fic-rubert-tiny-2 / trainer_state.json
lightsource's picture
Upload folder using huggingface_hub
ae3eae5 verified
raw
history blame
311 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1785,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028011204481792717,
"grad_norm": 0.296756774187088,
"learning_rate": 8.333333333333333e-07,
"loss": 0.1718,
"step": 1
},
{
"epoch": 0.0056022408963585435,
"grad_norm": 0.36025628447532654,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.133,
"step": 2
},
{
"epoch": 0.008403361344537815,
"grad_norm": 0.3665211796760559,
"learning_rate": 2.5e-06,
"loss": 0.1346,
"step": 3
},
{
"epoch": 0.011204481792717087,
"grad_norm": 0.47092393040657043,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2136,
"step": 4
},
{
"epoch": 0.014005602240896359,
"grad_norm": 0.5220111608505249,
"learning_rate": 4.166666666666667e-06,
"loss": 0.2203,
"step": 5
},
{
"epoch": 0.01680672268907563,
"grad_norm": 0.6161433458328247,
"learning_rate": 5e-06,
"loss": 0.0893,
"step": 6
},
{
"epoch": 0.0196078431372549,
"grad_norm": 0.34641674160957336,
"learning_rate": 5.833333333333334e-06,
"loss": 0.1721,
"step": 7
},
{
"epoch": 0.022408963585434174,
"grad_norm": 0.6245524287223816,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0883,
"step": 8
},
{
"epoch": 0.025210084033613446,
"grad_norm": 0.32467812299728394,
"learning_rate": 7.5e-06,
"loss": 0.167,
"step": 9
},
{
"epoch": 0.028011204481792718,
"grad_norm": 0.7720906138420105,
"learning_rate": 8.333333333333334e-06,
"loss": 0.2531,
"step": 10
},
{
"epoch": 0.03081232492997199,
"grad_norm": 0.38567203283309937,
"learning_rate": 9.166666666666666e-06,
"loss": 0.1309,
"step": 11
},
{
"epoch": 0.03361344537815126,
"grad_norm": 0.5268588662147522,
"learning_rate": 1e-05,
"loss": 0.2119,
"step": 12
},
{
"epoch": 0.036414565826330535,
"grad_norm": 0.3139660060405731,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.1741,
"step": 13
},
{
"epoch": 0.0392156862745098,
"grad_norm": 0.35874423384666443,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.1337,
"step": 14
},
{
"epoch": 0.04201680672268908,
"grad_norm": 0.3121141195297241,
"learning_rate": 1.25e-05,
"loss": 0.1745,
"step": 15
},
{
"epoch": 0.04481792717086835,
"grad_norm": 0.4937639534473419,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.208,
"step": 16
},
{
"epoch": 0.047619047619047616,
"grad_norm": 1.4663589000701904,
"learning_rate": 1.4166666666666668e-05,
"loss": 0.3469,
"step": 17
},
{
"epoch": 0.05042016806722689,
"grad_norm": 0.3288300633430481,
"learning_rate": 1.5e-05,
"loss": 0.1735,
"step": 18
},
{
"epoch": 0.05322128851540616,
"grad_norm": 0.7657830119132996,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.252,
"step": 19
},
{
"epoch": 0.056022408963585436,
"grad_norm": 0.36719515919685364,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1694,
"step": 20
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.2963164448738098,
"learning_rate": 1.75e-05,
"loss": 0.1647,
"step": 21
},
{
"epoch": 0.06162464985994398,
"grad_norm": 0.2958138883113861,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.1738,
"step": 22
},
{
"epoch": 0.06442577030812324,
"grad_norm": 0.6379082202911377,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.0961,
"step": 23
},
{
"epoch": 0.06722689075630252,
"grad_norm": 0.40202125906944275,
"learning_rate": 2e-05,
"loss": 0.1353,
"step": 24
},
{
"epoch": 0.0700280112044818,
"grad_norm": 0.631089448928833,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0944,
"step": 25
},
{
"epoch": 0.07282913165266107,
"grad_norm": 0.6378069519996643,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.0931,
"step": 26
},
{
"epoch": 0.07563025210084033,
"grad_norm": 0.3574202060699463,
"learning_rate": 2.25e-05,
"loss": 0.1729,
"step": 27
},
{
"epoch": 0.0784313725490196,
"grad_norm": 0.3499799370765686,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.1732,
"step": 28
},
{
"epoch": 0.08123249299719888,
"grad_norm": 0.5255885124206543,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.2186,
"step": 29
},
{
"epoch": 0.08403361344537816,
"grad_norm": 0.35638874769210815,
"learning_rate": 2.5e-05,
"loss": 0.1338,
"step": 30
},
{
"epoch": 0.08683473389355742,
"grad_norm": 0.3849603235721588,
"learning_rate": 2.5833333333333336e-05,
"loss": 0.1326,
"step": 31
},
{
"epoch": 0.0896358543417367,
"grad_norm": 0.32195600867271423,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.1715,
"step": 32
},
{
"epoch": 0.09243697478991597,
"grad_norm": 0.8319142460823059,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.2664,
"step": 33
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.7899373769760132,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.2529,
"step": 34
},
{
"epoch": 0.09803921568627451,
"grad_norm": 0.5739774107933044,
"learning_rate": 2.916666666666667e-05,
"loss": 0.0865,
"step": 35
},
{
"epoch": 0.10084033613445378,
"grad_norm": 0.3802482783794403,
"learning_rate": 3e-05,
"loss": 0.131,
"step": 36
},
{
"epoch": 0.10364145658263306,
"grad_norm": 0.4794873893260956,
"learning_rate": 3.0833333333333335e-05,
"loss": 0.2136,
"step": 37
},
{
"epoch": 0.10644257703081232,
"grad_norm": 0.3657197654247284,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.1318,
"step": 38
},
{
"epoch": 0.1092436974789916,
"grad_norm": 0.3748391270637512,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.1775,
"step": 39
},
{
"epoch": 0.11204481792717087,
"grad_norm": 0.4693293869495392,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.2114,
"step": 40
},
{
"epoch": 0.11484593837535013,
"grad_norm": 0.3253132998943329,
"learning_rate": 3.4166666666666666e-05,
"loss": 0.1775,
"step": 41
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.5953426361083984,
"learning_rate": 3.5e-05,
"loss": 0.2213,
"step": 42
},
{
"epoch": 0.12044817927170869,
"grad_norm": 0.3064591586589813,
"learning_rate": 3.5833333333333335e-05,
"loss": 0.1328,
"step": 43
},
{
"epoch": 0.12324929971988796,
"grad_norm": 0.37691906094551086,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.131,
"step": 44
},
{
"epoch": 0.12605042016806722,
"grad_norm": 1.2265459299087524,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.3374,
"step": 45
},
{
"epoch": 0.12885154061624648,
"grad_norm": 0.30182263255119324,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.1743,
"step": 46
},
{
"epoch": 0.13165266106442577,
"grad_norm": 0.3886899948120117,
"learning_rate": 3.9166666666666665e-05,
"loss": 0.131,
"step": 47
},
{
"epoch": 0.13445378151260504,
"grad_norm": 0.5219215154647827,
"learning_rate": 4e-05,
"loss": 0.2145,
"step": 48
},
{
"epoch": 0.13725490196078433,
"grad_norm": 0.2958211898803711,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.1715,
"step": 49
},
{
"epoch": 0.1400560224089636,
"grad_norm": 0.290129154920578,
"learning_rate": 4.166666666666667e-05,
"loss": 0.1739,
"step": 50
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.3508636951446533,
"learning_rate": 4.25e-05,
"loss": 0.132,
"step": 51
},
{
"epoch": 0.14565826330532214,
"grad_norm": 0.6013919711112976,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.0944,
"step": 52
},
{
"epoch": 0.1484593837535014,
"grad_norm": 0.5936094522476196,
"learning_rate": 4.4166666666666665e-05,
"loss": 0.0949,
"step": 53
},
{
"epoch": 0.15126050420168066,
"grad_norm": 0.38572150468826294,
"learning_rate": 4.5e-05,
"loss": 0.135,
"step": 54
},
{
"epoch": 0.15406162464985995,
"grad_norm": 0.2600800693035126,
"learning_rate": 4.5833333333333334e-05,
"loss": 0.1692,
"step": 55
},
{
"epoch": 0.1568627450980392,
"grad_norm": 0.3701837360858917,
"learning_rate": 4.666666666666667e-05,
"loss": 0.1291,
"step": 56
},
{
"epoch": 0.15966386554621848,
"grad_norm": 0.34277698397636414,
"learning_rate": 4.75e-05,
"loss": 0.1346,
"step": 57
},
{
"epoch": 0.16246498599439776,
"grad_norm": 0.35784104466438293,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.177,
"step": 58
},
{
"epoch": 0.16526610644257703,
"grad_norm": 0.8476487994194031,
"learning_rate": 4.9166666666666665e-05,
"loss": 0.2581,
"step": 59
},
{
"epoch": 0.16806722689075632,
"grad_norm": 0.8182944655418396,
"learning_rate": 5e-05,
"loss": 0.2639,
"step": 60
},
{
"epoch": 0.17086834733893558,
"grad_norm": 0.29115599393844604,
"learning_rate": 4.999995853979553e-05,
"loss": 0.1691,
"step": 61
},
{
"epoch": 0.17366946778711484,
"grad_norm": 0.7699853777885437,
"learning_rate": 4.9999834159319655e-05,
"loss": 0.264,
"step": 62
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.7362309694290161,
"learning_rate": 4.999962685898491e-05,
"loss": 0.2508,
"step": 63
},
{
"epoch": 0.1792717086834734,
"grad_norm": 0.4930090010166168,
"learning_rate": 4.999933663947886e-05,
"loss": 0.2159,
"step": 64
},
{
"epoch": 0.18207282913165265,
"grad_norm": 0.3581504821777344,
"learning_rate": 4.999896350176413e-05,
"loss": 0.1278,
"step": 65
},
{
"epoch": 0.18487394957983194,
"grad_norm": 0.5248085260391235,
"learning_rate": 4.999850744707835e-05,
"loss": 0.0941,
"step": 66
},
{
"epoch": 0.1876750700280112,
"grad_norm": 0.3348913788795471,
"learning_rate": 4.9997968476934156e-05,
"loss": 0.1326,
"step": 67
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.5374695658683777,
"learning_rate": 4.999734659311921e-05,
"loss": 0.095,
"step": 68
},
{
"epoch": 0.19327731092436976,
"grad_norm": 0.28596481680870056,
"learning_rate": 4.9996641797696206e-05,
"loss": 0.1703,
"step": 69
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.3090248703956604,
"learning_rate": 4.999585409300281e-05,
"loss": 0.1358,
"step": 70
},
{
"epoch": 0.19887955182072828,
"grad_norm": 0.3411775827407837,
"learning_rate": 4.999498348165169e-05,
"loss": 0.1323,
"step": 71
},
{
"epoch": 0.20168067226890757,
"grad_norm": 0.32754117250442505,
"learning_rate": 4.999402996653051e-05,
"loss": 0.1324,
"step": 72
},
{
"epoch": 0.20448179271708683,
"grad_norm": 0.3174399137496948,
"learning_rate": 4.9992993550801905e-05,
"loss": 0.1304,
"step": 73
},
{
"epoch": 0.20728291316526612,
"grad_norm": 0.286386102437973,
"learning_rate": 4.999187423790347e-05,
"loss": 0.1289,
"step": 74
},
{
"epoch": 0.21008403361344538,
"grad_norm": 0.6507259607315063,
"learning_rate": 4.999067203154777e-05,
"loss": 0.2545,
"step": 75
},
{
"epoch": 0.21288515406162464,
"grad_norm": 0.2852478623390198,
"learning_rate": 4.998938693572229e-05,
"loss": 0.1309,
"step": 76
},
{
"epoch": 0.21568627450980393,
"grad_norm": 0.2769582271575928,
"learning_rate": 4.9988018954689465e-05,
"loss": 0.1694,
"step": 77
},
{
"epoch": 0.2184873949579832,
"grad_norm": 0.3016033172607422,
"learning_rate": 4.998656809298663e-05,
"loss": 0.1752,
"step": 78
},
{
"epoch": 0.22128851540616246,
"grad_norm": 0.2839262783527374,
"learning_rate": 4.998503435542604e-05,
"loss": 0.173,
"step": 79
},
{
"epoch": 0.22408963585434175,
"grad_norm": 0.4902794659137726,
"learning_rate": 4.9983417747094816e-05,
"loss": 0.0811,
"step": 80
},
{
"epoch": 0.226890756302521,
"grad_norm": 0.26926159858703613,
"learning_rate": 4.998171827335494e-05,
"loss": 0.1241,
"step": 81
},
{
"epoch": 0.22969187675070027,
"grad_norm": 0.2879544198513031,
"learning_rate": 4.997993593984327e-05,
"loss": 0.1698,
"step": 82
},
{
"epoch": 0.23249299719887956,
"grad_norm": 0.4781981110572815,
"learning_rate": 4.997807075247146e-05,
"loss": 0.0775,
"step": 83
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.5029699206352234,
"learning_rate": 4.997612271742601e-05,
"loss": 0.215,
"step": 84
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.27541235089302063,
"learning_rate": 4.9974091841168195e-05,
"loss": 0.1675,
"step": 85
},
{
"epoch": 0.24089635854341737,
"grad_norm": 0.25892165303230286,
"learning_rate": 4.997197813043404e-05,
"loss": 0.1221,
"step": 86
},
{
"epoch": 0.24369747899159663,
"grad_norm": 0.47153759002685547,
"learning_rate": 4.996978159223436e-05,
"loss": 0.0734,
"step": 87
},
{
"epoch": 0.24649859943977592,
"grad_norm": 0.3439052402973175,
"learning_rate": 4.9967502233854654e-05,
"loss": 0.1698,
"step": 88
},
{
"epoch": 0.24929971988795518,
"grad_norm": 1.2146350145339966,
"learning_rate": 4.996514006285514e-05,
"loss": 0.3275,
"step": 89
},
{
"epoch": 0.25210084033613445,
"grad_norm": 0.5989933013916016,
"learning_rate": 4.99626950870707e-05,
"loss": 0.2192,
"step": 90
},
{
"epoch": 0.2549019607843137,
"grad_norm": 0.481522798538208,
"learning_rate": 4.996016731461088e-05,
"loss": 0.0737,
"step": 91
},
{
"epoch": 0.25770308123249297,
"grad_norm": 0.8134711384773254,
"learning_rate": 4.995755675385982e-05,
"loss": 0.2394,
"step": 92
},
{
"epoch": 0.2605042016806723,
"grad_norm": 0.3162464201450348,
"learning_rate": 4.995486341347629e-05,
"loss": 0.1734,
"step": 93
},
{
"epoch": 0.26330532212885155,
"grad_norm": 1.2472445964813232,
"learning_rate": 4.99520873023936e-05,
"loss": 0.331,
"step": 94
},
{
"epoch": 0.2661064425770308,
"grad_norm": 0.4896290600299835,
"learning_rate": 4.994922842981958e-05,
"loss": 0.0775,
"step": 95
},
{
"epoch": 0.2689075630252101,
"grad_norm": 0.6393519043922424,
"learning_rate": 4.9946286805236616e-05,
"loss": 0.2254,
"step": 96
},
{
"epoch": 0.27170868347338933,
"grad_norm": 0.3101232945919037,
"learning_rate": 4.994326243840153e-05,
"loss": 0.1741,
"step": 97
},
{
"epoch": 0.27450980392156865,
"grad_norm": 0.396138995885849,
"learning_rate": 4.994015533934557e-05,
"loss": 0.2115,
"step": 98
},
{
"epoch": 0.2773109243697479,
"grad_norm": 0.4914068877696991,
"learning_rate": 4.993696551837444e-05,
"loss": 0.2207,
"step": 99
},
{
"epoch": 0.2801120448179272,
"grad_norm": 0.31758832931518555,
"learning_rate": 4.9933692986068165e-05,
"loss": 0.1266,
"step": 100
},
{
"epoch": 0.28291316526610644,
"grad_norm": 0.24751000106334686,
"learning_rate": 4.993033775328115e-05,
"loss": 0.1694,
"step": 101
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.2383459210395813,
"learning_rate": 4.992689983114209e-05,
"loss": 0.17,
"step": 102
},
{
"epoch": 0.28851540616246496,
"grad_norm": 0.3043301999568939,
"learning_rate": 4.9923379231053925e-05,
"loss": 0.1747,
"step": 103
},
{
"epoch": 0.2913165266106443,
"grad_norm": 0.311767041683197,
"learning_rate": 4.9919775964693854e-05,
"loss": 0.1345,
"step": 104
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.9271151423454285,
"learning_rate": 4.991609004401324e-05,
"loss": 0.3113,
"step": 105
},
{
"epoch": 0.2969187675070028,
"grad_norm": 0.24133501946926117,
"learning_rate": 4.991232148123761e-05,
"loss": 0.1685,
"step": 106
},
{
"epoch": 0.29971988795518206,
"grad_norm": 0.7193834781646729,
"learning_rate": 4.99084702888666e-05,
"loss": 0.2805,
"step": 107
},
{
"epoch": 0.3025210084033613,
"grad_norm": 0.21335840225219727,
"learning_rate": 4.990453647967389e-05,
"loss": 0.1717,
"step": 108
},
{
"epoch": 0.30532212885154064,
"grad_norm": 0.27446791529655457,
"learning_rate": 4.9900520066707215e-05,
"loss": 0.2081,
"step": 109
},
{
"epoch": 0.3081232492997199,
"grad_norm": 0.5605149269104004,
"learning_rate": 4.9896421063288286e-05,
"loss": 0.1085,
"step": 110
},
{
"epoch": 0.31092436974789917,
"grad_norm": 0.2565538287162781,
"learning_rate": 4.989223948301273e-05,
"loss": 0.1717,
"step": 111
},
{
"epoch": 0.3137254901960784,
"grad_norm": 0.5685631632804871,
"learning_rate": 4.988797533975009e-05,
"loss": 0.1095,
"step": 112
},
{
"epoch": 0.3165266106442577,
"grad_norm": 0.25811776518821716,
"learning_rate": 4.9883628647643744e-05,
"loss": 0.173,
"step": 113
},
{
"epoch": 0.31932773109243695,
"grad_norm": 0.3600546717643738,
"learning_rate": 4.9879199421110865e-05,
"loss": 0.1432,
"step": 114
},
{
"epoch": 0.32212885154061627,
"grad_norm": 0.22939907014369965,
"learning_rate": 4.98746876748424e-05,
"loss": 0.2009,
"step": 115
},
{
"epoch": 0.32492997198879553,
"grad_norm": 0.3546235263347626,
"learning_rate": 4.9870093423802964e-05,
"loss": 0.1419,
"step": 116
},
{
"epoch": 0.3277310924369748,
"grad_norm": 0.2727724015712738,
"learning_rate": 4.986541668323086e-05,
"loss": 0.2094,
"step": 117
},
{
"epoch": 0.33053221288515405,
"grad_norm": 0.2383224368095398,
"learning_rate": 4.986065746863797e-05,
"loss": 0.1735,
"step": 118
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.23489497601985931,
"learning_rate": 4.985581579580973e-05,
"loss": 0.1738,
"step": 119
},
{
"epoch": 0.33613445378151263,
"grad_norm": 0.3256724178791046,
"learning_rate": 4.985089168080509e-05,
"loss": 0.1336,
"step": 120
},
{
"epoch": 0.3389355742296919,
"grad_norm": 0.2290939837694168,
"learning_rate": 4.9845885139956435e-05,
"loss": 0.1732,
"step": 121
},
{
"epoch": 0.34173669467787116,
"grad_norm": 0.2946733236312866,
"learning_rate": 4.984079618986953e-05,
"loss": 0.1377,
"step": 122
},
{
"epoch": 0.3445378151260504,
"grad_norm": 0.9020264744758606,
"learning_rate": 4.983562484742349e-05,
"loss": 0.3229,
"step": 123
},
{
"epoch": 0.3473389355742297,
"grad_norm": 0.2107078731060028,
"learning_rate": 4.983037112977072e-05,
"loss": 0.1738,
"step": 124
},
{
"epoch": 0.35014005602240894,
"grad_norm": 0.21474942564964294,
"learning_rate": 4.982503505433683e-05,
"loss": 0.1698,
"step": 125
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.3706744909286499,
"learning_rate": 4.98196166388206e-05,
"loss": 0.2061,
"step": 126
},
{
"epoch": 0.3557422969187675,
"grad_norm": 0.19645719230175018,
"learning_rate": 4.981411590119392e-05,
"loss": 0.1687,
"step": 127
},
{
"epoch": 0.3585434173669468,
"grad_norm": 0.4788144528865814,
"learning_rate": 4.980853285970173e-05,
"loss": 0.097,
"step": 128
},
{
"epoch": 0.36134453781512604,
"grad_norm": 0.30618858337402344,
"learning_rate": 4.980286753286195e-05,
"loss": 0.1367,
"step": 129
},
{
"epoch": 0.3641456582633053,
"grad_norm": 0.5136687159538269,
"learning_rate": 4.979711993946543e-05,
"loss": 0.2502,
"step": 130
},
{
"epoch": 0.36694677871148457,
"grad_norm": 0.38218584656715393,
"learning_rate": 4.9791290098575876e-05,
"loss": 0.2166,
"step": 131
},
{
"epoch": 0.3697478991596639,
"grad_norm": 0.3716464936733246,
"learning_rate": 4.978537802952981e-05,
"loss": 0.2071,
"step": 132
},
{
"epoch": 0.37254901960784315,
"grad_norm": 0.21404078602790833,
"learning_rate": 4.977938375193648e-05,
"loss": 0.177,
"step": 133
},
{
"epoch": 0.3753501400560224,
"grad_norm": 0.3054008483886719,
"learning_rate": 4.9773307285677785e-05,
"loss": 0.212,
"step": 134
},
{
"epoch": 0.37815126050420167,
"grad_norm": 0.7401143312454224,
"learning_rate": 4.976714865090827e-05,
"loss": 0.278,
"step": 135
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.3005881607532501,
"learning_rate": 4.976090786805498e-05,
"loss": 0.2057,
"step": 136
},
{
"epoch": 0.38375350140056025,
"grad_norm": 1.065142035484314,
"learning_rate": 4.975458495781746e-05,
"loss": 0.3504,
"step": 137
},
{
"epoch": 0.3865546218487395,
"grad_norm": 0.291591614484787,
"learning_rate": 4.974817994116764e-05,
"loss": 0.2105,
"step": 138
},
{
"epoch": 0.38935574229691877,
"grad_norm": 0.21302799880504608,
"learning_rate": 4.9741692839349765e-05,
"loss": 0.1722,
"step": 139
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.3730943500995636,
"learning_rate": 4.973512367388038e-05,
"loss": 0.2368,
"step": 140
},
{
"epoch": 0.3949579831932773,
"grad_norm": 0.25385361909866333,
"learning_rate": 4.9728472466548194e-05,
"loss": 0.2002,
"step": 141
},
{
"epoch": 0.39775910364145656,
"grad_norm": 0.25864890217781067,
"learning_rate": 4.9721739239414034e-05,
"loss": 0.1719,
"step": 142
},
{
"epoch": 0.4005602240896359,
"grad_norm": 0.44104957580566406,
"learning_rate": 4.971492401481079e-05,
"loss": 0.1467,
"step": 143
},
{
"epoch": 0.40336134453781514,
"grad_norm": 0.6197791695594788,
"learning_rate": 4.9708026815343314e-05,
"loss": 0.1173,
"step": 144
},
{
"epoch": 0.4061624649859944,
"grad_norm": 0.6132116317749023,
"learning_rate": 4.970104766388832e-05,
"loss": 0.1187,
"step": 145
},
{
"epoch": 0.40896358543417366,
"grad_norm": 0.28150931000709534,
"learning_rate": 4.969398658359441e-05,
"loss": 0.1739,
"step": 146
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.41110119223594666,
"learning_rate": 4.968684359788187e-05,
"loss": 0.1441,
"step": 147
},
{
"epoch": 0.41456582633053224,
"grad_norm": 0.2573298513889313,
"learning_rate": 4.967961873044267e-05,
"loss": 0.2043,
"step": 148
},
{
"epoch": 0.4173669467787115,
"grad_norm": 0.5403711199760437,
"learning_rate": 4.967231200524037e-05,
"loss": 0.1091,
"step": 149
},
{
"epoch": 0.42016806722689076,
"grad_norm": 0.25525861978530884,
"learning_rate": 4.966492344651005e-05,
"loss": 0.2077,
"step": 150
},
{
"epoch": 0.42296918767507,
"grad_norm": 0.2501123547554016,
"learning_rate": 4.965745307875819e-05,
"loss": 0.2061,
"step": 151
},
{
"epoch": 0.4257703081232493,
"grad_norm": 0.2235931009054184,
"learning_rate": 4.964990092676263e-05,
"loss": 0.1766,
"step": 152
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.3031710386276245,
"learning_rate": 4.9642267015572464e-05,
"loss": 0.1371,
"step": 153
},
{
"epoch": 0.43137254901960786,
"grad_norm": 0.48257577419281006,
"learning_rate": 4.9634551370507986e-05,
"loss": 0.0987,
"step": 154
},
{
"epoch": 0.4341736694677871,
"grad_norm": 0.254314124584198,
"learning_rate": 4.962675401716056e-05,
"loss": 0.1627,
"step": 155
},
{
"epoch": 0.4369747899159664,
"grad_norm": 0.46130886673927307,
"learning_rate": 4.9618874981392596e-05,
"loss": 0.0952,
"step": 156
},
{
"epoch": 0.43977591036414565,
"grad_norm": 0.48739495873451233,
"learning_rate": 4.961091428933738e-05,
"loss": 0.2433,
"step": 157
},
{
"epoch": 0.4425770308123249,
"grad_norm": 0.3604317307472229,
"learning_rate": 4.96028719673991e-05,
"loss": 0.2135,
"step": 158
},
{
"epoch": 0.44537815126050423,
"grad_norm": 0.31823596358299255,
"learning_rate": 4.959474804225263e-05,
"loss": 0.209,
"step": 159
},
{
"epoch": 0.4481792717086835,
"grad_norm": 0.4376252591609955,
"learning_rate": 4.958654254084355e-05,
"loss": 0.0907,
"step": 160
},
{
"epoch": 0.45098039215686275,
"grad_norm": 0.5917844772338867,
"learning_rate": 4.9578255490388007e-05,
"loss": 0.2649,
"step": 161
},
{
"epoch": 0.453781512605042,
"grad_norm": 0.33242353796958923,
"learning_rate": 4.956988691837262e-05,
"loss": 0.2071,
"step": 162
},
{
"epoch": 0.4565826330532213,
"grad_norm": 0.19680732488632202,
"learning_rate": 4.956143685255441e-05,
"loss": 0.1706,
"step": 163
},
{
"epoch": 0.45938375350140054,
"grad_norm": 0.19730661809444427,
"learning_rate": 4.955290532096068e-05,
"loss": 0.1656,
"step": 164
},
{
"epoch": 0.46218487394957986,
"grad_norm": 0.24366319179534912,
"learning_rate": 4.9544292351888966e-05,
"loss": 0.129,
"step": 165
},
{
"epoch": 0.4649859943977591,
"grad_norm": 0.20081248879432678,
"learning_rate": 4.95355979739069e-05,
"loss": 0.1659,
"step": 166
},
{
"epoch": 0.4677871148459384,
"grad_norm": 0.24853447079658508,
"learning_rate": 4.9526822215852145e-05,
"loss": 0.1797,
"step": 167
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.3966836929321289,
"learning_rate": 4.951796510683227e-05,
"loss": 0.2161,
"step": 168
},
{
"epoch": 0.4733893557422969,
"grad_norm": 1.1678446531295776,
"learning_rate": 4.950902667622468e-05,
"loss": 0.3802,
"step": 169
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.4717535674571991,
"learning_rate": 4.95000069536765e-05,
"loss": 0.222,
"step": 170
},
{
"epoch": 0.4789915966386555,
"grad_norm": 0.2482030987739563,
"learning_rate": 4.9490905969104514e-05,
"loss": 0.1356,
"step": 171
},
{
"epoch": 0.48179271708683474,
"grad_norm": 0.20962095260620117,
"learning_rate": 4.9481723752695006e-05,
"loss": 0.1644,
"step": 172
},
{
"epoch": 0.484593837535014,
"grad_norm": 0.46536725759506226,
"learning_rate": 4.9472460334903703e-05,
"loss": 0.2479,
"step": 173
},
{
"epoch": 0.48739495798319327,
"grad_norm": 0.4335561990737915,
"learning_rate": 4.946311574645565e-05,
"loss": 0.0947,
"step": 174
},
{
"epoch": 0.49019607843137253,
"grad_norm": 0.4413890838623047,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.0965,
"step": 175
},
{
"epoch": 0.49299719887955185,
"grad_norm": 0.17996694147586823,
"learning_rate": 4.944418318183559e-05,
"loss": 0.1686,
"step": 176
},
{
"epoch": 0.4957983193277311,
"grad_norm": 0.17904537916183472,
"learning_rate": 4.943459526845942e-05,
"loss": 0.171,
"step": 177
},
{
"epoch": 0.49859943977591037,
"grad_norm": 0.4312843382358551,
"learning_rate": 4.9424926310017975e-05,
"loss": 0.0961,
"step": 178
},
{
"epoch": 0.5014005602240896,
"grad_norm": 0.27155429124832153,
"learning_rate": 4.941517633858141e-05,
"loss": 0.2079,
"step": 179
},
{
"epoch": 0.5042016806722689,
"grad_norm": 0.2025201916694641,
"learning_rate": 4.9405345386488614e-05,
"loss": 0.1742,
"step": 180
},
{
"epoch": 0.5070028011204482,
"grad_norm": 0.20862695574760437,
"learning_rate": 4.939543348634703e-05,
"loss": 0.1693,
"step": 181
},
{
"epoch": 0.5098039215686274,
"grad_norm": 0.43479472398757935,
"learning_rate": 4.9385440671032626e-05,
"loss": 0.0942,
"step": 182
},
{
"epoch": 0.5126050420168067,
"grad_norm": 0.2722318470478058,
"learning_rate": 4.937536697368971e-05,
"loss": 0.2083,
"step": 183
},
{
"epoch": 0.5154061624649859,
"grad_norm": 0.4572315216064453,
"learning_rate": 4.936521242773091e-05,
"loss": 0.2492,
"step": 184
},
{
"epoch": 0.5182072829131653,
"grad_norm": 0.5211023092269897,
"learning_rate": 4.9354977066836986e-05,
"loss": 0.2559,
"step": 185
},
{
"epoch": 0.5210084033613446,
"grad_norm": 0.4220978319644928,
"learning_rate": 4.934466092495673e-05,
"loss": 0.0944,
"step": 186
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.25524213910102844,
"learning_rate": 4.9334264036306916e-05,
"loss": 0.1332,
"step": 187
},
{
"epoch": 0.5266106442577031,
"grad_norm": 0.1635034680366516,
"learning_rate": 4.93237864353721e-05,
"loss": 0.1691,
"step": 188
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.1833125352859497,
"learning_rate": 4.931322815690457e-05,
"loss": 0.172,
"step": 189
},
{
"epoch": 0.5322128851540616,
"grad_norm": 0.22643063962459564,
"learning_rate": 4.930258923592418e-05,
"loss": 0.1768,
"step": 190
},
{
"epoch": 0.5350140056022409,
"grad_norm": 0.4311389923095703,
"learning_rate": 4.9291869707718304e-05,
"loss": 0.094,
"step": 191
},
{
"epoch": 0.5378151260504201,
"grad_norm": 0.41472041606903076,
"learning_rate": 4.9281069607841624e-05,
"loss": 0.0918,
"step": 192
},
{
"epoch": 0.5406162464985994,
"grad_norm": 0.19142650067806244,
"learning_rate": 4.927018897211609e-05,
"loss": 0.1737,
"step": 193
},
{
"epoch": 0.5434173669467787,
"grad_norm": 0.27896803617477417,
"learning_rate": 4.925922783663079e-05,
"loss": 0.2096,
"step": 194
},
{
"epoch": 0.5462184873949579,
"grad_norm": 0.2846761643886566,
"learning_rate": 4.924818623774178e-05,
"loss": 0.1254,
"step": 195
},
{
"epoch": 0.5490196078431373,
"grad_norm": 0.1776442974805832,
"learning_rate": 4.923706421207202e-05,
"loss": 0.1715,
"step": 196
},
{
"epoch": 0.5518207282913166,
"grad_norm": 0.2530427873134613,
"learning_rate": 4.922586179651124e-05,
"loss": 0.1286,
"step": 197
},
{
"epoch": 0.5546218487394958,
"grad_norm": 0.3111589550971985,
"learning_rate": 4.9214579028215776e-05,
"loss": 0.209,
"step": 198
},
{
"epoch": 0.5574229691876751,
"grad_norm": 0.4025686979293823,
"learning_rate": 4.920321594460852e-05,
"loss": 0.0844,
"step": 199
},
{
"epoch": 0.5602240896358543,
"grad_norm": 0.4007948040962219,
"learning_rate": 4.9191772583378705e-05,
"loss": 0.0839,
"step": 200
},
{
"epoch": 0.5630252100840336,
"grad_norm": 0.24552220106124878,
"learning_rate": 4.9180248982481876e-05,
"loss": 0.1228,
"step": 201
},
{
"epoch": 0.5658263305322129,
"grad_norm": 0.3994346261024475,
"learning_rate": 4.916864518013971e-05,
"loss": 0.0792,
"step": 202
},
{
"epoch": 0.5686274509803921,
"grad_norm": 0.24182085692882538,
"learning_rate": 4.915696121483986e-05,
"loss": 0.1216,
"step": 203
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.6845493912696838,
"learning_rate": 4.9145197125335916e-05,
"loss": 0.271,
"step": 204
},
{
"epoch": 0.5742296918767507,
"grad_norm": 0.35140976309776306,
"learning_rate": 4.91333529506472e-05,
"loss": 0.1824,
"step": 205
},
{
"epoch": 0.5770308123249299,
"grad_norm": 0.2534499168395996,
"learning_rate": 4.912142873005866e-05,
"loss": 0.1623,
"step": 206
},
{
"epoch": 0.5798319327731093,
"grad_norm": 0.21057064831256866,
"learning_rate": 4.910942450312075e-05,
"loss": 0.1185,
"step": 207
},
{
"epoch": 0.5826330532212886,
"grad_norm": 0.2223557084798813,
"learning_rate": 4.909734030964929e-05,
"loss": 0.1259,
"step": 208
},
{
"epoch": 0.5854341736694678,
"grad_norm": 0.2970251739025116,
"learning_rate": 4.9085176189725314e-05,
"loss": 0.1735,
"step": 209
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.8583146929740906,
"learning_rate": 4.907293218369499e-05,
"loss": 0.2865,
"step": 210
},
{
"epoch": 0.5910364145658263,
"grad_norm": 0.2618134617805481,
"learning_rate": 4.906060833216942e-05,
"loss": 0.1733,
"step": 211
},
{
"epoch": 0.5938375350140056,
"grad_norm": 0.3876262605190277,
"learning_rate": 4.904820467602458e-05,
"loss": 0.0753,
"step": 212
},
{
"epoch": 0.5966386554621849,
"grad_norm": 0.7885448932647705,
"learning_rate": 4.90357212564011e-05,
"loss": 0.2769,
"step": 213
},
{
"epoch": 0.5994397759103641,
"grad_norm": 0.44926995038986206,
"learning_rate": 4.9023158114704206e-05,
"loss": 0.2188,
"step": 214
},
{
"epoch": 0.6022408963585434,
"grad_norm": 1.0731271505355835,
"learning_rate": 4.901051529260352e-05,
"loss": 0.3633,
"step": 215
},
{
"epoch": 0.6050420168067226,
"grad_norm": 0.38666459918022156,
"learning_rate": 4.899779283203296e-05,
"loss": 0.0786,
"step": 216
},
{
"epoch": 0.6078431372549019,
"grad_norm": 0.40130844712257385,
"learning_rate": 4.89849907751906e-05,
"loss": 0.2174,
"step": 217
},
{
"epoch": 0.6106442577030813,
"grad_norm": 0.22078561782836914,
"learning_rate": 4.897210916453851e-05,
"loss": 0.131,
"step": 218
},
{
"epoch": 0.6134453781512605,
"grad_norm": 0.3916260600090027,
"learning_rate": 4.895914804280262e-05,
"loss": 0.0827,
"step": 219
},
{
"epoch": 0.6162464985994398,
"grad_norm": 0.5899614095687866,
"learning_rate": 4.89461074529726e-05,
"loss": 0.2647,
"step": 220
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.37688153982162476,
"learning_rate": 4.893298743830168e-05,
"loss": 0.2185,
"step": 221
},
{
"epoch": 0.6218487394957983,
"grad_norm": 0.19744780659675598,
"learning_rate": 4.891978804230656e-05,
"loss": 0.1755,
"step": 222
},
{
"epoch": 0.6246498599439776,
"grad_norm": 0.23267637193202972,
"learning_rate": 4.890650930876719e-05,
"loss": 0.1316,
"step": 223
},
{
"epoch": 0.6274509803921569,
"grad_norm": 0.1993289738893509,
"learning_rate": 4.889315128172669e-05,
"loss": 0.1755,
"step": 224
},
{
"epoch": 0.6302521008403361,
"grad_norm": 0.22977744042873383,
"learning_rate": 4.88797140054912e-05,
"loss": 0.1308,
"step": 225
},
{
"epoch": 0.6330532212885154,
"grad_norm": 0.3395256996154785,
"learning_rate": 4.88661975246297e-05,
"loss": 0.2184,
"step": 226
},
{
"epoch": 0.6358543417366946,
"grad_norm": 0.276090145111084,
"learning_rate": 4.8852601883973846e-05,
"loss": 0.1294,
"step": 227
},
{
"epoch": 0.6386554621848739,
"grad_norm": 0.2874520421028137,
"learning_rate": 4.883892712861791e-05,
"loss": 0.2176,
"step": 228
},
{
"epoch": 0.6414565826330533,
"grad_norm": 0.3347153663635254,
"learning_rate": 4.882517330391854e-05,
"loss": 0.2182,
"step": 229
},
{
"epoch": 0.6442577030812325,
"grad_norm": 0.40832483768463135,
"learning_rate": 4.8811340455494624e-05,
"loss": 0.0955,
"step": 230
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.2539953589439392,
"learning_rate": 4.879742862922721e-05,
"loss": 0.1308,
"step": 231
},
{
"epoch": 0.6498599439775911,
"grad_norm": 0.26753100752830505,
"learning_rate": 4.8783437871259254e-05,
"loss": 0.2096,
"step": 232
},
{
"epoch": 0.6526610644257703,
"grad_norm": 0.16798768937587738,
"learning_rate": 4.876936822799553e-05,
"loss": 0.1713,
"step": 233
},
{
"epoch": 0.6554621848739496,
"grad_norm": 0.2817080020904541,
"learning_rate": 4.875521974610247e-05,
"loss": 0.2118,
"step": 234
},
{
"epoch": 0.6582633053221288,
"grad_norm": 0.40791934728622437,
"learning_rate": 4.874099247250798e-05,
"loss": 0.0928,
"step": 235
},
{
"epoch": 0.6610644257703081,
"grad_norm": 0.24818055331707,
"learning_rate": 4.8726686454401325e-05,
"loss": 0.2097,
"step": 236
},
{
"epoch": 0.6638655462184874,
"grad_norm": 0.420173317193985,
"learning_rate": 4.8712301739232935e-05,
"loss": 0.2445,
"step": 237
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2433900535106659,
"learning_rate": 4.869783837471427e-05,
"loss": 0.1331,
"step": 238
},
{
"epoch": 0.6694677871148459,
"grad_norm": 0.3094242811203003,
"learning_rate": 4.8683296408817644e-05,
"loss": 0.2125,
"step": 239
},
{
"epoch": 0.6722689075630253,
"grad_norm": 0.24864792823791504,
"learning_rate": 4.8668675889776095e-05,
"loss": 0.1326,
"step": 240
},
{
"epoch": 0.6750700280112045,
"grad_norm": 0.16234812140464783,
"learning_rate": 4.8653976866083206e-05,
"loss": 0.17,
"step": 241
},
{
"epoch": 0.6778711484593838,
"grad_norm": 0.30067870020866394,
"learning_rate": 4.863919938649293e-05,
"loss": 0.206,
"step": 242
},
{
"epoch": 0.680672268907563,
"grad_norm": 0.43154123425483704,
"learning_rate": 4.862434350001945e-05,
"loss": 0.2505,
"step": 243
},
{
"epoch": 0.6834733893557423,
"grad_norm": 0.4572393596172333,
"learning_rate": 4.860940925593703e-05,
"loss": 0.2417,
"step": 244
},
{
"epoch": 0.6862745098039216,
"grad_norm": 0.2604037821292877,
"learning_rate": 4.85943967037798e-05,
"loss": 0.1362,
"step": 245
},
{
"epoch": 0.6890756302521008,
"grad_norm": 0.2577713429927826,
"learning_rate": 4.857930589334164e-05,
"loss": 0.1339,
"step": 246
},
{
"epoch": 0.6918767507002801,
"grad_norm": 0.17431576550006866,
"learning_rate": 4.8564136874676e-05,
"loss": 0.1719,
"step": 247
},
{
"epoch": 0.6946778711484594,
"grad_norm": 0.271586537361145,
"learning_rate": 4.854888969809573e-05,
"loss": 0.1349,
"step": 248
},
{
"epoch": 0.6974789915966386,
"grad_norm": 0.2800721824169159,
"learning_rate": 4.8533564414172915e-05,
"loss": 0.1354,
"step": 249
},
{
"epoch": 0.7002801120448179,
"grad_norm": 0.388351172208786,
"learning_rate": 4.851816107373871e-05,
"loss": 0.2419,
"step": 250
},
{
"epoch": 0.7030812324929971,
"grad_norm": 0.2688753604888916,
"learning_rate": 4.850267972788316e-05,
"loss": 0.1368,
"step": 251
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.43289467692375183,
"learning_rate": 4.848712042795505e-05,
"loss": 0.0977,
"step": 252
},
{
"epoch": 0.7086834733893558,
"grad_norm": 0.3105364739894867,
"learning_rate": 4.847148322556171e-05,
"loss": 0.218,
"step": 253
},
{
"epoch": 0.711484593837535,
"grad_norm": 0.44322580099105835,
"learning_rate": 4.8455768172568886e-05,
"loss": 0.2419,
"step": 254
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.8140190839767456,
"learning_rate": 4.843997532110051e-05,
"loss": 0.3223,
"step": 255
},
{
"epoch": 0.7170868347338936,
"grad_norm": 0.26502156257629395,
"learning_rate": 4.842410472353858e-05,
"loss": 0.1313,
"step": 256
},
{
"epoch": 0.7198879551820728,
"grad_norm": 0.8722953200340271,
"learning_rate": 4.840815643252294e-05,
"loss": 0.3257,
"step": 257
},
{
"epoch": 0.7226890756302521,
"grad_norm": 0.2759450376033783,
"learning_rate": 4.839213050095116e-05,
"loss": 0.1385,
"step": 258
},
{
"epoch": 0.7254901960784313,
"grad_norm": 0.17727164924144745,
"learning_rate": 4.83760269819783e-05,
"loss": 0.1713,
"step": 259
},
{
"epoch": 0.7282913165266106,
"grad_norm": 0.20785532891750336,
"learning_rate": 4.835984592901678e-05,
"loss": 0.1776,
"step": 260
},
{
"epoch": 0.7310924369747899,
"grad_norm": 0.5836495757102966,
"learning_rate": 4.834358739573618e-05,
"loss": 0.2779,
"step": 261
},
{
"epoch": 0.7338935574229691,
"grad_norm": 0.33494001626968384,
"learning_rate": 4.8327251436063064e-05,
"loss": 0.2319,
"step": 262
},
{
"epoch": 0.7366946778711485,
"grad_norm": 0.29367175698280334,
"learning_rate": 4.831083810418082e-05,
"loss": 0.1383,
"step": 263
},
{
"epoch": 0.7394957983193278,
"grad_norm": 0.31937041878700256,
"learning_rate": 4.8294347454529445e-05,
"loss": 0.1407,
"step": 264
},
{
"epoch": 0.742296918767507,
"grad_norm": 0.48022958636283875,
"learning_rate": 4.82777795418054e-05,
"loss": 0.1067,
"step": 265
},
{
"epoch": 0.7450980392156863,
"grad_norm": 0.48949047923088074,
"learning_rate": 4.826113442096141e-05,
"loss": 0.1079,
"step": 266
},
{
"epoch": 0.7478991596638656,
"grad_norm": 0.5554994344711304,
"learning_rate": 4.8244412147206284e-05,
"loss": 0.2744,
"step": 267
},
{
"epoch": 0.7507002801120448,
"grad_norm": 0.22763153910636902,
"learning_rate": 4.822761277600474e-05,
"loss": 0.2015,
"step": 268
},
{
"epoch": 0.7535014005602241,
"grad_norm": 0.45823562145233154,
"learning_rate": 4.821073636307719e-05,
"loss": 0.1057,
"step": 269
},
{
"epoch": 0.7563025210084033,
"grad_norm": 0.21038657426834106,
"learning_rate": 4.819378296439961e-05,
"loss": 0.1802,
"step": 270
},
{
"epoch": 0.7591036414565826,
"grad_norm": 0.21373850107192993,
"learning_rate": 4.8176752636203314e-05,
"loss": 0.1706,
"step": 271
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.2585920989513397,
"learning_rate": 4.815964543497476e-05,
"loss": 0.2091,
"step": 272
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.4250545799732208,
"learning_rate": 4.81424614174554e-05,
"loss": 0.2504,
"step": 273
},
{
"epoch": 0.7675070028011205,
"grad_norm": 0.350320041179657,
"learning_rate": 4.8125200640641455e-05,
"loss": 0.2373,
"step": 274
},
{
"epoch": 0.7703081232492998,
"grad_norm": 0.2049403339624405,
"learning_rate": 4.8107863161783773e-05,
"loss": 0.1697,
"step": 275
},
{
"epoch": 0.773109243697479,
"grad_norm": 0.20530298352241516,
"learning_rate": 4.8090449038387564e-05,
"loss": 0.1794,
"step": 276
},
{
"epoch": 0.7759103641456583,
"grad_norm": 0.18878710269927979,
"learning_rate": 4.8072958328212294e-05,
"loss": 0.1672,
"step": 277
},
{
"epoch": 0.7787114845938375,
"grad_norm": 0.23467376828193665,
"learning_rate": 4.805539108927142e-05,
"loss": 0.2107,
"step": 278
},
{
"epoch": 0.7815126050420168,
"grad_norm": 0.3365757465362549,
"learning_rate": 4.8037747379832265e-05,
"loss": 0.1352,
"step": 279
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.47438693046569824,
"learning_rate": 4.8020027258415764e-05,
"loss": 0.1032,
"step": 280
},
{
"epoch": 0.7871148459383753,
"grad_norm": 0.29125773906707764,
"learning_rate": 4.8002230783796315e-05,
"loss": 0.1374,
"step": 281
},
{
"epoch": 0.7899159663865546,
"grad_norm": 0.597076952457428,
"learning_rate": 4.798435801500154e-05,
"loss": 0.2789,
"step": 282
},
{
"epoch": 0.7927170868347339,
"grad_norm": 0.40489983558654785,
"learning_rate": 4.7966409011312145e-05,
"loss": 0.2436,
"step": 283
},
{
"epoch": 0.7955182072829131,
"grad_norm": 0.2765220105648041,
"learning_rate": 4.7948383832261665e-05,
"loss": 0.1397,
"step": 284
},
{
"epoch": 0.7983193277310925,
"grad_norm": 0.28682276606559753,
"learning_rate": 4.793028253763633e-05,
"loss": 0.1353,
"step": 285
},
{
"epoch": 0.8011204481792717,
"grad_norm": 0.2538602352142334,
"learning_rate": 4.791210518747479e-05,
"loss": 0.2152,
"step": 286
},
{
"epoch": 0.803921568627451,
"grad_norm": 0.2573007643222809,
"learning_rate": 4.7893851842067984e-05,
"loss": 0.2034,
"step": 287
},
{
"epoch": 0.8067226890756303,
"grad_norm": 0.2098461240530014,
"learning_rate": 4.7875522561958906e-05,
"loss": 0.1647,
"step": 288
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.20104509592056274,
"learning_rate": 4.785711740794242e-05,
"loss": 0.1711,
"step": 289
},
{
"epoch": 0.8123249299719888,
"grad_norm": 0.17355301976203918,
"learning_rate": 4.783863644106502e-05,
"loss": 0.1701,
"step": 290
},
{
"epoch": 0.8151260504201681,
"grad_norm": 0.4620436728000641,
"learning_rate": 4.782007972262471e-05,
"loss": 0.1005,
"step": 291
},
{
"epoch": 0.8179271708683473,
"grad_norm": 0.18848982453346252,
"learning_rate": 4.7801447314170695e-05,
"loss": 0.1764,
"step": 292
},
{
"epoch": 0.8207282913165266,
"grad_norm": 0.2763855457305908,
"learning_rate": 4.7782739277503266e-05,
"loss": 0.1372,
"step": 293
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.29234346747398376,
"learning_rate": 4.776395567467353e-05,
"loss": 0.1395,
"step": 294
},
{
"epoch": 0.8263305322128851,
"grad_norm": 0.30519282817840576,
"learning_rate": 4.7745096567983256e-05,
"loss": 0.2119,
"step": 295
},
{
"epoch": 0.8291316526610645,
"grad_norm": 0.16128459572792053,
"learning_rate": 4.772616201998464e-05,
"loss": 0.1722,
"step": 296
},
{
"epoch": 0.8319327731092437,
"grad_norm": 0.43482664227485657,
"learning_rate": 4.770715209348009e-05,
"loss": 0.0938,
"step": 297
},
{
"epoch": 0.834733893557423,
"grad_norm": 0.19980540871620178,
"learning_rate": 4.768806685152206e-05,
"loss": 0.1747,
"step": 298
},
{
"epoch": 0.8375350140056023,
"grad_norm": 0.23939190804958344,
"learning_rate": 4.766890635741278e-05,
"loss": 0.1334,
"step": 299
},
{
"epoch": 0.8403361344537815,
"grad_norm": 0.40733805298805237,
"learning_rate": 4.76496706747041e-05,
"loss": 0.0893,
"step": 300
},
{
"epoch": 0.8431372549019608,
"grad_norm": 0.22405779361724854,
"learning_rate": 4.763035986719722e-05,
"loss": 0.1742,
"step": 301
},
{
"epoch": 0.84593837535014,
"grad_norm": 0.23476524651050568,
"learning_rate": 4.761097399894257e-05,
"loss": 0.129,
"step": 302
},
{
"epoch": 0.8487394957983193,
"grad_norm": 0.2294142097234726,
"learning_rate": 4.7591513134239505e-05,
"loss": 0.13,
"step": 303
},
{
"epoch": 0.8515406162464986,
"grad_norm": 0.24955376982688904,
"learning_rate": 4.757197733763615e-05,
"loss": 0.1777,
"step": 304
},
{
"epoch": 0.8543417366946778,
"grad_norm": 0.19835838675498962,
"learning_rate": 4.7552366673929136e-05,
"loss": 0.1683,
"step": 305
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.2246866673231125,
"learning_rate": 4.7532681208163444e-05,
"loss": 0.167,
"step": 306
},
{
"epoch": 0.8599439775910365,
"grad_norm": 0.2068600207567215,
"learning_rate": 4.751292100563215e-05,
"loss": 0.17,
"step": 307
},
{
"epoch": 0.8627450980392157,
"grad_norm": 0.2158859223127365,
"learning_rate": 4.7493086131876216e-05,
"loss": 0.166,
"step": 308
},
{
"epoch": 0.865546218487395,
"grad_norm": 0.44647109508514404,
"learning_rate": 4.747317665268427e-05,
"loss": 0.2246,
"step": 309
},
{
"epoch": 0.8683473389355743,
"grad_norm": 0.38463255763053894,
"learning_rate": 4.74531926340924e-05,
"loss": 0.0782,
"step": 310
},
{
"epoch": 0.8711484593837535,
"grad_norm": 0.20261278748512268,
"learning_rate": 4.743313414238394e-05,
"loss": 0.1246,
"step": 311
},
{
"epoch": 0.8739495798319328,
"grad_norm": 0.24954895675182343,
"learning_rate": 4.74130012440892e-05,
"loss": 0.1735,
"step": 312
},
{
"epoch": 0.876750700280112,
"grad_norm": 0.3798324167728424,
"learning_rate": 4.7392794005985326e-05,
"loss": 0.0757,
"step": 313
},
{
"epoch": 0.8795518207282913,
"grad_norm": 0.1796898990869522,
"learning_rate": 4.7372512495096e-05,
"loss": 0.1273,
"step": 314
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.675372838973999,
"learning_rate": 4.735215677869128e-05,
"loss": 0.2697,
"step": 315
},
{
"epoch": 0.8851540616246498,
"grad_norm": 1.0348286628723145,
"learning_rate": 4.733172692428734e-05,
"loss": 0.3294,
"step": 316
},
{
"epoch": 0.8879551820728291,
"grad_norm": 0.22474908828735352,
"learning_rate": 4.731122299964625e-05,
"loss": 0.1263,
"step": 317
},
{
"epoch": 0.8907563025210085,
"grad_norm": 0.4402739107608795,
"learning_rate": 4.7290645072775764e-05,
"loss": 0.2248,
"step": 318
},
{
"epoch": 0.8935574229691877,
"grad_norm": 0.22885151207447052,
"learning_rate": 4.726999321192908e-05,
"loss": 0.1709,
"step": 319
},
{
"epoch": 0.896358543417367,
"grad_norm": 0.1985294669866562,
"learning_rate": 4.7249267485604644e-05,
"loss": 0.1238,
"step": 320
},
{
"epoch": 0.8991596638655462,
"grad_norm": 0.3767232298851013,
"learning_rate": 4.7228467962545866e-05,
"loss": 0.0794,
"step": 321
},
{
"epoch": 0.9019607843137255,
"grad_norm": 0.20609763264656067,
"learning_rate": 4.720759471174096e-05,
"loss": 0.1668,
"step": 322
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.6048465371131897,
"learning_rate": 4.7186647802422644e-05,
"loss": 0.2622,
"step": 323
},
{
"epoch": 0.907563025210084,
"grad_norm": 0.23643067479133606,
"learning_rate": 4.7165627304068e-05,
"loss": 0.1786,
"step": 324
},
{
"epoch": 0.9103641456582633,
"grad_norm": 0.3819364011287689,
"learning_rate": 4.714453328639814e-05,
"loss": 0.0801,
"step": 325
},
{
"epoch": 0.9131652661064426,
"grad_norm": 0.382179856300354,
"learning_rate": 4.712336581937805e-05,
"loss": 0.0812,
"step": 326
},
{
"epoch": 0.9159663865546218,
"grad_norm": 0.20134811103343964,
"learning_rate": 4.710212497321633e-05,
"loss": 0.1285,
"step": 327
},
{
"epoch": 0.9187675070028011,
"grad_norm": 0.3812258541584015,
"learning_rate": 4.7080810818364974e-05,
"loss": 0.0812,
"step": 328
},
{
"epoch": 0.9215686274509803,
"grad_norm": 0.6299641132354736,
"learning_rate": 4.7059423425519105e-05,
"loss": 0.2613,
"step": 329
},
{
"epoch": 0.9243697478991597,
"grad_norm": 0.20863133668899536,
"learning_rate": 4.703796286561679e-05,
"loss": 0.1234,
"step": 330
},
{
"epoch": 0.927170868347339,
"grad_norm": 0.5638542175292969,
"learning_rate": 4.7016429209838764e-05,
"loss": 0.2572,
"step": 331
},
{
"epoch": 0.9299719887955182,
"grad_norm": 0.8459420800209045,
"learning_rate": 4.6994822529608204e-05,
"loss": 0.31,
"step": 332
},
{
"epoch": 0.9327731092436975,
"grad_norm": 0.20429813861846924,
"learning_rate": 4.697314289659051e-05,
"loss": 0.1265,
"step": 333
},
{
"epoch": 0.9355742296918768,
"grad_norm": 0.5639265179634094,
"learning_rate": 4.695139038269303e-05,
"loss": 0.2573,
"step": 334
},
{
"epoch": 0.938375350140056,
"grad_norm": 0.24481046199798584,
"learning_rate": 4.6929565060064864e-05,
"loss": 0.1265,
"step": 335
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.2361876666545868,
"learning_rate": 4.690766700109659e-05,
"loss": 0.171,
"step": 336
},
{
"epoch": 0.9439775910364145,
"grad_norm": 0.5476352572441101,
"learning_rate": 4.688569627842007e-05,
"loss": 0.2601,
"step": 337
},
{
"epoch": 0.9467787114845938,
"grad_norm": 0.4084392786026001,
"learning_rate": 4.686365296490813e-05,
"loss": 0.0887,
"step": 338
},
{
"epoch": 0.9495798319327731,
"grad_norm": 0.4765351414680481,
"learning_rate": 4.684153713367442e-05,
"loss": 0.2469,
"step": 339
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.24255749583244324,
"learning_rate": 4.681934885807308e-05,
"loss": 0.1327,
"step": 340
},
{
"epoch": 0.9551820728291317,
"grad_norm": 0.2432311475276947,
"learning_rate": 4.6797088211698524e-05,
"loss": 0.1349,
"step": 341
},
{
"epoch": 0.957983193277311,
"grad_norm": 0.41482043266296387,
"learning_rate": 4.677475526838526e-05,
"loss": 0.0934,
"step": 342
},
{
"epoch": 0.9607843137254902,
"grad_norm": 0.4962022006511688,
"learning_rate": 4.675235010220754e-05,
"loss": 0.2515,
"step": 343
},
{
"epoch": 0.9635854341736695,
"grad_norm": 0.26970216631889343,
"learning_rate": 4.672987278747919e-05,
"loss": 0.2126,
"step": 344
},
{
"epoch": 0.9663865546218487,
"grad_norm": 0.1891782432794571,
"learning_rate": 4.6707323398753346e-05,
"loss": 0.1656,
"step": 345
},
{
"epoch": 0.969187675070028,
"grad_norm": 0.18780910968780518,
"learning_rate": 4.668470201082218e-05,
"loss": 0.1719,
"step": 346
},
{
"epoch": 0.9719887955182073,
"grad_norm": 0.6354146599769592,
"learning_rate": 4.6662008698716675e-05,
"loss": 0.2906,
"step": 347
},
{
"epoch": 0.9747899159663865,
"grad_norm": 0.19006231427192688,
"learning_rate": 4.663924353770639e-05,
"loss": 0.1676,
"step": 348
},
{
"epoch": 0.9775910364145658,
"grad_norm": 0.1768893450498581,
"learning_rate": 4.6616406603299176e-05,
"loss": 0.1664,
"step": 349
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.19590170681476593,
"learning_rate": 4.6593497971240956e-05,
"loss": 0.1647,
"step": 350
},
{
"epoch": 0.9831932773109243,
"grad_norm": 0.4365139901638031,
"learning_rate": 4.657051771751546e-05,
"loss": 0.0991,
"step": 351
},
{
"epoch": 0.9859943977591037,
"grad_norm": 0.7374207377433777,
"learning_rate": 4.654746591834396e-05,
"loss": 0.31,
"step": 352
},
{
"epoch": 0.988795518207283,
"grad_norm": 0.2521171271800995,
"learning_rate": 4.652434265018504e-05,
"loss": 0.1356,
"step": 353
},
{
"epoch": 0.9915966386554622,
"grad_norm": 0.2702823281288147,
"learning_rate": 4.6501147989734346e-05,
"loss": 0.2081,
"step": 354
},
{
"epoch": 0.9943977591036415,
"grad_norm": 0.24237488210201263,
"learning_rate": 4.647788201392429e-05,
"loss": 0.175,
"step": 355
},
{
"epoch": 0.9971988795518207,
"grad_norm": 0.2869884669780731,
"learning_rate": 4.645454479992386e-05,
"loss": 0.1384,
"step": 356
},
{
"epoch": 1.0,
"grad_norm": 0.4562254548072815,
"learning_rate": 4.64311364251383e-05,
"loss": 0.1006,
"step": 357
},
{
"epoch": 1.0,
"eval_f1 (minor class)": 0.12357723577235774,
"eval_loss": 0.1737803816795349,
"eval_roc_auc": 0.5386335507088105,
"eval_runtime": 2.9217,
"eval_samples_per_second": 433.989,
"eval_steps_per_second": 13.69,
"step": 357
},
{
"epoch": 1.0028011204481793,
"grad_norm": 0.2548682689666748,
"learning_rate": 4.6407656967208876e-05,
"loss": 0.2014,
"step": 358
},
{
"epoch": 1.0056022408963585,
"grad_norm": 0.2920522689819336,
"learning_rate": 4.638410650401267e-05,
"loss": 0.144,
"step": 359
},
{
"epoch": 1.0084033613445378,
"grad_norm": 0.44480934739112854,
"learning_rate": 4.6360485113662216e-05,
"loss": 0.0998,
"step": 360
},
{
"epoch": 1.011204481792717,
"grad_norm": 0.44798406958580017,
"learning_rate": 4.633679287450534e-05,
"loss": 0.0989,
"step": 361
},
{
"epoch": 1.0140056022408963,
"grad_norm": 0.21220609545707703,
"learning_rate": 4.631302986512485e-05,
"loss": 0.1732,
"step": 362
},
{
"epoch": 1.0168067226890756,
"grad_norm": 0.4394678771495819,
"learning_rate": 4.628919616433827e-05,
"loss": 0.2458,
"step": 363
},
{
"epoch": 1.0196078431372548,
"grad_norm": 0.6233344674110413,
"learning_rate": 4.6265291851197626e-05,
"loss": 0.2756,
"step": 364
},
{
"epoch": 1.022408963585434,
"grad_norm": 0.43582528829574585,
"learning_rate": 4.6241317004989126e-05,
"loss": 0.2477,
"step": 365
},
{
"epoch": 1.0252100840336134,
"grad_norm": 0.22646282613277435,
"learning_rate": 4.621727170523293e-05,
"loss": 0.1612,
"step": 366
},
{
"epoch": 1.0280112044817926,
"grad_norm": 0.2966563403606415,
"learning_rate": 4.619315603168289e-05,
"loss": 0.1287,
"step": 367
},
{
"epoch": 1.0308123249299719,
"grad_norm": 0.45250192284584045,
"learning_rate": 4.6168970064326266e-05,
"loss": 0.0975,
"step": 368
},
{
"epoch": 1.0336134453781514,
"grad_norm": 1.0906392335891724,
"learning_rate": 4.614471388338346e-05,
"loss": 0.3621,
"step": 369
},
{
"epoch": 1.0364145658263306,
"grad_norm": 0.3004130721092224,
"learning_rate": 4.6120387569307775e-05,
"loss": 0.1335,
"step": 370
},
{
"epoch": 1.0392156862745099,
"grad_norm": 0.20164428651332855,
"learning_rate": 4.609599120278514e-05,
"loss": 0.1761,
"step": 371
},
{
"epoch": 1.0420168067226891,
"grad_norm": 0.4512525796890259,
"learning_rate": 4.6071524864733794e-05,
"loss": 0.0978,
"step": 372
},
{
"epoch": 1.0448179271708684,
"grad_norm": 0.2054992914199829,
"learning_rate": 4.604698863630411e-05,
"loss": 0.1603,
"step": 373
},
{
"epoch": 1.0476190476190477,
"grad_norm": 0.27855244278907776,
"learning_rate": 4.602238259887825e-05,
"loss": 0.1278,
"step": 374
},
{
"epoch": 1.050420168067227,
"grad_norm": 0.2679445743560791,
"learning_rate": 4.599770683406991e-05,
"loss": 0.1353,
"step": 375
},
{
"epoch": 1.0532212885154062,
"grad_norm": 0.4758155941963196,
"learning_rate": 4.5972961423724087e-05,
"loss": 0.2309,
"step": 376
},
{
"epoch": 1.0560224089635855,
"grad_norm": 0.4918065071105957,
"learning_rate": 4.594814644991674e-05,
"loss": 0.2486,
"step": 377
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.45264533162117004,
"learning_rate": 4.592326199495461e-05,
"loss": 0.0964,
"step": 378
},
{
"epoch": 1.061624649859944,
"grad_norm": 0.2610427439212799,
"learning_rate": 4.5898308141374836e-05,
"loss": 0.1328,
"step": 379
},
{
"epoch": 1.0644257703081232,
"grad_norm": 0.6690388917922974,
"learning_rate": 4.5873284971944784e-05,
"loss": 0.2738,
"step": 380
},
{
"epoch": 1.0672268907563025,
"grad_norm": 0.28492769598960876,
"learning_rate": 4.5848192569661706e-05,
"loss": 0.1286,
"step": 381
},
{
"epoch": 1.0700280112044818,
"grad_norm": 0.30735185742378235,
"learning_rate": 4.5823031017752485e-05,
"loss": 0.2082,
"step": 382
},
{
"epoch": 1.072829131652661,
"grad_norm": 0.390432208776474,
"learning_rate": 4.579780039967339e-05,
"loss": 0.2061,
"step": 383
},
{
"epoch": 1.0756302521008403,
"grad_norm": 0.5083768367767334,
"learning_rate": 4.577250079910973e-05,
"loss": 0.2584,
"step": 384
},
{
"epoch": 1.0784313725490196,
"grad_norm": 0.2757357060909271,
"learning_rate": 4.574713229997563e-05,
"loss": 0.1371,
"step": 385
},
{
"epoch": 1.0812324929971988,
"grad_norm": 0.5543646216392517,
"learning_rate": 4.5721694986413753e-05,
"loss": 0.2586,
"step": 386
},
{
"epoch": 1.084033613445378,
"grad_norm": 0.29792216420173645,
"learning_rate": 4.5696188942795e-05,
"loss": 0.21,
"step": 387
},
{
"epoch": 1.0868347338935573,
"grad_norm": 0.2785909175872803,
"learning_rate": 4.5670614253718224e-05,
"loss": 0.213,
"step": 388
},
{
"epoch": 1.0896358543417366,
"grad_norm": 0.6011048555374146,
"learning_rate": 4.5644971004009984e-05,
"loss": 0.2512,
"step": 389
},
{
"epoch": 1.092436974789916,
"grad_norm": 0.32919472455978394,
"learning_rate": 4.5619259278724214e-05,
"loss": 0.1317,
"step": 390
},
{
"epoch": 1.0952380952380953,
"grad_norm": 0.19824554026126862,
"learning_rate": 4.5593479163141994e-05,
"loss": 0.1699,
"step": 391
},
{
"epoch": 1.0980392156862746,
"grad_norm": 0.20258326828479767,
"learning_rate": 4.556763074277124e-05,
"loss": 0.1732,
"step": 392
},
{
"epoch": 1.1008403361344539,
"grad_norm": 0.30259111523628235,
"learning_rate": 4.55417141033464e-05,
"loss": 0.138,
"step": 393
},
{
"epoch": 1.1036414565826331,
"grad_norm": 0.3482188880443573,
"learning_rate": 4.551572933082822e-05,
"loss": 0.1307,
"step": 394
},
{
"epoch": 1.1064425770308124,
"grad_norm": 0.24273444712162018,
"learning_rate": 4.548967651140341e-05,
"loss": 0.2059,
"step": 395
},
{
"epoch": 1.1092436974789917,
"grad_norm": 0.2121374011039734,
"learning_rate": 4.54635557314844e-05,
"loss": 0.1707,
"step": 396
},
{
"epoch": 1.112044817927171,
"grad_norm": 0.22848618030548096,
"learning_rate": 4.5437367077709e-05,
"loss": 0.2048,
"step": 397
},
{
"epoch": 1.1148459383753502,
"grad_norm": 0.4918600022792816,
"learning_rate": 4.541111063694019e-05,
"loss": 0.1049,
"step": 398
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.47923439741134644,
"learning_rate": 4.538478649626574e-05,
"loss": 0.0996,
"step": 399
},
{
"epoch": 1.1204481792717087,
"grad_norm": 0.42145538330078125,
"learning_rate": 4.5358394742998e-05,
"loss": 0.2455,
"step": 400
},
{
"epoch": 1.123249299719888,
"grad_norm": 0.8408336639404297,
"learning_rate": 4.533193546467357e-05,
"loss": 0.3064,
"step": 401
},
{
"epoch": 1.1260504201680672,
"grad_norm": 0.5717082619667053,
"learning_rate": 4.530540874905302e-05,
"loss": 0.2691,
"step": 402
},
{
"epoch": 1.1288515406162465,
"grad_norm": 0.4880336821079254,
"learning_rate": 4.527881468412058e-05,
"loss": 0.1031,
"step": 403
},
{
"epoch": 1.1316526610644257,
"grad_norm": 0.23222537338733673,
"learning_rate": 4.52521533580839e-05,
"loss": 0.2062,
"step": 404
},
{
"epoch": 1.134453781512605,
"grad_norm": 0.4019043445587158,
"learning_rate": 4.522542485937369e-05,
"loss": 0.2336,
"step": 405
},
{
"epoch": 1.1372549019607843,
"grad_norm": 0.4039878845214844,
"learning_rate": 4.5198629276643465e-05,
"loss": 0.2291,
"step": 406
},
{
"epoch": 1.1400560224089635,
"grad_norm": 0.24419677257537842,
"learning_rate": 4.517176669876927e-05,
"loss": 0.1874,
"step": 407
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.5417842268943787,
"learning_rate": 4.5144837214849334e-05,
"loss": 0.1077,
"step": 408
},
{
"epoch": 1.145658263305322,
"grad_norm": 0.5518860816955566,
"learning_rate": 4.5117840914203805e-05,
"loss": 0.1073,
"step": 409
},
{
"epoch": 1.1484593837535013,
"grad_norm": 0.282936692237854,
"learning_rate": 4.509077788637446e-05,
"loss": 0.1585,
"step": 410
},
{
"epoch": 1.1512605042016806,
"grad_norm": 0.22835111618041992,
"learning_rate": 4.5063648221124386e-05,
"loss": 0.1841,
"step": 411
},
{
"epoch": 1.1540616246498598,
"grad_norm": 0.36185768246650696,
"learning_rate": 4.503645200843771e-05,
"loss": 0.1392,
"step": 412
},
{
"epoch": 1.156862745098039,
"grad_norm": 0.5351378321647644,
"learning_rate": 4.500918933851928e-05,
"loss": 0.1027,
"step": 413
},
{
"epoch": 1.1596638655462184,
"grad_norm": 0.2676061987876892,
"learning_rate": 4.498186030179434e-05,
"loss": 0.158,
"step": 414
},
{
"epoch": 1.1624649859943978,
"grad_norm": 0.3645527958869934,
"learning_rate": 4.495446498890831e-05,
"loss": 0.1314,
"step": 415
},
{
"epoch": 1.165266106442577,
"grad_norm": 0.29105129837989807,
"learning_rate": 4.4927003490726404e-05,
"loss": 0.1341,
"step": 416
},
{
"epoch": 1.1680672268907564,
"grad_norm": 0.7233947515487671,
"learning_rate": 4.4899475898333367e-05,
"loss": 0.2707,
"step": 417
},
{
"epoch": 1.1708683473389356,
"grad_norm": 0.342913419008255,
"learning_rate": 4.487188230303316e-05,
"loss": 0.1763,
"step": 418
},
{
"epoch": 1.173669467787115,
"grad_norm": 0.2501223385334015,
"learning_rate": 4.48442227963487e-05,
"loss": 0.1555,
"step": 419
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.32546159625053406,
"learning_rate": 4.4816497470021454e-05,
"loss": 0.1288,
"step": 420
},
{
"epoch": 1.1792717086834734,
"grad_norm": 0.23415644466876984,
"learning_rate": 4.478870641601127e-05,
"loss": 0.1575,
"step": 421
},
{
"epoch": 1.1820728291316527,
"grad_norm": 0.2671819031238556,
"learning_rate": 4.4760849726495945e-05,
"loss": 0.13,
"step": 422
},
{
"epoch": 1.184873949579832,
"grad_norm": 1.1153579950332642,
"learning_rate": 4.473292749387102e-05,
"loss": 0.3314,
"step": 423
},
{
"epoch": 1.1876750700280112,
"grad_norm": 0.2628577947616577,
"learning_rate": 4.47049398107494e-05,
"loss": 0.1561,
"step": 424
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.2286321073770523,
"learning_rate": 4.467688676996111e-05,
"loss": 0.1583,
"step": 425
},
{
"epoch": 1.1932773109243697,
"grad_norm": 0.29673612117767334,
"learning_rate": 4.464876846455291e-05,
"loss": 0.1242,
"step": 426
},
{
"epoch": 1.196078431372549,
"grad_norm": 0.44082918763160706,
"learning_rate": 4.4620584987788065e-05,
"loss": 0.2101,
"step": 427
},
{
"epoch": 1.1988795518207283,
"grad_norm": 0.2973793148994446,
"learning_rate": 4.4592336433146e-05,
"loss": 0.1493,
"step": 428
},
{
"epoch": 1.2016806722689075,
"grad_norm": 0.2873530089855194,
"learning_rate": 4.4564022894321966e-05,
"loss": 0.1652,
"step": 429
},
{
"epoch": 1.2044817927170868,
"grad_norm": 0.5498677492141724,
"learning_rate": 4.4535644465226796e-05,
"loss": 0.0821,
"step": 430
},
{
"epoch": 1.207282913165266,
"grad_norm": 0.249070942401886,
"learning_rate": 4.450720123998651e-05,
"loss": 0.1611,
"step": 431
},
{
"epoch": 1.2100840336134453,
"grad_norm": 0.5754450559616089,
"learning_rate": 4.4478693312942054e-05,
"loss": 0.236,
"step": 432
},
{
"epoch": 1.2128851540616246,
"grad_norm": 0.31797465682029724,
"learning_rate": 4.4450120778649014e-05,
"loss": 0.1199,
"step": 433
},
{
"epoch": 1.215686274509804,
"grad_norm": 0.4542624056339264,
"learning_rate": 4.4421483731877214e-05,
"loss": 0.1787,
"step": 434
},
{
"epoch": 1.2184873949579833,
"grad_norm": 0.35622158646583557,
"learning_rate": 4.43927822676105e-05,
"loss": 0.1261,
"step": 435
},
{
"epoch": 1.2212885154061626,
"grad_norm": 0.9345407485961914,
"learning_rate": 4.4364016481046336e-05,
"loss": 0.2648,
"step": 436
},
{
"epoch": 1.2240896358543418,
"grad_norm": 0.4623335301876068,
"learning_rate": 4.433518646759558e-05,
"loss": 0.187,
"step": 437
},
{
"epoch": 1.226890756302521,
"grad_norm": 0.8016291856765747,
"learning_rate": 4.4306292322882066e-05,
"loss": 0.2376,
"step": 438
},
{
"epoch": 1.2296918767507004,
"grad_norm": 0.3608405292034149,
"learning_rate": 4.4277334142742375e-05,
"loss": 0.1337,
"step": 439
},
{
"epoch": 1.2324929971988796,
"grad_norm": 0.9408867359161377,
"learning_rate": 4.424831202322548e-05,
"loss": 0.2775,
"step": 440
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.36630916595458984,
"learning_rate": 4.421922606059242e-05,
"loss": 0.1437,
"step": 441
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.5501814484596252,
"learning_rate": 4.419007635131598e-05,
"loss": 0.129,
"step": 442
},
{
"epoch": 1.2408963585434174,
"grad_norm": 0.5585311651229858,
"learning_rate": 4.416086299208041e-05,
"loss": 0.2171,
"step": 443
},
{
"epoch": 1.2436974789915967,
"grad_norm": 0.5523871779441833,
"learning_rate": 4.413158607978104e-05,
"loss": 0.1953,
"step": 444
},
{
"epoch": 1.246498599439776,
"grad_norm": 0.778249979019165,
"learning_rate": 4.410224571152403e-05,
"loss": 0.0988,
"step": 445
},
{
"epoch": 1.2492997198879552,
"grad_norm": 0.3328462839126587,
"learning_rate": 4.407284198462597e-05,
"loss": 0.189,
"step": 446
},
{
"epoch": 1.2521008403361344,
"grad_norm": 0.3411165177822113,
"learning_rate": 4.404337499661364e-05,
"loss": 0.1717,
"step": 447
},
{
"epoch": 1.2549019607843137,
"grad_norm": 0.3889773488044739,
"learning_rate": 4.4013844845223626e-05,
"loss": 0.2114,
"step": 448
},
{
"epoch": 1.257703081232493,
"grad_norm": 0.2957543134689331,
"learning_rate": 4.398425162840202e-05,
"loss": 0.1762,
"step": 449
},
{
"epoch": 1.2605042016806722,
"grad_norm": 0.4489629864692688,
"learning_rate": 4.395459544430407e-05,
"loss": 0.2177,
"step": 450
},
{
"epoch": 1.2633053221288515,
"grad_norm": 0.5136566162109375,
"learning_rate": 4.3924876391293915e-05,
"loss": 0.1395,
"step": 451
},
{
"epoch": 1.2661064425770308,
"grad_norm": 0.6125023365020752,
"learning_rate": 4.3895094567944186e-05,
"loss": 0.1241,
"step": 452
},
{
"epoch": 1.26890756302521,
"grad_norm": 0.3510366380214691,
"learning_rate": 4.386525007303571e-05,
"loss": 0.1614,
"step": 453
},
{
"epoch": 1.2717086834733893,
"grad_norm": 0.31088364124298096,
"learning_rate": 4.3835343005557215e-05,
"loss": 0.1631,
"step": 454
},
{
"epoch": 1.2745098039215685,
"grad_norm": 0.4494378864765167,
"learning_rate": 4.380537346470495e-05,
"loss": 0.1324,
"step": 455
},
{
"epoch": 1.2773109243697478,
"grad_norm": 0.5657458901405334,
"learning_rate": 4.3775341549882364e-05,
"loss": 0.1197,
"step": 456
},
{
"epoch": 1.280112044817927,
"grad_norm": 0.26143553853034973,
"learning_rate": 4.374524736069982e-05,
"loss": 0.1596,
"step": 457
},
{
"epoch": 1.2829131652661063,
"grad_norm": 0.3509024977684021,
"learning_rate": 4.37150909969742e-05,
"loss": 0.1464,
"step": 458
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.3572234809398651,
"learning_rate": 4.368487255872864e-05,
"loss": 0.1751,
"step": 459
},
{
"epoch": 1.2885154061624648,
"grad_norm": 0.50468909740448,
"learning_rate": 4.365459214619214e-05,
"loss": 0.0963,
"step": 460
},
{
"epoch": 1.2913165266106443,
"grad_norm": 0.3895762860774994,
"learning_rate": 4.3624249859799274e-05,
"loss": 0.1136,
"step": 461
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.6248763203620911,
"learning_rate": 4.359384580018982e-05,
"loss": 0.1675,
"step": 462
},
{
"epoch": 1.2969187675070029,
"grad_norm": 0.37576088309288025,
"learning_rate": 4.356338006820849e-05,
"loss": 0.1191,
"step": 463
},
{
"epoch": 1.2997198879551821,
"grad_norm": 0.5103853940963745,
"learning_rate": 4.35328527649045e-05,
"loss": 0.1591,
"step": 464
},
{
"epoch": 1.3025210084033614,
"grad_norm": 0.6339231133460999,
"learning_rate": 4.35022639915313e-05,
"loss": 0.1657,
"step": 465
},
{
"epoch": 1.3053221288515406,
"grad_norm": 0.3810737729072571,
"learning_rate": 4.347161384954626e-05,
"loss": 0.1179,
"step": 466
},
{
"epoch": 1.30812324929972,
"grad_norm": 1.3601053953170776,
"learning_rate": 4.344090244061024e-05,
"loss": 0.2517,
"step": 467
},
{
"epoch": 1.3109243697478992,
"grad_norm": 0.40818560123443604,
"learning_rate": 4.341012986658738e-05,
"loss": 0.123,
"step": 468
},
{
"epoch": 1.3137254901960784,
"grad_norm": 1.1468818187713623,
"learning_rate": 4.337929622954463e-05,
"loss": 0.2502,
"step": 469
},
{
"epoch": 1.3165266106442577,
"grad_norm": 1.0096850395202637,
"learning_rate": 4.334840163175151e-05,
"loss": 0.2248,
"step": 470
},
{
"epoch": 1.319327731092437,
"grad_norm": 0.43895837664604187,
"learning_rate": 4.3317446175679735e-05,
"loss": 0.1442,
"step": 471
},
{
"epoch": 1.3221288515406162,
"grad_norm": 1.211270809173584,
"learning_rate": 4.3286429964002856e-05,
"loss": 0.2676,
"step": 472
},
{
"epoch": 1.3249299719887955,
"grad_norm": 0.5929017663002014,
"learning_rate": 4.325535309959596e-05,
"loss": 0.0812,
"step": 473
},
{
"epoch": 1.3277310924369747,
"grad_norm": 0.3913727104663849,
"learning_rate": 4.3224215685535294e-05,
"loss": 0.1334,
"step": 474
},
{
"epoch": 1.330532212885154,
"grad_norm": 0.4079723358154297,
"learning_rate": 4.3193017825097936e-05,
"loss": 0.1742,
"step": 475
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.3363523781299591,
"learning_rate": 4.316175962176148e-05,
"loss": 0.1315,
"step": 476
},
{
"epoch": 1.3361344537815127,
"grad_norm": 0.5548957586288452,
"learning_rate": 4.313044117920363e-05,
"loss": 0.0904,
"step": 477
},
{
"epoch": 1.338935574229692,
"grad_norm": 0.2717703580856323,
"learning_rate": 4.3099062601301904e-05,
"loss": 0.1617,
"step": 478
},
{
"epoch": 1.3417366946778713,
"grad_norm": 0.48116743564605713,
"learning_rate": 4.30676239921333e-05,
"loss": 0.2158,
"step": 479
},
{
"epoch": 1.3445378151260505,
"grad_norm": 0.5152187943458557,
"learning_rate": 4.3036125455973896e-05,
"loss": 0.0882,
"step": 480
},
{
"epoch": 1.3473389355742298,
"grad_norm": 0.747715175151825,
"learning_rate": 4.300456709729856e-05,
"loss": 0.2583,
"step": 481
},
{
"epoch": 1.350140056022409,
"grad_norm": 0.24533413350582123,
"learning_rate": 4.2972949020780575e-05,
"loss": 0.1618,
"step": 482
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.2675352394580841,
"learning_rate": 4.294127133129128e-05,
"loss": 0.1735,
"step": 483
},
{
"epoch": 1.3557422969187676,
"grad_norm": 0.40604180097579956,
"learning_rate": 4.290953413389977e-05,
"loss": 0.1971,
"step": 484
},
{
"epoch": 1.3585434173669468,
"grad_norm": 0.5014567971229553,
"learning_rate": 4.2877737533872485e-05,
"loss": 0.0903,
"step": 485
},
{
"epoch": 1.361344537815126,
"grad_norm": 0.3054341971874237,
"learning_rate": 4.284588163667292e-05,
"loss": 0.1307,
"step": 486
},
{
"epoch": 1.3641456582633054,
"grad_norm": 0.5000020265579224,
"learning_rate": 4.281396654796124e-05,
"loss": 0.0904,
"step": 487
},
{
"epoch": 1.3669467787114846,
"grad_norm": 0.3115267753601074,
"learning_rate": 4.278199237359392e-05,
"loss": 0.1209,
"step": 488
},
{
"epoch": 1.3697478991596639,
"grad_norm": 0.22438497841358185,
"learning_rate": 4.274995921962343e-05,
"loss": 0.1775,
"step": 489
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.25052976608276367,
"learning_rate": 4.271786719229786e-05,
"loss": 0.1337,
"step": 490
},
{
"epoch": 1.3753501400560224,
"grad_norm": 0.2692072093486786,
"learning_rate": 4.268571639806057e-05,
"loss": 0.1773,
"step": 491
},
{
"epoch": 1.3781512605042017,
"grad_norm": 0.5472849011421204,
"learning_rate": 4.265350694354985e-05,
"loss": 0.2203,
"step": 492
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.2849833071231842,
"learning_rate": 4.2621238935598524e-05,
"loss": 0.1256,
"step": 493
},
{
"epoch": 1.3837535014005602,
"grad_norm": 0.4663335084915161,
"learning_rate": 4.2588912481233666e-05,
"loss": 0.0828,
"step": 494
},
{
"epoch": 1.3865546218487395,
"grad_norm": 0.3137625753879547,
"learning_rate": 4.2556527687676186e-05,
"loss": 0.1409,
"step": 495
},
{
"epoch": 1.3893557422969187,
"grad_norm": 0.23330673575401306,
"learning_rate": 4.2524084662340494e-05,
"loss": 0.1695,
"step": 496
},
{
"epoch": 1.392156862745098,
"grad_norm": 0.2527044117450714,
"learning_rate": 4.249158351283414e-05,
"loss": 0.125,
"step": 497
},
{
"epoch": 1.3949579831932772,
"grad_norm": 0.2422427237033844,
"learning_rate": 4.2459024346957475e-05,
"loss": 0.1719,
"step": 498
},
{
"epoch": 1.3977591036414565,
"grad_norm": 0.25663718581199646,
"learning_rate": 4.2426407272703284e-05,
"loss": 0.1757,
"step": 499
},
{
"epoch": 1.4005602240896358,
"grad_norm": 0.2737230062484741,
"learning_rate": 4.2393732398256394e-05,
"loss": 0.114,
"step": 500
},
{
"epoch": 1.403361344537815,
"grad_norm": 0.2504082918167114,
"learning_rate": 4.236099983199338e-05,
"loss": 0.1697,
"step": 501
},
{
"epoch": 1.4061624649859943,
"grad_norm": 0.911520779132843,
"learning_rate": 4.232820968248214e-05,
"loss": 0.2774,
"step": 502
},
{
"epoch": 1.4089635854341735,
"grad_norm": 0.4278314411640167,
"learning_rate": 4.229536205848158e-05,
"loss": 0.2105,
"step": 503
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.22557204961776733,
"learning_rate": 4.2262457068941245e-05,
"loss": 0.1257,
"step": 504
},
{
"epoch": 1.4145658263305323,
"grad_norm": 0.2534330189228058,
"learning_rate": 4.222949482300094e-05,
"loss": 0.1229,
"step": 505
},
{
"epoch": 1.4173669467787116,
"grad_norm": 0.20387370884418488,
"learning_rate": 4.219647542999037e-05,
"loss": 0.1262,
"step": 506
},
{
"epoch": 1.4201680672268908,
"grad_norm": 0.20558597147464752,
"learning_rate": 4.21633989994288e-05,
"loss": 0.1239,
"step": 507
},
{
"epoch": 1.42296918767507,
"grad_norm": 0.6656189560890198,
"learning_rate": 4.2130265641024705e-05,
"loss": 0.2589,
"step": 508
},
{
"epoch": 1.4257703081232493,
"grad_norm": 0.3030642569065094,
"learning_rate": 4.209707546467531e-05,
"loss": 0.1137,
"step": 509
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2517474591732025,
"learning_rate": 4.206382858046636e-05,
"loss": 0.1739,
"step": 510
},
{
"epoch": 1.4313725490196079,
"grad_norm": 0.21296930313110352,
"learning_rate": 4.2030525098671646e-05,
"loss": 0.1274,
"step": 511
},
{
"epoch": 1.4341736694677871,
"grad_norm": 0.23091089725494385,
"learning_rate": 4.199716512975272e-05,
"loss": 0.1305,
"step": 512
},
{
"epoch": 1.4369747899159664,
"grad_norm": 0.25984320044517517,
"learning_rate": 4.1963748784358456e-05,
"loss": 0.1668,
"step": 513
},
{
"epoch": 1.4397759103641457,
"grad_norm": 0.2812284231185913,
"learning_rate": 4.1930276173324756e-05,
"loss": 0.1218,
"step": 514
},
{
"epoch": 1.442577030812325,
"grad_norm": 0.42426514625549316,
"learning_rate": 4.189674740767411e-05,
"loss": 0.0794,
"step": 515
},
{
"epoch": 1.4453781512605042,
"grad_norm": 0.23902854323387146,
"learning_rate": 4.1863162598615265e-05,
"loss": 0.1354,
"step": 516
},
{
"epoch": 1.4481792717086834,
"grad_norm": 0.2747328281402588,
"learning_rate": 4.1829521857542885e-05,
"loss": 0.1168,
"step": 517
},
{
"epoch": 1.4509803921568627,
"grad_norm": 0.25116053223609924,
"learning_rate": 4.1795825296037126e-05,
"loss": 0.1312,
"step": 518
},
{
"epoch": 1.453781512605042,
"grad_norm": 0.7276238799095154,
"learning_rate": 4.176207302586329e-05,
"loss": 0.2773,
"step": 519
},
{
"epoch": 1.4565826330532212,
"grad_norm": 0.26795586943626404,
"learning_rate": 4.172826515897146e-05,
"loss": 0.1678,
"step": 520
},
{
"epoch": 1.4593837535014005,
"grad_norm": 0.42587825655937195,
"learning_rate": 4.169440180749612e-05,
"loss": 0.0729,
"step": 521
},
{
"epoch": 1.46218487394958,
"grad_norm": 0.3999551832675934,
"learning_rate": 4.166048308375579e-05,
"loss": 0.1851,
"step": 522
},
{
"epoch": 1.4649859943977592,
"grad_norm": 0.2874641418457031,
"learning_rate": 4.162650910025264e-05,
"loss": 0.162,
"step": 523
},
{
"epoch": 1.4677871148459385,
"grad_norm": 0.7796664834022522,
"learning_rate": 4.159247996967215e-05,
"loss": 0.2352,
"step": 524
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.0714823007583618,
"learning_rate": 4.1558395804882695e-05,
"loss": 0.3158,
"step": 525
},
{
"epoch": 1.473389355742297,
"grad_norm": 1.209743618965149,
"learning_rate": 4.152425671893518e-05,
"loss": 0.3378,
"step": 526
},
{
"epoch": 1.4761904761904763,
"grad_norm": 0.25424477458000183,
"learning_rate": 4.149006282506268e-05,
"loss": 0.1739,
"step": 527
},
{
"epoch": 1.4789915966386555,
"grad_norm": 0.2382468283176422,
"learning_rate": 4.145581423668008e-05,
"loss": 0.1301,
"step": 528
},
{
"epoch": 1.4817927170868348,
"grad_norm": 0.2188778817653656,
"learning_rate": 4.142151106738364e-05,
"loss": 0.1625,
"step": 529
},
{
"epoch": 1.484593837535014,
"grad_norm": 0.4427192807197571,
"learning_rate": 4.138715343095068e-05,
"loss": 0.085,
"step": 530
},
{
"epoch": 1.4873949579831933,
"grad_norm": 0.22112633287906647,
"learning_rate": 4.135274144133918e-05,
"loss": 0.1694,
"step": 531
},
{
"epoch": 1.4901960784313726,
"grad_norm": 0.27271297574043274,
"learning_rate": 4.1318275212687376e-05,
"loss": 0.1251,
"step": 532
},
{
"epoch": 1.4929971988795518,
"grad_norm": 0.3934704661369324,
"learning_rate": 4.1283754859313414e-05,
"loss": 0.2116,
"step": 533
},
{
"epoch": 1.495798319327731,
"grad_norm": 0.33457499742507935,
"learning_rate": 4.124918049571499e-05,
"loss": 0.2102,
"step": 534
},
{
"epoch": 1.4985994397759104,
"grad_norm": 0.33014121651649475,
"learning_rate": 4.12145522365689e-05,
"loss": 0.2068,
"step": 535
},
{
"epoch": 1.5014005602240896,
"grad_norm": 1.0461491346359253,
"learning_rate": 4.117987019673073e-05,
"loss": 0.3212,
"step": 536
},
{
"epoch": 1.504201680672269,
"grad_norm": 0.45663779973983765,
"learning_rate": 4.1145134491234427e-05,
"loss": 0.2363,
"step": 537
},
{
"epoch": 1.5070028011204482,
"grad_norm": 0.46917828917503357,
"learning_rate": 4.111034523529196e-05,
"loss": 0.2326,
"step": 538
},
{
"epoch": 1.5098039215686274,
"grad_norm": 0.30291152000427246,
"learning_rate": 4.1075502544292884e-05,
"loss": 0.2064,
"step": 539
},
{
"epoch": 1.5126050420168067,
"grad_norm": 0.3694068491458893,
"learning_rate": 4.1040606533804024e-05,
"loss": 0.1405,
"step": 540
},
{
"epoch": 1.515406162464986,
"grad_norm": 0.3829762935638428,
"learning_rate": 4.100565731956903e-05,
"loss": 0.1385,
"step": 541
},
{
"epoch": 1.5182072829131652,
"grad_norm": 0.2339664101600647,
"learning_rate": 4.097065501750804e-05,
"loss": 0.1658,
"step": 542
},
{
"epoch": 1.5210084033613445,
"grad_norm": 0.2567518353462219,
"learning_rate": 4.093559974371725e-05,
"loss": 0.1748,
"step": 543
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.5793279409408569,
"learning_rate": 4.0900491614468553e-05,
"loss": 0.1087,
"step": 544
},
{
"epoch": 1.526610644257703,
"grad_norm": 0.5785588622093201,
"learning_rate": 4.086533074620919e-05,
"loss": 0.1099,
"step": 545
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.5617150664329529,
"learning_rate": 4.083011725556129e-05,
"loss": 0.1074,
"step": 546
},
{
"epoch": 1.5322128851540615,
"grad_norm": 0.36825308203697205,
"learning_rate": 4.0794851259321546e-05,
"loss": 0.2374,
"step": 547
},
{
"epoch": 1.5350140056022408,
"grad_norm": 0.5937955379486084,
"learning_rate": 4.0759532874460785e-05,
"loss": 0.2677,
"step": 548
},
{
"epoch": 1.53781512605042,
"grad_norm": 0.24256612360477448,
"learning_rate": 4.0724162218123596e-05,
"loss": 0.1698,
"step": 549
},
{
"epoch": 1.5406162464985993,
"grad_norm": 0.5699710249900818,
"learning_rate": 4.068873940762796e-05,
"loss": 0.1086,
"step": 550
},
{
"epoch": 1.5434173669467786,
"grad_norm": 0.3110926151275635,
"learning_rate": 4.065326456046483e-05,
"loss": 0.2061,
"step": 551
},
{
"epoch": 1.5462184873949578,
"grad_norm": 0.36440059542655945,
"learning_rate": 4.0617737794297764e-05,
"loss": 0.2146,
"step": 552
},
{
"epoch": 1.5490196078431373,
"grad_norm": 0.2662959098815918,
"learning_rate": 4.058215922696252e-05,
"loss": 0.1728,
"step": 553
},
{
"epoch": 1.5518207282913166,
"grad_norm": 0.3178268074989319,
"learning_rate": 4.0546528976466655e-05,
"loss": 0.1402,
"step": 554
},
{
"epoch": 1.5546218487394958,
"grad_norm": 0.30780094861984253,
"learning_rate": 4.051084716098921e-05,
"loss": 0.2053,
"step": 555
},
{
"epoch": 1.557422969187675,
"grad_norm": 0.21479925513267517,
"learning_rate": 4.047511389888017e-05,
"loss": 0.1748,
"step": 556
},
{
"epoch": 1.5602240896358543,
"grad_norm": 0.24510297179222107,
"learning_rate": 4.043932930866021e-05,
"loss": 0.1683,
"step": 557
},
{
"epoch": 1.5630252100840336,
"grad_norm": 0.2353736162185669,
"learning_rate": 4.040349350902028e-05,
"loss": 0.1643,
"step": 558
},
{
"epoch": 1.5658263305322129,
"grad_norm": 0.5024756789207458,
"learning_rate": 4.036760661882109e-05,
"loss": 0.0958,
"step": 559
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.20932908356189728,
"learning_rate": 4.033166875709291e-05,
"loss": 0.1738,
"step": 560
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.20001398026943207,
"learning_rate": 4.029568004303501e-05,
"loss": 0.173,
"step": 561
},
{
"epoch": 1.5742296918767507,
"grad_norm": 0.2733915150165558,
"learning_rate": 4.025964059601535e-05,
"loss": 0.1329,
"step": 562
},
{
"epoch": 1.57703081232493,
"grad_norm": 0.48111703991889954,
"learning_rate": 4.022355053557015e-05,
"loss": 0.2336,
"step": 563
},
{
"epoch": 1.5798319327731094,
"grad_norm": 0.6872272491455078,
"learning_rate": 4.018740998140352e-05,
"loss": 0.2748,
"step": 564
},
{
"epoch": 1.5826330532212887,
"grad_norm": 0.7389792203903198,
"learning_rate": 4.015121905338704e-05,
"loss": 0.2948,
"step": 565
},
{
"epoch": 1.585434173669468,
"grad_norm": 0.46626532077789307,
"learning_rate": 4.011497787155938e-05,
"loss": 0.2341,
"step": 566
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.2073425054550171,
"learning_rate": 4.007868655612586e-05,
"loss": 0.1662,
"step": 567
},
{
"epoch": 1.5910364145658265,
"grad_norm": 0.2893884479999542,
"learning_rate": 4.004234522745813e-05,
"loss": 0.1419,
"step": 568
},
{
"epoch": 1.5938375350140057,
"grad_norm": 0.3202155828475952,
"learning_rate": 4.00059540060937e-05,
"loss": 0.1377,
"step": 569
},
{
"epoch": 1.596638655462185,
"grad_norm": 0.21610812842845917,
"learning_rate": 3.996951301273557e-05,
"loss": 0.1696,
"step": 570
},
{
"epoch": 1.5994397759103642,
"grad_norm": 0.1836165338754654,
"learning_rate": 3.993302236825181e-05,
"loss": 0.1733,
"step": 571
},
{
"epoch": 1.6022408963585435,
"grad_norm": 0.2764729857444763,
"learning_rate": 3.98964821936752e-05,
"loss": 0.1683,
"step": 572
},
{
"epoch": 1.6050420168067228,
"grad_norm": 0.31198838353157043,
"learning_rate": 3.9859892610202785e-05,
"loss": 0.1573,
"step": 573
},
{
"epoch": 1.607843137254902,
"grad_norm": 0.5770326852798462,
"learning_rate": 3.982325373919549e-05,
"loss": 0.2388,
"step": 574
},
{
"epoch": 1.6106442577030813,
"grad_norm": 0.3288671672344208,
"learning_rate": 3.9786565702177723e-05,
"loss": 0.2145,
"step": 575
},
{
"epoch": 1.6134453781512605,
"grad_norm": 0.2905415892601013,
"learning_rate": 3.974982862083697e-05,
"loss": 0.1867,
"step": 576
},
{
"epoch": 1.6162464985994398,
"grad_norm": 0.2127629816532135,
"learning_rate": 3.9713042617023386e-05,
"loss": 0.1782,
"step": 577
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.2230168879032135,
"learning_rate": 3.967620781274938e-05,
"loss": 0.1759,
"step": 578
},
{
"epoch": 1.6218487394957983,
"grad_norm": 0.3231712281703949,
"learning_rate": 3.9639324330189236e-05,
"loss": 0.207,
"step": 579
},
{
"epoch": 1.6246498599439776,
"grad_norm": 0.3769891858100891,
"learning_rate": 3.960239229167869e-05,
"loss": 0.1371,
"step": 580
},
{
"epoch": 1.6274509803921569,
"grad_norm": 0.32875877618789673,
"learning_rate": 3.956541181971455e-05,
"loss": 0.1396,
"step": 581
},
{
"epoch": 1.6302521008403361,
"grad_norm": 0.2713623046875,
"learning_rate": 3.9528383036954224e-05,
"loss": 0.1653,
"step": 582
},
{
"epoch": 1.6330532212885154,
"grad_norm": 1.0410537719726562,
"learning_rate": 3.949130606621541e-05,
"loss": 0.3346,
"step": 583
},
{
"epoch": 1.6358543417366946,
"grad_norm": 0.23961792886257172,
"learning_rate": 3.945418103047558e-05,
"loss": 0.1784,
"step": 584
},
{
"epoch": 1.638655462184874,
"grad_norm": 0.286924809217453,
"learning_rate": 3.941700805287168e-05,
"loss": 0.2005,
"step": 585
},
{
"epoch": 1.6414565826330532,
"grad_norm": 0.38071775436401367,
"learning_rate": 3.937978725669965e-05,
"loss": 0.1445,
"step": 586
},
{
"epoch": 1.6442577030812324,
"grad_norm": 0.5587998032569885,
"learning_rate": 3.934251876541404e-05,
"loss": 0.1045,
"step": 587
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.3966423273086548,
"learning_rate": 3.9305202702627575e-05,
"loss": 0.1404,
"step": 588
},
{
"epoch": 1.649859943977591,
"grad_norm": 0.3944240212440491,
"learning_rate": 3.92678391921108e-05,
"loss": 0.139,
"step": 589
},
{
"epoch": 1.6526610644257702,
"grad_norm": 0.25802481174468994,
"learning_rate": 3.9230428357791595e-05,
"loss": 0.1688,
"step": 590
},
{
"epoch": 1.6554621848739495,
"grad_norm": 0.22108642756938934,
"learning_rate": 3.919297032375485e-05,
"loss": 0.1737,
"step": 591
},
{
"epoch": 1.6582633053221287,
"grad_norm": 0.3680080473423004,
"learning_rate": 3.915546521424198e-05,
"loss": 0.1375,
"step": 592
},
{
"epoch": 1.661064425770308,
"grad_norm": 0.2191331535577774,
"learning_rate": 3.9117913153650546e-05,
"loss": 0.1674,
"step": 593
},
{
"epoch": 1.6638655462184873,
"grad_norm": 0.4968366324901581,
"learning_rate": 3.908031426653383e-05,
"loss": 0.0956,
"step": 594
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.2787998616695404,
"learning_rate": 3.9042668677600436e-05,
"loss": 0.1356,
"step": 595
},
{
"epoch": 1.6694677871148458,
"grad_norm": 0.37599462270736694,
"learning_rate": 3.900497651171388e-05,
"loss": 0.2054,
"step": 596
},
{
"epoch": 1.6722689075630253,
"grad_norm": 0.28965288400650024,
"learning_rate": 3.8967237893892134e-05,
"loss": 0.1325,
"step": 597
},
{
"epoch": 1.6750700280112045,
"grad_norm": 0.44063901901245117,
"learning_rate": 3.892945294930728e-05,
"loss": 0.0864,
"step": 598
},
{
"epoch": 1.6778711484593838,
"grad_norm": 0.29227393865585327,
"learning_rate": 3.889162180328505e-05,
"loss": 0.1258,
"step": 599
},
{
"epoch": 1.680672268907563,
"grad_norm": 0.3660801351070404,
"learning_rate": 3.885374458130438e-05,
"loss": 0.2033,
"step": 600
},
{
"epoch": 1.6834733893557423,
"grad_norm": 0.4446983337402344,
"learning_rate": 3.881582140899707e-05,
"loss": 0.0834,
"step": 601
},
{
"epoch": 1.6862745098039216,
"grad_norm": 0.7357985377311707,
"learning_rate": 3.877785241214733e-05,
"loss": 0.244,
"step": 602
},
{
"epoch": 1.6890756302521008,
"grad_norm": 0.2903584837913513,
"learning_rate": 3.873983771669133e-05,
"loss": 0.1743,
"step": 603
},
{
"epoch": 1.69187675070028,
"grad_norm": 0.2409980297088623,
"learning_rate": 3.8701777448716856e-05,
"loss": 0.1273,
"step": 604
},
{
"epoch": 1.6946778711484594,
"grad_norm": 0.22482717037200928,
"learning_rate": 3.866367173446281e-05,
"loss": 0.1268,
"step": 605
},
{
"epoch": 1.6974789915966386,
"grad_norm": 0.2362489402294159,
"learning_rate": 3.862552070031886e-05,
"loss": 0.1251,
"step": 606
},
{
"epoch": 1.7002801120448179,
"grad_norm": 0.23961283266544342,
"learning_rate": 3.858732447282497e-05,
"loss": 0.1698,
"step": 607
},
{
"epoch": 1.7030812324929971,
"grad_norm": 0.5947995185852051,
"learning_rate": 3.854908317867102e-05,
"loss": 0.2085,
"step": 608
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.8783779740333557,
"learning_rate": 3.851079694469636e-05,
"loss": 0.2709,
"step": 609
},
{
"epoch": 1.708683473389356,
"grad_norm": 0.23939639329910278,
"learning_rate": 3.8472465897889394e-05,
"loss": 0.1278,
"step": 610
},
{
"epoch": 1.7114845938375352,
"grad_norm": 1.0147463083267212,
"learning_rate": 3.843409016538716e-05,
"loss": 0.2919,
"step": 611
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.7156258225440979,
"learning_rate": 3.8395669874474915e-05,
"loss": 0.2452,
"step": 612
},
{
"epoch": 1.7170868347338937,
"grad_norm": 0.23990198969841003,
"learning_rate": 3.835720515258572e-05,
"loss": 0.1744,
"step": 613
},
{
"epoch": 1.719887955182073,
"grad_norm": 0.6649200320243835,
"learning_rate": 3.831869612729999e-05,
"loss": 0.2606,
"step": 614
},
{
"epoch": 1.7226890756302522,
"grad_norm": 0.2535988390445709,
"learning_rate": 3.828014292634509e-05,
"loss": 0.1681,
"step": 615
},
{
"epoch": 1.7254901960784315,
"grad_norm": 0.620143711566925,
"learning_rate": 3.8241545677594895e-05,
"loss": 0.2662,
"step": 616
},
{
"epoch": 1.7282913165266107,
"grad_norm": 0.2728918492794037,
"learning_rate": 3.820290450906941e-05,
"loss": 0.1692,
"step": 617
},
{
"epoch": 1.73109243697479,
"grad_norm": 0.2622087001800537,
"learning_rate": 3.816421954893428e-05,
"loss": 0.1215,
"step": 618
},
{
"epoch": 1.7338935574229692,
"grad_norm": 0.3146950304508209,
"learning_rate": 3.8125490925500425e-05,
"loss": 0.1987,
"step": 619
},
{
"epoch": 1.7366946778711485,
"grad_norm": 0.5234703421592712,
"learning_rate": 3.808671876722357e-05,
"loss": 0.2537,
"step": 620
},
{
"epoch": 1.7394957983193278,
"grad_norm": 0.356790155172348,
"learning_rate": 3.804790320270384e-05,
"loss": 0.2111,
"step": 621
},
{
"epoch": 1.742296918767507,
"grad_norm": 0.4688396155834198,
"learning_rate": 3.800904436068533e-05,
"loss": 0.0948,
"step": 622
},
{
"epoch": 1.7450980392156863,
"grad_norm": 0.2965690493583679,
"learning_rate": 3.797014237005571e-05,
"loss": 0.1953,
"step": 623
},
{
"epoch": 1.7478991596638656,
"grad_norm": 0.45940619707107544,
"learning_rate": 3.793119735984572e-05,
"loss": 0.2438,
"step": 624
},
{
"epoch": 1.7507002801120448,
"grad_norm": 0.34540608525276184,
"learning_rate": 3.78922094592288e-05,
"loss": 0.1378,
"step": 625
},
{
"epoch": 1.753501400560224,
"grad_norm": 0.2714684307575226,
"learning_rate": 3.785317879752066e-05,
"loss": 0.2209,
"step": 626
},
{
"epoch": 1.7563025210084033,
"grad_norm": 0.6053199172019958,
"learning_rate": 3.781410550417885e-05,
"loss": 0.2887,
"step": 627
},
{
"epoch": 1.7591036414565826,
"grad_norm": 0.3671252429485321,
"learning_rate": 3.77749897088023e-05,
"loss": 0.1822,
"step": 628
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.2441740483045578,
"learning_rate": 3.773583154113092e-05,
"loss": 0.1712,
"step": 629
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.5259542465209961,
"learning_rate": 3.769663113104516e-05,
"loss": 0.1065,
"step": 630
},
{
"epoch": 1.7675070028011204,
"grad_norm": 0.2503260374069214,
"learning_rate": 3.765738860856557e-05,
"loss": 0.1761,
"step": 631
},
{
"epoch": 1.7703081232492996,
"grad_norm": 0.5481187701225281,
"learning_rate": 3.7618104103852415e-05,
"loss": 0.2614,
"step": 632
},
{
"epoch": 1.773109243697479,
"grad_norm": 0.27601489424705505,
"learning_rate": 3.757877774720517e-05,
"loss": 0.1901,
"step": 633
},
{
"epoch": 1.7759103641456582,
"grad_norm": 0.2434099018573761,
"learning_rate": 3.7539409669062136e-05,
"loss": 0.205,
"step": 634
},
{
"epoch": 1.7787114845938374,
"grad_norm": 0.28954920172691345,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1636,
"step": 635
},
{
"epoch": 1.7815126050420167,
"grad_norm": 0.5615999698638916,
"learning_rate": 3.74605488707334e-05,
"loss": 0.1061,
"step": 636
},
{
"epoch": 1.784313725490196,
"grad_norm": 0.5544323325157166,
"learning_rate": 3.742105641211449e-05,
"loss": 0.106,
"step": 637
},
{
"epoch": 1.7871148459383752,
"grad_norm": 0.5592769384384155,
"learning_rate": 3.738152275513249e-05,
"loss": 0.1079,
"step": 638
},
{
"epoch": 1.7899159663865545,
"grad_norm": 0.2856994569301605,
"learning_rate": 3.7341948030913294e-05,
"loss": 0.1731,
"step": 639
},
{
"epoch": 1.7927170868347337,
"grad_norm": 0.3745407164096832,
"learning_rate": 3.730233237071898e-05,
"loss": 0.1404,
"step": 640
},
{
"epoch": 1.795518207282913,
"grad_norm": 0.33862021565437317,
"learning_rate": 3.726267590594744e-05,
"loss": 0.1952,
"step": 641
},
{
"epoch": 1.7983193277310925,
"grad_norm": 0.3554513156414032,
"learning_rate": 3.7222978768131854e-05,
"loss": 0.1322,
"step": 642
},
{
"epoch": 1.8011204481792717,
"grad_norm": 0.22815661132335663,
"learning_rate": 3.718324108894036e-05,
"loss": 0.1714,
"step": 643
},
{
"epoch": 1.803921568627451,
"grad_norm": 0.32825687527656555,
"learning_rate": 3.7143463000175546e-05,
"loss": 0.1421,
"step": 644
},
{
"epoch": 1.8067226890756303,
"grad_norm": 0.24787546694278717,
"learning_rate": 3.7103644633774014e-05,
"loss": 0.1682,
"step": 645
},
{
"epoch": 1.8095238095238095,
"grad_norm": 0.3160790801048279,
"learning_rate": 3.706378612180598e-05,
"loss": 0.193,
"step": 646
},
{
"epoch": 1.8123249299719888,
"grad_norm": 0.22702836990356445,
"learning_rate": 3.70238875964748e-05,
"loss": 0.171,
"step": 647
},
{
"epoch": 1.815126050420168,
"grad_norm": 0.2879033386707306,
"learning_rate": 3.6983949190116576e-05,
"loss": 0.1391,
"step": 648
},
{
"epoch": 1.8179271708683473,
"grad_norm": 0.2816312313079834,
"learning_rate": 3.6943971035199645e-05,
"loss": 0.1351,
"step": 649
},
{
"epoch": 1.8207282913165266,
"grad_norm": 0.2494056075811386,
"learning_rate": 3.690395326432421e-05,
"loss": 0.165,
"step": 650
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.5620803236961365,
"learning_rate": 3.686389601022188e-05,
"loss": 0.256,
"step": 651
},
{
"epoch": 1.826330532212885,
"grad_norm": 0.36324793100357056,
"learning_rate": 3.682379940575519e-05,
"loss": 0.1187,
"step": 652
},
{
"epoch": 1.8291316526610646,
"grad_norm": 0.32306012511253357,
"learning_rate": 3.678366358391723e-05,
"loss": 0.1183,
"step": 653
},
{
"epoch": 1.8319327731092439,
"grad_norm": 0.25600212812423706,
"learning_rate": 3.674348867783115e-05,
"loss": 0.1684,
"step": 654
},
{
"epoch": 1.8347338935574231,
"grad_norm": 0.27342215180397034,
"learning_rate": 3.670327482074973e-05,
"loss": 0.1249,
"step": 655
},
{
"epoch": 1.8375350140056024,
"grad_norm": 0.4090156555175781,
"learning_rate": 3.666302214605495e-05,
"loss": 0.179,
"step": 656
},
{
"epoch": 1.8403361344537816,
"grad_norm": 0.4678898751735687,
"learning_rate": 3.662273078725754e-05,
"loss": 0.2021,
"step": 657
},
{
"epoch": 1.843137254901961,
"grad_norm": 0.28741687536239624,
"learning_rate": 3.6582400877996546e-05,
"loss": 0.1641,
"step": 658
},
{
"epoch": 1.8459383753501402,
"grad_norm": 0.4612816572189331,
"learning_rate": 3.654203255203886e-05,
"loss": 0.2167,
"step": 659
},
{
"epoch": 1.8487394957983194,
"grad_norm": 0.27487486600875854,
"learning_rate": 3.6501625943278805e-05,
"loss": 0.1567,
"step": 660
},
{
"epoch": 1.8515406162464987,
"grad_norm": 0.4421831965446472,
"learning_rate": 3.6461181185737694e-05,
"loss": 0.2094,
"step": 661
},
{
"epoch": 1.854341736694678,
"grad_norm": 0.5941998362541199,
"learning_rate": 3.6420698413563345e-05,
"loss": 0.2084,
"step": 662
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.5112701058387756,
"learning_rate": 3.6380177761029685e-05,
"loss": 0.0836,
"step": 663
},
{
"epoch": 1.8599439775910365,
"grad_norm": 0.8637323975563049,
"learning_rate": 3.633961936253628e-05,
"loss": 0.2437,
"step": 664
},
{
"epoch": 1.8627450980392157,
"grad_norm": 0.4770047962665558,
"learning_rate": 3.629902335260789e-05,
"loss": 0.1965,
"step": 665
},
{
"epoch": 1.865546218487395,
"grad_norm": 0.3599564731121063,
"learning_rate": 3.625838986589403e-05,
"loss": 0.1134,
"step": 666
},
{
"epoch": 1.8683473389355743,
"grad_norm": 0.5263693928718567,
"learning_rate": 3.621771903716849e-05,
"loss": 0.0875,
"step": 667
},
{
"epoch": 1.8711484593837535,
"grad_norm": 0.5898877382278442,
"learning_rate": 3.617701100132897e-05,
"loss": 0.2261,
"step": 668
},
{
"epoch": 1.8739495798319328,
"grad_norm": 0.369739830493927,
"learning_rate": 3.613626589339653e-05,
"loss": 0.1718,
"step": 669
},
{
"epoch": 1.876750700280112,
"grad_norm": 0.368196576833725,
"learning_rate": 3.609548384851522e-05,
"loss": 0.194,
"step": 670
},
{
"epoch": 1.8795518207282913,
"grad_norm": 0.2884950339794159,
"learning_rate": 3.605466500195159e-05,
"loss": 0.1578,
"step": 671
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.7132876515388489,
"learning_rate": 3.601380948909425e-05,
"loss": 0.2391,
"step": 672
},
{
"epoch": 1.8851540616246498,
"grad_norm": 0.29729971289634705,
"learning_rate": 3.597291744545344e-05,
"loss": 0.1518,
"step": 673
},
{
"epoch": 1.887955182072829,
"grad_norm": 0.24395136535167694,
"learning_rate": 3.593198900666056e-05,
"loss": 0.1675,
"step": 674
},
{
"epoch": 1.8907563025210083,
"grad_norm": 0.28539448976516724,
"learning_rate": 3.589102430846773e-05,
"loss": 0.1573,
"step": 675
},
{
"epoch": 1.8935574229691876,
"grad_norm": 0.38136017322540283,
"learning_rate": 3.585002348674733e-05,
"loss": 0.2004,
"step": 676
},
{
"epoch": 1.8963585434173669,
"grad_norm": 0.5776926279067993,
"learning_rate": 3.5808986677491555e-05,
"loss": 0.0913,
"step": 677
},
{
"epoch": 1.8991596638655461,
"grad_norm": 0.4445839524269104,
"learning_rate": 3.576791401681194e-05,
"loss": 0.185,
"step": 678
},
{
"epoch": 1.9019607843137254,
"grad_norm": 0.31285786628723145,
"learning_rate": 3.5726805640939e-05,
"loss": 0.1609,
"step": 679
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.38080793619155884,
"learning_rate": 3.5685661686221644e-05,
"loss": 0.1928,
"step": 680
},
{
"epoch": 1.907563025210084,
"grad_norm": 0.37026283144950867,
"learning_rate": 3.564448228912682e-05,
"loss": 0.1829,
"step": 681
},
{
"epoch": 1.9103641456582632,
"grad_norm": 0.3265126347541809,
"learning_rate": 3.5603267586239026e-05,
"loss": 0.1563,
"step": 682
},
{
"epoch": 1.9131652661064424,
"grad_norm": 0.6355715990066528,
"learning_rate": 3.556201771425985e-05,
"loss": 0.2228,
"step": 683
},
{
"epoch": 1.9159663865546217,
"grad_norm": 0.5335772037506104,
"learning_rate": 3.5520732810007566e-05,
"loss": 0.1654,
"step": 684
},
{
"epoch": 1.918767507002801,
"grad_norm": 0.6501393914222717,
"learning_rate": 3.547941301041661e-05,
"loss": 0.097,
"step": 685
},
{
"epoch": 1.9215686274509802,
"grad_norm": 0.6850645542144775,
"learning_rate": 3.543805845253716e-05,
"loss": 0.1012,
"step": 686
},
{
"epoch": 1.9243697478991597,
"grad_norm": 0.659045398235321,
"learning_rate": 3.539666927353469e-05,
"loss": 0.0977,
"step": 687
},
{
"epoch": 1.927170868347339,
"grad_norm": 0.47713983058929443,
"learning_rate": 3.535524561068952e-05,
"loss": 0.1192,
"step": 688
},
{
"epoch": 1.9299719887955182,
"grad_norm": 0.4114561975002289,
"learning_rate": 3.5313787601396325e-05,
"loss": 0.1243,
"step": 689
},
{
"epoch": 1.9327731092436975,
"grad_norm": 0.4006536900997162,
"learning_rate": 3.527229538316371e-05,
"loss": 0.1811,
"step": 690
},
{
"epoch": 1.9355742296918768,
"grad_norm": 0.520041823387146,
"learning_rate": 3.523076909361373e-05,
"loss": 0.2083,
"step": 691
},
{
"epoch": 1.938375350140056,
"grad_norm": 0.3322596251964569,
"learning_rate": 3.518920887048149e-05,
"loss": 0.1568,
"step": 692
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.3033127188682556,
"learning_rate": 3.514761485161458e-05,
"loss": 0.1544,
"step": 693
},
{
"epoch": 1.9439775910364145,
"grad_norm": 1.5677649974822998,
"learning_rate": 3.510598717497276e-05,
"loss": 0.3283,
"step": 694
},
{
"epoch": 1.9467787114845938,
"grad_norm": 0.9142319560050964,
"learning_rate": 3.506432597862737e-05,
"loss": 0.2276,
"step": 695
},
{
"epoch": 1.949579831932773,
"grad_norm": 0.3185567259788513,
"learning_rate": 3.5022631400760944e-05,
"loss": 0.1542,
"step": 696
},
{
"epoch": 1.9523809523809523,
"grad_norm": 0.6337863802909851,
"learning_rate": 3.4980903579666744e-05,
"loss": 0.2202,
"step": 697
},
{
"epoch": 1.9551820728291318,
"grad_norm": 0.3815496861934662,
"learning_rate": 3.493914265374829e-05,
"loss": 0.1303,
"step": 698
},
{
"epoch": 1.957983193277311,
"grad_norm": 0.4644532799720764,
"learning_rate": 3.489734876151891e-05,
"loss": 0.1881,
"step": 699
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.43058714270591736,
"learning_rate": 3.485552204160126e-05,
"loss": 0.1121,
"step": 700
},
{
"epoch": 1.9635854341736696,
"grad_norm": 1.0429691076278687,
"learning_rate": 3.48136626327269e-05,
"loss": 0.2764,
"step": 701
},
{
"epoch": 1.9663865546218489,
"grad_norm": 0.6157497763633728,
"learning_rate": 3.4771770673735796e-05,
"loss": 0.0843,
"step": 702
},
{
"epoch": 1.9691876750700281,
"grad_norm": 0.33218902349472046,
"learning_rate": 3.472984630357587e-05,
"loss": 0.1626,
"step": 703
},
{
"epoch": 1.9719887955182074,
"grad_norm": 0.6079518795013428,
"learning_rate": 3.4687889661302576e-05,
"loss": 0.0859,
"step": 704
},
{
"epoch": 1.9747899159663866,
"grad_norm": 0.2988344728946686,
"learning_rate": 3.464590088607839e-05,
"loss": 0.1604,
"step": 705
},
{
"epoch": 1.977591036414566,
"grad_norm": 0.47207456827163696,
"learning_rate": 3.460388011717236e-05,
"loss": 0.1837,
"step": 706
},
{
"epoch": 1.9803921568627452,
"grad_norm": 0.35358136892318726,
"learning_rate": 3.456182749395966e-05,
"loss": 0.1482,
"step": 707
},
{
"epoch": 1.9831932773109244,
"grad_norm": 0.405224084854126,
"learning_rate": 3.451974315592113e-05,
"loss": 0.1288,
"step": 708
},
{
"epoch": 1.9859943977591037,
"grad_norm": 0.4823918640613556,
"learning_rate": 3.4477627242642776e-05,
"loss": 0.1835,
"step": 709
},
{
"epoch": 1.988795518207283,
"grad_norm": 0.3758796155452728,
"learning_rate": 3.443547989381536e-05,
"loss": 0.1303,
"step": 710
},
{
"epoch": 1.9915966386554622,
"grad_norm": 0.47359490394592285,
"learning_rate": 3.43933012492339e-05,
"loss": 0.1844,
"step": 711
},
{
"epoch": 1.9943977591036415,
"grad_norm": 0.3807266652584076,
"learning_rate": 3.43510914487972e-05,
"loss": 0.1357,
"step": 712
},
{
"epoch": 1.9971988795518207,
"grad_norm": 0.6917422413825989,
"learning_rate": 3.430885063250743e-05,
"loss": 0.2231,
"step": 713
},
{
"epoch": 2.0,
"grad_norm": 0.6366453766822815,
"learning_rate": 3.42665789404696e-05,
"loss": 0.0794,
"step": 714
},
{
"epoch": 2.0,
"eval_f1 (minor class)": 0.11274509803921569,
"eval_loss": 0.17888100445270538,
"eval_roc_auc": 0.5086586164934943,
"eval_runtime": 2.9373,
"eval_samples_per_second": 431.683,
"eval_steps_per_second": 13.618,
"step": 714
},
{
"epoch": 2.0028011204481793,
"grad_norm": 0.6449307203292847,
"learning_rate": 3.422427651289118e-05,
"loss": 0.0814,
"step": 715
},
{
"epoch": 2.0056022408963585,
"grad_norm": 1.047551155090332,
"learning_rate": 3.418194349008153e-05,
"loss": 0.2542,
"step": 716
},
{
"epoch": 2.008403361344538,
"grad_norm": 0.486785888671875,
"learning_rate": 3.413958001245152e-05,
"loss": 0.1403,
"step": 717
},
{
"epoch": 2.011204481792717,
"grad_norm": 1.1322331428527832,
"learning_rate": 3.409718622051303e-05,
"loss": 0.2194,
"step": 718
},
{
"epoch": 2.0140056022408963,
"grad_norm": 0.41819918155670166,
"learning_rate": 3.4054762254878476e-05,
"loss": 0.151,
"step": 719
},
{
"epoch": 2.0168067226890756,
"grad_norm": 0.38247114419937134,
"learning_rate": 3.401230825626037e-05,
"loss": 0.1593,
"step": 720
},
{
"epoch": 2.019607843137255,
"grad_norm": 0.7601361274719238,
"learning_rate": 3.396982436547082e-05,
"loss": 0.2249,
"step": 721
},
{
"epoch": 2.022408963585434,
"grad_norm": 1.248322606086731,
"learning_rate": 3.392731072342109e-05,
"loss": 0.197,
"step": 722
},
{
"epoch": 2.0252100840336134,
"grad_norm": 0.5805521607398987,
"learning_rate": 3.388476747112113e-05,
"loss": 0.1443,
"step": 723
},
{
"epoch": 2.0280112044817926,
"grad_norm": 0.47655874490737915,
"learning_rate": 3.384219474967908e-05,
"loss": 0.1962,
"step": 724
},
{
"epoch": 2.030812324929972,
"grad_norm": 0.48123157024383545,
"learning_rate": 3.3799592700300866e-05,
"loss": 0.1266,
"step": 725
},
{
"epoch": 2.033613445378151,
"grad_norm": 0.9366979002952576,
"learning_rate": 3.375696146428963e-05,
"loss": 0.2132,
"step": 726
},
{
"epoch": 2.0364145658263304,
"grad_norm": 0.38628774881362915,
"learning_rate": 3.3714301183045385e-05,
"loss": 0.1599,
"step": 727
},
{
"epoch": 2.0392156862745097,
"grad_norm": 0.5700298547744751,
"learning_rate": 3.3671611998064425e-05,
"loss": 0.1271,
"step": 728
},
{
"epoch": 2.042016806722689,
"grad_norm": 0.8660473823547363,
"learning_rate": 3.3628894050938945e-05,
"loss": 0.2301,
"step": 729
},
{
"epoch": 2.044817927170868,
"grad_norm": 0.7136172652244568,
"learning_rate": 3.3586147483356535e-05,
"loss": 0.1177,
"step": 730
},
{
"epoch": 2.0476190476190474,
"grad_norm": 0.727861225605011,
"learning_rate": 3.354337243709971e-05,
"loss": 0.2178,
"step": 731
},
{
"epoch": 2.0504201680672267,
"grad_norm": 1.5354722738265991,
"learning_rate": 3.350056905404543e-05,
"loss": 0.3138,
"step": 732
},
{
"epoch": 2.053221288515406,
"grad_norm": 0.6516889333724976,
"learning_rate": 3.345773747616467e-05,
"loss": 0.221,
"step": 733
},
{
"epoch": 2.0560224089635852,
"grad_norm": 0.4907413125038147,
"learning_rate": 3.34148778455219e-05,
"loss": 0.1524,
"step": 734
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.5165950059890747,
"learning_rate": 3.3371990304274656e-05,
"loss": 0.199,
"step": 735
},
{
"epoch": 2.0616246498599438,
"grad_norm": 1.439732313156128,
"learning_rate": 3.332907499467302e-05,
"loss": 0.2949,
"step": 736
},
{
"epoch": 2.064425770308123,
"grad_norm": 0.650676965713501,
"learning_rate": 3.328613205905921e-05,
"loss": 0.145,
"step": 737
},
{
"epoch": 2.0672268907563027,
"grad_norm": 0.444158673286438,
"learning_rate": 3.324316163986704e-05,
"loss": 0.219,
"step": 738
},
{
"epoch": 2.070028011204482,
"grad_norm": 0.7450339794158936,
"learning_rate": 3.320016387962151e-05,
"loss": 0.1589,
"step": 739
},
{
"epoch": 2.0728291316526612,
"grad_norm": 1.0164347887039185,
"learning_rate": 3.315713892093829e-05,
"loss": 0.1207,
"step": 740
},
{
"epoch": 2.0756302521008405,
"grad_norm": 0.6916231513023376,
"learning_rate": 3.3114086906523266e-05,
"loss": 0.17,
"step": 741
},
{
"epoch": 2.0784313725490198,
"grad_norm": 1.029963493347168,
"learning_rate": 3.307100797917207e-05,
"loss": 0.1218,
"step": 742
},
{
"epoch": 2.081232492997199,
"grad_norm": 0.8726139068603516,
"learning_rate": 3.302790228176959e-05,
"loss": 0.1337,
"step": 743
},
{
"epoch": 2.0840336134453783,
"grad_norm": 0.6706817746162415,
"learning_rate": 3.29847699572895e-05,
"loss": 0.1637,
"step": 744
},
{
"epoch": 2.0868347338935576,
"grad_norm": 0.7627484798431396,
"learning_rate": 3.294161114879382e-05,
"loss": 0.2022,
"step": 745
},
{
"epoch": 2.089635854341737,
"grad_norm": 0.6804760098457336,
"learning_rate": 3.289842599943237e-05,
"loss": 0.1245,
"step": 746
},
{
"epoch": 2.092436974789916,
"grad_norm": 0.5034189224243164,
"learning_rate": 3.285521465244237e-05,
"loss": 0.141,
"step": 747
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.9262709617614746,
"learning_rate": 3.281197725114792e-05,
"loss": 0.099,
"step": 748
},
{
"epoch": 2.0980392156862746,
"grad_norm": 0.45192310214042664,
"learning_rate": 3.276871393895954e-05,
"loss": 0.1634,
"step": 749
},
{
"epoch": 2.100840336134454,
"grad_norm": 1.2695709466934204,
"learning_rate": 3.272542485937369e-05,
"loss": 0.2591,
"step": 750
},
{
"epoch": 2.103641456582633,
"grad_norm": 0.6426392197608948,
"learning_rate": 3.26821101559723e-05,
"loss": 0.1701,
"step": 751
},
{
"epoch": 2.1064425770308124,
"grad_norm": 0.44048747420310974,
"learning_rate": 3.263876997242229e-05,
"loss": 0.1532,
"step": 752
},
{
"epoch": 2.1092436974789917,
"grad_norm": 1.7888472080230713,
"learning_rate": 3.259540445247509e-05,
"loss": 0.2987,
"step": 753
},
{
"epoch": 2.112044817927171,
"grad_norm": 1.113670825958252,
"learning_rate": 3.2552013739966145e-05,
"loss": 0.2087,
"step": 754
},
{
"epoch": 2.11484593837535,
"grad_norm": 0.6354279518127441,
"learning_rate": 3.250859797881452e-05,
"loss": 0.1307,
"step": 755
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.5894302129745483,
"learning_rate": 3.2465157313022284e-05,
"loss": 0.094,
"step": 756
},
{
"epoch": 2.1204481792717087,
"grad_norm": 0.5198976397514343,
"learning_rate": 3.242169188667416e-05,
"loss": 0.1224,
"step": 757
},
{
"epoch": 2.123249299719888,
"grad_norm": 0.4988885521888733,
"learning_rate": 3.2378201843937e-05,
"loss": 0.1241,
"step": 758
},
{
"epoch": 2.1260504201680672,
"grad_norm": 1.0682430267333984,
"learning_rate": 3.233468732905927e-05,
"loss": 0.2033,
"step": 759
},
{
"epoch": 2.1288515406162465,
"grad_norm": 1.5533567667007446,
"learning_rate": 3.2291148486370626e-05,
"loss": 0.2584,
"step": 760
},
{
"epoch": 2.1316526610644257,
"grad_norm": 0.7770599126815796,
"learning_rate": 3.224758546028143e-05,
"loss": 0.0703,
"step": 761
},
{
"epoch": 2.134453781512605,
"grad_norm": 0.8279226422309875,
"learning_rate": 3.220399839528222e-05,
"loss": 0.0776,
"step": 762
},
{
"epoch": 2.1372549019607843,
"grad_norm": 1.2002817392349243,
"learning_rate": 3.2160387435943296e-05,
"loss": 0.2055,
"step": 763
},
{
"epoch": 2.1400560224089635,
"grad_norm": 0.5353344678878784,
"learning_rate": 3.21167527269142e-05,
"loss": 0.1137,
"step": 764
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.7878655791282654,
"learning_rate": 3.207309441292325e-05,
"loss": 0.0716,
"step": 765
},
{
"epoch": 2.145658263305322,
"grad_norm": 0.562883198261261,
"learning_rate": 3.202941263877706e-05,
"loss": 0.0879,
"step": 766
},
{
"epoch": 2.1484593837535013,
"grad_norm": 1.004749059677124,
"learning_rate": 3.198570754936004e-05,
"loss": 0.1883,
"step": 767
},
{
"epoch": 2.1512605042016806,
"grad_norm": 0.8928613066673279,
"learning_rate": 3.194197928963396e-05,
"loss": 0.164,
"step": 768
},
{
"epoch": 2.15406162464986,
"grad_norm": 2.246786594390869,
"learning_rate": 3.189822800463742e-05,
"loss": 0.1953,
"step": 769
},
{
"epoch": 2.156862745098039,
"grad_norm": 1.1755541563034058,
"learning_rate": 3.1854453839485385e-05,
"loss": 0.1598,
"step": 770
},
{
"epoch": 2.1596638655462184,
"grad_norm": 0.8687044382095337,
"learning_rate": 3.1810656939368744e-05,
"loss": 0.0719,
"step": 771
},
{
"epoch": 2.1624649859943976,
"grad_norm": 1.5823578834533691,
"learning_rate": 3.176683744955377e-05,
"loss": 0.1969,
"step": 772
},
{
"epoch": 2.165266106442577,
"grad_norm": 0.6787047982215881,
"learning_rate": 3.172299551538164e-05,
"loss": 0.1161,
"step": 773
},
{
"epoch": 2.168067226890756,
"grad_norm": 2.3936171531677246,
"learning_rate": 3.167913128226803e-05,
"loss": 0.2243,
"step": 774
},
{
"epoch": 2.1708683473389354,
"grad_norm": 0.9804347157478333,
"learning_rate": 3.1635244895702524e-05,
"loss": 0.0667,
"step": 775
},
{
"epoch": 2.1736694677871147,
"grad_norm": 1.6783931255340576,
"learning_rate": 3.159133650124821e-05,
"loss": 0.177,
"step": 776
},
{
"epoch": 2.176470588235294,
"grad_norm": 1.7065593004226685,
"learning_rate": 3.1547406244541175e-05,
"loss": 0.2228,
"step": 777
},
{
"epoch": 2.179271708683473,
"grad_norm": 1.4685626029968262,
"learning_rate": 3.150345427129002e-05,
"loss": 0.212,
"step": 778
},
{
"epoch": 2.1820728291316525,
"grad_norm": 1.0432506799697876,
"learning_rate": 3.145948072727535e-05,
"loss": 0.1374,
"step": 779
},
{
"epoch": 2.184873949579832,
"grad_norm": 1.9101930856704712,
"learning_rate": 3.1415485758349346e-05,
"loss": 0.2451,
"step": 780
},
{
"epoch": 2.1876750700280114,
"grad_norm": 1.068935513496399,
"learning_rate": 3.137146951043524e-05,
"loss": 0.1353,
"step": 781
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.9940873980522156,
"learning_rate": 3.132743212952684e-05,
"loss": 0.0726,
"step": 782
},
{
"epoch": 2.19327731092437,
"grad_norm": 1.092490315437317,
"learning_rate": 3.128337376168805e-05,
"loss": 0.174,
"step": 783
},
{
"epoch": 2.196078431372549,
"grad_norm": 0.75323885679245,
"learning_rate": 3.123929455305239e-05,
"loss": 0.1166,
"step": 784
},
{
"epoch": 2.1988795518207285,
"grad_norm": 1.346554160118103,
"learning_rate": 3.11951946498225e-05,
"loss": 0.1646,
"step": 785
},
{
"epoch": 2.2016806722689077,
"grad_norm": 0.9711296558380127,
"learning_rate": 3.115107419826966e-05,
"loss": 0.1301,
"step": 786
},
{
"epoch": 2.204481792717087,
"grad_norm": 0.9502242207527161,
"learning_rate": 3.1106933344733304e-05,
"loss": 0.1036,
"step": 787
},
{
"epoch": 2.2072829131652663,
"grad_norm": 1.8469451665878296,
"learning_rate": 3.106277223562054e-05,
"loss": 0.2247,
"step": 788
},
{
"epoch": 2.2100840336134455,
"grad_norm": 0.8027406334877014,
"learning_rate": 3.101859101740564e-05,
"loss": 0.1404,
"step": 789
},
{
"epoch": 2.212885154061625,
"grad_norm": 1.0038503408432007,
"learning_rate": 3.0974389836629624e-05,
"loss": 0.1889,
"step": 790
},
{
"epoch": 2.215686274509804,
"grad_norm": 2.352829933166504,
"learning_rate": 3.0930168839899666e-05,
"loss": 0.2958,
"step": 791
},
{
"epoch": 2.2184873949579833,
"grad_norm": 0.6628269553184509,
"learning_rate": 3.088592817388869e-05,
"loss": 0.1324,
"step": 792
},
{
"epoch": 2.2212885154061626,
"grad_norm": 0.7565025091171265,
"learning_rate": 3.084166798533489e-05,
"loss": 0.1405,
"step": 793
},
{
"epoch": 2.224089635854342,
"grad_norm": 0.8182944655418396,
"learning_rate": 3.079738842104115e-05,
"loss": 0.1145,
"step": 794
},
{
"epoch": 2.226890756302521,
"grad_norm": 0.6144469380378723,
"learning_rate": 3.075308962787466e-05,
"loss": 0.1457,
"step": 795
},
{
"epoch": 2.2296918767507004,
"grad_norm": 1.0344878435134888,
"learning_rate": 3.0708771752766394e-05,
"loss": 0.118,
"step": 796
},
{
"epoch": 2.2324929971988796,
"grad_norm": 0.6334454417228699,
"learning_rate": 3.06644349427106e-05,
"loss": 0.137,
"step": 797
},
{
"epoch": 2.235294117647059,
"grad_norm": 1.0241148471832275,
"learning_rate": 3.062007934476433e-05,
"loss": 0.073,
"step": 798
},
{
"epoch": 2.238095238095238,
"grad_norm": 1.3902757167816162,
"learning_rate": 3.057570510604696e-05,
"loss": 0.1635,
"step": 799
},
{
"epoch": 2.2408963585434174,
"grad_norm": 1.1507909297943115,
"learning_rate": 3.05313123737397e-05,
"loss": 0.1311,
"step": 800
},
{
"epoch": 2.2436974789915967,
"grad_norm": 1.0565153360366821,
"learning_rate": 3.048690129508507e-05,
"loss": 0.1478,
"step": 801
},
{
"epoch": 2.246498599439776,
"grad_norm": 1.574903964996338,
"learning_rate": 3.0442472017386468e-05,
"loss": 0.1736,
"step": 802
},
{
"epoch": 2.249299719887955,
"grad_norm": 1.0273772478103638,
"learning_rate": 3.039802468800767e-05,
"loss": 0.069,
"step": 803
},
{
"epoch": 2.2521008403361344,
"grad_norm": 0.6728970408439636,
"learning_rate": 3.0353559454372283e-05,
"loss": 0.1135,
"step": 804
},
{
"epoch": 2.2549019607843137,
"grad_norm": 2.3881945610046387,
"learning_rate": 3.0309076463963332e-05,
"loss": 0.2483,
"step": 805
},
{
"epoch": 2.257703081232493,
"grad_norm": 0.9350632429122925,
"learning_rate": 3.026457586432272e-05,
"loss": 0.1364,
"step": 806
},
{
"epoch": 2.2605042016806722,
"grad_norm": 1.813084602355957,
"learning_rate": 3.0220057803050766e-05,
"loss": 0.1363,
"step": 807
},
{
"epoch": 2.2633053221288515,
"grad_norm": 1.0090512037277222,
"learning_rate": 3.0175522427805702e-05,
"loss": 0.0537,
"step": 808
},
{
"epoch": 2.2661064425770308,
"grad_norm": 2.049363613128662,
"learning_rate": 3.0130969886303202e-05,
"loss": 0.1873,
"step": 809
},
{
"epoch": 2.26890756302521,
"grad_norm": 1.8174175024032593,
"learning_rate": 3.008640032631585e-05,
"loss": 0.1829,
"step": 810
},
{
"epoch": 2.2717086834733893,
"grad_norm": 1.599669337272644,
"learning_rate": 3.0041813895672693e-05,
"loss": 0.1405,
"step": 811
},
{
"epoch": 2.2745098039215685,
"grad_norm": 0.9827849268913269,
"learning_rate": 2.9997210742258735e-05,
"loss": 0.1344,
"step": 812
},
{
"epoch": 2.277310924369748,
"grad_norm": 1.021662950515747,
"learning_rate": 2.9952591014014452e-05,
"loss": 0.0605,
"step": 813
},
{
"epoch": 2.280112044817927,
"grad_norm": 1.5521609783172607,
"learning_rate": 2.9907954858935278e-05,
"loss": 0.1679,
"step": 814
},
{
"epoch": 2.2829131652661063,
"grad_norm": 3.6788723468780518,
"learning_rate": 2.9863302425071153e-05,
"loss": 0.2745,
"step": 815
},
{
"epoch": 2.2857142857142856,
"grad_norm": 3.890761137008667,
"learning_rate": 2.9818633860525992e-05,
"loss": 0.2846,
"step": 816
},
{
"epoch": 2.288515406162465,
"grad_norm": 0.9732990860939026,
"learning_rate": 2.9773949313457218e-05,
"loss": 0.0869,
"step": 817
},
{
"epoch": 2.291316526610644,
"grad_norm": 3.4630930423736572,
"learning_rate": 2.9729248932075277e-05,
"loss": 0.2344,
"step": 818
},
{
"epoch": 2.2941176470588234,
"grad_norm": 1.538347840309143,
"learning_rate": 2.9684532864643122e-05,
"loss": 0.0815,
"step": 819
},
{
"epoch": 2.2969187675070026,
"grad_norm": 0.8786672353744507,
"learning_rate": 2.963980125947573e-05,
"loss": 0.1294,
"step": 820
},
{
"epoch": 2.299719887955182,
"grad_norm": 2.4151713848114014,
"learning_rate": 2.9595054264939625e-05,
"loss": 0.2426,
"step": 821
},
{
"epoch": 2.302521008403361,
"grad_norm": 0.9119096994400024,
"learning_rate": 2.9550292029452374e-05,
"loss": 0.1187,
"step": 822
},
{
"epoch": 2.3053221288515404,
"grad_norm": 1.4019802808761597,
"learning_rate": 2.9505514701482094e-05,
"loss": 0.0792,
"step": 823
},
{
"epoch": 2.3081232492997197,
"grad_norm": 2.1555142402648926,
"learning_rate": 2.946072242954695e-05,
"loss": 0.2305,
"step": 824
},
{
"epoch": 2.310924369747899,
"grad_norm": 2.0481019020080566,
"learning_rate": 2.9415915362214692e-05,
"loss": 0.2277,
"step": 825
},
{
"epoch": 2.313725490196078,
"grad_norm": 1.6008906364440918,
"learning_rate": 2.9371093648102115e-05,
"loss": 0.1847,
"step": 826
},
{
"epoch": 2.3165266106442575,
"grad_norm": 1.3652608394622803,
"learning_rate": 2.9326257435874633e-05,
"loss": 0.1658,
"step": 827
},
{
"epoch": 2.3193277310924367,
"grad_norm": 0.9234902262687683,
"learning_rate": 2.9281406874245728e-05,
"loss": 0.1248,
"step": 828
},
{
"epoch": 2.3221288515406164,
"grad_norm": 3.829730749130249,
"learning_rate": 2.9236542111976467e-05,
"loss": 0.2828,
"step": 829
},
{
"epoch": 2.3249299719887957,
"grad_norm": 1.4697741270065308,
"learning_rate": 2.9191663297875025e-05,
"loss": 0.203,
"step": 830
},
{
"epoch": 2.327731092436975,
"grad_norm": 1.3902533054351807,
"learning_rate": 2.9146770580796206e-05,
"loss": 0.1166,
"step": 831
},
{
"epoch": 2.330532212885154,
"grad_norm": 1.8676822185516357,
"learning_rate": 2.9101864109640885e-05,
"loss": 0.1062,
"step": 832
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.5584545135498047,
"learning_rate": 2.9056944033355588e-05,
"loss": 0.1372,
"step": 833
},
{
"epoch": 2.3361344537815127,
"grad_norm": 1.1754084825515747,
"learning_rate": 2.9012010500931965e-05,
"loss": 0.1565,
"step": 834
},
{
"epoch": 2.338935574229692,
"grad_norm": 1.4818142652511597,
"learning_rate": 2.8967063661406285e-05,
"loss": 0.1558,
"step": 835
},
{
"epoch": 2.3417366946778713,
"grad_norm": 1.2028563022613525,
"learning_rate": 2.8922103663858962e-05,
"loss": 0.1037,
"step": 836
},
{
"epoch": 2.3445378151260505,
"grad_norm": 1.0404908657073975,
"learning_rate": 2.8877130657414055e-05,
"loss": 0.1289,
"step": 837
},
{
"epoch": 2.34733893557423,
"grad_norm": 2.1311583518981934,
"learning_rate": 2.8832144791238757e-05,
"loss": 0.2058,
"step": 838
},
{
"epoch": 2.350140056022409,
"grad_norm": 0.9418230056762695,
"learning_rate": 2.8787146214542936e-05,
"loss": 0.1284,
"step": 839
},
{
"epoch": 2.3529411764705883,
"grad_norm": 3.9797799587249756,
"learning_rate": 2.874213507657861e-05,
"loss": 0.284,
"step": 840
},
{
"epoch": 2.3557422969187676,
"grad_norm": 1.02471923828125,
"learning_rate": 2.869711152663945e-05,
"loss": 0.1083,
"step": 841
},
{
"epoch": 2.358543417366947,
"grad_norm": 2.1199262142181396,
"learning_rate": 2.8652075714060295e-05,
"loss": 0.1628,
"step": 842
},
{
"epoch": 2.361344537815126,
"grad_norm": 3.4246649742126465,
"learning_rate": 2.8607027788216673e-05,
"loss": 0.2235,
"step": 843
},
{
"epoch": 2.3641456582633054,
"grad_norm": 2.611994504928589,
"learning_rate": 2.856196789852429e-05,
"loss": 0.1928,
"step": 844
},
{
"epoch": 2.3669467787114846,
"grad_norm": 1.5862823724746704,
"learning_rate": 2.8516896194438514e-05,
"loss": 0.1598,
"step": 845
},
{
"epoch": 2.369747899159664,
"grad_norm": 2.3910162448883057,
"learning_rate": 2.8471812825453912e-05,
"loss": 0.1882,
"step": 846
},
{
"epoch": 2.372549019607843,
"grad_norm": 1.9114694595336914,
"learning_rate": 2.8426717941103742e-05,
"loss": 0.1644,
"step": 847
},
{
"epoch": 2.3753501400560224,
"grad_norm": 2.0703580379486084,
"learning_rate": 2.8381611690959452e-05,
"loss": 0.1611,
"step": 848
},
{
"epoch": 2.3781512605042017,
"grad_norm": 2.1538355350494385,
"learning_rate": 2.833649422463019e-05,
"loss": 0.212,
"step": 849
},
{
"epoch": 2.380952380952381,
"grad_norm": 1.4592348337173462,
"learning_rate": 2.829136569176231e-05,
"loss": 0.1827,
"step": 850
},
{
"epoch": 2.38375350140056,
"grad_norm": 1.1074143648147583,
"learning_rate": 2.824622624203887e-05,
"loss": 0.0923,
"step": 851
},
{
"epoch": 2.3865546218487395,
"grad_norm": 1.632165789604187,
"learning_rate": 2.820107602517913e-05,
"loss": 0.0798,
"step": 852
},
{
"epoch": 2.3893557422969187,
"grad_norm": 1.7588852643966675,
"learning_rate": 2.815591519093806e-05,
"loss": 0.0824,
"step": 853
},
{
"epoch": 2.392156862745098,
"grad_norm": 1.0905377864837646,
"learning_rate": 2.8110743889105873e-05,
"loss": 0.1431,
"step": 854
},
{
"epoch": 2.3949579831932772,
"grad_norm": 1.8013345003128052,
"learning_rate": 2.8065562269507463e-05,
"loss": 0.083,
"step": 855
},
{
"epoch": 2.3977591036414565,
"grad_norm": 1.3905547857284546,
"learning_rate": 2.802037048200198e-05,
"loss": 0.1831,
"step": 856
},
{
"epoch": 2.4005602240896358,
"grad_norm": 3.3773820400238037,
"learning_rate": 2.7975168676482276e-05,
"loss": 0.2509,
"step": 857
},
{
"epoch": 2.403361344537815,
"grad_norm": 3.573317766189575,
"learning_rate": 2.7929957002874434e-05,
"loss": 0.2583,
"step": 858
},
{
"epoch": 2.4061624649859943,
"grad_norm": 0.9735562801361084,
"learning_rate": 2.7884735611137287e-05,
"loss": 0.113,
"step": 859
},
{
"epoch": 2.4089635854341735,
"grad_norm": 1.2774683237075806,
"learning_rate": 2.7839504651261872e-05,
"loss": 0.1054,
"step": 860
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.908938467502594,
"learning_rate": 2.7794264273270986e-05,
"loss": 0.1364,
"step": 861
},
{
"epoch": 2.414565826330532,
"grad_norm": 2.184770345687866,
"learning_rate": 2.7749014627218654e-05,
"loss": 0.2112,
"step": 862
},
{
"epoch": 2.4173669467787113,
"grad_norm": 1.4857324361801147,
"learning_rate": 2.770375586318964e-05,
"loss": 0.0705,
"step": 863
},
{
"epoch": 2.4201680672268906,
"grad_norm": 3.609208345413208,
"learning_rate": 2.765848813129895e-05,
"loss": 0.2855,
"step": 864
},
{
"epoch": 2.42296918767507,
"grad_norm": 2.6520326137542725,
"learning_rate": 2.761321158169134e-05,
"loss": 0.1879,
"step": 865
},
{
"epoch": 2.425770308123249,
"grad_norm": 1.848484754562378,
"learning_rate": 2.7567926364540814e-05,
"loss": 0.1432,
"step": 866
},
{
"epoch": 2.4285714285714284,
"grad_norm": 1.1588749885559082,
"learning_rate": 2.7522632630050117e-05,
"loss": 0.1075,
"step": 867
},
{
"epoch": 2.431372549019608,
"grad_norm": 1.314877986907959,
"learning_rate": 2.7477330528450246e-05,
"loss": 0.0853,
"step": 868
},
{
"epoch": 2.4341736694677873,
"grad_norm": 0.8727567195892334,
"learning_rate": 2.7432020209999955e-05,
"loss": 0.087,
"step": 869
},
{
"epoch": 2.4369747899159666,
"grad_norm": 1.754386305809021,
"learning_rate": 2.7386701824985255e-05,
"loss": 0.1586,
"step": 870
},
{
"epoch": 2.439775910364146,
"grad_norm": 1.5930638313293457,
"learning_rate": 2.73413755237189e-05,
"loss": 0.1684,
"step": 871
},
{
"epoch": 2.442577030812325,
"grad_norm": 3.0343246459960938,
"learning_rate": 2.7296041456539913e-05,
"loss": 0.2159,
"step": 872
},
{
"epoch": 2.4453781512605044,
"grad_norm": 1.294217824935913,
"learning_rate": 2.7250699773813065e-05,
"loss": 0.0609,
"step": 873
},
{
"epoch": 2.4481792717086837,
"grad_norm": 1.2340281009674072,
"learning_rate": 2.720535062592838e-05,
"loss": 0.1437,
"step": 874
},
{
"epoch": 2.450980392156863,
"grad_norm": 1.1075295209884644,
"learning_rate": 2.715999416330068e-05,
"loss": 0.1246,
"step": 875
},
{
"epoch": 2.453781512605042,
"grad_norm": 0.8321152329444885,
"learning_rate": 2.7114630536369002e-05,
"loss": 0.1264,
"step": 876
},
{
"epoch": 2.4565826330532214,
"grad_norm": 1.457123041152954,
"learning_rate": 2.706925989559617e-05,
"loss": 0.0573,
"step": 877
},
{
"epoch": 2.4593837535014007,
"grad_norm": 3.3637664318084717,
"learning_rate": 2.7023882391468275e-05,
"loss": 0.1873,
"step": 878
},
{
"epoch": 2.46218487394958,
"grad_norm": 2.1541194915771484,
"learning_rate": 2.6978498174494148e-05,
"loss": 0.1426,
"step": 879
},
{
"epoch": 2.4649859943977592,
"grad_norm": 2.2974648475646973,
"learning_rate": 2.6933107395204926e-05,
"loss": 0.1586,
"step": 880
},
{
"epoch": 2.4677871148459385,
"grad_norm": 1.1513004302978516,
"learning_rate": 2.6887710204153464e-05,
"loss": 0.1239,
"step": 881
},
{
"epoch": 2.4705882352941178,
"grad_norm": 1.440656065940857,
"learning_rate": 2.6842306751913927e-05,
"loss": 0.1369,
"step": 882
},
{
"epoch": 2.473389355742297,
"grad_norm": 1.4952887296676636,
"learning_rate": 2.6796897189081216e-05,
"loss": 0.1562,
"step": 883
},
{
"epoch": 2.4761904761904763,
"grad_norm": 1.7169839143753052,
"learning_rate": 2.675148166627051e-05,
"loss": 0.0699,
"step": 884
},
{
"epoch": 2.4789915966386555,
"grad_norm": 2.272355794906616,
"learning_rate": 2.6706060334116777e-05,
"loss": 0.1072,
"step": 885
},
{
"epoch": 2.481792717086835,
"grad_norm": 1.0691149234771729,
"learning_rate": 2.6660633343274217e-05,
"loss": 0.1161,
"step": 886
},
{
"epoch": 2.484593837535014,
"grad_norm": 1.3935844898223877,
"learning_rate": 2.6615200844415818e-05,
"loss": 0.061,
"step": 887
},
{
"epoch": 2.4873949579831933,
"grad_norm": 2.5838394165039062,
"learning_rate": 2.656976298823284e-05,
"loss": 0.1642,
"step": 888
},
{
"epoch": 2.4901960784313726,
"grad_norm": 2.358081817626953,
"learning_rate": 2.6524319925434298e-05,
"loss": 0.1791,
"step": 889
},
{
"epoch": 2.492997198879552,
"grad_norm": 2.429168462753296,
"learning_rate": 2.6478871806746497e-05,
"loss": 0.1494,
"step": 890
},
{
"epoch": 2.495798319327731,
"grad_norm": 1.4072517156600952,
"learning_rate": 2.6433418782912505e-05,
"loss": 0.1284,
"step": 891
},
{
"epoch": 2.4985994397759104,
"grad_norm": 1.0104848146438599,
"learning_rate": 2.6387961004691646e-05,
"loss": 0.1196,
"step": 892
},
{
"epoch": 2.5014005602240896,
"grad_norm": 1.6268606185913086,
"learning_rate": 2.6342498622859012e-05,
"loss": 0.1645,
"step": 893
},
{
"epoch": 2.504201680672269,
"grad_norm": 1.4942986965179443,
"learning_rate": 2.6297031788205002e-05,
"loss": 0.1631,
"step": 894
},
{
"epoch": 2.507002801120448,
"grad_norm": 2.0987398624420166,
"learning_rate": 2.625156065153473e-05,
"loss": 0.1939,
"step": 895
},
{
"epoch": 2.5098039215686274,
"grad_norm": 2.244086980819702,
"learning_rate": 2.6206085363667614e-05,
"loss": 0.2049,
"step": 896
},
{
"epoch": 2.5126050420168067,
"grad_norm": 0.9332670569419861,
"learning_rate": 2.6160606075436844e-05,
"loss": 0.1232,
"step": 897
},
{
"epoch": 2.515406162464986,
"grad_norm": 1.4196544885635376,
"learning_rate": 2.6115122937688858e-05,
"loss": 0.0894,
"step": 898
},
{
"epoch": 2.518207282913165,
"grad_norm": 1.8320996761322021,
"learning_rate": 2.6069636101282864e-05,
"loss": 0.0721,
"step": 899
},
{
"epoch": 2.5210084033613445,
"grad_norm": 1.893038034439087,
"learning_rate": 2.6024145717090358e-05,
"loss": 0.093,
"step": 900
},
{
"epoch": 2.5238095238095237,
"grad_norm": 1.4464538097381592,
"learning_rate": 2.5978651935994587e-05,
"loss": 0.0873,
"step": 901
},
{
"epoch": 2.526610644257703,
"grad_norm": 3.647059202194214,
"learning_rate": 2.593315490889006e-05,
"loss": 0.2706,
"step": 902
},
{
"epoch": 2.5294117647058822,
"grad_norm": 1.4356290102005005,
"learning_rate": 2.5887654786682074e-05,
"loss": 0.116,
"step": 903
},
{
"epoch": 2.5322128851540615,
"grad_norm": 0.9853848218917847,
"learning_rate": 2.584215172028618e-05,
"loss": 0.1188,
"step": 904
},
{
"epoch": 2.5350140056022408,
"grad_norm": 1.500677227973938,
"learning_rate": 2.5796645860627666e-05,
"loss": 0.0558,
"step": 905
},
{
"epoch": 2.53781512605042,
"grad_norm": 1.1618036031723022,
"learning_rate": 2.5751137358641142e-05,
"loss": 0.0892,
"step": 906
},
{
"epoch": 2.5406162464985993,
"grad_norm": 0.8631600141525269,
"learning_rate": 2.5705626365269936e-05,
"loss": 0.0864,
"step": 907
},
{
"epoch": 2.5434173669467786,
"grad_norm": 0.9771242737770081,
"learning_rate": 2.5660113031465665e-05,
"loss": 0.1298,
"step": 908
},
{
"epoch": 2.546218487394958,
"grad_norm": 0.6834169626235962,
"learning_rate": 2.561459750818769e-05,
"loss": 0.0985,
"step": 909
},
{
"epoch": 2.549019607843137,
"grad_norm": 6.005595684051514,
"learning_rate": 2.556907994640264e-05,
"loss": 0.2469,
"step": 910
},
{
"epoch": 2.5518207282913163,
"grad_norm": 3.9378793239593506,
"learning_rate": 2.5523560497083926e-05,
"loss": 0.2189,
"step": 911
},
{
"epoch": 2.5546218487394956,
"grad_norm": 5.9815802574157715,
"learning_rate": 2.547803931121119e-05,
"loss": 0.272,
"step": 912
},
{
"epoch": 2.557422969187675,
"grad_norm": 1.720476508140564,
"learning_rate": 2.543251653976984e-05,
"loss": 0.0776,
"step": 913
},
{
"epoch": 2.560224089635854,
"grad_norm": 1.5094176530838013,
"learning_rate": 2.5386992333750565e-05,
"loss": 0.1011,
"step": 914
},
{
"epoch": 2.5630252100840334,
"grad_norm": 1.1552016735076904,
"learning_rate": 2.5341466844148775e-05,
"loss": 0.0941,
"step": 915
},
{
"epoch": 2.5658263305322127,
"grad_norm": 5.176620006561279,
"learning_rate": 2.5295940221964183e-05,
"loss": 0.2609,
"step": 916
},
{
"epoch": 2.568627450980392,
"grad_norm": 1.7582367658615112,
"learning_rate": 2.525041261820022e-05,
"loss": 0.1071,
"step": 917
},
{
"epoch": 2.571428571428571,
"grad_norm": 4.4167656898498535,
"learning_rate": 2.520488418386358e-05,
"loss": 0.3358,
"step": 918
},
{
"epoch": 2.5742296918767504,
"grad_norm": 1.6718422174453735,
"learning_rate": 2.515935506996374e-05,
"loss": 0.109,
"step": 919
},
{
"epoch": 2.5770308123249297,
"grad_norm": 6.082570552825928,
"learning_rate": 2.5113825427512387e-05,
"loss": 0.3443,
"step": 920
},
{
"epoch": 2.5798319327731094,
"grad_norm": 4.4841389656066895,
"learning_rate": 2.5068295407523002e-05,
"loss": 0.1837,
"step": 921
},
{
"epoch": 2.5826330532212887,
"grad_norm": 5.3200154304504395,
"learning_rate": 2.5022765161010285e-05,
"loss": 0.2542,
"step": 922
},
{
"epoch": 2.585434173669468,
"grad_norm": 2.247319221496582,
"learning_rate": 2.4977234838989714e-05,
"loss": 0.1716,
"step": 923
},
{
"epoch": 2.588235294117647,
"grad_norm": 1.0447925329208374,
"learning_rate": 2.4931704592477e-05,
"loss": 0.1269,
"step": 924
},
{
"epoch": 2.5910364145658265,
"grad_norm": 0.9518100619316101,
"learning_rate": 2.4886174572487612e-05,
"loss": 0.1492,
"step": 925
},
{
"epoch": 2.5938375350140057,
"grad_norm": 1.3479211330413818,
"learning_rate": 2.4840644930036265e-05,
"loss": 0.1175,
"step": 926
},
{
"epoch": 2.596638655462185,
"grad_norm": 0.9772087335586548,
"learning_rate": 2.479511581613642e-05,
"loss": 0.1481,
"step": 927
},
{
"epoch": 2.5994397759103642,
"grad_norm": 2.1998813152313232,
"learning_rate": 2.4749587381799785e-05,
"loss": 0.1811,
"step": 928
},
{
"epoch": 2.6022408963585435,
"grad_norm": 2.205763578414917,
"learning_rate": 2.4704059778035823e-05,
"loss": 0.108,
"step": 929
},
{
"epoch": 2.6050420168067228,
"grad_norm": 1.233022689819336,
"learning_rate": 2.4658533155851228e-05,
"loss": 0.1735,
"step": 930
},
{
"epoch": 2.607843137254902,
"grad_norm": 1.2709368467330933,
"learning_rate": 2.4613007666249444e-05,
"loss": 0.1537,
"step": 931
},
{
"epoch": 2.6106442577030813,
"grad_norm": 1.1181148290634155,
"learning_rate": 2.4567483460230166e-05,
"loss": 0.1699,
"step": 932
},
{
"epoch": 2.6134453781512605,
"grad_norm": 2.1665706634521484,
"learning_rate": 2.4521960688788813e-05,
"loss": 0.1121,
"step": 933
},
{
"epoch": 2.61624649859944,
"grad_norm": 2.5444371700286865,
"learning_rate": 2.447643950291608e-05,
"loss": 0.1106,
"step": 934
},
{
"epoch": 2.619047619047619,
"grad_norm": 1.5478941202163696,
"learning_rate": 2.4430920053597356e-05,
"loss": 0.1466,
"step": 935
},
{
"epoch": 2.6218487394957983,
"grad_norm": 1.6474449634552002,
"learning_rate": 2.4385402491812317e-05,
"loss": 0.1433,
"step": 936
},
{
"epoch": 2.6246498599439776,
"grad_norm": 1.5256372690200806,
"learning_rate": 2.433988696853434e-05,
"loss": 0.142,
"step": 937
},
{
"epoch": 2.627450980392157,
"grad_norm": 3.403135299682617,
"learning_rate": 2.4294373634730067e-05,
"loss": 0.2463,
"step": 938
},
{
"epoch": 2.630252100840336,
"grad_norm": 1.661600947380066,
"learning_rate": 2.4248862641358867e-05,
"loss": 0.0999,
"step": 939
},
{
"epoch": 2.6330532212885154,
"grad_norm": 2.347033977508545,
"learning_rate": 2.4203354139372336e-05,
"loss": 0.1872,
"step": 940
},
{
"epoch": 2.6358543417366946,
"grad_norm": 1.0385260581970215,
"learning_rate": 2.4157848279713836e-05,
"loss": 0.0964,
"step": 941
},
{
"epoch": 2.638655462184874,
"grad_norm": 1.9232137203216553,
"learning_rate": 2.4112345213317932e-05,
"loss": 0.0593,
"step": 942
},
{
"epoch": 2.641456582633053,
"grad_norm": 1.9710360765457153,
"learning_rate": 2.4066845091109945e-05,
"loss": 0.0703,
"step": 943
},
{
"epoch": 2.6442577030812324,
"grad_norm": 3.1349337100982666,
"learning_rate": 2.4021348064005412e-05,
"loss": 0.2024,
"step": 944
},
{
"epoch": 2.6470588235294117,
"grad_norm": 1.115425705909729,
"learning_rate": 2.3975854282909644e-05,
"loss": 0.1145,
"step": 945
},
{
"epoch": 2.649859943977591,
"grad_norm": 2.5821077823638916,
"learning_rate": 2.393036389871714e-05,
"loss": 0.0916,
"step": 946
},
{
"epoch": 2.65266106442577,
"grad_norm": 2.563978672027588,
"learning_rate": 2.3884877062311148e-05,
"loss": 0.1482,
"step": 947
},
{
"epoch": 2.6554621848739495,
"grad_norm": 1.5142000913619995,
"learning_rate": 2.3839393924563162e-05,
"loss": 0.0888,
"step": 948
},
{
"epoch": 2.6582633053221287,
"grad_norm": 1.4172172546386719,
"learning_rate": 2.379391463633239e-05,
"loss": 0.1059,
"step": 949
},
{
"epoch": 2.661064425770308,
"grad_norm": 4.520923137664795,
"learning_rate": 2.374843934846528e-05,
"loss": 0.3047,
"step": 950
},
{
"epoch": 2.6638655462184873,
"grad_norm": 1.8254413604736328,
"learning_rate": 2.3702968211795014e-05,
"loss": 0.0708,
"step": 951
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.337383270263672,
"learning_rate": 2.3657501377140994e-05,
"loss": 0.1196,
"step": 952
},
{
"epoch": 2.669467787114846,
"grad_norm": 2.389855146408081,
"learning_rate": 2.3612038995308366e-05,
"loss": 0.1203,
"step": 953
},
{
"epoch": 2.6722689075630255,
"grad_norm": 3.353264808654785,
"learning_rate": 2.3566581217087494e-05,
"loss": 0.3071,
"step": 954
},
{
"epoch": 2.6750700280112047,
"grad_norm": 0.9094104170799255,
"learning_rate": 2.35211281932535e-05,
"loss": 0.12,
"step": 955
},
{
"epoch": 2.677871148459384,
"grad_norm": 1.1269475221633911,
"learning_rate": 2.3475680074565704e-05,
"loss": 0.0879,
"step": 956
},
{
"epoch": 2.6806722689075633,
"grad_norm": 3.381889581680298,
"learning_rate": 2.3430237011767167e-05,
"loss": 0.1277,
"step": 957
},
{
"epoch": 2.6834733893557425,
"grad_norm": 3.6173455715179443,
"learning_rate": 2.3384799155584188e-05,
"loss": 0.1896,
"step": 958
},
{
"epoch": 2.686274509803922,
"grad_norm": 2.380185127258301,
"learning_rate": 2.333936665672579e-05,
"loss": 0.1528,
"step": 959
},
{
"epoch": 2.689075630252101,
"grad_norm": 4.736597537994385,
"learning_rate": 2.329393966588323e-05,
"loss": 0.2742,
"step": 960
},
{
"epoch": 2.6918767507002803,
"grad_norm": 1.3075121641159058,
"learning_rate": 2.3248518333729492e-05,
"loss": 0.1345,
"step": 961
},
{
"epoch": 2.6946778711484596,
"grad_norm": 7.774576187133789,
"learning_rate": 2.3203102810918793e-05,
"loss": 0.2945,
"step": 962
},
{
"epoch": 2.697478991596639,
"grad_norm": 4.586057662963867,
"learning_rate": 2.3157693248086082e-05,
"loss": 0.2048,
"step": 963
},
{
"epoch": 2.700280112044818,
"grad_norm": 1.880879282951355,
"learning_rate": 2.3112289795846535e-05,
"loss": 0.1447,
"step": 964
},
{
"epoch": 2.7030812324929974,
"grad_norm": 1.6627247333526611,
"learning_rate": 2.3066892604795083e-05,
"loss": 0.1739,
"step": 965
},
{
"epoch": 2.7058823529411766,
"grad_norm": 2.666111707687378,
"learning_rate": 2.3021501825505848e-05,
"loss": 0.1068,
"step": 966
},
{
"epoch": 2.708683473389356,
"grad_norm": 1.956619381904602,
"learning_rate": 2.297611760853173e-05,
"loss": 0.1249,
"step": 967
},
{
"epoch": 2.711484593837535,
"grad_norm": 2.4250824451446533,
"learning_rate": 2.2930740104403838e-05,
"loss": 0.0901,
"step": 968
},
{
"epoch": 2.7142857142857144,
"grad_norm": 1.3982925415039062,
"learning_rate": 2.2885369463631e-05,
"loss": 0.1411,
"step": 969
},
{
"epoch": 2.7170868347338937,
"grad_norm": 2.247079372406006,
"learning_rate": 2.2840005836699327e-05,
"loss": 0.1423,
"step": 970
},
{
"epoch": 2.719887955182073,
"grad_norm": 2.0201003551483154,
"learning_rate": 2.2794649374071624e-05,
"loss": 0.1771,
"step": 971
},
{
"epoch": 2.722689075630252,
"grad_norm": 3.928633213043213,
"learning_rate": 2.2749300226186948e-05,
"loss": 0.2267,
"step": 972
},
{
"epoch": 2.7254901960784315,
"grad_norm": 1.7197332382202148,
"learning_rate": 2.27039585434601e-05,
"loss": 0.1331,
"step": 973
},
{
"epoch": 2.7282913165266107,
"grad_norm": 4.6833319664001465,
"learning_rate": 2.2658624476281106e-05,
"loss": 0.4137,
"step": 974
},
{
"epoch": 2.73109243697479,
"grad_norm": 2.0049855709075928,
"learning_rate": 2.261329817501475e-05,
"loss": 0.1425,
"step": 975
},
{
"epoch": 2.7338935574229692,
"grad_norm": 3.6578054428100586,
"learning_rate": 2.2567979790000044e-05,
"loss": 0.2481,
"step": 976
},
{
"epoch": 2.7366946778711485,
"grad_norm": 3.138701915740967,
"learning_rate": 2.252266947154976e-05,
"loss": 0.217,
"step": 977
},
{
"epoch": 2.7394957983193278,
"grad_norm": 2.2167491912841797,
"learning_rate": 2.2477367369949885e-05,
"loss": 0.1679,
"step": 978
},
{
"epoch": 2.742296918767507,
"grad_norm": 1.97732412815094,
"learning_rate": 2.2432073635459195e-05,
"loss": 0.1166,
"step": 979
},
{
"epoch": 2.7450980392156863,
"grad_norm": 3.3067071437835693,
"learning_rate": 2.238678841830867e-05,
"loss": 0.2084,
"step": 980
},
{
"epoch": 2.7478991596638656,
"grad_norm": 2.3965907096862793,
"learning_rate": 2.2341511868701056e-05,
"loss": 0.1414,
"step": 981
},
{
"epoch": 2.750700280112045,
"grad_norm": 1.1365553140640259,
"learning_rate": 2.2296244136810372e-05,
"loss": 0.1233,
"step": 982
},
{
"epoch": 2.753501400560224,
"grad_norm": 2.6060173511505127,
"learning_rate": 2.2250985372781358e-05,
"loss": 0.1747,
"step": 983
},
{
"epoch": 2.7563025210084033,
"grad_norm": 1.867843508720398,
"learning_rate": 2.2205735726729023e-05,
"loss": 0.1697,
"step": 984
},
{
"epoch": 2.7591036414565826,
"grad_norm": 2.642951011657715,
"learning_rate": 2.2160495348738123e-05,
"loss": 0.2173,
"step": 985
},
{
"epoch": 2.761904761904762,
"grad_norm": 1.9568394422531128,
"learning_rate": 2.2115264388862716e-05,
"loss": 0.1009,
"step": 986
},
{
"epoch": 2.764705882352941,
"grad_norm": 2.103360652923584,
"learning_rate": 2.2070042997125568e-05,
"loss": 0.2042,
"step": 987
},
{
"epoch": 2.7675070028011204,
"grad_norm": 4.358123302459717,
"learning_rate": 2.2024831323517727e-05,
"loss": 0.2433,
"step": 988
},
{
"epoch": 2.7703081232492996,
"grad_norm": 1.3602516651153564,
"learning_rate": 2.1979629517998027e-05,
"loss": 0.1339,
"step": 989
},
{
"epoch": 2.773109243697479,
"grad_norm": 1.3080668449401855,
"learning_rate": 2.1934437730492543e-05,
"loss": 0.1076,
"step": 990
},
{
"epoch": 2.775910364145658,
"grad_norm": 3.740341901779175,
"learning_rate": 2.1889256110894136e-05,
"loss": 0.1864,
"step": 991
},
{
"epoch": 2.7787114845938374,
"grad_norm": 1.519460916519165,
"learning_rate": 2.1844084809061947e-05,
"loss": 0.0937,
"step": 992
},
{
"epoch": 2.7815126050420167,
"grad_norm": 1.7029025554656982,
"learning_rate": 2.1798923974820883e-05,
"loss": 0.1736,
"step": 993
},
{
"epoch": 2.784313725490196,
"grad_norm": 6.329875469207764,
"learning_rate": 2.175377375796114e-05,
"loss": 0.2977,
"step": 994
},
{
"epoch": 2.787114845938375,
"grad_norm": 1.558143138885498,
"learning_rate": 2.1708634308237685e-05,
"loss": 0.0992,
"step": 995
},
{
"epoch": 2.7899159663865545,
"grad_norm": 7.823278903961182,
"learning_rate": 2.166350577536981e-05,
"loss": 0.3846,
"step": 996
},
{
"epoch": 2.7927170868347337,
"grad_norm": 2.097930669784546,
"learning_rate": 2.1618388309040554e-05,
"loss": 0.0771,
"step": 997
},
{
"epoch": 2.795518207282913,
"grad_norm": 3.3652889728546143,
"learning_rate": 2.157328205889626e-05,
"loss": 0.2203,
"step": 998
},
{
"epoch": 2.7983193277310923,
"grad_norm": 2.2427425384521484,
"learning_rate": 2.1528187174546093e-05,
"loss": 0.2252,
"step": 999
},
{
"epoch": 2.8011204481792715,
"grad_norm": 2.1080129146575928,
"learning_rate": 2.1483103805561492e-05,
"loss": 0.0739,
"step": 1000
},
{
"epoch": 2.803921568627451,
"grad_norm": 4.240324974060059,
"learning_rate": 2.1438032101475717e-05,
"loss": 0.2332,
"step": 1001
},
{
"epoch": 2.80672268907563,
"grad_norm": 1.8164169788360596,
"learning_rate": 2.1392972211783333e-05,
"loss": 0.1475,
"step": 1002
},
{
"epoch": 2.8095238095238093,
"grad_norm": 1.6740729808807373,
"learning_rate": 2.1347924285939714e-05,
"loss": 0.0843,
"step": 1003
},
{
"epoch": 2.8123249299719886,
"grad_norm": 5.382768154144287,
"learning_rate": 2.1302888473360567e-05,
"loss": 0.239,
"step": 1004
},
{
"epoch": 2.815126050420168,
"grad_norm": 1.6590102910995483,
"learning_rate": 2.1257864923421404e-05,
"loss": 0.1047,
"step": 1005
},
{
"epoch": 2.817927170868347,
"grad_norm": 1.018723487854004,
"learning_rate": 2.1212853785457063e-05,
"loss": 0.0965,
"step": 1006
},
{
"epoch": 2.8207282913165264,
"grad_norm": 2.8001303672790527,
"learning_rate": 2.116785520876124e-05,
"loss": 0.146,
"step": 1007
},
{
"epoch": 2.8235294117647056,
"grad_norm": 1.4516608715057373,
"learning_rate": 2.1122869342585948e-05,
"loss": 0.1119,
"step": 1008
},
{
"epoch": 2.826330532212885,
"grad_norm": 2.7861437797546387,
"learning_rate": 2.1077896336141043e-05,
"loss": 0.2351,
"step": 1009
},
{
"epoch": 2.8291316526610646,
"grad_norm": 3.8527379035949707,
"learning_rate": 2.1032936338593718e-05,
"loss": 0.1761,
"step": 1010
},
{
"epoch": 2.831932773109244,
"grad_norm": 1.841584324836731,
"learning_rate": 2.098798949906804e-05,
"loss": 0.1167,
"step": 1011
},
{
"epoch": 2.834733893557423,
"grad_norm": 1.2715080976486206,
"learning_rate": 2.094305596664442e-05,
"loss": 0.1063,
"step": 1012
},
{
"epoch": 2.8375350140056024,
"grad_norm": 1.0730732679367065,
"learning_rate": 2.089813589035912e-05,
"loss": 0.1142,
"step": 1013
},
{
"epoch": 2.8403361344537816,
"grad_norm": 2.1539816856384277,
"learning_rate": 2.0853229419203807e-05,
"loss": 0.1385,
"step": 1014
},
{
"epoch": 2.843137254901961,
"grad_norm": 1.6405160427093506,
"learning_rate": 2.080833670212498e-05,
"loss": 0.1202,
"step": 1015
},
{
"epoch": 2.84593837535014,
"grad_norm": 6.016746520996094,
"learning_rate": 2.0763457888023536e-05,
"loss": 0.1775,
"step": 1016
},
{
"epoch": 2.8487394957983194,
"grad_norm": 2.172192096710205,
"learning_rate": 2.071859312575427e-05,
"loss": 0.0765,
"step": 1017
},
{
"epoch": 2.8515406162464987,
"grad_norm": 2.1204257011413574,
"learning_rate": 2.0673742564125366e-05,
"loss": 0.0646,
"step": 1018
},
{
"epoch": 2.854341736694678,
"grad_norm": 1.5806128978729248,
"learning_rate": 2.0628906351897888e-05,
"loss": 0.0917,
"step": 1019
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.5115206241607666,
"learning_rate": 2.0584084637785317e-05,
"loss": 0.0867,
"step": 1020
},
{
"epoch": 2.8599439775910365,
"grad_norm": 4.569616317749023,
"learning_rate": 2.0539277570453056e-05,
"loss": 0.196,
"step": 1021
},
{
"epoch": 2.8627450980392157,
"grad_norm": 1.7789644002914429,
"learning_rate": 2.049448529851791e-05,
"loss": 0.0636,
"step": 1022
},
{
"epoch": 2.865546218487395,
"grad_norm": 5.349003314971924,
"learning_rate": 2.044970797054763e-05,
"loss": 0.1957,
"step": 1023
},
{
"epoch": 2.8683473389355743,
"grad_norm": 3.394336223602295,
"learning_rate": 2.0404945735060384e-05,
"loss": 0.1502,
"step": 1024
},
{
"epoch": 2.8711484593837535,
"grad_norm": 2.537426471710205,
"learning_rate": 2.0360198740524277e-05,
"loss": 0.1638,
"step": 1025
},
{
"epoch": 2.8739495798319328,
"grad_norm": 2.9345662593841553,
"learning_rate": 2.031546713535688e-05,
"loss": 0.1451,
"step": 1026
},
{
"epoch": 2.876750700280112,
"grad_norm": 1.415229320526123,
"learning_rate": 2.0270751067924726e-05,
"loss": 0.0903,
"step": 1027
},
{
"epoch": 2.8795518207282913,
"grad_norm": 5.642995357513428,
"learning_rate": 2.0226050686542785e-05,
"loss": 0.259,
"step": 1028
},
{
"epoch": 2.8823529411764706,
"grad_norm": 2.897869825363159,
"learning_rate": 2.018136613947401e-05,
"loss": 0.1846,
"step": 1029
},
{
"epoch": 2.88515406162465,
"grad_norm": 1.0580120086669922,
"learning_rate": 2.0136697574928853e-05,
"loss": 0.1207,
"step": 1030
},
{
"epoch": 2.887955182072829,
"grad_norm": 1.414197325706482,
"learning_rate": 2.0092045141064728e-05,
"loss": 0.1204,
"step": 1031
},
{
"epoch": 2.8907563025210083,
"grad_norm": 4.534923553466797,
"learning_rate": 2.0047408985985554e-05,
"loss": 0.2321,
"step": 1032
},
{
"epoch": 2.8935574229691876,
"grad_norm": 1.720337986946106,
"learning_rate": 2.000278925774127e-05,
"loss": 0.0847,
"step": 1033
},
{
"epoch": 2.896358543417367,
"grad_norm": 1.1151013374328613,
"learning_rate": 1.9958186104327317e-05,
"loss": 0.1009,
"step": 1034
},
{
"epoch": 2.899159663865546,
"grad_norm": 2.245469093322754,
"learning_rate": 1.991359967368416e-05,
"loss": 0.1659,
"step": 1035
},
{
"epoch": 2.9019607843137254,
"grad_norm": 2.1737239360809326,
"learning_rate": 1.986903011369681e-05,
"loss": 0.0708,
"step": 1036
},
{
"epoch": 2.9047619047619047,
"grad_norm": 1.8480969667434692,
"learning_rate": 1.9824477572194293e-05,
"loss": 0.1006,
"step": 1037
},
{
"epoch": 2.907563025210084,
"grad_norm": 2.1421828269958496,
"learning_rate": 1.977994219694924e-05,
"loss": 0.0719,
"step": 1038
},
{
"epoch": 2.910364145658263,
"grad_norm": 1.80349600315094,
"learning_rate": 1.9735424135677283e-05,
"loss": 0.1409,
"step": 1039
},
{
"epoch": 2.9131652661064424,
"grad_norm": 4.127012252807617,
"learning_rate": 1.9690923536036674e-05,
"loss": 0.2044,
"step": 1040
},
{
"epoch": 2.9159663865546217,
"grad_norm": 1.261323094367981,
"learning_rate": 1.9646440545627723e-05,
"loss": 0.1205,
"step": 1041
},
{
"epoch": 2.918767507002801,
"grad_norm": 1.5377665758132935,
"learning_rate": 1.9601975311992336e-05,
"loss": 0.143,
"step": 1042
},
{
"epoch": 2.9215686274509802,
"grad_norm": 4.650557041168213,
"learning_rate": 1.9557527982613534e-05,
"loss": 0.1921,
"step": 1043
},
{
"epoch": 2.92436974789916,
"grad_norm": 1.1912533044815063,
"learning_rate": 1.951309870491494e-05,
"loss": 0.1071,
"step": 1044
},
{
"epoch": 2.927170868347339,
"grad_norm": 4.09713888168335,
"learning_rate": 1.946868762626031e-05,
"loss": 0.1706,
"step": 1045
},
{
"epoch": 2.9299719887955185,
"grad_norm": 0.8855944275856018,
"learning_rate": 1.942429489395305e-05,
"loss": 0.0771,
"step": 1046
},
{
"epoch": 2.9327731092436977,
"grad_norm": 1.456910252571106,
"learning_rate": 1.9379920655235674e-05,
"loss": 0.1087,
"step": 1047
},
{
"epoch": 2.935574229691877,
"grad_norm": 2.284022569656372,
"learning_rate": 1.93355650572894e-05,
"loss": 0.1672,
"step": 1048
},
{
"epoch": 2.9383753501400562,
"grad_norm": 1.4423737525939941,
"learning_rate": 1.9291228247233605e-05,
"loss": 0.1369,
"step": 1049
},
{
"epoch": 2.9411764705882355,
"grad_norm": 2.2354118824005127,
"learning_rate": 1.9246910372125342e-05,
"loss": 0.0673,
"step": 1050
},
{
"epoch": 2.9439775910364148,
"grad_norm": 4.511037826538086,
"learning_rate": 1.9202611578958856e-05,
"loss": 0.2114,
"step": 1051
},
{
"epoch": 2.946778711484594,
"grad_norm": 1.7781668901443481,
"learning_rate": 1.915833201466512e-05,
"loss": 0.1061,
"step": 1052
},
{
"epoch": 2.9495798319327733,
"grad_norm": 2.758974075317383,
"learning_rate": 1.911407182611131e-05,
"loss": 0.2044,
"step": 1053
},
{
"epoch": 2.9523809523809526,
"grad_norm": 1.5675272941589355,
"learning_rate": 1.9069831160100336e-05,
"loss": 0.1365,
"step": 1054
},
{
"epoch": 2.955182072829132,
"grad_norm": 1.3328542709350586,
"learning_rate": 1.9025610163370382e-05,
"loss": 0.1121,
"step": 1055
},
{
"epoch": 2.957983193277311,
"grad_norm": 2.2529404163360596,
"learning_rate": 1.8981408982594366e-05,
"loss": 0.1053,
"step": 1056
},
{
"epoch": 2.9607843137254903,
"grad_norm": 1.1923927068710327,
"learning_rate": 1.8937227764379468e-05,
"loss": 0.0899,
"step": 1057
},
{
"epoch": 2.9635854341736696,
"grad_norm": 5.314030647277832,
"learning_rate": 1.8893066655266698e-05,
"loss": 0.2127,
"step": 1058
},
{
"epoch": 2.966386554621849,
"grad_norm": 1.430845856666565,
"learning_rate": 1.8848925801730342e-05,
"loss": 0.1402,
"step": 1059
},
{
"epoch": 2.969187675070028,
"grad_norm": 6.605301380157471,
"learning_rate": 1.8804805350177505e-05,
"loss": 0.3004,
"step": 1060
},
{
"epoch": 2.9719887955182074,
"grad_norm": 1.6426587104797363,
"learning_rate": 1.8760705446947612e-05,
"loss": 0.0928,
"step": 1061
},
{
"epoch": 2.9747899159663866,
"grad_norm": 1.2367888689041138,
"learning_rate": 1.8716626238311958e-05,
"loss": 0.0813,
"step": 1062
},
{
"epoch": 2.977591036414566,
"grad_norm": 3.249281883239746,
"learning_rate": 1.867256787047317e-05,
"loss": 0.2224,
"step": 1063
},
{
"epoch": 2.980392156862745,
"grad_norm": 2.45151424407959,
"learning_rate": 1.862853048956477e-05,
"loss": 0.0713,
"step": 1064
},
{
"epoch": 2.9831932773109244,
"grad_norm": 4.494724273681641,
"learning_rate": 1.8584514241650666e-05,
"loss": 0.2052,
"step": 1065
},
{
"epoch": 2.9859943977591037,
"grad_norm": 2.8811893463134766,
"learning_rate": 1.854051927272466e-05,
"loss": 0.1478,
"step": 1066
},
{
"epoch": 2.988795518207283,
"grad_norm": 1.8865958452224731,
"learning_rate": 1.849654572870999e-05,
"loss": 0.0716,
"step": 1067
},
{
"epoch": 2.991596638655462,
"grad_norm": 1.676823616027832,
"learning_rate": 1.845259375545882e-05,
"loss": 0.0817,
"step": 1068
},
{
"epoch": 2.9943977591036415,
"grad_norm": 1.9959278106689453,
"learning_rate": 1.8408663498751787e-05,
"loss": 0.0945,
"step": 1069
},
{
"epoch": 2.9971988795518207,
"grad_norm": 4.297255039215088,
"learning_rate": 1.8364755104297475e-05,
"loss": 0.1461,
"step": 1070
},
{
"epoch": 3.0,
"grad_norm": 4.1172919273376465,
"learning_rate": 1.8320868717731977e-05,
"loss": 0.14,
"step": 1071
},
{
"epoch": 3.0,
"eval_f1 (minor class)": 0.07164179104477611,
"eval_loss": 0.27444425225257874,
"eval_roc_auc": 0.47768521118703655,
"eval_runtime": 2.9384,
"eval_samples_per_second": 431.534,
"eval_steps_per_second": 13.613,
"step": 1071
},
{
"epoch": 3.0028011204481793,
"grad_norm": 1.9041872024536133,
"learning_rate": 1.827700448461836e-05,
"loss": 0.1121,
"step": 1072
},
{
"epoch": 3.0056022408963585,
"grad_norm": 4.570765972137451,
"learning_rate": 1.8233162550446237e-05,
"loss": 0.164,
"step": 1073
},
{
"epoch": 3.008403361344538,
"grad_norm": 3.6286909580230713,
"learning_rate": 1.818934306063126e-05,
"loss": 0.1787,
"step": 1074
},
{
"epoch": 3.011204481792717,
"grad_norm": 1.2501143217086792,
"learning_rate": 1.814554616051462e-05,
"loss": 0.0779,
"step": 1075
},
{
"epoch": 3.0140056022408963,
"grad_norm": 1.1278096437454224,
"learning_rate": 1.8101771995362593e-05,
"loss": 0.1068,
"step": 1076
},
{
"epoch": 3.0168067226890756,
"grad_norm": 3.9806535243988037,
"learning_rate": 1.8058020710366052e-05,
"loss": 0.1387,
"step": 1077
},
{
"epoch": 3.019607843137255,
"grad_norm": 1.3316892385482788,
"learning_rate": 1.8014292450639962e-05,
"loss": 0.1017,
"step": 1078
},
{
"epoch": 3.022408963585434,
"grad_norm": 4.150597095489502,
"learning_rate": 1.7970587361222945e-05,
"loss": 0.1456,
"step": 1079
},
{
"epoch": 3.0252100840336134,
"grad_norm": 1.0515501499176025,
"learning_rate": 1.7926905587076747e-05,
"loss": 0.0972,
"step": 1080
},
{
"epoch": 3.0280112044817926,
"grad_norm": 4.736774921417236,
"learning_rate": 1.78832472730858e-05,
"loss": 0.2442,
"step": 1081
},
{
"epoch": 3.030812324929972,
"grad_norm": 4.363551139831543,
"learning_rate": 1.7839612564056713e-05,
"loss": 0.2249,
"step": 1082
},
{
"epoch": 3.033613445378151,
"grad_norm": 1.5223008394241333,
"learning_rate": 1.7796001604717787e-05,
"loss": 0.0879,
"step": 1083
},
{
"epoch": 3.0364145658263304,
"grad_norm": 2.6861369609832764,
"learning_rate": 1.775241453971858e-05,
"loss": 0.1543,
"step": 1084
},
{
"epoch": 3.0392156862745097,
"grad_norm": 1.0746591091156006,
"learning_rate": 1.7708851513629377e-05,
"loss": 0.094,
"step": 1085
},
{
"epoch": 3.042016806722689,
"grad_norm": 3.822098731994629,
"learning_rate": 1.766531267094074e-05,
"loss": 0.1346,
"step": 1086
},
{
"epoch": 3.044817927170868,
"grad_norm": 1.1148676872253418,
"learning_rate": 1.7621798156063014e-05,
"loss": 0.0527,
"step": 1087
},
{
"epoch": 3.0476190476190474,
"grad_norm": 1.9202200174331665,
"learning_rate": 1.7578308113325842e-05,
"loss": 0.1012,
"step": 1088
},
{
"epoch": 3.0504201680672267,
"grad_norm": 1.3763988018035889,
"learning_rate": 1.7534842686977722e-05,
"loss": 0.0719,
"step": 1089
},
{
"epoch": 3.053221288515406,
"grad_norm": 2.6884231567382812,
"learning_rate": 1.749140202118549e-05,
"loss": 0.1288,
"step": 1090
},
{
"epoch": 3.0560224089635852,
"grad_norm": 3.1106390953063965,
"learning_rate": 1.7447986260033854e-05,
"loss": 0.1485,
"step": 1091
},
{
"epoch": 3.0588235294117645,
"grad_norm": 2.4417479038238525,
"learning_rate": 1.740459554752492e-05,
"loss": 0.1173,
"step": 1092
},
{
"epoch": 3.0616246498599438,
"grad_norm": 1.9952789545059204,
"learning_rate": 1.7361230027577717e-05,
"loss": 0.1014,
"step": 1093
},
{
"epoch": 3.064425770308123,
"grad_norm": 4.222140312194824,
"learning_rate": 1.7317889844027707e-05,
"loss": 0.1561,
"step": 1094
},
{
"epoch": 3.0672268907563027,
"grad_norm": 1.7875876426696777,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.0453,
"step": 1095
},
{
"epoch": 3.070028011204482,
"grad_norm": 1.239638328552246,
"learning_rate": 1.723128606104047e-05,
"loss": 0.0793,
"step": 1096
},
{
"epoch": 3.0728291316526612,
"grad_norm": 1.499316930770874,
"learning_rate": 1.7188022748852093e-05,
"loss": 0.1036,
"step": 1097
},
{
"epoch": 3.0756302521008405,
"grad_norm": 8.684480667114258,
"learning_rate": 1.7144785347557643e-05,
"loss": 0.1959,
"step": 1098
},
{
"epoch": 3.0784313725490198,
"grad_norm": 1.5142842531204224,
"learning_rate": 1.7101574000567632e-05,
"loss": 0.0675,
"step": 1099
},
{
"epoch": 3.081232492997199,
"grad_norm": 1.4972301721572876,
"learning_rate": 1.7058388851206187e-05,
"loss": 0.0726,
"step": 1100
},
{
"epoch": 3.0840336134453783,
"grad_norm": 3.245042562484741,
"learning_rate": 1.70152300427105e-05,
"loss": 0.1299,
"step": 1101
},
{
"epoch": 3.0868347338935576,
"grad_norm": 2.845457077026367,
"learning_rate": 1.6972097718230415e-05,
"loss": 0.1117,
"step": 1102
},
{
"epoch": 3.089635854341737,
"grad_norm": 2.242424249649048,
"learning_rate": 1.6928992020827935e-05,
"loss": 0.0942,
"step": 1103
},
{
"epoch": 3.092436974789916,
"grad_norm": 1.7120102643966675,
"learning_rate": 1.688591309347674e-05,
"loss": 0.0353,
"step": 1104
},
{
"epoch": 3.0952380952380953,
"grad_norm": 2.4106523990631104,
"learning_rate": 1.6842861079061717e-05,
"loss": 0.0453,
"step": 1105
},
{
"epoch": 3.0980392156862746,
"grad_norm": 5.073875427246094,
"learning_rate": 1.67998361203785e-05,
"loss": 0.1652,
"step": 1106
},
{
"epoch": 3.100840336134454,
"grad_norm": 5.638187885284424,
"learning_rate": 1.6756838360132967e-05,
"loss": 0.1419,
"step": 1107
},
{
"epoch": 3.103641456582633,
"grad_norm": 1.8764796257019043,
"learning_rate": 1.6713867940940802e-05,
"loss": 0.0354,
"step": 1108
},
{
"epoch": 3.1064425770308124,
"grad_norm": 5.068608283996582,
"learning_rate": 1.6670925005326975e-05,
"loss": 0.1406,
"step": 1109
},
{
"epoch": 3.1092436974789917,
"grad_norm": 1.0536717176437378,
"learning_rate": 1.6628009695725346e-05,
"loss": 0.0664,
"step": 1110
},
{
"epoch": 3.112044817927171,
"grad_norm": 1.4604605436325073,
"learning_rate": 1.6585122154478093e-05,
"loss": 0.0462,
"step": 1111
},
{
"epoch": 3.11484593837535,
"grad_norm": 1.783050775527954,
"learning_rate": 1.6542262523835334e-05,
"loss": 0.0419,
"step": 1112
},
{
"epoch": 3.1176470588235294,
"grad_norm": 0.7703757882118225,
"learning_rate": 1.6499430945954576e-05,
"loss": 0.0457,
"step": 1113
},
{
"epoch": 3.1204481792717087,
"grad_norm": 6.189941883087158,
"learning_rate": 1.6456627562900297e-05,
"loss": 0.1177,
"step": 1114
},
{
"epoch": 3.123249299719888,
"grad_norm": 0.8514342308044434,
"learning_rate": 1.6413852516643468e-05,
"loss": 0.0555,
"step": 1115
},
{
"epoch": 3.1260504201680672,
"grad_norm": 1.1006991863250732,
"learning_rate": 1.637110594906106e-05,
"loss": 0.0209,
"step": 1116
},
{
"epoch": 3.1288515406162465,
"grad_norm": 1.2455596923828125,
"learning_rate": 1.6328388001935584e-05,
"loss": 0.0588,
"step": 1117
},
{
"epoch": 3.1316526610644257,
"grad_norm": 4.493983268737793,
"learning_rate": 1.6285698816954624e-05,
"loss": 0.1099,
"step": 1118
},
{
"epoch": 3.134453781512605,
"grad_norm": 4.895142078399658,
"learning_rate": 1.6243038535710366e-05,
"loss": 0.0951,
"step": 1119
},
{
"epoch": 3.1372549019607843,
"grad_norm": 7.513343811035156,
"learning_rate": 1.6200407299699143e-05,
"loss": 0.135,
"step": 1120
},
{
"epoch": 3.1400560224089635,
"grad_norm": 1.311148762702942,
"learning_rate": 1.6157805250320916e-05,
"loss": 0.0229,
"step": 1121
},
{
"epoch": 3.142857142857143,
"grad_norm": 3.4543368816375732,
"learning_rate": 1.6115232528878877e-05,
"loss": 0.1027,
"step": 1122
},
{
"epoch": 3.145658263305322,
"grad_norm": 1.4137835502624512,
"learning_rate": 1.6072689276578916e-05,
"loss": 0.0639,
"step": 1123
},
{
"epoch": 3.1484593837535013,
"grad_norm": 9.049639701843262,
"learning_rate": 1.603017563452919e-05,
"loss": 0.1938,
"step": 1124
},
{
"epoch": 3.1512605042016806,
"grad_norm": 6.14802885055542,
"learning_rate": 1.5987691743739636e-05,
"loss": 0.1298,
"step": 1125
},
{
"epoch": 3.15406162464986,
"grad_norm": 4.0769500732421875,
"learning_rate": 1.5945237745121527e-05,
"loss": 0.0923,
"step": 1126
},
{
"epoch": 3.156862745098039,
"grad_norm": 4.114642143249512,
"learning_rate": 1.5902813779486974e-05,
"loss": 0.1167,
"step": 1127
},
{
"epoch": 3.1596638655462184,
"grad_norm": 8.57524299621582,
"learning_rate": 1.5860419987548486e-05,
"loss": 0.153,
"step": 1128
},
{
"epoch": 3.1624649859943976,
"grad_norm": 5.372265815734863,
"learning_rate": 1.5818056509918476e-05,
"loss": 0.1754,
"step": 1129
},
{
"epoch": 3.165266106442577,
"grad_norm": 1.3174500465393066,
"learning_rate": 1.577572348710882e-05,
"loss": 0.0411,
"step": 1130
},
{
"epoch": 3.168067226890756,
"grad_norm": 5.699807167053223,
"learning_rate": 1.57334210595304e-05,
"loss": 0.1272,
"step": 1131
},
{
"epoch": 3.1708683473389354,
"grad_norm": 2.9178154468536377,
"learning_rate": 1.569114936749258e-05,
"loss": 0.0579,
"step": 1132
},
{
"epoch": 3.1736694677871147,
"grad_norm": 11.699190139770508,
"learning_rate": 1.5648908551202805e-05,
"loss": 0.2907,
"step": 1133
},
{
"epoch": 3.176470588235294,
"grad_norm": 5.31898307800293,
"learning_rate": 1.5606698750766106e-05,
"loss": 0.1472,
"step": 1134
},
{
"epoch": 3.179271708683473,
"grad_norm": 1.5398555994033813,
"learning_rate": 1.5564520106184644e-05,
"loss": 0.1009,
"step": 1135
},
{
"epoch": 3.1820728291316525,
"grad_norm": 3.491788864135742,
"learning_rate": 1.5522372757357223e-05,
"loss": 0.0922,
"step": 1136
},
{
"epoch": 3.184873949579832,
"grad_norm": 2.5744388103485107,
"learning_rate": 1.5480256844078877e-05,
"loss": 0.1341,
"step": 1137
},
{
"epoch": 3.1876750700280114,
"grad_norm": 2.21545147895813,
"learning_rate": 1.543817250604035e-05,
"loss": 0.0803,
"step": 1138
},
{
"epoch": 3.1904761904761907,
"grad_norm": 2.675712823867798,
"learning_rate": 1.539611988282765e-05,
"loss": 0.0835,
"step": 1139
},
{
"epoch": 3.19327731092437,
"grad_norm": 3.099294662475586,
"learning_rate": 1.5354099113921615e-05,
"loss": 0.3112,
"step": 1140
},
{
"epoch": 3.196078431372549,
"grad_norm": 2.0022711753845215,
"learning_rate": 1.5312110338697426e-05,
"loss": 0.1004,
"step": 1141
},
{
"epoch": 3.1988795518207285,
"grad_norm": 2.4308621883392334,
"learning_rate": 1.5270153696424133e-05,
"loss": 0.0707,
"step": 1142
},
{
"epoch": 3.2016806722689077,
"grad_norm": 3.7195255756378174,
"learning_rate": 1.522822932626421e-05,
"loss": 0.1451,
"step": 1143
},
{
"epoch": 3.204481792717087,
"grad_norm": 3.308135509490967,
"learning_rate": 1.5186337367273105e-05,
"loss": 0.0804,
"step": 1144
},
{
"epoch": 3.2072829131652663,
"grad_norm": 3.8628289699554443,
"learning_rate": 1.514447795839874e-05,
"loss": 0.1371,
"step": 1145
},
{
"epoch": 3.2100840336134455,
"grad_norm": 2.1035282611846924,
"learning_rate": 1.510265123848109e-05,
"loss": 0.0949,
"step": 1146
},
{
"epoch": 3.212885154061625,
"grad_norm": 1.18038010597229,
"learning_rate": 1.5060857346251711e-05,
"loss": 0.0558,
"step": 1147
},
{
"epoch": 3.215686274509804,
"grad_norm": 3.4646565914154053,
"learning_rate": 1.5019096420333257e-05,
"loss": 0.0763,
"step": 1148
},
{
"epoch": 3.2184873949579833,
"grad_norm": 3.8320508003234863,
"learning_rate": 1.4977368599239062e-05,
"loss": 0.1264,
"step": 1149
},
{
"epoch": 3.2212885154061626,
"grad_norm": 8.55019760131836,
"learning_rate": 1.493567402137263e-05,
"loss": 0.2867,
"step": 1150
},
{
"epoch": 3.224089635854342,
"grad_norm": 9.773296356201172,
"learning_rate": 1.489401282502724e-05,
"loss": 0.272,
"step": 1151
},
{
"epoch": 3.226890756302521,
"grad_norm": 2.1806998252868652,
"learning_rate": 1.4852385148385411e-05,
"loss": 0.0633,
"step": 1152
},
{
"epoch": 3.2296918767507004,
"grad_norm": 1.791035771369934,
"learning_rate": 1.4810791129518516e-05,
"loss": 0.0833,
"step": 1153
},
{
"epoch": 3.2324929971988796,
"grad_norm": 2.499708652496338,
"learning_rate": 1.4769230906386272e-05,
"loss": 0.0769,
"step": 1154
},
{
"epoch": 3.235294117647059,
"grad_norm": 1.1026753187179565,
"learning_rate": 1.4727704616836296e-05,
"loss": 0.0456,
"step": 1155
},
{
"epoch": 3.238095238095238,
"grad_norm": 2.9074044227600098,
"learning_rate": 1.4686212398603682e-05,
"loss": 0.047,
"step": 1156
},
{
"epoch": 3.2408963585434174,
"grad_norm": 1.6559323072433472,
"learning_rate": 1.464475438931049e-05,
"loss": 0.0687,
"step": 1157
},
{
"epoch": 3.2436974789915967,
"grad_norm": 0.9927138686180115,
"learning_rate": 1.4603330726465314e-05,
"loss": 0.0433,
"step": 1158
},
{
"epoch": 3.246498599439776,
"grad_norm": 6.823101043701172,
"learning_rate": 1.4561941547462855e-05,
"loss": 0.1453,
"step": 1159
},
{
"epoch": 3.249299719887955,
"grad_norm": 1.3782508373260498,
"learning_rate": 1.4520586989583406e-05,
"loss": 0.0707,
"step": 1160
},
{
"epoch": 3.2521008403361344,
"grad_norm": 1.30823814868927,
"learning_rate": 1.4479267189992435e-05,
"loss": 0.0213,
"step": 1161
},
{
"epoch": 3.2549019607843137,
"grad_norm": 8.884556770324707,
"learning_rate": 1.4437982285740151e-05,
"loss": 0.1646,
"step": 1162
},
{
"epoch": 3.257703081232493,
"grad_norm": 2.384504795074463,
"learning_rate": 1.4396732413760976e-05,
"loss": 0.0792,
"step": 1163
},
{
"epoch": 3.2605042016806722,
"grad_norm": 1.9138203859329224,
"learning_rate": 1.4355517710873184e-05,
"loss": 0.0714,
"step": 1164
},
{
"epoch": 3.2633053221288515,
"grad_norm": 20.88504409790039,
"learning_rate": 1.4314338313778358e-05,
"loss": 0.3717,
"step": 1165
},
{
"epoch": 3.2661064425770308,
"grad_norm": 20.60991859436035,
"learning_rate": 1.4273194359061006e-05,
"loss": 0.4143,
"step": 1166
},
{
"epoch": 3.26890756302521,
"grad_norm": 4.24085807800293,
"learning_rate": 1.4232085983188065e-05,
"loss": 0.077,
"step": 1167
},
{
"epoch": 3.2717086834733893,
"grad_norm": 4.002956867218018,
"learning_rate": 1.4191013322508468e-05,
"loss": 0.1215,
"step": 1168
},
{
"epoch": 3.2745098039215685,
"grad_norm": 8.213194847106934,
"learning_rate": 1.4149976513252677e-05,
"loss": 0.1479,
"step": 1169
},
{
"epoch": 3.277310924369748,
"grad_norm": 0.9012970328330994,
"learning_rate": 1.4108975691532272e-05,
"loss": 0.0436,
"step": 1170
},
{
"epoch": 3.280112044817927,
"grad_norm": 6.056758880615234,
"learning_rate": 1.4068010993339436e-05,
"loss": 0.1932,
"step": 1171
},
{
"epoch": 3.2829131652661063,
"grad_norm": 6.821181297302246,
"learning_rate": 1.402708255454656e-05,
"loss": 0.1307,
"step": 1172
},
{
"epoch": 3.2857142857142856,
"grad_norm": 3.345459461212158,
"learning_rate": 1.3986190510905758e-05,
"loss": 0.0906,
"step": 1173
},
{
"epoch": 3.288515406162465,
"grad_norm": 1.9614676237106323,
"learning_rate": 1.3945334998048423e-05,
"loss": 0.0312,
"step": 1174
},
{
"epoch": 3.291316526610644,
"grad_norm": 1.3649321794509888,
"learning_rate": 1.3904516151484793e-05,
"loss": 0.0447,
"step": 1175
},
{
"epoch": 3.2941176470588234,
"grad_norm": 3.0201363563537598,
"learning_rate": 1.3863734106603471e-05,
"loss": 0.0703,
"step": 1176
},
{
"epoch": 3.2969187675070026,
"grad_norm": 1.9470489025115967,
"learning_rate": 1.3822988998671032e-05,
"loss": 0.0889,
"step": 1177
},
{
"epoch": 3.299719887955182,
"grad_norm": 1.7316004037857056,
"learning_rate": 1.3782280962831511e-05,
"loss": 0.0395,
"step": 1178
},
{
"epoch": 3.302521008403361,
"grad_norm": 3.4648783206939697,
"learning_rate": 1.3741610134105985e-05,
"loss": 0.0815,
"step": 1179
},
{
"epoch": 3.3053221288515404,
"grad_norm": 5.406269550323486,
"learning_rate": 1.3700976647392119e-05,
"loss": 0.1189,
"step": 1180
},
{
"epoch": 3.3081232492997197,
"grad_norm": 2.2869129180908203,
"learning_rate": 1.366038063746372e-05,
"loss": 0.0518,
"step": 1181
},
{
"epoch": 3.310924369747899,
"grad_norm": 4.437301158905029,
"learning_rate": 1.361982223897032e-05,
"loss": 0.1117,
"step": 1182
},
{
"epoch": 3.313725490196078,
"grad_norm": 4.043997287750244,
"learning_rate": 1.3579301586436658e-05,
"loss": 0.1187,
"step": 1183
},
{
"epoch": 3.3165266106442575,
"grad_norm": 1.651050090789795,
"learning_rate": 1.353881881426231e-05,
"loss": 0.0562,
"step": 1184
},
{
"epoch": 3.3193277310924367,
"grad_norm": 3.4451749324798584,
"learning_rate": 1.3498374056721197e-05,
"loss": 0.131,
"step": 1185
},
{
"epoch": 3.3221288515406164,
"grad_norm": 9.893393516540527,
"learning_rate": 1.345796744796115e-05,
"loss": 0.2424,
"step": 1186
},
{
"epoch": 3.3249299719887957,
"grad_norm": 6.996748447418213,
"learning_rate": 1.3417599122003464e-05,
"loss": 0.099,
"step": 1187
},
{
"epoch": 3.327731092436975,
"grad_norm": 3.4951329231262207,
"learning_rate": 1.3377269212742457e-05,
"loss": 0.091,
"step": 1188
},
{
"epoch": 3.330532212885154,
"grad_norm": 1.6428848505020142,
"learning_rate": 1.3336977853945055e-05,
"loss": 0.0635,
"step": 1189
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2.470294237136841,
"learning_rate": 1.3296725179250274e-05,
"loss": 0.075,
"step": 1190
},
{
"epoch": 3.3361344537815127,
"grad_norm": 7.2729105949401855,
"learning_rate": 1.325651132216886e-05,
"loss": 0.3581,
"step": 1191
},
{
"epoch": 3.338935574229692,
"grad_norm": 3.032115936279297,
"learning_rate": 1.321633641608277e-05,
"loss": 0.0465,
"step": 1192
},
{
"epoch": 3.3417366946778713,
"grad_norm": 4.1879096031188965,
"learning_rate": 1.3176200594244817e-05,
"loss": 0.0818,
"step": 1193
},
{
"epoch": 3.3445378151260505,
"grad_norm": 2.855759859085083,
"learning_rate": 1.3136103989778137e-05,
"loss": 0.1257,
"step": 1194
},
{
"epoch": 3.34733893557423,
"grad_norm": 1.19746732711792,
"learning_rate": 1.3096046735675793e-05,
"loss": 0.0917,
"step": 1195
},
{
"epoch": 3.350140056022409,
"grad_norm": 2.3846075534820557,
"learning_rate": 1.3056028964800366e-05,
"loss": 0.0417,
"step": 1196
},
{
"epoch": 3.3529411764705883,
"grad_norm": 1.6664929389953613,
"learning_rate": 1.3016050809883435e-05,
"loss": 0.0641,
"step": 1197
},
{
"epoch": 3.3557422969187676,
"grad_norm": 3.2694690227508545,
"learning_rate": 1.2976112403525204e-05,
"loss": 0.0594,
"step": 1198
},
{
"epoch": 3.358543417366947,
"grad_norm": 1.9079506397247314,
"learning_rate": 1.2936213878194031e-05,
"loss": 0.0815,
"step": 1199
},
{
"epoch": 3.361344537815126,
"grad_norm": 3.0590572357177734,
"learning_rate": 1.2896355366225999e-05,
"loss": 0.0666,
"step": 1200
},
{
"epoch": 3.3641456582633054,
"grad_norm": 8.331475257873535,
"learning_rate": 1.2856536999824458e-05,
"loss": 0.1884,
"step": 1201
},
{
"epoch": 3.3669467787114846,
"grad_norm": 2.459279775619507,
"learning_rate": 1.2816758911059635e-05,
"loss": 0.0792,
"step": 1202
},
{
"epoch": 3.369747899159664,
"grad_norm": 3.9899792671203613,
"learning_rate": 1.2777021231868145e-05,
"loss": 0.1138,
"step": 1203
},
{
"epoch": 3.372549019607843,
"grad_norm": 9.031767845153809,
"learning_rate": 1.273732409405257e-05,
"loss": 0.1623,
"step": 1204
},
{
"epoch": 3.3753501400560224,
"grad_norm": 2.037651777267456,
"learning_rate": 1.2697667629281024e-05,
"loss": 0.0407,
"step": 1205
},
{
"epoch": 3.3781512605042017,
"grad_norm": 4.093064785003662,
"learning_rate": 1.2658051969086715e-05,
"loss": 0.103,
"step": 1206
},
{
"epoch": 3.380952380952381,
"grad_norm": 3.442111015319824,
"learning_rate": 1.261847724486751e-05,
"loss": 0.1132,
"step": 1207
},
{
"epoch": 3.38375350140056,
"grad_norm": 1.749991774559021,
"learning_rate": 1.2578943587885516e-05,
"loss": 0.0856,
"step": 1208
},
{
"epoch": 3.3865546218487395,
"grad_norm": 4.175179481506348,
"learning_rate": 1.2539451129266603e-05,
"loss": 0.1442,
"step": 1209
},
{
"epoch": 3.3893557422969187,
"grad_norm": 6.753386497497559,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.1425,
"step": 1210
},
{
"epoch": 3.392156862745098,
"grad_norm": 8.538300514221191,
"learning_rate": 1.2460590330937875e-05,
"loss": 0.2035,
"step": 1211
},
{
"epoch": 3.3949579831932772,
"grad_norm": 2.291215181350708,
"learning_rate": 1.2421222252794834e-05,
"loss": 0.087,
"step": 1212
},
{
"epoch": 3.3977591036414565,
"grad_norm": 2.0655384063720703,
"learning_rate": 1.238189589614759e-05,
"loss": 0.0612,
"step": 1213
},
{
"epoch": 3.4005602240896358,
"grad_norm": 1.291670322418213,
"learning_rate": 1.2342611391434424e-05,
"loss": 0.0643,
"step": 1214
},
{
"epoch": 3.403361344537815,
"grad_norm": 2.476210832595825,
"learning_rate": 1.2303368868954848e-05,
"loss": 0.0537,
"step": 1215
},
{
"epoch": 3.4061624649859943,
"grad_norm": 6.289384365081787,
"learning_rate": 1.2264168458869089e-05,
"loss": 0.2037,
"step": 1216
},
{
"epoch": 3.4089635854341735,
"grad_norm": 7.985877513885498,
"learning_rate": 1.2225010291197708e-05,
"loss": 0.2083,
"step": 1217
},
{
"epoch": 3.411764705882353,
"grad_norm": 1.0264813899993896,
"learning_rate": 1.218589449582116e-05,
"loss": 0.0622,
"step": 1218
},
{
"epoch": 3.414565826330532,
"grad_norm": 1.2536002397537231,
"learning_rate": 1.2146821202479348e-05,
"loss": 0.0398,
"step": 1219
},
{
"epoch": 3.4173669467787113,
"grad_norm": 2.4298698902130127,
"learning_rate": 1.2107790540771208e-05,
"loss": 0.036,
"step": 1220
},
{
"epoch": 3.4201680672268906,
"grad_norm": 9.198648452758789,
"learning_rate": 1.2068802640154292e-05,
"loss": 0.1996,
"step": 1221
},
{
"epoch": 3.42296918767507,
"grad_norm": 1.9860378503799438,
"learning_rate": 1.2029857629944297e-05,
"loss": 0.0769,
"step": 1222
},
{
"epoch": 3.425770308123249,
"grad_norm": 12.668886184692383,
"learning_rate": 1.1990955639314664e-05,
"loss": 0.1511,
"step": 1223
},
{
"epoch": 3.4285714285714284,
"grad_norm": 4.27272891998291,
"learning_rate": 1.1952096797296166e-05,
"loss": 0.0515,
"step": 1224
},
{
"epoch": 3.431372549019608,
"grad_norm": 4.90073299407959,
"learning_rate": 1.1913281232776444e-05,
"loss": 0.1255,
"step": 1225
},
{
"epoch": 3.4341736694677873,
"grad_norm": 8.19225025177002,
"learning_rate": 1.187450907449958e-05,
"loss": 0.1752,
"step": 1226
},
{
"epoch": 3.4369747899159666,
"grad_norm": 1.2287348508834839,
"learning_rate": 1.1835780451065724e-05,
"loss": 0.0711,
"step": 1227
},
{
"epoch": 3.439775910364146,
"grad_norm": 1.460504174232483,
"learning_rate": 1.1797095490930597e-05,
"loss": 0.0573,
"step": 1228
},
{
"epoch": 3.442577030812325,
"grad_norm": 2.864814281463623,
"learning_rate": 1.175845432240511e-05,
"loss": 0.0456,
"step": 1229
},
{
"epoch": 3.4453781512605044,
"grad_norm": 1.8840250968933105,
"learning_rate": 1.1719857073654922e-05,
"loss": 0.1108,
"step": 1230
},
{
"epoch": 3.4481792717086837,
"grad_norm": 1.9371321201324463,
"learning_rate": 1.168130387270002e-05,
"loss": 0.0342,
"step": 1231
},
{
"epoch": 3.450980392156863,
"grad_norm": 2.4567437171936035,
"learning_rate": 1.1642794847414281e-05,
"loss": 0.0746,
"step": 1232
},
{
"epoch": 3.453781512605042,
"grad_norm": 4.603306770324707,
"learning_rate": 1.1604330125525079e-05,
"loss": 0.194,
"step": 1233
},
{
"epoch": 3.4565826330532214,
"grad_norm": 1.1506344079971313,
"learning_rate": 1.1565909834612842e-05,
"loss": 0.0292,
"step": 1234
},
{
"epoch": 3.4593837535014007,
"grad_norm": 8.21558666229248,
"learning_rate": 1.1527534102110612e-05,
"loss": 0.1685,
"step": 1235
},
{
"epoch": 3.46218487394958,
"grad_norm": 2.137861967086792,
"learning_rate": 1.1489203055303647e-05,
"loss": 0.0731,
"step": 1236
},
{
"epoch": 3.4649859943977592,
"grad_norm": 8.72636890411377,
"learning_rate": 1.1450916821328987e-05,
"loss": 0.0957,
"step": 1237
},
{
"epoch": 3.4677871148459385,
"grad_norm": 4.765244960784912,
"learning_rate": 1.1412675527175038e-05,
"loss": 0.1223,
"step": 1238
},
{
"epoch": 3.4705882352941178,
"grad_norm": 1.8884990215301514,
"learning_rate": 1.1374479299681143e-05,
"loss": 0.0375,
"step": 1239
},
{
"epoch": 3.473389355742297,
"grad_norm": 7.054959774017334,
"learning_rate": 1.1336328265537194e-05,
"loss": 0.1534,
"step": 1240
},
{
"epoch": 3.4761904761904763,
"grad_norm": 5.714659690856934,
"learning_rate": 1.1298222551283152e-05,
"loss": 0.1357,
"step": 1241
},
{
"epoch": 3.4789915966386555,
"grad_norm": 6.959196090698242,
"learning_rate": 1.1260162283308679e-05,
"loss": 0.2064,
"step": 1242
},
{
"epoch": 3.481792717086835,
"grad_norm": 7.022974014282227,
"learning_rate": 1.1222147587852677e-05,
"loss": 0.1596,
"step": 1243
},
{
"epoch": 3.484593837535014,
"grad_norm": 15.019680976867676,
"learning_rate": 1.1184178591002936e-05,
"loss": 0.3435,
"step": 1244
},
{
"epoch": 3.4873949579831933,
"grad_norm": 1.869878888130188,
"learning_rate": 1.1146255418695634e-05,
"loss": 0.0558,
"step": 1245
},
{
"epoch": 3.4901960784313726,
"grad_norm": 2.9518184661865234,
"learning_rate": 1.110837819671496e-05,
"loss": 0.1011,
"step": 1246
},
{
"epoch": 3.492997198879552,
"grad_norm": 2.555640935897827,
"learning_rate": 1.107054705069272e-05,
"loss": 0.0679,
"step": 1247
},
{
"epoch": 3.495798319327731,
"grad_norm": 1.9027478694915771,
"learning_rate": 1.1032762106107871e-05,
"loss": 0.074,
"step": 1248
},
{
"epoch": 3.4985994397759104,
"grad_norm": 2.643005847930908,
"learning_rate": 1.0995023488286132e-05,
"loss": 0.0459,
"step": 1249
},
{
"epoch": 3.5014005602240896,
"grad_norm": 2.0106723308563232,
"learning_rate": 1.0957331322399575e-05,
"loss": 0.0642,
"step": 1250
},
{
"epoch": 3.504201680672269,
"grad_norm": 2.9285848140716553,
"learning_rate": 1.0919685733466175e-05,
"loss": 0.0481,
"step": 1251
},
{
"epoch": 3.507002801120448,
"grad_norm": 1.3682835102081299,
"learning_rate": 1.088208684634946e-05,
"loss": 0.0424,
"step": 1252
},
{
"epoch": 3.5098039215686274,
"grad_norm": 1.431214690208435,
"learning_rate": 1.0844534785758024e-05,
"loss": 0.0412,
"step": 1253
},
{
"epoch": 3.5126050420168067,
"grad_norm": 4.357926845550537,
"learning_rate": 1.0807029676245145e-05,
"loss": 0.1033,
"step": 1254
},
{
"epoch": 3.515406162464986,
"grad_norm": 1.1506880521774292,
"learning_rate": 1.0769571642208404e-05,
"loss": 0.0223,
"step": 1255
},
{
"epoch": 3.518207282913165,
"grad_norm": 7.2023396492004395,
"learning_rate": 1.0732160807889211e-05,
"loss": 0.1147,
"step": 1256
},
{
"epoch": 3.5210084033613445,
"grad_norm": 3.6651108264923096,
"learning_rate": 1.0694797297372432e-05,
"loss": 0.092,
"step": 1257
},
{
"epoch": 3.5238095238095237,
"grad_norm": 19.00282096862793,
"learning_rate": 1.0657481234585965e-05,
"loss": 0.2943,
"step": 1258
},
{
"epoch": 3.526610644257703,
"grad_norm": 4.939517498016357,
"learning_rate": 1.062021274330035e-05,
"loss": 0.094,
"step": 1259
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.8424512147903442,
"learning_rate": 1.0582991947128324e-05,
"loss": 0.0781,
"step": 1260
},
{
"epoch": 3.5322128851540615,
"grad_norm": 1.7380822896957397,
"learning_rate": 1.0545818969524432e-05,
"loss": 0.0417,
"step": 1261
},
{
"epoch": 3.5350140056022408,
"grad_norm": 0.913371741771698,
"learning_rate": 1.050869393378461e-05,
"loss": 0.0598,
"step": 1262
},
{
"epoch": 3.53781512605042,
"grad_norm": 4.327756881713867,
"learning_rate": 1.0471616963045789e-05,
"loss": 0.0889,
"step": 1263
},
{
"epoch": 3.5406162464985993,
"grad_norm": 15.264612197875977,
"learning_rate": 1.043458818028546e-05,
"loss": 0.2762,
"step": 1264
},
{
"epoch": 3.5434173669467786,
"grad_norm": 4.662524223327637,
"learning_rate": 1.0397607708321303e-05,
"loss": 0.2163,
"step": 1265
},
{
"epoch": 3.546218487394958,
"grad_norm": 9.964065551757812,
"learning_rate": 1.0360675669810765e-05,
"loss": 0.1604,
"step": 1266
},
{
"epoch": 3.549019607843137,
"grad_norm": 3.6799726486206055,
"learning_rate": 1.0323792187250625e-05,
"loss": 0.08,
"step": 1267
},
{
"epoch": 3.5518207282913163,
"grad_norm": 1.5393813848495483,
"learning_rate": 1.028695738297662e-05,
"loss": 0.0394,
"step": 1268
},
{
"epoch": 3.5546218487394956,
"grad_norm": 5.216233253479004,
"learning_rate": 1.0250171379163035e-05,
"loss": 0.192,
"step": 1269
},
{
"epoch": 3.557422969187675,
"grad_norm": 3.2655088901519775,
"learning_rate": 1.0213434297822274e-05,
"loss": 0.0812,
"step": 1270
},
{
"epoch": 3.560224089635854,
"grad_norm": 2.0399258136749268,
"learning_rate": 1.0176746260804512e-05,
"loss": 0.1101,
"step": 1271
},
{
"epoch": 3.5630252100840334,
"grad_norm": 2.1905429363250732,
"learning_rate": 1.0140107389797224e-05,
"loss": 0.0743,
"step": 1272
},
{
"epoch": 3.5658263305322127,
"grad_norm": 2.318368434906006,
"learning_rate": 1.0103517806324809e-05,
"loss": 0.0475,
"step": 1273
},
{
"epoch": 3.568627450980392,
"grad_norm": 2.9474756717681885,
"learning_rate": 1.0066977631748193e-05,
"loss": 0.0409,
"step": 1274
},
{
"epoch": 3.571428571428571,
"grad_norm": 5.024738788604736,
"learning_rate": 1.0030486987264437e-05,
"loss": 0.0758,
"step": 1275
},
{
"epoch": 3.5742296918767504,
"grad_norm": 10.777070999145508,
"learning_rate": 9.994045993906306e-06,
"loss": 0.2476,
"step": 1276
},
{
"epoch": 3.5770308123249297,
"grad_norm": 10.371776580810547,
"learning_rate": 9.957654772541867e-06,
"loss": 0.2626,
"step": 1277
},
{
"epoch": 3.5798319327731094,
"grad_norm": 3.480252742767334,
"learning_rate": 9.921313443874141e-06,
"loss": 0.0764,
"step": 1278
},
{
"epoch": 3.5826330532212887,
"grad_norm": 3.2019546031951904,
"learning_rate": 9.88502212844063e-06,
"loss": 0.0482,
"step": 1279
},
{
"epoch": 3.585434173669468,
"grad_norm": 3.009918451309204,
"learning_rate": 9.848780946612963e-06,
"loss": 0.1114,
"step": 1280
},
{
"epoch": 3.588235294117647,
"grad_norm": 2.430823564529419,
"learning_rate": 9.812590018596486e-06,
"loss": 0.0798,
"step": 1281
},
{
"epoch": 3.5910364145658265,
"grad_norm": 9.197877883911133,
"learning_rate": 9.77644946442986e-06,
"loss": 0.2076,
"step": 1282
},
{
"epoch": 3.5938375350140057,
"grad_norm": 1.2582132816314697,
"learning_rate": 9.740359403984656e-06,
"loss": 0.0544,
"step": 1283
},
{
"epoch": 3.596638655462185,
"grad_norm": 1.6079927682876587,
"learning_rate": 9.704319956964996e-06,
"loss": 0.0845,
"step": 1284
},
{
"epoch": 3.5994397759103642,
"grad_norm": 11.193923950195312,
"learning_rate": 9.668331242907089e-06,
"loss": 0.1999,
"step": 1285
},
{
"epoch": 3.6022408963585435,
"grad_norm": 1.8941559791564941,
"learning_rate": 9.632393381178911e-06,
"loss": 0.084,
"step": 1286
},
{
"epoch": 3.6050420168067228,
"grad_norm": 10.134256362915039,
"learning_rate": 9.596506490979738e-06,
"loss": 0.1817,
"step": 1287
},
{
"epoch": 3.607843137254902,
"grad_norm": 15.522900581359863,
"learning_rate": 9.560670691339793e-06,
"loss": 0.3554,
"step": 1288
},
{
"epoch": 3.6106442577030813,
"grad_norm": 5.754158973693848,
"learning_rate": 9.524886101119846e-06,
"loss": 0.159,
"step": 1289
},
{
"epoch": 3.6134453781512605,
"grad_norm": 2.8589224815368652,
"learning_rate": 9.489152839010797e-06,
"loss": 0.0577,
"step": 1290
},
{
"epoch": 3.61624649859944,
"grad_norm": 12.999177932739258,
"learning_rate": 9.453471023533343e-06,
"loss": 0.233,
"step": 1291
},
{
"epoch": 3.619047619047619,
"grad_norm": 3.228353500366211,
"learning_rate": 9.41784077303749e-06,
"loss": 0.0644,
"step": 1292
},
{
"epoch": 3.6218487394957983,
"grad_norm": 2.4915523529052734,
"learning_rate": 9.382262205702247e-06,
"loss": 0.0401,
"step": 1293
},
{
"epoch": 3.6246498599439776,
"grad_norm": 2.3028035163879395,
"learning_rate": 9.346735439535182e-06,
"loss": 0.0669,
"step": 1294
},
{
"epoch": 3.627450980392157,
"grad_norm": 2.022444486618042,
"learning_rate": 9.311260592372045e-06,
"loss": 0.0647,
"step": 1295
},
{
"epoch": 3.630252100840336,
"grad_norm": 3.0425169467926025,
"learning_rate": 9.275837781876403e-06,
"loss": 0.038,
"step": 1296
},
{
"epoch": 3.6330532212885154,
"grad_norm": 2.807349443435669,
"learning_rate": 9.240467125539218e-06,
"loss": 0.0816,
"step": 1297
},
{
"epoch": 3.6358543417366946,
"grad_norm": 11.162236213684082,
"learning_rate": 9.205148740678458e-06,
"loss": 0.322,
"step": 1298
},
{
"epoch": 3.638655462184874,
"grad_norm": 1.8583990335464478,
"learning_rate": 9.16988274443871e-06,
"loss": 0.095,
"step": 1299
},
{
"epoch": 3.641456582633053,
"grad_norm": 23.761890411376953,
"learning_rate": 9.134669253790815e-06,
"loss": 0.1911,
"step": 1300
},
{
"epoch": 3.6442577030812324,
"grad_norm": 2.7331414222717285,
"learning_rate": 9.099508385531452e-06,
"loss": 0.0671,
"step": 1301
},
{
"epoch": 3.6470588235294117,
"grad_norm": 2.361318588256836,
"learning_rate": 9.064400256282757e-06,
"loss": 0.0711,
"step": 1302
},
{
"epoch": 3.649859943977591,
"grad_norm": 3.064162254333496,
"learning_rate": 9.029344982491963e-06,
"loss": 0.0556,
"step": 1303
},
{
"epoch": 3.65266106442577,
"grad_norm": 1.9346349239349365,
"learning_rate": 8.99434268043097e-06,
"loss": 0.0471,
"step": 1304
},
{
"epoch": 3.6554621848739495,
"grad_norm": 1.4097012281417847,
"learning_rate": 8.959393466195972e-06,
"loss": 0.0251,
"step": 1305
},
{
"epoch": 3.6582633053221287,
"grad_norm": 3.3399229049682617,
"learning_rate": 8.924497455707115e-06,
"loss": 0.092,
"step": 1306
},
{
"epoch": 3.661064425770308,
"grad_norm": 26.91242790222168,
"learning_rate": 8.889654764708046e-06,
"loss": 0.512,
"step": 1307
},
{
"epoch": 3.6638655462184873,
"grad_norm": 1.973227858543396,
"learning_rate": 8.854865508765578e-06,
"loss": 0.0423,
"step": 1308
},
{
"epoch": 3.6666666666666665,
"grad_norm": 9.74034309387207,
"learning_rate": 8.820129803269272e-06,
"loss": 0.2736,
"step": 1309
},
{
"epoch": 3.669467787114846,
"grad_norm": 1.1904311180114746,
"learning_rate": 8.785447763431101e-06,
"loss": 0.0186,
"step": 1310
},
{
"epoch": 3.6722689075630255,
"grad_norm": 8.14508056640625,
"learning_rate": 8.750819504285015e-06,
"loss": 0.1088,
"step": 1311
},
{
"epoch": 3.6750700280112047,
"grad_norm": 1.9873169660568237,
"learning_rate": 8.71624514068659e-06,
"loss": 0.0228,
"step": 1312
},
{
"epoch": 3.677871148459384,
"grad_norm": 1.4725542068481445,
"learning_rate": 8.68172478731264e-06,
"loss": 0.0167,
"step": 1313
},
{
"epoch": 3.6806722689075633,
"grad_norm": 2.066056489944458,
"learning_rate": 8.647258558660829e-06,
"loss": 0.0738,
"step": 1314
},
{
"epoch": 3.6834733893557425,
"grad_norm": 5.671342849731445,
"learning_rate": 8.612846569049323e-06,
"loss": 0.0943,
"step": 1315
},
{
"epoch": 3.686274509803922,
"grad_norm": 1.975346326828003,
"learning_rate": 8.57848893261636e-06,
"loss": 0.0316,
"step": 1316
},
{
"epoch": 3.689075630252101,
"grad_norm": 10.312047958374023,
"learning_rate": 8.544185763319926e-06,
"loss": 0.1668,
"step": 1317
},
{
"epoch": 3.6918767507002803,
"grad_norm": 1.698010802268982,
"learning_rate": 8.50993717493732e-06,
"loss": 0.0209,
"step": 1318
},
{
"epoch": 3.6946778711484596,
"grad_norm": 6.265659809112549,
"learning_rate": 8.475743281064829e-06,
"loss": 0.1098,
"step": 1319
},
{
"epoch": 3.697478991596639,
"grad_norm": 16.2908992767334,
"learning_rate": 8.441604195117314e-06,
"loss": 0.5784,
"step": 1320
},
{
"epoch": 3.700280112044818,
"grad_norm": 6.113296985626221,
"learning_rate": 8.407520030327845e-06,
"loss": 0.0942,
"step": 1321
},
{
"epoch": 3.7030812324929974,
"grad_norm": 4.005929946899414,
"learning_rate": 8.373490899747358e-06,
"loss": 0.0839,
"step": 1322
},
{
"epoch": 3.7058823529411766,
"grad_norm": 12.831454277038574,
"learning_rate": 8.339516916244217e-06,
"loss": 0.2103,
"step": 1323
},
{
"epoch": 3.708683473389356,
"grad_norm": 6.1074981689453125,
"learning_rate": 8.30559819250389e-06,
"loss": 0.106,
"step": 1324
},
{
"epoch": 3.711484593837535,
"grad_norm": 2.1068618297576904,
"learning_rate": 8.271734841028553e-06,
"loss": 0.0911,
"step": 1325
},
{
"epoch": 3.7142857142857144,
"grad_norm": 1.2227877378463745,
"learning_rate": 8.237926974136715e-06,
"loss": 0.046,
"step": 1326
},
{
"epoch": 3.7170868347338937,
"grad_norm": 6.126723289489746,
"learning_rate": 8.204174703962878e-06,
"loss": 0.1022,
"step": 1327
},
{
"epoch": 3.719887955182073,
"grad_norm": 1.6104238033294678,
"learning_rate": 8.17047814245711e-06,
"loss": 0.0633,
"step": 1328
},
{
"epoch": 3.722689075630252,
"grad_norm": 1.5701721906661987,
"learning_rate": 8.136837401384734e-06,
"loss": 0.0429,
"step": 1329
},
{
"epoch": 3.7254901960784315,
"grad_norm": 15.184452056884766,
"learning_rate": 8.103252592325896e-06,
"loss": 0.4572,
"step": 1330
},
{
"epoch": 3.7282913165266107,
"grad_norm": 1.1463077068328857,
"learning_rate": 8.069723826675249e-06,
"loss": 0.0494,
"step": 1331
},
{
"epoch": 3.73109243697479,
"grad_norm": 9.456391334533691,
"learning_rate": 8.036251215641546e-06,
"loss": 0.1326,
"step": 1332
},
{
"epoch": 3.7338935574229692,
"grad_norm": 4.1337890625,
"learning_rate": 8.002834870247283e-06,
"loss": 0.0966,
"step": 1333
},
{
"epoch": 3.7366946778711485,
"grad_norm": 11.9760103225708,
"learning_rate": 7.969474901328358e-06,
"loss": 0.4693,
"step": 1334
},
{
"epoch": 3.7394957983193278,
"grad_norm": 9.769811630249023,
"learning_rate": 7.936171419533653e-06,
"loss": 0.1692,
"step": 1335
},
{
"epoch": 3.742296918767507,
"grad_norm": 4.583395004272461,
"learning_rate": 7.902924535324691e-06,
"loss": 0.1152,
"step": 1336
},
{
"epoch": 3.7450980392156863,
"grad_norm": 1.4525033235549927,
"learning_rate": 7.869734358975302e-06,
"loss": 0.0496,
"step": 1337
},
{
"epoch": 3.7478991596638656,
"grad_norm": 5.7722859382629395,
"learning_rate": 7.836601000571198e-06,
"loss": 0.1252,
"step": 1338
},
{
"epoch": 3.750700280112045,
"grad_norm": 9.7979097366333,
"learning_rate": 7.803524570009637e-06,
"loss": 0.2204,
"step": 1339
},
{
"epoch": 3.753501400560224,
"grad_norm": 3.631242275238037,
"learning_rate": 7.770505176999066e-06,
"loss": 0.0699,
"step": 1340
},
{
"epoch": 3.7563025210084033,
"grad_norm": 4.886704444885254,
"learning_rate": 7.737542931058756e-06,
"loss": 0.125,
"step": 1341
},
{
"epoch": 3.7591036414565826,
"grad_norm": 4.995973587036133,
"learning_rate": 7.704637941518422e-06,
"loss": 0.1187,
"step": 1342
},
{
"epoch": 3.761904761904762,
"grad_norm": 3.329430341720581,
"learning_rate": 7.671790317517865e-06,
"loss": 0.0936,
"step": 1343
},
{
"epoch": 3.764705882352941,
"grad_norm": 3.955146312713623,
"learning_rate": 7.63900016800663e-06,
"loss": 0.1008,
"step": 1344
},
{
"epoch": 3.7675070028011204,
"grad_norm": 3.2486631870269775,
"learning_rate": 7.606267601743614e-06,
"loss": 0.0833,
"step": 1345
},
{
"epoch": 3.7703081232492996,
"grad_norm": 2.8611109256744385,
"learning_rate": 7.573592727296722e-06,
"loss": 0.0673,
"step": 1346
},
{
"epoch": 3.773109243697479,
"grad_norm": 3.060568332672119,
"learning_rate": 7.54097565304252e-06,
"loss": 0.0608,
"step": 1347
},
{
"epoch": 3.775910364145658,
"grad_norm": 2.8913657665252686,
"learning_rate": 7.508416487165862e-06,
"loss": 0.06,
"step": 1348
},
{
"epoch": 3.7787114845938374,
"grad_norm": 2.9590132236480713,
"learning_rate": 7.475915337659517e-06,
"loss": 0.0708,
"step": 1349
},
{
"epoch": 3.7815126050420167,
"grad_norm": 1.5255166292190552,
"learning_rate": 7.4434723123238236e-06,
"loss": 0.0537,
"step": 1350
},
{
"epoch": 3.784313725490196,
"grad_norm": 3.0520412921905518,
"learning_rate": 7.411087518766341e-06,
"loss": 0.0514,
"step": 1351
},
{
"epoch": 3.787114845938375,
"grad_norm": 2.7965035438537598,
"learning_rate": 7.378761064401485e-06,
"loss": 0.0919,
"step": 1352
},
{
"epoch": 3.7899159663865545,
"grad_norm": 2.2356274127960205,
"learning_rate": 7.3464930564501575e-06,
"loss": 0.0655,
"step": 1353
},
{
"epoch": 3.7927170868347337,
"grad_norm": 2.5678279399871826,
"learning_rate": 7.314283601939431e-06,
"loss": 0.0592,
"step": 1354
},
{
"epoch": 3.795518207282913,
"grad_norm": 3.243337631225586,
"learning_rate": 7.282132807702144e-06,
"loss": 0.0729,
"step": 1355
},
{
"epoch": 3.7983193277310923,
"grad_norm": 2.403731107711792,
"learning_rate": 7.250040780376577e-06,
"loss": 0.086,
"step": 1356
},
{
"epoch": 3.8011204481792715,
"grad_norm": 1.7365161180496216,
"learning_rate": 7.218007626406082e-06,
"loss": 0.0375,
"step": 1357
},
{
"epoch": 3.803921568627451,
"grad_norm": 9.282291412353516,
"learning_rate": 7.1860334520387654e-06,
"loss": 0.2492,
"step": 1358
},
{
"epoch": 3.80672268907563,
"grad_norm": 7.384303569793701,
"learning_rate": 7.1541183633270755e-06,
"loss": 0.1021,
"step": 1359
},
{
"epoch": 3.8095238095238093,
"grad_norm": 3.1575872898101807,
"learning_rate": 7.122262466127514e-06,
"loss": 0.0834,
"step": 1360
},
{
"epoch": 3.8123249299719886,
"grad_norm": 1.5287607908248901,
"learning_rate": 7.090465866100238e-06,
"loss": 0.0291,
"step": 1361
},
{
"epoch": 3.815126050420168,
"grad_norm": 1.460954189300537,
"learning_rate": 7.058728668708728e-06,
"loss": 0.0284,
"step": 1362
},
{
"epoch": 3.817927170868347,
"grad_norm": 8.310348510742188,
"learning_rate": 7.027050979219438e-06,
"loss": 0.1777,
"step": 1363
},
{
"epoch": 3.8207282913165264,
"grad_norm": 0.7145363688468933,
"learning_rate": 6.995432902701452e-06,
"loss": 0.0271,
"step": 1364
},
{
"epoch": 3.8235294117647056,
"grad_norm": 5.249289035797119,
"learning_rate": 6.9638745440261084e-06,
"loss": 0.3472,
"step": 1365
},
{
"epoch": 3.826330532212885,
"grad_norm": 9.49671459197998,
"learning_rate": 6.9323760078667085e-06,
"loss": 0.1544,
"step": 1366
},
{
"epoch": 3.8291316526610646,
"grad_norm": 4.925049781799316,
"learning_rate": 6.9009373986980955e-06,
"loss": 0.0977,
"step": 1367
},
{
"epoch": 3.831932773109244,
"grad_norm": 10.142794609069824,
"learning_rate": 6.8695588207963765e-06,
"loss": 0.2152,
"step": 1368
},
{
"epoch": 3.834733893557423,
"grad_norm": 3.3772802352905273,
"learning_rate": 6.838240378238528e-06,
"loss": 0.0861,
"step": 1369
},
{
"epoch": 3.8375350140056024,
"grad_norm": 4.451547622680664,
"learning_rate": 6.806982174902066e-06,
"loss": 0.1098,
"step": 1370
},
{
"epoch": 3.8403361344537816,
"grad_norm": 1.6721394062042236,
"learning_rate": 6.775784314464717e-06,
"loss": 0.0545,
"step": 1371
},
{
"epoch": 3.843137254901961,
"grad_norm": 1.5572619438171387,
"learning_rate": 6.744646900404045e-06,
"loss": 0.0825,
"step": 1372
},
{
"epoch": 3.84593837535014,
"grad_norm": 11.100934028625488,
"learning_rate": 6.713570035997147e-06,
"loss": 0.1646,
"step": 1373
},
{
"epoch": 3.8487394957983194,
"grad_norm": 6.442490100860596,
"learning_rate": 6.68255382432027e-06,
"loss": 0.3148,
"step": 1374
},
{
"epoch": 3.8515406162464987,
"grad_norm": 1.8811453580856323,
"learning_rate": 6.651598368248493e-06,
"loss": 0.0318,
"step": 1375
},
{
"epoch": 3.854341736694678,
"grad_norm": 12.277639389038086,
"learning_rate": 6.62070377045538e-06,
"loss": 0.1851,
"step": 1376
},
{
"epoch": 3.857142857142857,
"grad_norm": 5.6130051612854,
"learning_rate": 6.589870133412626e-06,
"loss": 0.1043,
"step": 1377
},
{
"epoch": 3.8599439775910365,
"grad_norm": 1.940450668334961,
"learning_rate": 6.559097559389754e-06,
"loss": 0.0504,
"step": 1378
},
{
"epoch": 3.8627450980392157,
"grad_norm": 3.3685734272003174,
"learning_rate": 6.528386150453747e-06,
"loss": 0.0568,
"step": 1379
},
{
"epoch": 3.865546218487395,
"grad_norm": 8.066683769226074,
"learning_rate": 6.497736008468702e-06,
"loss": 0.1339,
"step": 1380
},
{
"epoch": 3.8683473389355743,
"grad_norm": 1.7001276016235352,
"learning_rate": 6.467147235095511e-06,
"loss": 0.05,
"step": 1381
},
{
"epoch": 3.8711484593837535,
"grad_norm": 2.7757482528686523,
"learning_rate": 6.436619931791518e-06,
"loss": 0.0653,
"step": 1382
},
{
"epoch": 3.8739495798319328,
"grad_norm": 2.4557902812957764,
"learning_rate": 6.4061541998101795e-06,
"loss": 0.0889,
"step": 1383
},
{
"epoch": 3.876750700280112,
"grad_norm": 2.238588571548462,
"learning_rate": 6.375750140200729e-06,
"loss": 0.0648,
"step": 1384
},
{
"epoch": 3.8795518207282913,
"grad_norm": 6.886146545410156,
"learning_rate": 6.3454078538078635e-06,
"loss": 0.2117,
"step": 1385
},
{
"epoch": 3.8823529411764706,
"grad_norm": 2.0907630920410156,
"learning_rate": 6.315127441271368e-06,
"loss": 0.0643,
"step": 1386
},
{
"epoch": 3.88515406162465,
"grad_norm": 13.973304748535156,
"learning_rate": 6.284909003025808e-06,
"loss": 0.2389,
"step": 1387
},
{
"epoch": 3.887955182072829,
"grad_norm": 0.9553418755531311,
"learning_rate": 6.254752639300185e-06,
"loss": 0.0406,
"step": 1388
},
{
"epoch": 3.8907563025210083,
"grad_norm": 1.759007215499878,
"learning_rate": 6.224658450117637e-06,
"loss": 0.0477,
"step": 1389
},
{
"epoch": 3.8935574229691876,
"grad_norm": 1.847176432609558,
"learning_rate": 6.194626535295059e-06,
"loss": 0.0384,
"step": 1390
},
{
"epoch": 3.896358543417367,
"grad_norm": 2.2088205814361572,
"learning_rate": 6.164656994442783e-06,
"loss": 0.0485,
"step": 1391
},
{
"epoch": 3.899159663865546,
"grad_norm": 14.182723999023438,
"learning_rate": 6.134749926964289e-06,
"loss": 0.2399,
"step": 1392
},
{
"epoch": 3.9019607843137254,
"grad_norm": 1.9838969707489014,
"learning_rate": 6.104905432055824e-06,
"loss": 0.0728,
"step": 1393
},
{
"epoch": 3.9047619047619047,
"grad_norm": 18.831859588623047,
"learning_rate": 6.075123608706093e-06,
"loss": 0.2583,
"step": 1394
},
{
"epoch": 3.907563025210084,
"grad_norm": 9.972983360290527,
"learning_rate": 6.045404555695935e-06,
"loss": 0.1796,
"step": 1395
},
{
"epoch": 3.910364145658263,
"grad_norm": 3.596752166748047,
"learning_rate": 6.0157483715979876e-06,
"loss": 0.0758,
"step": 1396
},
{
"epoch": 3.9131652661064424,
"grad_norm": 1.0203285217285156,
"learning_rate": 5.986155154776377e-06,
"loss": 0.0507,
"step": 1397
},
{
"epoch": 3.9159663865546217,
"grad_norm": 2.0499589443206787,
"learning_rate": 5.956625003386357e-06,
"loss": 0.0689,
"step": 1398
},
{
"epoch": 3.918767507002801,
"grad_norm": 19.545166015625,
"learning_rate": 5.9271580153740316e-06,
"loss": 0.2548,
"step": 1399
},
{
"epoch": 3.9215686274509802,
"grad_norm": 3.7680435180664062,
"learning_rate": 5.897754288475979e-06,
"loss": 0.0777,
"step": 1400
},
{
"epoch": 3.92436974789916,
"grad_norm": 1.9220796823501587,
"learning_rate": 5.868413920218965e-06,
"loss": 0.0543,
"step": 1401
},
{
"epoch": 3.927170868347339,
"grad_norm": 2.944732189178467,
"learning_rate": 5.839137007919598e-06,
"loss": 0.1321,
"step": 1402
},
{
"epoch": 3.9299719887955185,
"grad_norm": 3.1687591075897217,
"learning_rate": 5.809923648684018e-06,
"loss": 0.0829,
"step": 1403
},
{
"epoch": 3.9327731092436977,
"grad_norm": 2.992490768432617,
"learning_rate": 5.780773939407586e-06,
"loss": 0.0424,
"step": 1404
},
{
"epoch": 3.935574229691877,
"grad_norm": 3.319343090057373,
"learning_rate": 5.751687976774523e-06,
"loss": 0.0521,
"step": 1405
},
{
"epoch": 3.9383753501400562,
"grad_norm": 6.968481063842773,
"learning_rate": 5.722665857257628e-06,
"loss": 0.1416,
"step": 1406
},
{
"epoch": 3.9411764705882355,
"grad_norm": 1.7675644159317017,
"learning_rate": 5.693707677117943e-06,
"loss": 0.0347,
"step": 1407
},
{
"epoch": 3.9439775910364148,
"grad_norm": 18.082834243774414,
"learning_rate": 5.6648135324044285e-06,
"loss": 0.2809,
"step": 1408
},
{
"epoch": 3.946778711484594,
"grad_norm": 11.232237815856934,
"learning_rate": 5.635983518953664e-06,
"loss": 0.1947,
"step": 1409
},
{
"epoch": 3.9495798319327733,
"grad_norm": 1.4977034330368042,
"learning_rate": 5.607217732389503e-06,
"loss": 0.0407,
"step": 1410
},
{
"epoch": 3.9523809523809526,
"grad_norm": 13.30891227722168,
"learning_rate": 5.578516268122785e-06,
"loss": 0.2298,
"step": 1411
},
{
"epoch": 3.955182072829132,
"grad_norm": 2.625317335128784,
"learning_rate": 5.5498792213509926e-06,
"loss": 0.0487,
"step": 1412
},
{
"epoch": 3.957983193277311,
"grad_norm": 5.572171688079834,
"learning_rate": 5.521306687057947e-06,
"loss": 0.0979,
"step": 1413
},
{
"epoch": 3.9607843137254903,
"grad_norm": 1.1671191453933716,
"learning_rate": 5.492798760013504e-06,
"loss": 0.0523,
"step": 1414
},
{
"epoch": 3.9635854341736696,
"grad_norm": 1.6150152683258057,
"learning_rate": 5.464355534773216e-06,
"loss": 0.0671,
"step": 1415
},
{
"epoch": 3.966386554621849,
"grad_norm": 4.571686267852783,
"learning_rate": 5.435977105678033e-06,
"loss": 0.1125,
"step": 1416
},
{
"epoch": 3.969187675070028,
"grad_norm": 1.6142526865005493,
"learning_rate": 5.4076635668540075e-06,
"loss": 0.0516,
"step": 1417
},
{
"epoch": 3.9719887955182074,
"grad_norm": 4.262225151062012,
"learning_rate": 5.3794150122119416e-06,
"loss": 0.1635,
"step": 1418
},
{
"epoch": 3.9747899159663866,
"grad_norm": 2.3288111686706543,
"learning_rate": 5.351231535447096e-06,
"loss": 0.0676,
"step": 1419
},
{
"epoch": 3.977591036414566,
"grad_norm": 1.823206901550293,
"learning_rate": 5.323113230038898e-06,
"loss": 0.051,
"step": 1420
},
{
"epoch": 3.980392156862745,
"grad_norm": 3.124669075012207,
"learning_rate": 5.2950601892506004e-06,
"loss": 0.0416,
"step": 1421
},
{
"epoch": 3.9831932773109244,
"grad_norm": 2.188183546066284,
"learning_rate": 5.267072506128981e-06,
"loss": 0.0853,
"step": 1422
},
{
"epoch": 3.9859943977591037,
"grad_norm": 1.9526631832122803,
"learning_rate": 5.239150273504054e-06,
"loss": 0.0709,
"step": 1423
},
{
"epoch": 3.988795518207283,
"grad_norm": 1.2215217351913452,
"learning_rate": 5.211293583988735e-06,
"loss": 0.0368,
"step": 1424
},
{
"epoch": 3.991596638655462,
"grad_norm": 15.94097900390625,
"learning_rate": 5.183502529978548e-06,
"loss": 0.1376,
"step": 1425
},
{
"epoch": 3.9943977591036415,
"grad_norm": 12.16773509979248,
"learning_rate": 5.15577720365131e-06,
"loss": 0.1905,
"step": 1426
},
{
"epoch": 3.9971988795518207,
"grad_norm": 3.074533462524414,
"learning_rate": 5.128117696966841e-06,
"loss": 0.0416,
"step": 1427
},
{
"epoch": 4.0,
"grad_norm": 1.2314493656158447,
"learning_rate": 5.10052410166664e-06,
"loss": 0.0157,
"step": 1428
},
{
"epoch": 4.0,
"eval_f1 (minor class)": 0.04291845493562232,
"eval_loss": 0.46329978108406067,
"eval_roc_auc": 0.47434657347603143,
"eval_runtime": 2.9689,
"eval_samples_per_second": 427.093,
"eval_steps_per_second": 13.473,
"step": 1428
},
{
"epoch": 4.002801120448179,
"grad_norm": 5.048481464385986,
"learning_rate": 5.072996509273598e-06,
"loss": 0.0944,
"step": 1429
},
{
"epoch": 4.0056022408963585,
"grad_norm": 2.3314294815063477,
"learning_rate": 5.0455350110916924e-06,
"loss": 0.0502,
"step": 1430
},
{
"epoch": 4.008403361344538,
"grad_norm": 1.6619155406951904,
"learning_rate": 5.018139698205665e-06,
"loss": 0.0383,
"step": 1431
},
{
"epoch": 4.011204481792717,
"grad_norm": 1.6886764764785767,
"learning_rate": 4.990810661480733e-06,
"loss": 0.0278,
"step": 1432
},
{
"epoch": 4.014005602240896,
"grad_norm": 5.145814418792725,
"learning_rate": 4.963547991562295e-06,
"loss": 0.0704,
"step": 1433
},
{
"epoch": 4.016806722689076,
"grad_norm": 3.0751662254333496,
"learning_rate": 4.9363517788756195e-06,
"loss": 0.0809,
"step": 1434
},
{
"epoch": 4.019607843137255,
"grad_norm": 6.961215972900391,
"learning_rate": 4.9092221136255444e-06,
"loss": 0.0796,
"step": 1435
},
{
"epoch": 4.022408963585434,
"grad_norm": 13.316800117492676,
"learning_rate": 4.882159085796201e-06,
"loss": 0.2108,
"step": 1436
},
{
"epoch": 4.025210084033613,
"grad_norm": 8.000237464904785,
"learning_rate": 4.855162785150674e-06,
"loss": 0.4321,
"step": 1437
},
{
"epoch": 4.028011204481793,
"grad_norm": 1.922743320465088,
"learning_rate": 4.828233301230736e-06,
"loss": 0.0193,
"step": 1438
},
{
"epoch": 4.030812324929972,
"grad_norm": 7.537874698638916,
"learning_rate": 4.801370723356532e-06,
"loss": 0.1475,
"step": 1439
},
{
"epoch": 4.033613445378151,
"grad_norm": 10.09809398651123,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.1252,
"step": 1440
},
{
"epoch": 4.03641456582633,
"grad_norm": 1.498267650604248,
"learning_rate": 4.7478466419161066e-06,
"loss": 0.0246,
"step": 1441
},
{
"epoch": 4.03921568627451,
"grad_norm": 5.92438268661499,
"learning_rate": 4.7211853158794175e-06,
"loss": 0.0685,
"step": 1442
},
{
"epoch": 4.042016806722689,
"grad_norm": 1.3531486988067627,
"learning_rate": 4.694591250946984e-06,
"loss": 0.0363,
"step": 1443
},
{
"epoch": 4.044817927170868,
"grad_norm": 1.6662750244140625,
"learning_rate": 4.668064535326433e-06,
"loss": 0.0492,
"step": 1444
},
{
"epoch": 4.0476190476190474,
"grad_norm": 1.088405966758728,
"learning_rate": 4.6416052570020046e-06,
"loss": 0.032,
"step": 1445
},
{
"epoch": 4.050420168067227,
"grad_norm": 4.784574508666992,
"learning_rate": 4.615213503734267e-06,
"loss": 0.0843,
"step": 1446
},
{
"epoch": 4.053221288515406,
"grad_norm": 1.8354049921035767,
"learning_rate": 4.588889363059815e-06,
"loss": 0.0223,
"step": 1447
},
{
"epoch": 4.056022408963585,
"grad_norm": 1.975907564163208,
"learning_rate": 4.562632922290999e-06,
"loss": 0.0189,
"step": 1448
},
{
"epoch": 4.0588235294117645,
"grad_norm": 2.2125046253204346,
"learning_rate": 4.536444268515608e-06,
"loss": 0.0606,
"step": 1449
},
{
"epoch": 4.061624649859944,
"grad_norm": 5.576314926147461,
"learning_rate": 4.510323488596588e-06,
"loss": 0.068,
"step": 1450
},
{
"epoch": 4.064425770308123,
"grad_norm": 5.911201000213623,
"learning_rate": 4.484270669171781e-06,
"loss": 0.1,
"step": 1451
},
{
"epoch": 4.067226890756302,
"grad_norm": 14.300566673278809,
"learning_rate": 4.458285896653603e-06,
"loss": 0.2765,
"step": 1452
},
{
"epoch": 4.0700280112044815,
"grad_norm": 8.598773956298828,
"learning_rate": 4.4323692572287665e-06,
"loss": 0.1291,
"step": 1453
},
{
"epoch": 4.072829131652661,
"grad_norm": 1.7347263097763062,
"learning_rate": 4.406520836858003e-06,
"loss": 0.0169,
"step": 1454
},
{
"epoch": 4.07563025210084,
"grad_norm": 1.1106011867523193,
"learning_rate": 4.380740721275786e-06,
"loss": 0.024,
"step": 1455
},
{
"epoch": 4.078431372549019,
"grad_norm": 2.621875762939453,
"learning_rate": 4.355028995990021e-06,
"loss": 0.0331,
"step": 1456
},
{
"epoch": 4.081232492997199,
"grad_norm": 9.854007720947266,
"learning_rate": 4.3293857462817775e-06,
"loss": 0.2571,
"step": 1457
},
{
"epoch": 4.084033613445378,
"grad_norm": 21.680360794067383,
"learning_rate": 4.3038110572050065e-06,
"loss": 0.4809,
"step": 1458
},
{
"epoch": 4.086834733893557,
"grad_norm": 1.7910068035125732,
"learning_rate": 4.2783050135862455e-06,
"loss": 0.0267,
"step": 1459
},
{
"epoch": 4.089635854341736,
"grad_norm": 2.5836455821990967,
"learning_rate": 4.252867700024374e-06,
"loss": 0.071,
"step": 1460
},
{
"epoch": 4.092436974789916,
"grad_norm": 2.3723514080047607,
"learning_rate": 4.227499200890275e-06,
"loss": 0.0306,
"step": 1461
},
{
"epoch": 4.095238095238095,
"grad_norm": 11.093085289001465,
"learning_rate": 4.202199600326615e-06,
"loss": 0.1667,
"step": 1462
},
{
"epoch": 4.098039215686274,
"grad_norm": 12.050922393798828,
"learning_rate": 4.176968982247514e-06,
"loss": 0.2057,
"step": 1463
},
{
"epoch": 4.100840336134453,
"grad_norm": 2.999164581298828,
"learning_rate": 4.1518074303383006e-06,
"loss": 0.072,
"step": 1464
},
{
"epoch": 4.103641456582633,
"grad_norm": 12.901005744934082,
"learning_rate": 4.126715028055225e-06,
"loss": 0.2034,
"step": 1465
},
{
"epoch": 4.106442577030812,
"grad_norm": 3.1244161128997803,
"learning_rate": 4.1016918586251645e-06,
"loss": 0.0487,
"step": 1466
},
{
"epoch": 4.109243697478991,
"grad_norm": 1.664584755897522,
"learning_rate": 4.076738005045394e-06,
"loss": 0.0471,
"step": 1467
},
{
"epoch": 4.1120448179271705,
"grad_norm": 16.134794235229492,
"learning_rate": 4.051853550083257e-06,
"loss": 0.4147,
"step": 1468
},
{
"epoch": 4.11484593837535,
"grad_norm": 15.253890037536621,
"learning_rate": 4.027038576275921e-06,
"loss": 0.2513,
"step": 1469
},
{
"epoch": 4.117647058823529,
"grad_norm": 2.4090123176574707,
"learning_rate": 4.002293165930087e-06,
"loss": 0.0924,
"step": 1470
},
{
"epoch": 4.120448179271708,
"grad_norm": 1.3718992471694946,
"learning_rate": 3.977617401121752e-06,
"loss": 0.0398,
"step": 1471
},
{
"epoch": 4.1232492997198875,
"grad_norm": 1.8119758367538452,
"learning_rate": 3.953011363695891e-06,
"loss": 0.0567,
"step": 1472
},
{
"epoch": 4.126050420168067,
"grad_norm": 1.2629765272140503,
"learning_rate": 3.9284751352662045e-06,
"loss": 0.0385,
"step": 1473
},
{
"epoch": 4.128851540616246,
"grad_norm": 1.7177892923355103,
"learning_rate": 3.904008797214867e-06,
"loss": 0.0272,
"step": 1474
},
{
"epoch": 4.131652661064426,
"grad_norm": 2.41703462600708,
"learning_rate": 3.879612430692223e-06,
"loss": 0.068,
"step": 1475
},
{
"epoch": 4.1344537815126055,
"grad_norm": 2.153484344482422,
"learning_rate": 3.855286116616541e-06,
"loss": 0.0304,
"step": 1476
},
{
"epoch": 4.137254901960785,
"grad_norm": 1.6554349660873413,
"learning_rate": 3.831029935673739e-06,
"loss": 0.0619,
"step": 1477
},
{
"epoch": 4.140056022408964,
"grad_norm": 1.987120270729065,
"learning_rate": 3.8068439683171113e-06,
"loss": 0.0199,
"step": 1478
},
{
"epoch": 4.142857142857143,
"grad_norm": 1.3541096448898315,
"learning_rate": 3.7827282947670685e-06,
"loss": 0.0191,
"step": 1479
},
{
"epoch": 4.1456582633053225,
"grad_norm": 7.670234680175781,
"learning_rate": 3.758682995010879e-06,
"loss": 0.1025,
"step": 1480
},
{
"epoch": 4.148459383753502,
"grad_norm": 4.780028343200684,
"learning_rate": 3.734708148802374e-06,
"loss": 0.0719,
"step": 1481
},
{
"epoch": 4.151260504201681,
"grad_norm": 5.820712089538574,
"learning_rate": 3.7108038356617302e-06,
"loss": 0.0777,
"step": 1482
},
{
"epoch": 4.15406162464986,
"grad_norm": 11.232895851135254,
"learning_rate": 3.686970134875159e-06,
"loss": 0.1607,
"step": 1483
},
{
"epoch": 4.1568627450980395,
"grad_norm": 2.310209274291992,
"learning_rate": 3.6632071254946666e-06,
"loss": 0.0634,
"step": 1484
},
{
"epoch": 4.159663865546219,
"grad_norm": 21.566038131713867,
"learning_rate": 3.6395148863377858e-06,
"loss": 0.3186,
"step": 1485
},
{
"epoch": 4.162464985994398,
"grad_norm": 2.0398006439208984,
"learning_rate": 3.6158934959873353e-06,
"loss": 0.0344,
"step": 1486
},
{
"epoch": 4.165266106442577,
"grad_norm": 2.561113119125366,
"learning_rate": 3.5923430327911234e-06,
"loss": 0.04,
"step": 1487
},
{
"epoch": 4.168067226890757,
"grad_norm": 1.6639447212219238,
"learning_rate": 3.5688635748617083e-06,
"loss": 0.0504,
"step": 1488
},
{
"epoch": 4.170868347338936,
"grad_norm": 1.4207217693328857,
"learning_rate": 3.5454552000761477e-06,
"loss": 0.0146,
"step": 1489
},
{
"epoch": 4.173669467787115,
"grad_norm": 10.304887771606445,
"learning_rate": 3.5221179860757154e-06,
"loss": 0.1718,
"step": 1490
},
{
"epoch": 4.176470588235294,
"grad_norm": 0.8173993825912476,
"learning_rate": 3.4988520102656603e-06,
"loss": 0.0279,
"step": 1491
},
{
"epoch": 4.179271708683474,
"grad_norm": 6.045089244842529,
"learning_rate": 3.475657349814959e-06,
"loss": 0.077,
"step": 1492
},
{
"epoch": 4.182072829131653,
"grad_norm": 1.4789886474609375,
"learning_rate": 3.4525340816560446e-06,
"loss": 0.0359,
"step": 1493
},
{
"epoch": 4.184873949579832,
"grad_norm": 8.492941856384277,
"learning_rate": 3.4294822824845446e-06,
"loss": 0.1165,
"step": 1494
},
{
"epoch": 4.187675070028011,
"grad_norm": 1.121145486831665,
"learning_rate": 3.4065020287590454e-06,
"loss": 0.0305,
"step": 1495
},
{
"epoch": 4.190476190476191,
"grad_norm": 15.198166847229004,
"learning_rate": 3.3835933967008294e-06,
"loss": 0.1947,
"step": 1496
},
{
"epoch": 4.19327731092437,
"grad_norm": 20.755802154541016,
"learning_rate": 3.360756462293621e-06,
"loss": 0.4492,
"step": 1497
},
{
"epoch": 4.196078431372549,
"grad_norm": 1.9873195886611938,
"learning_rate": 3.33799130128333e-06,
"loss": 0.0433,
"step": 1498
},
{
"epoch": 4.1988795518207285,
"grad_norm": 6.29288911819458,
"learning_rate": 3.315297989177829e-06,
"loss": 0.0876,
"step": 1499
},
{
"epoch": 4.201680672268908,
"grad_norm": 10.196606636047363,
"learning_rate": 3.292676601246661e-06,
"loss": 0.1346,
"step": 1500
},
{
"epoch": 4.204481792717087,
"grad_norm": 2.1695783138275146,
"learning_rate": 3.270127212520807e-06,
"loss": 0.0426,
"step": 1501
},
{
"epoch": 4.207282913165266,
"grad_norm": 16.231338500976562,
"learning_rate": 3.2476498977924625e-06,
"loss": 0.1547,
"step": 1502
},
{
"epoch": 4.2100840336134455,
"grad_norm": 1.5948389768600464,
"learning_rate": 3.2252447316147462e-06,
"loss": 0.0322,
"step": 1503
},
{
"epoch": 4.212885154061625,
"grad_norm": 3.197258472442627,
"learning_rate": 3.2029117883014796e-06,
"loss": 0.0715,
"step": 1504
},
{
"epoch": 4.215686274509804,
"grad_norm": 4.440106391906738,
"learning_rate": 3.1806511419269298e-06,
"loss": 0.1035,
"step": 1505
},
{
"epoch": 4.218487394957983,
"grad_norm": 1.9581427574157715,
"learning_rate": 3.1584628663255843e-06,
"loss": 0.025,
"step": 1506
},
{
"epoch": 4.221288515406163,
"grad_norm": 15.279425621032715,
"learning_rate": 3.1363470350918688e-06,
"loss": 0.1811,
"step": 1507
},
{
"epoch": 4.224089635854342,
"grad_norm": 4.951991558074951,
"learning_rate": 3.114303721579939e-06,
"loss": 0.0832,
"step": 1508
},
{
"epoch": 4.226890756302521,
"grad_norm": 0.6301396489143372,
"learning_rate": 3.092332998903416e-06,
"loss": 0.0246,
"step": 1509
},
{
"epoch": 4.2296918767507,
"grad_norm": 1.0005985498428345,
"learning_rate": 3.0704349399351435e-06,
"loss": 0.0359,
"step": 1510
},
{
"epoch": 4.23249299719888,
"grad_norm": 6.002378463745117,
"learning_rate": 3.04860961730698e-06,
"loss": 0.098,
"step": 1511
},
{
"epoch": 4.235294117647059,
"grad_norm": 6.9902825355529785,
"learning_rate": 3.0268571034094944e-06,
"loss": 0.1459,
"step": 1512
},
{
"epoch": 4.238095238095238,
"grad_norm": 2.461008071899414,
"learning_rate": 3.0051774703917958e-06,
"loss": 0.0408,
"step": 1513
},
{
"epoch": 4.240896358543417,
"grad_norm": 1.7934045791625977,
"learning_rate": 2.983570790161236e-06,
"loss": 0.074,
"step": 1514
},
{
"epoch": 4.243697478991597,
"grad_norm": 4.491225719451904,
"learning_rate": 2.962037134383211e-06,
"loss": 0.0882,
"step": 1515
},
{
"epoch": 4.246498599439776,
"grad_norm": 2.34145188331604,
"learning_rate": 2.9405765744808994e-06,
"loss": 0.0591,
"step": 1516
},
{
"epoch": 4.249299719887955,
"grad_norm": 13.59142780303955,
"learning_rate": 2.919189181635032e-06,
"loss": 0.1959,
"step": 1517
},
{
"epoch": 4.2521008403361344,
"grad_norm": 2.3551502227783203,
"learning_rate": 2.8978750267836754e-06,
"loss": 0.0537,
"step": 1518
},
{
"epoch": 4.254901960784314,
"grad_norm": 1.709115743637085,
"learning_rate": 2.8766341806219564e-06,
"loss": 0.0421,
"step": 1519
},
{
"epoch": 4.257703081232493,
"grad_norm": 3.4162774085998535,
"learning_rate": 2.8554667136018676e-06,
"loss": 0.056,
"step": 1520
},
{
"epoch": 4.260504201680672,
"grad_norm": 1.602628231048584,
"learning_rate": 2.8343726959320082e-06,
"loss": 0.0559,
"step": 1521
},
{
"epoch": 4.2633053221288515,
"grad_norm": 1.7536978721618652,
"learning_rate": 2.8133521975773543e-06,
"loss": 0.02,
"step": 1522
},
{
"epoch": 4.266106442577031,
"grad_norm": 15.273795127868652,
"learning_rate": 2.7924052882590505e-06,
"loss": 0.242,
"step": 1523
},
{
"epoch": 4.26890756302521,
"grad_norm": 3.253753662109375,
"learning_rate": 2.771532037454136e-06,
"loss": 0.0336,
"step": 1524
},
{
"epoch": 4.271708683473389,
"grad_norm": 11.549338340759277,
"learning_rate": 2.7507325143953626e-06,
"loss": 0.1732,
"step": 1525
},
{
"epoch": 4.2745098039215685,
"grad_norm": 2.729703903198242,
"learning_rate": 2.730006788070924e-06,
"loss": 0.0792,
"step": 1526
},
{
"epoch": 4.277310924369748,
"grad_norm": 2.084292411804199,
"learning_rate": 2.7093549272242447e-06,
"loss": 0.0281,
"step": 1527
},
{
"epoch": 4.280112044817927,
"grad_norm": 2.7604475021362305,
"learning_rate": 2.6887770003537595e-06,
"loss": 0.0495,
"step": 1528
},
{
"epoch": 4.282913165266106,
"grad_norm": 3.681213617324829,
"learning_rate": 2.668273075712663e-06,
"loss": 0.0607,
"step": 1529
},
{
"epoch": 4.285714285714286,
"grad_norm": 1.6016509532928467,
"learning_rate": 2.6478432213087213e-06,
"loss": 0.05,
"step": 1530
},
{
"epoch": 4.288515406162465,
"grad_norm": 1.4813611507415771,
"learning_rate": 2.6274875049040025e-06,
"loss": 0.0454,
"step": 1531
},
{
"epoch": 4.291316526610644,
"grad_norm": 2.653531551361084,
"learning_rate": 2.6072059940146775e-06,
"loss": 0.0342,
"step": 1532
},
{
"epoch": 4.294117647058823,
"grad_norm": 2.021646022796631,
"learning_rate": 2.5869987559107995e-06,
"loss": 0.082,
"step": 1533
},
{
"epoch": 4.296918767507003,
"grad_norm": 3.9387238025665283,
"learning_rate": 2.5668658576160658e-06,
"loss": 0.0846,
"step": 1534
},
{
"epoch": 4.299719887955182,
"grad_norm": 8.06607723236084,
"learning_rate": 2.5468073659076e-06,
"loss": 0.1411,
"step": 1535
},
{
"epoch": 4.302521008403361,
"grad_norm": 1.206432819366455,
"learning_rate": 2.526823347315729e-06,
"loss": 0.0408,
"step": 1536
},
{
"epoch": 4.30532212885154,
"grad_norm": 4.089168071746826,
"learning_rate": 2.5069138681237862e-06,
"loss": 0.052,
"step": 1537
},
{
"epoch": 4.30812324929972,
"grad_norm": 2.05041766166687,
"learning_rate": 2.487078994367853e-06,
"loss": 0.0328,
"step": 1538
},
{
"epoch": 4.310924369747899,
"grad_norm": 3.3000848293304443,
"learning_rate": 2.467318791836559e-06,
"loss": 0.0677,
"step": 1539
},
{
"epoch": 4.313725490196078,
"grad_norm": 9.812804222106934,
"learning_rate": 2.4476333260708696e-06,
"loss": 0.1494,
"step": 1540
},
{
"epoch": 4.3165266106442575,
"grad_norm": 1.3348217010498047,
"learning_rate": 2.42802266236386e-06,
"loss": 0.025,
"step": 1541
},
{
"epoch": 4.319327731092437,
"grad_norm": 2.383312463760376,
"learning_rate": 2.4084868657604954e-06,
"loss": 0.0773,
"step": 1542
},
{
"epoch": 4.322128851540616,
"grad_norm": 5.496630668640137,
"learning_rate": 2.389026001057429e-06,
"loss": 0.0905,
"step": 1543
},
{
"epoch": 4.324929971988795,
"grad_norm": 2.8773908615112305,
"learning_rate": 2.3696401328027806e-06,
"loss": 0.0499,
"step": 1544
},
{
"epoch": 4.3277310924369745,
"grad_norm": 6.333166122436523,
"learning_rate": 2.3503293252959136e-06,
"loss": 0.0853,
"step": 1545
},
{
"epoch": 4.330532212885154,
"grad_norm": 7.183204650878906,
"learning_rate": 2.331093642587223e-06,
"loss": 0.1166,
"step": 1546
},
{
"epoch": 4.333333333333333,
"grad_norm": 1.5117511749267578,
"learning_rate": 2.3119331484779432e-06,
"loss": 0.0452,
"step": 1547
},
{
"epoch": 4.336134453781512,
"grad_norm": 7.2051496505737305,
"learning_rate": 2.292847906519907e-06,
"loss": 0.0896,
"step": 1548
},
{
"epoch": 4.338935574229692,
"grad_norm": 6.0130181312561035,
"learning_rate": 2.273837980015364e-06,
"loss": 0.0912,
"step": 1549
},
{
"epoch": 4.341736694677871,
"grad_norm": 3.2291836738586426,
"learning_rate": 2.25490343201675e-06,
"loss": 0.0754,
"step": 1550
},
{
"epoch": 4.34453781512605,
"grad_norm": 1.2099117040634155,
"learning_rate": 2.236044325326478e-06,
"loss": 0.0492,
"step": 1551
},
{
"epoch": 4.347338935574229,
"grad_norm": 2.219330310821533,
"learning_rate": 2.2172607224967446e-06,
"loss": 0.0486,
"step": 1552
},
{
"epoch": 4.350140056022409,
"grad_norm": 11.192909240722656,
"learning_rate": 2.198552685829308e-06,
"loss": 0.264,
"step": 1553
},
{
"epoch": 4.352941176470588,
"grad_norm": 9.376605033874512,
"learning_rate": 2.1799202773752943e-06,
"loss": 0.3257,
"step": 1554
},
{
"epoch": 4.355742296918767,
"grad_norm": 10.101871490478516,
"learning_rate": 2.1613635589349756e-06,
"loss": 0.1396,
"step": 1555
},
{
"epoch": 4.358543417366946,
"grad_norm": 4.612691402435303,
"learning_rate": 2.142882592057588e-06,
"loss": 0.0681,
"step": 1556
},
{
"epoch": 4.361344537815126,
"grad_norm": 2.978179693222046,
"learning_rate": 2.1244774380410975e-06,
"loss": 0.0873,
"step": 1557
},
{
"epoch": 4.364145658263305,
"grad_norm": 3.6708643436431885,
"learning_rate": 2.1061481579320227e-06,
"loss": 0.0857,
"step": 1558
},
{
"epoch": 4.366946778711484,
"grad_norm": 2.827366352081299,
"learning_rate": 2.087894812525218e-06,
"loss": 0.0473,
"step": 1559
},
{
"epoch": 4.369747899159664,
"grad_norm": 1.7137267589569092,
"learning_rate": 2.0697174623636794e-06,
"loss": 0.0266,
"step": 1560
},
{
"epoch": 4.372549019607844,
"grad_norm": 2.542445421218872,
"learning_rate": 2.0516161677383322e-06,
"loss": 0.0781,
"step": 1561
},
{
"epoch": 4.375350140056023,
"grad_norm": 2.5947771072387695,
"learning_rate": 2.0335909886878583e-06,
"loss": 0.0503,
"step": 1562
},
{
"epoch": 4.378151260504202,
"grad_norm": 1.3462882041931152,
"learning_rate": 2.015641984998459e-06,
"loss": 0.0376,
"step": 1563
},
{
"epoch": 4.380952380952381,
"grad_norm": 5.996583938598633,
"learning_rate": 1.9977692162036876e-06,
"loss": 0.092,
"step": 1564
},
{
"epoch": 4.383753501400561,
"grad_norm": 2.0592055320739746,
"learning_rate": 1.9799727415842323e-06,
"loss": 0.0564,
"step": 1565
},
{
"epoch": 4.38655462184874,
"grad_norm": 2.7037978172302246,
"learning_rate": 1.9622526201677345e-06,
"loss": 0.0555,
"step": 1566
},
{
"epoch": 4.389355742296919,
"grad_norm": 15.573929786682129,
"learning_rate": 1.944608910728582e-06,
"loss": 0.212,
"step": 1567
},
{
"epoch": 4.392156862745098,
"grad_norm": 2.237061023712158,
"learning_rate": 1.9270416717877106e-06,
"loss": 0.0696,
"step": 1568
},
{
"epoch": 4.394957983193278,
"grad_norm": 2.5163397789001465,
"learning_rate": 1.9095509616124385e-06,
"loss": 0.0457,
"step": 1569
},
{
"epoch": 4.397759103641457,
"grad_norm": 13.488228797912598,
"learning_rate": 1.8921368382162353e-06,
"loss": 0.1515,
"step": 1570
},
{
"epoch": 4.400560224089636,
"grad_norm": 13.238293647766113,
"learning_rate": 1.8747993593585477e-06,
"loss": 0.1779,
"step": 1571
},
{
"epoch": 4.4033613445378155,
"grad_norm": 2.18967604637146,
"learning_rate": 1.8575385825446102e-06,
"loss": 0.0456,
"step": 1572
},
{
"epoch": 4.406162464985995,
"grad_norm": 6.665509223937988,
"learning_rate": 1.8403545650252456e-06,
"loss": 0.0974,
"step": 1573
},
{
"epoch": 4.408963585434174,
"grad_norm": 1.9710391759872437,
"learning_rate": 1.8232473637966874e-06,
"loss": 0.0461,
"step": 1574
},
{
"epoch": 4.411764705882353,
"grad_norm": 2.088134765625,
"learning_rate": 1.8062170356003855e-06,
"loss": 0.0685,
"step": 1575
},
{
"epoch": 4.4145658263305325,
"grad_norm": 3.6738247871398926,
"learning_rate": 1.7892636369228083e-06,
"loss": 0.0494,
"step": 1576
},
{
"epoch": 4.417366946778712,
"grad_norm": 6.500718116760254,
"learning_rate": 1.7723872239952644e-06,
"loss": 0.0939,
"step": 1577
},
{
"epoch": 4.420168067226891,
"grad_norm": 2.2293624877929688,
"learning_rate": 1.7555878527937164e-06,
"loss": 0.0303,
"step": 1578
},
{
"epoch": 4.42296918767507,
"grad_norm": 1.980463981628418,
"learning_rate": 1.7388655790385927e-06,
"loss": 0.0496,
"step": 1579
},
{
"epoch": 4.42577030812325,
"grad_norm": 11.144430160522461,
"learning_rate": 1.7222204581946038e-06,
"loss": 0.1785,
"step": 1580
},
{
"epoch": 4.428571428571429,
"grad_norm": 6.809680938720703,
"learning_rate": 1.7056525454705623e-06,
"loss": 0.0816,
"step": 1581
},
{
"epoch": 4.431372549019608,
"grad_norm": 1.3695803880691528,
"learning_rate": 1.6891618958191884e-06,
"loss": 0.0334,
"step": 1582
},
{
"epoch": 4.434173669467787,
"grad_norm": 14.421440124511719,
"learning_rate": 1.6727485639369433e-06,
"loss": 0.2655,
"step": 1583
},
{
"epoch": 4.436974789915967,
"grad_norm": 1.357820749282837,
"learning_rate": 1.656412604263824e-06,
"loss": 0.0441,
"step": 1584
},
{
"epoch": 4.439775910364146,
"grad_norm": 3.2685511112213135,
"learning_rate": 1.6401540709832242e-06,
"loss": 0.0625,
"step": 1585
},
{
"epoch": 4.442577030812325,
"grad_norm": 3.0453453063964844,
"learning_rate": 1.6239730180217016e-06,
"loss": 0.0605,
"step": 1586
},
{
"epoch": 4.445378151260504,
"grad_norm": 3.6903421878814697,
"learning_rate": 1.6078694990488392e-06,
"loss": 0.072,
"step": 1587
},
{
"epoch": 4.448179271708684,
"grad_norm": 1.6126353740692139,
"learning_rate": 1.5918435674770582e-06,
"loss": 0.0598,
"step": 1588
},
{
"epoch": 4.450980392156863,
"grad_norm": 1.6641380786895752,
"learning_rate": 1.5758952764614255e-06,
"loss": 0.0613,
"step": 1589
},
{
"epoch": 4.453781512605042,
"grad_norm": 1.817109227180481,
"learning_rate": 1.5600246788994938e-06,
"loss": 0.0408,
"step": 1590
},
{
"epoch": 4.456582633053221,
"grad_norm": 2.2255218029022217,
"learning_rate": 1.5442318274311223e-06,
"loss": 0.0784,
"step": 1591
},
{
"epoch": 4.459383753501401,
"grad_norm": 10.7179536819458,
"learning_rate": 1.5285167744382933e-06,
"loss": 0.1236,
"step": 1592
},
{
"epoch": 4.46218487394958,
"grad_norm": 16.515138626098633,
"learning_rate": 1.5128795720449617e-06,
"loss": 0.2339,
"step": 1593
},
{
"epoch": 4.464985994397759,
"grad_norm": 1.4610596895217896,
"learning_rate": 1.497320272116845e-06,
"loss": 0.0405,
"step": 1594
},
{
"epoch": 4.4677871148459385,
"grad_norm": 6.288466930389404,
"learning_rate": 1.4818389262612947e-06,
"loss": 0.1256,
"step": 1595
},
{
"epoch": 4.470588235294118,
"grad_norm": 2.1592063903808594,
"learning_rate": 1.4664355858270862e-06,
"loss": 0.0313,
"step": 1596
},
{
"epoch": 4.473389355742297,
"grad_norm": 4.703954219818115,
"learning_rate": 1.451110301904271e-06,
"loss": 0.0736,
"step": 1597
},
{
"epoch": 4.476190476190476,
"grad_norm": 3.23107647895813,
"learning_rate": 1.4358631253240024e-06,
"loss": 0.0381,
"step": 1598
},
{
"epoch": 4.4789915966386555,
"grad_norm": 5.452665328979492,
"learning_rate": 1.420694106658363e-06,
"loss": 0.3105,
"step": 1599
},
{
"epoch": 4.481792717086835,
"grad_norm": 3.9024038314819336,
"learning_rate": 1.4056032962202036e-06,
"loss": 0.1058,
"step": 1600
},
{
"epoch": 4.484593837535014,
"grad_norm": 1.7305136919021606,
"learning_rate": 1.3905907440629752e-06,
"loss": 0.05,
"step": 1601
},
{
"epoch": 4.487394957983193,
"grad_norm": 2.359851121902466,
"learning_rate": 1.3756564999805515e-06,
"loss": 0.0409,
"step": 1602
},
{
"epoch": 4.490196078431373,
"grad_norm": 2.569563388824463,
"learning_rate": 1.3608006135070767e-06,
"loss": 0.026,
"step": 1603
},
{
"epoch": 4.492997198879552,
"grad_norm": 5.982087135314941,
"learning_rate": 1.3460231339168016e-06,
"loss": 0.152,
"step": 1604
},
{
"epoch": 4.495798319327731,
"grad_norm": 6.946103572845459,
"learning_rate": 1.3313241102239054e-06,
"loss": 0.2391,
"step": 1605
},
{
"epoch": 4.49859943977591,
"grad_norm": 1.6757957935333252,
"learning_rate": 1.3167035911823584e-06,
"loss": 0.0172,
"step": 1606
},
{
"epoch": 4.50140056022409,
"grad_norm": 19.9091739654541,
"learning_rate": 1.302161625285736e-06,
"loss": 0.2357,
"step": 1607
},
{
"epoch": 4.504201680672269,
"grad_norm": 11.021063804626465,
"learning_rate": 1.2876982607670673e-06,
"loss": 0.0897,
"step": 1608
},
{
"epoch": 4.507002801120448,
"grad_norm": 1.462570309638977,
"learning_rate": 1.2733135455986755e-06,
"loss": 0.0574,
"step": 1609
},
{
"epoch": 4.509803921568627,
"grad_norm": 10.852596282958984,
"learning_rate": 1.2590075274920205e-06,
"loss": 0.1951,
"step": 1610
},
{
"epoch": 4.512605042016807,
"grad_norm": 3.872061014175415,
"learning_rate": 1.2447802538975345e-06,
"loss": 0.07,
"step": 1611
},
{
"epoch": 4.515406162464986,
"grad_norm": 1.7108198404312134,
"learning_rate": 1.2306317720044702e-06,
"loss": 0.0332,
"step": 1612
},
{
"epoch": 4.518207282913165,
"grad_norm": 1.4256621599197388,
"learning_rate": 1.2165621287407502e-06,
"loss": 0.0554,
"step": 1613
},
{
"epoch": 4.5210084033613445,
"grad_norm": 1.342751383781433,
"learning_rate": 1.2025713707727954e-06,
"loss": 0.0231,
"step": 1614
},
{
"epoch": 4.523809523809524,
"grad_norm": 3.968289375305176,
"learning_rate": 1.1886595445053745e-06,
"loss": 0.0447,
"step": 1615
},
{
"epoch": 4.526610644257703,
"grad_norm": 3.041936159133911,
"learning_rate": 1.1748266960814685e-06,
"loss": 0.0408,
"step": 1616
},
{
"epoch": 4.529411764705882,
"grad_norm": 2.2960402965545654,
"learning_rate": 1.1610728713820906e-06,
"loss": 0.0288,
"step": 1617
},
{
"epoch": 4.5322128851540615,
"grad_norm": 2.9857239723205566,
"learning_rate": 1.147398116026152e-06,
"loss": 0.0326,
"step": 1618
},
{
"epoch": 4.535014005602241,
"grad_norm": 11.323493003845215,
"learning_rate": 1.1338024753703074e-06,
"loss": 0.215,
"step": 1619
},
{
"epoch": 4.53781512605042,
"grad_norm": 14.352209091186523,
"learning_rate": 1.120285994508799e-06,
"loss": 0.1771,
"step": 1620
},
{
"epoch": 4.540616246498599,
"grad_norm": 1.062800407409668,
"learning_rate": 1.1068487182733094e-06,
"loss": 0.0246,
"step": 1621
},
{
"epoch": 4.543417366946779,
"grad_norm": 1.4831277132034302,
"learning_rate": 1.0934906912328208e-06,
"loss": 0.0562,
"step": 1622
},
{
"epoch": 4.546218487394958,
"grad_norm": 7.015320301055908,
"learning_rate": 1.08021195769345e-06,
"loss": 0.2891,
"step": 1623
},
{
"epoch": 4.549019607843137,
"grad_norm": 3.3961236476898193,
"learning_rate": 1.067012561698319e-06,
"loss": 0.0467,
"step": 1624
},
{
"epoch": 4.551820728291316,
"grad_norm": 6.722127914428711,
"learning_rate": 1.0538925470274019e-06,
"loss": 0.0768,
"step": 1625
},
{
"epoch": 4.554621848739496,
"grad_norm": 2.404068946838379,
"learning_rate": 1.0408519571973807e-06,
"loss": 0.052,
"step": 1626
},
{
"epoch": 4.557422969187675,
"grad_norm": 9.304272651672363,
"learning_rate": 1.0278908354614924e-06,
"loss": 0.3272,
"step": 1627
},
{
"epoch": 4.560224089635854,
"grad_norm": 12.329230308532715,
"learning_rate": 1.0150092248094018e-06,
"loss": 0.1709,
"step": 1628
},
{
"epoch": 4.563025210084033,
"grad_norm": 11.739825248718262,
"learning_rate": 1.0022071679670424e-06,
"loss": 0.2198,
"step": 1629
},
{
"epoch": 4.565826330532213,
"grad_norm": 10.221098899841309,
"learning_rate": 9.894847073964876e-07,
"loss": 0.1181,
"step": 1630
},
{
"epoch": 4.568627450980392,
"grad_norm": 9.965012550354004,
"learning_rate": 9.768418852957984e-07,
"loss": 0.1115,
"step": 1631
},
{
"epoch": 4.571428571428571,
"grad_norm": 2.6115660667419434,
"learning_rate": 9.642787435989009e-07,
"loss": 0.0291,
"step": 1632
},
{
"epoch": 4.57422969187675,
"grad_norm": 2.7376770973205566,
"learning_rate": 9.51795323975424e-07,
"loss": 0.0363,
"step": 1633
},
{
"epoch": 4.57703081232493,
"grad_norm": 12.229462623596191,
"learning_rate": 9.39391667830583e-07,
"loss": 0.1141,
"step": 1634
},
{
"epoch": 4.579831932773109,
"grad_norm": 1.8331098556518555,
"learning_rate": 9.270678163050217e-07,
"loss": 0.0228,
"step": 1635
},
{
"epoch": 4.582633053221288,
"grad_norm": 2.132624864578247,
"learning_rate": 9.148238102746898e-07,
"loss": 0.0509,
"step": 1636
},
{
"epoch": 4.5854341736694675,
"grad_norm": 2.425008773803711,
"learning_rate": 9.026596903507156e-07,
"loss": 0.0308,
"step": 1637
},
{
"epoch": 4.588235294117647,
"grad_norm": 2.0792341232299805,
"learning_rate": 8.905754968792479e-07,
"loss": 0.0576,
"step": 1638
},
{
"epoch": 4.591036414565826,
"grad_norm": 1.6930265426635742,
"learning_rate": 8.78571269941339e-07,
"loss": 0.0766,
"step": 1639
},
{
"epoch": 4.593837535014005,
"grad_norm": 2.5589306354522705,
"learning_rate": 8.666470493528006e-07,
"loss": 0.0772,
"step": 1640
},
{
"epoch": 4.5966386554621845,
"grad_norm": 3.1330699920654297,
"learning_rate": 8.548028746640846e-07,
"loss": 0.0485,
"step": 1641
},
{
"epoch": 4.599439775910364,
"grad_norm": 4.326323509216309,
"learning_rate": 8.43038785160144e-07,
"loss": 0.0909,
"step": 1642
},
{
"epoch": 4.602240896358543,
"grad_norm": 3.274137496948242,
"learning_rate": 8.313548198603028e-07,
"loss": 0.0732,
"step": 1643
},
{
"epoch": 4.605042016806722,
"grad_norm": 4.196802139282227,
"learning_rate": 8.197510175181278e-07,
"loss": 0.0803,
"step": 1644
},
{
"epoch": 4.607843137254902,
"grad_norm": 5.336474895477295,
"learning_rate": 8.082274166213016e-07,
"loss": 0.1106,
"step": 1645
},
{
"epoch": 4.610644257703081,
"grad_norm": 14.818458557128906,
"learning_rate": 7.96784055391489e-07,
"loss": 0.1967,
"step": 1646
},
{
"epoch": 4.61344537815126,
"grad_norm": 4.849156856536865,
"learning_rate": 7.854209717842231e-07,
"loss": 0.0707,
"step": 1647
},
{
"epoch": 4.616246498599439,
"grad_norm": 2.6177124977111816,
"learning_rate": 7.741382034887612e-07,
"loss": 0.0938,
"step": 1648
},
{
"epoch": 4.619047619047619,
"grad_norm": 1.4032827615737915,
"learning_rate": 7.629357879279764e-07,
"loss": 0.0563,
"step": 1649
},
{
"epoch": 4.621848739495798,
"grad_norm": 2.6066806316375732,
"learning_rate": 7.518137622582188e-07,
"loss": 0.0321,
"step": 1650
},
{
"epoch": 4.624649859943977,
"grad_norm": 3.374037027359009,
"learning_rate": 7.407721633692133e-07,
"loss": 0.1005,
"step": 1651
},
{
"epoch": 4.627450980392156,
"grad_norm": 2.4223220348358154,
"learning_rate": 7.298110278839088e-07,
"loss": 0.043,
"step": 1652
},
{
"epoch": 4.630252100840336,
"grad_norm": 1.6764289140701294,
"learning_rate": 7.189303921583818e-07,
"loss": 0.0259,
"step": 1653
},
{
"epoch": 4.633053221288515,
"grad_norm": 15.260522842407227,
"learning_rate": 7.08130292281703e-07,
"loss": 0.5412,
"step": 1654
},
{
"epoch": 4.635854341736694,
"grad_norm": 0.99358069896698,
"learning_rate": 6.974107640758177e-07,
"loss": 0.0334,
"step": 1655
},
{
"epoch": 4.6386554621848735,
"grad_norm": 9.05520248413086,
"learning_rate": 6.867718430954351e-07,
"loss": 0.1177,
"step": 1656
},
{
"epoch": 4.641456582633054,
"grad_norm": 2.503553867340088,
"learning_rate": 6.762135646279005e-07,
"loss": 0.0787,
"step": 1657
},
{
"epoch": 4.644257703081233,
"grad_norm": 2.3541131019592285,
"learning_rate": 6.657359636930871e-07,
"loss": 0.0636,
"step": 1658
},
{
"epoch": 4.647058823529412,
"grad_norm": 1.380083441734314,
"learning_rate": 6.553390750432709e-07,
"loss": 0.0463,
"step": 1659
},
{
"epoch": 4.649859943977591,
"grad_norm": 0.6362089514732361,
"learning_rate": 6.450229331630253e-07,
"loss": 0.0259,
"step": 1660
},
{
"epoch": 4.652661064425771,
"grad_norm": 3.1322717666625977,
"learning_rate": 6.347875722690938e-07,
"loss": 0.0531,
"step": 1661
},
{
"epoch": 4.65546218487395,
"grad_norm": 2.5103180408477783,
"learning_rate": 6.246330263102895e-07,
"loss": 0.0489,
"step": 1662
},
{
"epoch": 4.658263305322129,
"grad_norm": 2.5276596546173096,
"learning_rate": 6.145593289673818e-07,
"loss": 0.0431,
"step": 1663
},
{
"epoch": 4.661064425770308,
"grad_norm": 11.838348388671875,
"learning_rate": 6.045665136529682e-07,
"loss": 0.1969,
"step": 1664
},
{
"epoch": 4.663865546218488,
"grad_norm": 12.39102554321289,
"learning_rate": 5.946546135113862e-07,
"loss": 0.1764,
"step": 1665
},
{
"epoch": 4.666666666666667,
"grad_norm": 2.167029857635498,
"learning_rate": 5.848236614185875e-07,
"loss": 0.0451,
"step": 1666
},
{
"epoch": 4.669467787114846,
"grad_norm": 2.474241018295288,
"learning_rate": 5.750736899820335e-07,
"loss": 0.0549,
"step": 1667
},
{
"epoch": 4.6722689075630255,
"grad_norm": 1.713193655014038,
"learning_rate": 5.654047315405892e-07,
"loss": 0.0457,
"step": 1668
},
{
"epoch": 4.675070028011205,
"grad_norm": 2.2506814002990723,
"learning_rate": 5.558168181644147e-07,
"loss": 0.0671,
"step": 1669
},
{
"epoch": 4.677871148459384,
"grad_norm": 2.6729841232299805,
"learning_rate": 5.463099816548579e-07,
"loss": 0.0623,
"step": 1670
},
{
"epoch": 4.680672268907563,
"grad_norm": 2.53517746925354,
"learning_rate": 5.368842535443507e-07,
"loss": 0.0728,
"step": 1671
},
{
"epoch": 4.6834733893557425,
"grad_norm": 3.004045248031616,
"learning_rate": 5.275396650963022e-07,
"loss": 0.0366,
"step": 1672
},
{
"epoch": 4.686274509803922,
"grad_norm": 2.0710742473602295,
"learning_rate": 5.182762473049968e-07,
"loss": 0.08,
"step": 1673
},
{
"epoch": 4.689075630252101,
"grad_norm": 2.197892904281616,
"learning_rate": 5.09094030895485e-07,
"loss": 0.0494,
"step": 1674
},
{
"epoch": 4.69187675070028,
"grad_norm": 2.688249349594116,
"learning_rate": 4.999930463234964e-07,
"loss": 0.0281,
"step": 1675
},
{
"epoch": 4.69467787114846,
"grad_norm": 11.430791854858398,
"learning_rate": 4.909733237753261e-07,
"loss": 0.1581,
"step": 1676
},
{
"epoch": 4.697478991596639,
"grad_norm": 3.3218257427215576,
"learning_rate": 4.820348931677349e-07,
"loss": 0.0395,
"step": 1677
},
{
"epoch": 4.700280112044818,
"grad_norm": 3.3020308017730713,
"learning_rate": 4.731777841478574e-07,
"loss": 0.0405,
"step": 1678
},
{
"epoch": 4.703081232492997,
"grad_norm": 8.893743515014648,
"learning_rate": 4.644020260930998e-07,
"loss": 0.1208,
"step": 1679
},
{
"epoch": 4.705882352941177,
"grad_norm": 2.8727614879608154,
"learning_rate": 4.5570764811103674e-07,
"loss": 0.0331,
"step": 1680
},
{
"epoch": 4.708683473389356,
"grad_norm": 1.768341302871704,
"learning_rate": 4.470946790393199e-07,
"loss": 0.0732,
"step": 1681
},
{
"epoch": 4.711484593837535,
"grad_norm": 5.500733375549316,
"learning_rate": 4.3856314744559477e-07,
"loss": 0.139,
"step": 1682
},
{
"epoch": 4.714285714285714,
"grad_norm": 7.111898422241211,
"learning_rate": 4.301130816273813e-07,
"loss": 0.2025,
"step": 1683
},
{
"epoch": 4.717086834733894,
"grad_norm": 2.4582126140594482,
"learning_rate": 4.217445096119932e-07,
"loss": 0.0387,
"step": 1684
},
{
"epoch": 4.719887955182073,
"grad_norm": 8.703099250793457,
"learning_rate": 4.134574591564494e-07,
"loss": 0.1172,
"step": 1685
},
{
"epoch": 4.722689075630252,
"grad_norm": 8.003055572509766,
"learning_rate": 4.05251957747374e-07,
"loss": 0.1201,
"step": 1686
},
{
"epoch": 4.7254901960784315,
"grad_norm": 1.542994499206543,
"learning_rate": 3.9712803260090467e-07,
"loss": 0.0397,
"step": 1687
},
{
"epoch": 4.728291316526611,
"grad_norm": 6.328291893005371,
"learning_rate": 3.8908571066261503e-07,
"loss": 0.0821,
"step": 1688
},
{
"epoch": 4.73109243697479,
"grad_norm": 2.74688720703125,
"learning_rate": 3.811250186074089e-07,
"loss": 0.0822,
"step": 1689
},
{
"epoch": 4.733893557422969,
"grad_norm": 8.745429039001465,
"learning_rate": 3.732459828394402e-07,
"loss": 0.1282,
"step": 1690
},
{
"epoch": 4.7366946778711485,
"grad_norm": 18.845354080200195,
"learning_rate": 3.654486294920212e-07,
"loss": 0.2173,
"step": 1691
},
{
"epoch": 4.739495798319328,
"grad_norm": 1.8658539056777954,
"learning_rate": 3.57732984427539e-07,
"loss": 0.069,
"step": 1692
},
{
"epoch": 4.742296918767507,
"grad_norm": 2.340552806854248,
"learning_rate": 3.5009907323737825e-07,
"loss": 0.0434,
"step": 1693
},
{
"epoch": 4.745098039215686,
"grad_norm": 7.247988224029541,
"learning_rate": 3.425469212418125e-07,
"loss": 0.142,
"step": 1694
},
{
"epoch": 4.7478991596638656,
"grad_norm": 7.049402713775635,
"learning_rate": 3.350765534899519e-07,
"loss": 0.0934,
"step": 1695
},
{
"epoch": 4.750700280112045,
"grad_norm": 3.384856939315796,
"learning_rate": 3.276879947596262e-07,
"loss": 0.0324,
"step": 1696
},
{
"epoch": 4.753501400560224,
"grad_norm": 2.604999303817749,
"learning_rate": 3.203812695573322e-07,
"loss": 0.0385,
"step": 1697
},
{
"epoch": 4.756302521008403,
"grad_norm": 2.4760732650756836,
"learning_rate": 3.131564021181338e-07,
"loss": 0.0591,
"step": 1698
},
{
"epoch": 4.759103641456583,
"grad_norm": 3.1976983547210693,
"learning_rate": 3.0601341640559276e-07,
"loss": 0.0345,
"step": 1699
},
{
"epoch": 4.761904761904762,
"grad_norm": 2.2834088802337646,
"learning_rate": 2.9895233611167697e-07,
"loss": 0.0289,
"step": 1700
},
{
"epoch": 4.764705882352941,
"grad_norm": 2.7305784225463867,
"learning_rate": 2.9197318465669363e-07,
"loss": 0.0267,
"step": 1701
},
{
"epoch": 4.76750700280112,
"grad_norm": 2.3291826248168945,
"learning_rate": 2.850759851892093e-07,
"loss": 0.051,
"step": 1702
},
{
"epoch": 4.7703081232493,
"grad_norm": 1.806341528892517,
"learning_rate": 2.7826076058596607e-07,
"loss": 0.075,
"step": 1703
},
{
"epoch": 4.773109243697479,
"grad_norm": 7.36749267578125,
"learning_rate": 2.7152753345181245e-07,
"loss": 0.1183,
"step": 1704
},
{
"epoch": 4.775910364145658,
"grad_norm": 2.591308355331421,
"learning_rate": 2.6487632611962576e-07,
"loss": 0.0547,
"step": 1705
},
{
"epoch": 4.778711484593837,
"grad_norm": 13.91858959197998,
"learning_rate": 2.5830716065023687e-07,
"loss": 0.2223,
"step": 1706
},
{
"epoch": 4.781512605042017,
"grad_norm": 1.7399249076843262,
"learning_rate": 2.518200588323666e-07,
"loss": 0.0415,
"step": 1707
},
{
"epoch": 4.784313725490196,
"grad_norm": 13.468841552734375,
"learning_rate": 2.454150421825396e-07,
"loss": 0.233,
"step": 1708
},
{
"epoch": 4.787114845938375,
"grad_norm": 1.9121205806732178,
"learning_rate": 2.3909213194501513e-07,
"loss": 0.0368,
"step": 1709
},
{
"epoch": 4.7899159663865545,
"grad_norm": 2.452716827392578,
"learning_rate": 2.3285134909173112e-07,
"loss": 0.051,
"step": 1710
},
{
"epoch": 4.792717086834734,
"grad_norm": 2.3272078037261963,
"learning_rate": 2.2669271432221583e-07,
"loss": 0.0213,
"step": 1711
},
{
"epoch": 4.795518207282913,
"grad_norm": 8.261398315429688,
"learning_rate": 2.2061624806352932e-07,
"loss": 0.1205,
"step": 1712
},
{
"epoch": 4.798319327731092,
"grad_norm": 1.9649708271026611,
"learning_rate": 2.1462197047019127e-07,
"loss": 0.0367,
"step": 1713
},
{
"epoch": 4.8011204481792715,
"grad_norm": 1.853360891342163,
"learning_rate": 2.087099014241256e-07,
"loss": 0.034,
"step": 1714
},
{
"epoch": 4.803921568627451,
"grad_norm": 2.8028903007507324,
"learning_rate": 2.0288006053457708e-07,
"loss": 0.0363,
"step": 1715
},
{
"epoch": 4.80672268907563,
"grad_norm": 2.87454891204834,
"learning_rate": 1.9713246713805588e-07,
"loss": 0.0468,
"step": 1716
},
{
"epoch": 4.809523809523809,
"grad_norm": 1.8069205284118652,
"learning_rate": 1.9146714029827374e-07,
"loss": 0.0589,
"step": 1717
},
{
"epoch": 4.812324929971989,
"grad_norm": 7.590226173400879,
"learning_rate": 1.858840988060828e-07,
"loss": 0.1066,
"step": 1718
},
{
"epoch": 4.815126050420168,
"grad_norm": 3.4766526222229004,
"learning_rate": 1.803833611794037e-07,
"loss": 0.0997,
"step": 1719
},
{
"epoch": 4.817927170868347,
"grad_norm": 1.1857209205627441,
"learning_rate": 1.7496494566317245e-07,
"loss": 0.0268,
"step": 1720
},
{
"epoch": 4.820728291316526,
"grad_norm": 9.3518648147583,
"learning_rate": 1.6962887022927977e-07,
"loss": 0.1597,
"step": 1721
},
{
"epoch": 4.823529411764706,
"grad_norm": 1.3974069356918335,
"learning_rate": 1.6437515257650972e-07,
"loss": 0.0423,
"step": 1722
},
{
"epoch": 4.826330532212885,
"grad_norm": 4.613117218017578,
"learning_rate": 1.5920381013047603e-07,
"loss": 0.1046,
"step": 1723
},
{
"epoch": 4.829131652661064,
"grad_norm": 6.378235816955566,
"learning_rate": 1.541148600435721e-07,
"loss": 0.0736,
"step": 1724
},
{
"epoch": 4.831932773109243,
"grad_norm": 2.638489007949829,
"learning_rate": 1.4910831919490996e-07,
"loss": 0.0344,
"step": 1725
},
{
"epoch": 4.834733893557423,
"grad_norm": 3.4409339427948,
"learning_rate": 1.4418420419026746e-07,
"loss": 0.0442,
"step": 1726
},
{
"epoch": 4.837535014005602,
"grad_norm": 1.2881028652191162,
"learning_rate": 1.3934253136203279e-07,
"loss": 0.0546,
"step": 1727
},
{
"epoch": 4.840336134453781,
"grad_norm": 1.417250633239746,
"learning_rate": 1.3458331676914071e-07,
"loss": 0.0331,
"step": 1728
},
{
"epoch": 4.8431372549019605,
"grad_norm": 7.4041428565979,
"learning_rate": 1.2990657619703362e-07,
"loss": 0.1291,
"step": 1729
},
{
"epoch": 4.84593837535014,
"grad_norm": 1.3169057369232178,
"learning_rate": 1.2531232515760328e-07,
"loss": 0.038,
"step": 1730
},
{
"epoch": 4.848739495798319,
"grad_norm": 20.743854522705078,
"learning_rate": 1.2080057888913255e-07,
"loss": 0.3051,
"step": 1731
},
{
"epoch": 4.851540616246498,
"grad_norm": 2.0885722637176514,
"learning_rate": 1.1637135235625929e-07,
"loss": 0.045,
"step": 1732
},
{
"epoch": 4.8543417366946775,
"grad_norm": 1.63369882106781,
"learning_rate": 1.1202466024991254e-07,
"loss": 0.0323,
"step": 1733
},
{
"epoch": 4.857142857142857,
"grad_norm": 12.961833000183105,
"learning_rate": 1.0776051698727362e-07,
"loss": 0.1602,
"step": 1734
},
{
"epoch": 4.859943977591037,
"grad_norm": 12.232168197631836,
"learning_rate": 1.0357893671171792e-07,
"loss": 0.106,
"step": 1735
},
{
"epoch": 4.862745098039216,
"grad_norm": 1.9717285633087158,
"learning_rate": 9.947993329278705e-08,
"loss": 0.0258,
"step": 1736
},
{
"epoch": 4.865546218487395,
"grad_norm": 2.950434446334839,
"learning_rate": 9.546352032611394e-08,
"loss": 0.0649,
"step": 1737
},
{
"epoch": 4.868347338935575,
"grad_norm": 1.6429107189178467,
"learning_rate": 9.152971113340902e-08,
"loss": 0.0167,
"step": 1738
},
{
"epoch": 4.871148459383754,
"grad_norm": 2.8018219470977783,
"learning_rate": 8.767851876239074e-08,
"loss": 0.0726,
"step": 1739
},
{
"epoch": 4.873949579831933,
"grad_norm": 4.093255043029785,
"learning_rate": 8.390995598676066e-08,
"loss": 0.0648,
"step": 1740
},
{
"epoch": 4.8767507002801125,
"grad_norm": 1.5745418071746826,
"learning_rate": 8.022403530614786e-08,
"loss": 0.0832,
"step": 1741
},
{
"epoch": 4.879551820728292,
"grad_norm": 12.333577156066895,
"learning_rate": 7.662076894607573e-08,
"loss": 0.3143,
"step": 1742
},
{
"epoch": 4.882352941176471,
"grad_norm": 1.0486894845962524,
"learning_rate": 7.310016885791471e-08,
"loss": 0.0257,
"step": 1743
},
{
"epoch": 4.88515406162465,
"grad_norm": 1.6549791097640991,
"learning_rate": 6.966224671884903e-08,
"loss": 0.028,
"step": 1744
},
{
"epoch": 4.8879551820728295,
"grad_norm": 1.3248844146728516,
"learning_rate": 6.63070139318378e-08,
"loss": 0.0361,
"step": 1745
},
{
"epoch": 4.890756302521009,
"grad_norm": 7.451545238494873,
"learning_rate": 6.30344816255679e-08,
"loss": 0.153,
"step": 1746
},
{
"epoch": 4.893557422969188,
"grad_norm": 1.918972373008728,
"learning_rate": 5.984466065443173e-08,
"loss": 0.0248,
"step": 1747
},
{
"epoch": 4.896358543417367,
"grad_norm": 6.333373546600342,
"learning_rate": 5.673756159847721e-08,
"loss": 0.1058,
"step": 1748
},
{
"epoch": 4.899159663865547,
"grad_norm": 7.948291778564453,
"learning_rate": 5.371319476338288e-08,
"loss": 0.1141,
"step": 1749
},
{
"epoch": 4.901960784313726,
"grad_norm": 2.0265696048736572,
"learning_rate": 5.0771570180416226e-08,
"loss": 0.0356,
"step": 1750
},
{
"epoch": 4.904761904761905,
"grad_norm": 11.392507553100586,
"learning_rate": 4.7912697606405935e-08,
"loss": 0.1442,
"step": 1751
},
{
"epoch": 4.907563025210084,
"grad_norm": 4.750543594360352,
"learning_rate": 4.5136586523711335e-08,
"loss": 0.1115,
"step": 1752
},
{
"epoch": 4.910364145658264,
"grad_norm": 3.7499547004699707,
"learning_rate": 4.244324614017803e-08,
"loss": 0.0454,
"step": 1753
},
{
"epoch": 4.913165266106443,
"grad_norm": 6.997133255004883,
"learning_rate": 3.983268538912399e-08,
"loss": 0.1016,
"step": 1754
},
{
"epoch": 4.915966386554622,
"grad_norm": 2.791280746459961,
"learning_rate": 3.730491292930072e-08,
"loss": 0.1355,
"step": 1755
},
{
"epoch": 4.918767507002801,
"grad_norm": 2.445356845855713,
"learning_rate": 3.4859937144862684e-08,
"loss": 0.0521,
"step": 1756
},
{
"epoch": 4.921568627450981,
"grad_norm": 17.968975067138672,
"learning_rate": 3.249776614534794e-08,
"loss": 0.3338,
"step": 1757
},
{
"epoch": 4.92436974789916,
"grad_norm": 2.179201126098633,
"learning_rate": 3.0218407765641995e-08,
"loss": 0.0513,
"step": 1758
},
{
"epoch": 4.927170868347339,
"grad_norm": 1.389999270439148,
"learning_rate": 2.8021869565958427e-08,
"loss": 0.0263,
"step": 1759
},
{
"epoch": 4.9299719887955185,
"grad_norm": 20.252166748046875,
"learning_rate": 2.590815883181108e-08,
"loss": 0.281,
"step": 1760
},
{
"epoch": 4.932773109243698,
"grad_norm": 2.8117334842681885,
"learning_rate": 2.387728257399191e-08,
"loss": 0.028,
"step": 1761
},
{
"epoch": 4.935574229691877,
"grad_norm": 3.943692922592163,
"learning_rate": 2.192924752854042e-08,
"loss": 0.0796,
"step": 1762
},
{
"epoch": 4.938375350140056,
"grad_norm": 29.34433364868164,
"learning_rate": 2.006406015673812e-08,
"loss": 0.3524,
"step": 1763
},
{
"epoch": 4.9411764705882355,
"grad_norm": 1.402475118637085,
"learning_rate": 1.8281726645061336e-08,
"loss": 0.0504,
"step": 1764
},
{
"epoch": 4.943977591036415,
"grad_norm": 7.353896141052246,
"learning_rate": 1.658225290518678e-08,
"loss": 0.094,
"step": 1765
},
{
"epoch": 4.946778711484594,
"grad_norm": 1.7359479665756226,
"learning_rate": 1.4965644573958216e-08,
"loss": 0.043,
"step": 1766
},
{
"epoch": 4.949579831932773,
"grad_norm": 2.4236812591552734,
"learning_rate": 1.3431907013367051e-08,
"loss": 0.0693,
"step": 1767
},
{
"epoch": 4.9523809523809526,
"grad_norm": 2.8057467937469482,
"learning_rate": 1.1981045310535676e-08,
"loss": 0.0356,
"step": 1768
},
{
"epoch": 4.955182072829132,
"grad_norm": 1.7950048446655273,
"learning_rate": 1.0613064277711916e-08,
"loss": 0.0315,
"step": 1769
},
{
"epoch": 4.957983193277311,
"grad_norm": 4.1644511222839355,
"learning_rate": 9.327968452232938e-09,
"loss": 0.0458,
"step": 1770
},
{
"epoch": 4.96078431372549,
"grad_norm": 4.522883415222168,
"learning_rate": 8.12576209653082e-09,
"loss": 0.091,
"step": 1771
},
{
"epoch": 4.96358543417367,
"grad_norm": 8.26240348815918,
"learning_rate": 7.006449198096454e-09,
"loss": 0.1189,
"step": 1772
},
{
"epoch": 4.966386554621849,
"grad_norm": 3.297781467437744,
"learning_rate": 5.970033469490655e-09,
"loss": 0.0897,
"step": 1773
},
{
"epoch": 4.969187675070028,
"grad_norm": 8.993282318115234,
"learning_rate": 5.01651834831085e-09,
"loss": 0.0968,
"step": 1774
},
{
"epoch": 4.971988795518207,
"grad_norm": 11.676566123962402,
"learning_rate": 4.1459069971938606e-09,
"loss": 0.1407,
"step": 1775
},
{
"epoch": 4.974789915966387,
"grad_norm": 2.719485282897949,
"learning_rate": 3.358202303796465e-09,
"loss": 0.0762,
"step": 1776
},
{
"epoch": 4.977591036414566,
"grad_norm": 1.5511162281036377,
"learning_rate": 2.653406880789855e-09,
"loss": 0.0516,
"step": 1777
},
{
"epoch": 4.980392156862745,
"grad_norm": 2.307431697845459,
"learning_rate": 2.031523065851304e-09,
"loss": 0.0221,
"step": 1778
},
{
"epoch": 4.983193277310924,
"grad_norm": 9.107227325439453,
"learning_rate": 1.492552921655843e-09,
"loss": 0.1387,
"step": 1779
},
{
"epoch": 4.985994397759104,
"grad_norm": 4.414126873016357,
"learning_rate": 1.0364982358707087e-09,
"loss": 0.0932,
"step": 1780
},
{
"epoch": 4.988795518207283,
"grad_norm": 4.6520819664001465,
"learning_rate": 6.633605211414651e-10,
"loss": 0.0866,
"step": 1781
},
{
"epoch": 4.991596638655462,
"grad_norm": 2.0537655353546143,
"learning_rate": 3.731410150975556e-10,
"loss": 0.0565,
"step": 1782
},
{
"epoch": 4.9943977591036415,
"grad_norm": 1.9146981239318848,
"learning_rate": 1.658406803467516e-10,
"loss": 0.0307,
"step": 1783
},
{
"epoch": 4.997198879551821,
"grad_norm": 1.4750840663909912,
"learning_rate": 4.1460204466825524e-11,
"loss": 0.0387,
"step": 1784
},
{
"epoch": 5.0,
"grad_norm": 1.6340241432189941,
"learning_rate": 0.0,
"loss": 0.0155,
"step": 1785
}
],
"logging_steps": 1,
"max_steps": 1785,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 429013224499200.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}