sedrickkeh's picture
End of training
e04ea31 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003194888178913738,
"grad_norm": 6.409946841419442,
"learning_rate": 4.2553191489361704e-07,
"loss": 0.9383,
"step": 1
},
{
"epoch": 0.006389776357827476,
"grad_norm": 5.883137519343072,
"learning_rate": 8.510638297872341e-07,
"loss": 0.8501,
"step": 2
},
{
"epoch": 0.009584664536741214,
"grad_norm": 5.946755544649649,
"learning_rate": 1.276595744680851e-06,
"loss": 0.887,
"step": 3
},
{
"epoch": 0.012779552715654952,
"grad_norm": 5.642225969670194,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.8099,
"step": 4
},
{
"epoch": 0.01597444089456869,
"grad_norm": 5.891932559187005,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.876,
"step": 5
},
{
"epoch": 0.019169329073482427,
"grad_norm": 4.9025637632451495,
"learning_rate": 2.553191489361702e-06,
"loss": 0.8158,
"step": 6
},
{
"epoch": 0.022364217252396165,
"grad_norm": 4.57848090524475,
"learning_rate": 2.978723404255319e-06,
"loss": 0.8368,
"step": 7
},
{
"epoch": 0.025559105431309903,
"grad_norm": 2.658016400637032,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.7747,
"step": 8
},
{
"epoch": 0.02875399361022364,
"grad_norm": 2.4579449141086083,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.7551,
"step": 9
},
{
"epoch": 0.03194888178913738,
"grad_norm": 2.072756095086418,
"learning_rate": 4.255319148936171e-06,
"loss": 0.7211,
"step": 10
},
{
"epoch": 0.03514376996805112,
"grad_norm": 3.5764214287428824,
"learning_rate": 4.680851063829788e-06,
"loss": 0.7667,
"step": 11
},
{
"epoch": 0.038338658146964855,
"grad_norm": 4.214989656079249,
"learning_rate": 5.106382978723404e-06,
"loss": 0.8135,
"step": 12
},
{
"epoch": 0.04153354632587859,
"grad_norm": 3.695642879844999,
"learning_rate": 5.531914893617022e-06,
"loss": 0.738,
"step": 13
},
{
"epoch": 0.04472843450479233,
"grad_norm": 4.169046468693503,
"learning_rate": 5.957446808510638e-06,
"loss": 0.7383,
"step": 14
},
{
"epoch": 0.04792332268370607,
"grad_norm": 3.911669767568645,
"learning_rate": 6.382978723404256e-06,
"loss": 0.6796,
"step": 15
},
{
"epoch": 0.051118210862619806,
"grad_norm": 3.3181523814007345,
"learning_rate": 6.808510638297873e-06,
"loss": 0.6902,
"step": 16
},
{
"epoch": 0.054313099041533544,
"grad_norm": 2.287624393972901,
"learning_rate": 7.234042553191491e-06,
"loss": 0.6386,
"step": 17
},
{
"epoch": 0.05750798722044728,
"grad_norm": 1.9124190484261288,
"learning_rate": 7.659574468085107e-06,
"loss": 0.6893,
"step": 18
},
{
"epoch": 0.06070287539936102,
"grad_norm": 1.8739939120437843,
"learning_rate": 8.085106382978723e-06,
"loss": 0.6776,
"step": 19
},
{
"epoch": 0.06389776357827476,
"grad_norm": 2.1937664044946783,
"learning_rate": 8.510638297872341e-06,
"loss": 0.6205,
"step": 20
},
{
"epoch": 0.0670926517571885,
"grad_norm": 2.2715620880143916,
"learning_rate": 8.936170212765958e-06,
"loss": 0.6056,
"step": 21
},
{
"epoch": 0.07028753993610223,
"grad_norm": 1.6081424374861002,
"learning_rate": 9.361702127659576e-06,
"loss": 0.555,
"step": 22
},
{
"epoch": 0.07348242811501597,
"grad_norm": 1.763358212351945,
"learning_rate": 9.787234042553192e-06,
"loss": 0.6666,
"step": 23
},
{
"epoch": 0.07667731629392971,
"grad_norm": 1.4811530280287506,
"learning_rate": 1.0212765957446808e-05,
"loss": 0.6504,
"step": 24
},
{
"epoch": 0.07987220447284345,
"grad_norm": 1.3405676738844483,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.5772,
"step": 25
},
{
"epoch": 0.08306709265175719,
"grad_norm": 1.2991745234008487,
"learning_rate": 1.1063829787234044e-05,
"loss": 0.6445,
"step": 26
},
{
"epoch": 0.08626198083067092,
"grad_norm": 1.0505598061019923,
"learning_rate": 1.1489361702127662e-05,
"loss": 0.5923,
"step": 27
},
{
"epoch": 0.08945686900958466,
"grad_norm": 1.103208098271378,
"learning_rate": 1.1914893617021277e-05,
"loss": 0.6241,
"step": 28
},
{
"epoch": 0.0926517571884984,
"grad_norm": 1.2201972114638524,
"learning_rate": 1.2340425531914895e-05,
"loss": 0.6283,
"step": 29
},
{
"epoch": 0.09584664536741214,
"grad_norm": 1.1112064502121644,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.6223,
"step": 30
},
{
"epoch": 0.09904153354632587,
"grad_norm": 0.9923339302816265,
"learning_rate": 1.3191489361702127e-05,
"loss": 0.5931,
"step": 31
},
{
"epoch": 0.10223642172523961,
"grad_norm": 1.0477203467646528,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.6182,
"step": 32
},
{
"epoch": 0.10543130990415335,
"grad_norm": 1.1019257759272982,
"learning_rate": 1.4042553191489363e-05,
"loss": 0.5675,
"step": 33
},
{
"epoch": 0.10862619808306709,
"grad_norm": 1.0627070873952427,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.596,
"step": 34
},
{
"epoch": 0.11182108626198083,
"grad_norm": 1.1260018768412945,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.5888,
"step": 35
},
{
"epoch": 0.11501597444089456,
"grad_norm": 1.0131578830510992,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.53,
"step": 36
},
{
"epoch": 0.1182108626198083,
"grad_norm": 0.9462086601742236,
"learning_rate": 1.5744680851063832e-05,
"loss": 0.5871,
"step": 37
},
{
"epoch": 0.12140575079872204,
"grad_norm": 1.089392870614615,
"learning_rate": 1.6170212765957446e-05,
"loss": 0.5879,
"step": 38
},
{
"epoch": 0.12460063897763578,
"grad_norm": 0.9813042474086496,
"learning_rate": 1.6595744680851064e-05,
"loss": 0.5539,
"step": 39
},
{
"epoch": 0.12779552715654952,
"grad_norm": 0.8478222123412902,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.5272,
"step": 40
},
{
"epoch": 0.13099041533546327,
"grad_norm": 0.9509316220604798,
"learning_rate": 1.74468085106383e-05,
"loss": 0.5524,
"step": 41
},
{
"epoch": 0.134185303514377,
"grad_norm": 0.9910817013897578,
"learning_rate": 1.7872340425531915e-05,
"loss": 0.5781,
"step": 42
},
{
"epoch": 0.13738019169329074,
"grad_norm": 1.1162723782576445,
"learning_rate": 1.8297872340425533e-05,
"loss": 0.57,
"step": 43
},
{
"epoch": 0.14057507987220447,
"grad_norm": 0.9939969861590426,
"learning_rate": 1.872340425531915e-05,
"loss": 0.5459,
"step": 44
},
{
"epoch": 0.14376996805111822,
"grad_norm": 1.2439803148930444,
"learning_rate": 1.914893617021277e-05,
"loss": 0.5854,
"step": 45
},
{
"epoch": 0.14696485623003194,
"grad_norm": 1.2941430779218954,
"learning_rate": 1.9574468085106384e-05,
"loss": 0.5667,
"step": 46
},
{
"epoch": 0.1501597444089457,
"grad_norm": 1.062058714539539,
"learning_rate": 2e-05,
"loss": 0.6011,
"step": 47
},
{
"epoch": 0.15335463258785942,
"grad_norm": 1.0851192274830292,
"learning_rate": 2.0425531914893616e-05,
"loss": 0.58,
"step": 48
},
{
"epoch": 0.15654952076677317,
"grad_norm": 1.0375577575144754,
"learning_rate": 2.0851063829787238e-05,
"loss": 0.5729,
"step": 49
},
{
"epoch": 0.1597444089456869,
"grad_norm": 1.147815879136087,
"learning_rate": 2.1276595744680852e-05,
"loss": 0.5442,
"step": 50
},
{
"epoch": 0.16293929712460065,
"grad_norm": 1.0221514663668887,
"learning_rate": 2.1702127659574467e-05,
"loss": 0.5857,
"step": 51
},
{
"epoch": 0.16613418530351437,
"grad_norm": 1.0076524214922307,
"learning_rate": 2.2127659574468088e-05,
"loss": 0.6012,
"step": 52
},
{
"epoch": 0.16932907348242812,
"grad_norm": 0.9868930985691252,
"learning_rate": 2.2553191489361703e-05,
"loss": 0.5524,
"step": 53
},
{
"epoch": 0.17252396166134185,
"grad_norm": 0.8832842960378782,
"learning_rate": 2.2978723404255324e-05,
"loss": 0.5442,
"step": 54
},
{
"epoch": 0.1757188498402556,
"grad_norm": 0.8366367306264574,
"learning_rate": 2.340425531914894e-05,
"loss": 0.5611,
"step": 55
},
{
"epoch": 0.17891373801916932,
"grad_norm": 1.0326981018410393,
"learning_rate": 2.3829787234042553e-05,
"loss": 0.5304,
"step": 56
},
{
"epoch": 0.18210862619808307,
"grad_norm": 0.9142673475609777,
"learning_rate": 2.4255319148936175e-05,
"loss": 0.5587,
"step": 57
},
{
"epoch": 0.1853035143769968,
"grad_norm": 1.056499453713147,
"learning_rate": 2.468085106382979e-05,
"loss": 0.5374,
"step": 58
},
{
"epoch": 0.18849840255591055,
"grad_norm": 1.0459906767360574,
"learning_rate": 2.5106382978723404e-05,
"loss": 0.5635,
"step": 59
},
{
"epoch": 0.19169329073482427,
"grad_norm": 1.0845897272985423,
"learning_rate": 2.5531914893617025e-05,
"loss": 0.5536,
"step": 60
},
{
"epoch": 0.19488817891373802,
"grad_norm": 1.2724090682106446,
"learning_rate": 2.595744680851064e-05,
"loss": 0.5549,
"step": 61
},
{
"epoch": 0.19808306709265175,
"grad_norm": 0.9769443793575512,
"learning_rate": 2.6382978723404255e-05,
"loss": 0.5142,
"step": 62
},
{
"epoch": 0.2012779552715655,
"grad_norm": 1.0004128504205176,
"learning_rate": 2.6808510638297876e-05,
"loss": 0.5533,
"step": 63
},
{
"epoch": 0.20447284345047922,
"grad_norm": 1.2165255582257546,
"learning_rate": 2.723404255319149e-05,
"loss": 0.5484,
"step": 64
},
{
"epoch": 0.20766773162939298,
"grad_norm": 1.1589649633600527,
"learning_rate": 2.7659574468085112e-05,
"loss": 0.5833,
"step": 65
},
{
"epoch": 0.2108626198083067,
"grad_norm": 0.8637922666258719,
"learning_rate": 2.8085106382978727e-05,
"loss": 0.5216,
"step": 66
},
{
"epoch": 0.21405750798722045,
"grad_norm": 1.3422386117294691,
"learning_rate": 2.851063829787234e-05,
"loss": 0.5304,
"step": 67
},
{
"epoch": 0.21725239616613418,
"grad_norm": 1.0665495345944047,
"learning_rate": 2.8936170212765963e-05,
"loss": 0.5217,
"step": 68
},
{
"epoch": 0.22044728434504793,
"grad_norm": 1.1974304023992344,
"learning_rate": 2.9361702127659577e-05,
"loss": 0.5758,
"step": 69
},
{
"epoch": 0.22364217252396165,
"grad_norm": 1.055001246229523,
"learning_rate": 2.9787234042553192e-05,
"loss": 0.5959,
"step": 70
},
{
"epoch": 0.2268370607028754,
"grad_norm": 0.9295929863968017,
"learning_rate": 3.0212765957446813e-05,
"loss": 0.5113,
"step": 71
},
{
"epoch": 0.23003194888178913,
"grad_norm": 1.0895682011148657,
"learning_rate": 3.063829787234043e-05,
"loss": 0.5343,
"step": 72
},
{
"epoch": 0.23322683706070288,
"grad_norm": 1.0443823721191292,
"learning_rate": 3.1063829787234046e-05,
"loss": 0.5368,
"step": 73
},
{
"epoch": 0.2364217252396166,
"grad_norm": 1.0778250715462803,
"learning_rate": 3.1489361702127664e-05,
"loss": 0.5187,
"step": 74
},
{
"epoch": 0.23961661341853036,
"grad_norm": 1.1417565226413213,
"learning_rate": 3.191489361702128e-05,
"loss": 0.5248,
"step": 75
},
{
"epoch": 0.24281150159744408,
"grad_norm": 1.1170007131913982,
"learning_rate": 3.234042553191489e-05,
"loss": 0.547,
"step": 76
},
{
"epoch": 0.24600638977635783,
"grad_norm": 1.1070104668983525,
"learning_rate": 3.276595744680851e-05,
"loss": 0.4987,
"step": 77
},
{
"epoch": 0.24920127795527156,
"grad_norm": 1.2780289742174922,
"learning_rate": 3.319148936170213e-05,
"loss": 0.5372,
"step": 78
},
{
"epoch": 0.2523961661341853,
"grad_norm": 1.1736505513452447,
"learning_rate": 3.361702127659575e-05,
"loss": 0.5534,
"step": 79
},
{
"epoch": 0.25559105431309903,
"grad_norm": 0.9677573531833106,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.5124,
"step": 80
},
{
"epoch": 0.25878594249201275,
"grad_norm": 1.257228321448476,
"learning_rate": 3.446808510638298e-05,
"loss": 0.5622,
"step": 81
},
{
"epoch": 0.26198083067092653,
"grad_norm": 1.0555331388479319,
"learning_rate": 3.48936170212766e-05,
"loss": 0.5765,
"step": 82
},
{
"epoch": 0.26517571884984026,
"grad_norm": 1.0776914247686873,
"learning_rate": 3.531914893617022e-05,
"loss": 0.5258,
"step": 83
},
{
"epoch": 0.268370607028754,
"grad_norm": 1.0056240006776036,
"learning_rate": 3.574468085106383e-05,
"loss": 0.4866,
"step": 84
},
{
"epoch": 0.2715654952076677,
"grad_norm": 1.1298508994667116,
"learning_rate": 3.617021276595745e-05,
"loss": 0.5806,
"step": 85
},
{
"epoch": 0.2747603833865815,
"grad_norm": 1.0948113866316362,
"learning_rate": 3.6595744680851066e-05,
"loss": 0.5617,
"step": 86
},
{
"epoch": 0.2779552715654952,
"grad_norm": 1.0650985230812091,
"learning_rate": 3.7021276595744684e-05,
"loss": 0.5656,
"step": 87
},
{
"epoch": 0.28115015974440893,
"grad_norm": 1.0504970870871342,
"learning_rate": 3.74468085106383e-05,
"loss": 0.5978,
"step": 88
},
{
"epoch": 0.28434504792332266,
"grad_norm": 0.8855494930537949,
"learning_rate": 3.787234042553192e-05,
"loss": 0.4934,
"step": 89
},
{
"epoch": 0.28753993610223644,
"grad_norm": 1.030400345834846,
"learning_rate": 3.829787234042554e-05,
"loss": 0.5292,
"step": 90
},
{
"epoch": 0.29073482428115016,
"grad_norm": 1.2762844738209218,
"learning_rate": 3.872340425531915e-05,
"loss": 0.5444,
"step": 91
},
{
"epoch": 0.2939297124600639,
"grad_norm": 1.126915075174302,
"learning_rate": 3.914893617021277e-05,
"loss": 0.5111,
"step": 92
},
{
"epoch": 0.2971246006389776,
"grad_norm": 1.4237336432448202,
"learning_rate": 3.9574468085106385e-05,
"loss": 0.5495,
"step": 93
},
{
"epoch": 0.3003194888178914,
"grad_norm": 1.162381948355831,
"learning_rate": 4e-05,
"loss": 0.5999,
"step": 94
},
{
"epoch": 0.3035143769968051,
"grad_norm": 1.3353166705017592,
"learning_rate": 3.999986177524551e-05,
"loss": 0.5371,
"step": 95
},
{
"epoch": 0.30670926517571884,
"grad_norm": 1.1533614278616147,
"learning_rate": 3.999944710289265e-05,
"loss": 0.5636,
"step": 96
},
{
"epoch": 0.30990415335463256,
"grad_norm": 1.2821208981923682,
"learning_rate": 3.9998755988673205e-05,
"loss": 0.5319,
"step": 97
},
{
"epoch": 0.31309904153354634,
"grad_norm": 1.3064500394077125,
"learning_rate": 3.9997788442140105e-05,
"loss": 0.5626,
"step": 98
},
{
"epoch": 0.31629392971246006,
"grad_norm": 1.1664124835581022,
"learning_rate": 3.999654447666721e-05,
"loss": 0.5379,
"step": 99
},
{
"epoch": 0.3194888178913738,
"grad_norm": 1.1049186599225438,
"learning_rate": 3.999502410944923e-05,
"loss": 0.5968,
"step": 100
},
{
"epoch": 0.3226837060702875,
"grad_norm": 0.9776199567560799,
"learning_rate": 3.99932273615014e-05,
"loss": 0.5493,
"step": 101
},
{
"epoch": 0.3258785942492013,
"grad_norm": 1.082662657590535,
"learning_rate": 3.99911542576592e-05,
"loss": 0.582,
"step": 102
},
{
"epoch": 0.329073482428115,
"grad_norm": 1.0520705991247479,
"learning_rate": 3.998880482657809e-05,
"loss": 0.5239,
"step": 103
},
{
"epoch": 0.33226837060702874,
"grad_norm": 1.0198152234253615,
"learning_rate": 3.9986179100733e-05,
"loss": 0.5562,
"step": 104
},
{
"epoch": 0.3354632587859425,
"grad_norm": 1.1581233439140284,
"learning_rate": 3.9983277116417974e-05,
"loss": 0.5199,
"step": 105
},
{
"epoch": 0.33865814696485624,
"grad_norm": 1.2617196945442968,
"learning_rate": 3.998009891374561e-05,
"loss": 0.5158,
"step": 106
},
{
"epoch": 0.34185303514376997,
"grad_norm": 1.233474385279073,
"learning_rate": 3.997664453664654e-05,
"loss": 0.5796,
"step": 107
},
{
"epoch": 0.3450479233226837,
"grad_norm": 1.383774855291399,
"learning_rate": 3.9972914032868805e-05,
"loss": 0.5355,
"step": 108
},
{
"epoch": 0.34824281150159747,
"grad_norm": 1.2889281329856341,
"learning_rate": 3.99689074539772e-05,
"loss": 0.5459,
"step": 109
},
{
"epoch": 0.3514376996805112,
"grad_norm": 0.8238497044674719,
"learning_rate": 3.996462485535257e-05,
"loss": 0.5503,
"step": 110
},
{
"epoch": 0.3546325878594249,
"grad_norm": 1.4807124856342684,
"learning_rate": 3.996006629619103e-05,
"loss": 0.6508,
"step": 111
},
{
"epoch": 0.35782747603833864,
"grad_norm": 1.4078520868755624,
"learning_rate": 3.995523183950314e-05,
"loss": 0.6093,
"step": 112
},
{
"epoch": 0.3610223642172524,
"grad_norm": 0.9015312070363262,
"learning_rate": 3.9950121552113076e-05,
"loss": 0.5879,
"step": 113
},
{
"epoch": 0.36421725239616615,
"grad_norm": 1.386297055498617,
"learning_rate": 3.994473550465765e-05,
"loss": 0.6371,
"step": 114
},
{
"epoch": 0.36741214057507987,
"grad_norm": 1.186356563974775,
"learning_rate": 3.993907377158537e-05,
"loss": 0.5058,
"step": 115
},
{
"epoch": 0.3706070287539936,
"grad_norm": 1.046808813203276,
"learning_rate": 3.993313643115541e-05,
"loss": 0.5475,
"step": 116
},
{
"epoch": 0.3738019169329074,
"grad_norm": 1.1184791458413588,
"learning_rate": 3.992692356543649e-05,
"loss": 0.5131,
"step": 117
},
{
"epoch": 0.3769968051118211,
"grad_norm": 0.9927859768100731,
"learning_rate": 3.992043526030582e-05,
"loss": 0.5351,
"step": 118
},
{
"epoch": 0.3801916932907348,
"grad_norm": 0.7194994252219997,
"learning_rate": 3.991367160544783e-05,
"loss": 0.5241,
"step": 119
},
{
"epoch": 0.38338658146964855,
"grad_norm": 1.1103180710410239,
"learning_rate": 3.990663269435298e-05,
"loss": 0.5341,
"step": 120
},
{
"epoch": 0.3865814696485623,
"grad_norm": 0.891726161610439,
"learning_rate": 3.9899318624316424e-05,
"loss": 0.56,
"step": 121
},
{
"epoch": 0.38977635782747605,
"grad_norm": 0.9833837615596567,
"learning_rate": 3.9891729496436736e-05,
"loss": 0.5275,
"step": 122
},
{
"epoch": 0.3929712460063898,
"grad_norm": 1.0423973561059756,
"learning_rate": 3.988386541561444e-05,
"loss": 0.5726,
"step": 123
},
{
"epoch": 0.3961661341853035,
"grad_norm": 1.051551443360772,
"learning_rate": 3.9875726490550606e-05,
"loss": 0.5689,
"step": 124
},
{
"epoch": 0.3993610223642173,
"grad_norm": 1.0379195132528112,
"learning_rate": 3.986731283374533e-05,
"loss": 0.5532,
"step": 125
},
{
"epoch": 0.402555910543131,
"grad_norm": 1.066307983698186,
"learning_rate": 3.985862456149616e-05,
"loss": 0.6265,
"step": 126
},
{
"epoch": 0.4057507987220447,
"grad_norm": 1.0375163774393865,
"learning_rate": 3.9849661793896537e-05,
"loss": 0.573,
"step": 127
},
{
"epoch": 0.40894568690095845,
"grad_norm": 0.8956353921945285,
"learning_rate": 3.984042465483409e-05,
"loss": 0.5177,
"step": 128
},
{
"epoch": 0.41214057507987223,
"grad_norm": 1.079166565420564,
"learning_rate": 3.983091327198896e-05,
"loss": 0.5476,
"step": 129
},
{
"epoch": 0.41533546325878595,
"grad_norm": 0.9871062497017694,
"learning_rate": 3.982112777683199e-05,
"loss": 0.5868,
"step": 130
},
{
"epoch": 0.4185303514376997,
"grad_norm": 1.1607498270093395,
"learning_rate": 3.981106830462296e-05,
"loss": 0.5829,
"step": 131
},
{
"epoch": 0.4217252396166134,
"grad_norm": 0.938424834465439,
"learning_rate": 3.9800734994408657e-05,
"loss": 0.5233,
"step": 132
},
{
"epoch": 0.4249201277955272,
"grad_norm": 0.9723972932143454,
"learning_rate": 3.9790127989021024e-05,
"loss": 0.5452,
"step": 133
},
{
"epoch": 0.4281150159744409,
"grad_norm": 1.1133970630266223,
"learning_rate": 3.977924743507513e-05,
"loss": 0.6125,
"step": 134
},
{
"epoch": 0.43130990415335463,
"grad_norm": 0.9887899047167236,
"learning_rate": 3.976809348296716e-05,
"loss": 0.5532,
"step": 135
},
{
"epoch": 0.43450479233226835,
"grad_norm": 1.0482567177622366,
"learning_rate": 3.9756666286872345e-05,
"loss": 0.5469,
"step": 136
},
{
"epoch": 0.43769968051118213,
"grad_norm": 0.9896665341430675,
"learning_rate": 3.974496600474282e-05,
"loss": 0.5496,
"step": 137
},
{
"epoch": 0.44089456869009586,
"grad_norm": 1.1889549086086342,
"learning_rate": 3.9732992798305465e-05,
"loss": 0.5767,
"step": 138
},
{
"epoch": 0.4440894568690096,
"grad_norm": 0.9876667745869144,
"learning_rate": 3.972074683305961e-05,
"loss": 0.5746,
"step": 139
},
{
"epoch": 0.4472843450479233,
"grad_norm": 1.0145350383168896,
"learning_rate": 3.9708228278274816e-05,
"loss": 0.5332,
"step": 140
},
{
"epoch": 0.4504792332268371,
"grad_norm": 0.898832690907024,
"learning_rate": 3.96954373069885e-05,
"loss": 0.6213,
"step": 141
},
{
"epoch": 0.4536741214057508,
"grad_norm": 0.9695396273633855,
"learning_rate": 3.968237409600355e-05,
"loss": 0.5389,
"step": 142
},
{
"epoch": 0.45686900958466453,
"grad_norm": 0.8686445117159894,
"learning_rate": 3.9669038825885875e-05,
"loss": 0.5514,
"step": 143
},
{
"epoch": 0.46006389776357826,
"grad_norm": 0.9730958936696619,
"learning_rate": 3.9655431680961924e-05,
"loss": 0.5342,
"step": 144
},
{
"epoch": 0.46325878594249204,
"grad_norm": 1.085850372058602,
"learning_rate": 3.964155284931612e-05,
"loss": 0.5637,
"step": 145
},
{
"epoch": 0.46645367412140576,
"grad_norm": 0.8490885837637923,
"learning_rate": 3.962740252278827e-05,
"loss": 0.5432,
"step": 146
},
{
"epoch": 0.4696485623003195,
"grad_norm": 1.208209634263402,
"learning_rate": 3.961298089697093e-05,
"loss": 0.5406,
"step": 147
},
{
"epoch": 0.4728434504792332,
"grad_norm": 1.2369537682088747,
"learning_rate": 3.959828817120665e-05,
"loss": 0.5871,
"step": 148
},
{
"epoch": 0.476038338658147,
"grad_norm": 1.0963511451666044,
"learning_rate": 3.9583324548585276e-05,
"loss": 0.5965,
"step": 149
},
{
"epoch": 0.4792332268370607,
"grad_norm": 1.1210854346222001,
"learning_rate": 3.956809023594112e-05,
"loss": 0.4888,
"step": 150
},
{
"epoch": 0.48242811501597443,
"grad_norm": 1.10943858725056,
"learning_rate": 3.955258544385009e-05,
"loss": 0.568,
"step": 151
},
{
"epoch": 0.48562300319488816,
"grad_norm": 0.9114400050058079,
"learning_rate": 3.95368103866268e-05,
"loss": 0.5329,
"step": 152
},
{
"epoch": 0.48881789137380194,
"grad_norm": 1.2588910487562754,
"learning_rate": 3.9520765282321584e-05,
"loss": 0.5458,
"step": 153
},
{
"epoch": 0.49201277955271566,
"grad_norm": 1.161279475928506,
"learning_rate": 3.9504450352717514e-05,
"loss": 0.534,
"step": 154
},
{
"epoch": 0.4952076677316294,
"grad_norm": 1.1666419752273152,
"learning_rate": 3.948786582332728e-05,
"loss": 0.5449,
"step": 155
},
{
"epoch": 0.4984025559105431,
"grad_norm": 1.2141093053662426,
"learning_rate": 3.947101192339016e-05,
"loss": 0.5761,
"step": 156
},
{
"epoch": 0.5015974440894568,
"grad_norm": 0.9508186492271605,
"learning_rate": 3.9453888885868756e-05,
"loss": 0.5019,
"step": 157
},
{
"epoch": 0.5047923322683706,
"grad_norm": 1.0363555761012373,
"learning_rate": 3.943649694744584e-05,
"loss": 0.5156,
"step": 158
},
{
"epoch": 0.5079872204472844,
"grad_norm": 1.0787946362260752,
"learning_rate": 3.9418836348521045e-05,
"loss": 0.5405,
"step": 159
},
{
"epoch": 0.5111821086261981,
"grad_norm": 0.9707452122472157,
"learning_rate": 3.940090733320757e-05,
"loss": 0.5227,
"step": 160
},
{
"epoch": 0.5143769968051118,
"grad_norm": 0.8852169354166731,
"learning_rate": 3.93827101493288e-05,
"loss": 0.5471,
"step": 161
},
{
"epoch": 0.5175718849840255,
"grad_norm": 1.0544907272347088,
"learning_rate": 3.936424504841485e-05,
"loss": 0.5495,
"step": 162
},
{
"epoch": 0.5207667731629393,
"grad_norm": 0.8460840865557951,
"learning_rate": 3.934551228569913e-05,
"loss": 0.5919,
"step": 163
},
{
"epoch": 0.5239616613418531,
"grad_norm": 1.3423757776337415,
"learning_rate": 3.932651212011479e-05,
"loss": 0.5745,
"step": 164
},
{
"epoch": 0.5271565495207667,
"grad_norm": 0.8367417092044956,
"learning_rate": 3.930724481429114e-05,
"loss": 0.5582,
"step": 165
},
{
"epoch": 0.5303514376996805,
"grad_norm": 0.8162694542335893,
"learning_rate": 3.928771063455007e-05,
"loss": 0.5382,
"step": 166
},
{
"epoch": 0.5335463258785943,
"grad_norm": 1.055654119658905,
"learning_rate": 3.926790985090228e-05,
"loss": 0.5127,
"step": 167
},
{
"epoch": 0.536741214057508,
"grad_norm": 0.9342358155994782,
"learning_rate": 3.924784273704363e-05,
"loss": 0.5133,
"step": 168
},
{
"epoch": 0.5399361022364217,
"grad_norm": 0.8868699043103929,
"learning_rate": 3.922750957035128e-05,
"loss": 0.5757,
"step": 169
},
{
"epoch": 0.5431309904153354,
"grad_norm": 0.9258969627974063,
"learning_rate": 3.920691063187995e-05,
"loss": 0.5588,
"step": 170
},
{
"epoch": 0.5463258785942492,
"grad_norm": 0.9405026305900314,
"learning_rate": 3.918604620635797e-05,
"loss": 0.5652,
"step": 171
},
{
"epoch": 0.549520766773163,
"grad_norm": 0.8161599160408928,
"learning_rate": 3.916491658218333e-05,
"loss": 0.5586,
"step": 172
},
{
"epoch": 0.5527156549520766,
"grad_norm": 0.9068770118413155,
"learning_rate": 3.914352205141975e-05,
"loss": 0.566,
"step": 173
},
{
"epoch": 0.5559105431309904,
"grad_norm": 0.9468726642569554,
"learning_rate": 3.91218629097926e-05,
"loss": 0.6058,
"step": 174
},
{
"epoch": 0.5591054313099042,
"grad_norm": 0.8130418886800949,
"learning_rate": 3.909993945668484e-05,
"loss": 0.5453,
"step": 175
},
{
"epoch": 0.5623003194888179,
"grad_norm": 0.909689386530599,
"learning_rate": 3.907775199513286e-05,
"loss": 0.5348,
"step": 176
},
{
"epoch": 0.5654952076677316,
"grad_norm": 0.9448685038012304,
"learning_rate": 3.905530083182231e-05,
"loss": 0.5615,
"step": 177
},
{
"epoch": 0.5686900958466453,
"grad_norm": 0.8887839168967605,
"learning_rate": 3.903258627708383e-05,
"loss": 0.5318,
"step": 178
},
{
"epoch": 0.5718849840255591,
"grad_norm": 0.8148539808805777,
"learning_rate": 3.90096086448888e-05,
"loss": 0.531,
"step": 179
},
{
"epoch": 0.5750798722044729,
"grad_norm": 0.9702327272254745,
"learning_rate": 3.898636825284499e-05,
"loss": 0.5679,
"step": 180
},
{
"epoch": 0.5782747603833865,
"grad_norm": 0.9398312312400305,
"learning_rate": 3.896286542219212e-05,
"loss": 0.566,
"step": 181
},
{
"epoch": 0.5814696485623003,
"grad_norm": 0.986760124602772,
"learning_rate": 3.893910047779752e-05,
"loss": 0.5536,
"step": 182
},
{
"epoch": 0.5846645367412141,
"grad_norm": 0.7553011510468522,
"learning_rate": 3.891507374815153e-05,
"loss": 0.5491,
"step": 183
},
{
"epoch": 0.5878594249201278,
"grad_norm": 0.7490780167276647,
"learning_rate": 3.8890785565363046e-05,
"loss": 0.5278,
"step": 184
},
{
"epoch": 0.5910543130990416,
"grad_norm": 0.927901642878805,
"learning_rate": 3.8866236265154864e-05,
"loss": 0.5493,
"step": 185
},
{
"epoch": 0.5942492012779552,
"grad_norm": 0.9377247340300795,
"learning_rate": 3.8841426186859095e-05,
"loss": 0.5118,
"step": 186
},
{
"epoch": 0.597444089456869,
"grad_norm": 0.7685781592155283,
"learning_rate": 3.881635567341243e-05,
"loss": 0.5213,
"step": 187
},
{
"epoch": 0.6006389776357828,
"grad_norm": 0.9161772600103768,
"learning_rate": 3.879102507135142e-05,
"loss": 0.528,
"step": 188
},
{
"epoch": 0.6038338658146964,
"grad_norm": 0.7614758217612799,
"learning_rate": 3.876543473080771e-05,
"loss": 0.5121,
"step": 189
},
{
"epoch": 0.6070287539936102,
"grad_norm": 1.0241100063473434,
"learning_rate": 3.8739585005503136e-05,
"loss": 0.5557,
"step": 190
},
{
"epoch": 0.610223642172524,
"grad_norm": 0.8917603175893786,
"learning_rate": 3.8713476252744896e-05,
"loss": 0.5033,
"step": 191
},
{
"epoch": 0.6134185303514377,
"grad_norm": 0.8866544366619419,
"learning_rate": 3.8687108833420585e-05,
"loss": 0.5459,
"step": 192
},
{
"epoch": 0.6166134185303515,
"grad_norm": 0.8490994667110613,
"learning_rate": 3.866048311199321e-05,
"loss": 0.5761,
"step": 193
},
{
"epoch": 0.6198083067092651,
"grad_norm": 1.0117997679265072,
"learning_rate": 3.863359945649615e-05,
"loss": 0.5597,
"step": 194
},
{
"epoch": 0.6230031948881789,
"grad_norm": 0.8115827675976007,
"learning_rate": 3.860645823852808e-05,
"loss": 0.5415,
"step": 195
},
{
"epoch": 0.6261980830670927,
"grad_norm": 1.0259667321397283,
"learning_rate": 3.85790598332478e-05,
"loss": 0.5435,
"step": 196
},
{
"epoch": 0.6293929712460063,
"grad_norm": 0.7714558957563437,
"learning_rate": 3.8551404619369115e-05,
"loss": 0.5485,
"step": 197
},
{
"epoch": 0.6325878594249201,
"grad_norm": 0.8744712822558212,
"learning_rate": 3.8523492979155534e-05,
"loss": 0.5025,
"step": 198
},
{
"epoch": 0.6357827476038339,
"grad_norm": 0.9290898869366451,
"learning_rate": 3.849532529841502e-05,
"loss": 0.5205,
"step": 199
},
{
"epoch": 0.6389776357827476,
"grad_norm": 0.9659909853502232,
"learning_rate": 3.846690196649464e-05,
"loss": 0.5074,
"step": 200
},
{
"epoch": 0.6421725239616614,
"grad_norm": 0.9633294322023886,
"learning_rate": 3.84382233762752e-05,
"loss": 0.5504,
"step": 201
},
{
"epoch": 0.645367412140575,
"grad_norm": 1.0208058978197756,
"learning_rate": 3.840928992416583e-05,
"loss": 0.5683,
"step": 202
},
{
"epoch": 0.6485623003194888,
"grad_norm": 0.7720130021877678,
"learning_rate": 3.8380102010098436e-05,
"loss": 0.5101,
"step": 203
},
{
"epoch": 0.6517571884984026,
"grad_norm": 0.9815422862820409,
"learning_rate": 3.835066003752226e-05,
"loss": 0.564,
"step": 204
},
{
"epoch": 0.6549520766773163,
"grad_norm": 0.8106906464515937,
"learning_rate": 3.832096441339825e-05,
"loss": 0.5227,
"step": 205
},
{
"epoch": 0.65814696485623,
"grad_norm": 0.9833175984925693,
"learning_rate": 3.829101554819341e-05,
"loss": 0.5564,
"step": 206
},
{
"epoch": 0.6613418530351438,
"grad_norm": 0.74809638164356,
"learning_rate": 3.826081385587523e-05,
"loss": 0.5172,
"step": 207
},
{
"epoch": 0.6645367412140575,
"grad_norm": 0.8646454866233487,
"learning_rate": 3.823035975390585e-05,
"loss": 0.5888,
"step": 208
},
{
"epoch": 0.6677316293929713,
"grad_norm": 0.8049832826819769,
"learning_rate": 3.8199653663236336e-05,
"loss": 0.5792,
"step": 209
},
{
"epoch": 0.670926517571885,
"grad_norm": 0.7543833230398367,
"learning_rate": 3.8168696008300884e-05,
"loss": 0.5196,
"step": 210
},
{
"epoch": 0.6741214057507987,
"grad_norm": 0.9148118195164756,
"learning_rate": 3.813748721701091e-05,
"loss": 0.5444,
"step": 211
},
{
"epoch": 0.6773162939297125,
"grad_norm": 0.7522832027542112,
"learning_rate": 3.8106027720749176e-05,
"loss": 0.5673,
"step": 212
},
{
"epoch": 0.6805111821086262,
"grad_norm": 0.8180964056504714,
"learning_rate": 3.807431795436379e-05,
"loss": 0.5756,
"step": 213
},
{
"epoch": 0.6837060702875399,
"grad_norm": 0.9161522568428497,
"learning_rate": 3.8042358356162215e-05,
"loss": 0.5901,
"step": 214
},
{
"epoch": 0.6869009584664537,
"grad_norm": 0.943077264357207,
"learning_rate": 3.801014936790522e-05,
"loss": 0.4931,
"step": 215
},
{
"epoch": 0.6900958466453674,
"grad_norm": 0.7950064346283184,
"learning_rate": 3.797769143480075e-05,
"loss": 0.5441,
"step": 216
},
{
"epoch": 0.6932907348242812,
"grad_norm": 0.9413760937926158,
"learning_rate": 3.79449850054978e-05,
"loss": 0.5904,
"step": 217
},
{
"epoch": 0.6964856230031949,
"grad_norm": 1.212260052461721,
"learning_rate": 3.791203053208017e-05,
"loss": 0.5766,
"step": 218
},
{
"epoch": 0.6996805111821086,
"grad_norm": 0.8484051659303491,
"learning_rate": 3.7878828470060274e-05,
"loss": 0.5772,
"step": 219
},
{
"epoch": 0.7028753993610224,
"grad_norm": 1.1355674304167553,
"learning_rate": 3.7845379278372775e-05,
"loss": 0.5679,
"step": 220
},
{
"epoch": 0.7060702875399361,
"grad_norm": 0.984582570050398,
"learning_rate": 3.781168341936834e-05,
"loss": 0.5432,
"step": 221
},
{
"epoch": 0.7092651757188498,
"grad_norm": 0.9476742495794508,
"learning_rate": 3.777774135880712e-05,
"loss": 0.5682,
"step": 222
},
{
"epoch": 0.7124600638977636,
"grad_norm": 1.139069082180595,
"learning_rate": 3.774355356585243e-05,
"loss": 0.5121,
"step": 223
},
{
"epoch": 0.7156549520766773,
"grad_norm": 0.7997882642082141,
"learning_rate": 3.7709120513064196e-05,
"loss": 0.5196,
"step": 224
},
{
"epoch": 0.7188498402555911,
"grad_norm": 0.912339161175054,
"learning_rate": 3.7674442676392456e-05,
"loss": 0.5309,
"step": 225
},
{
"epoch": 0.7220447284345048,
"grad_norm": 0.8646463276372267,
"learning_rate": 3.7639520535170736e-05,
"loss": 0.5764,
"step": 226
},
{
"epoch": 0.7252396166134185,
"grad_norm": 0.8485244000483088,
"learning_rate": 3.760435457210948e-05,
"loss": 0.5711,
"step": 227
},
{
"epoch": 0.7284345047923323,
"grad_norm": 0.8913545645187945,
"learning_rate": 3.7568945273289355e-05,
"loss": 0.5355,
"step": 228
},
{
"epoch": 0.731629392971246,
"grad_norm": 0.8760365656277963,
"learning_rate": 3.753329312815453e-05,
"loss": 0.5402,
"step": 229
},
{
"epoch": 0.7348242811501597,
"grad_norm": 0.9291923706353212,
"learning_rate": 3.749739862950589e-05,
"loss": 0.5323,
"step": 230
},
{
"epoch": 0.7380191693290735,
"grad_norm": 1.1191138286045244,
"learning_rate": 3.7461262273494277e-05,
"loss": 0.5401,
"step": 231
},
{
"epoch": 0.7412140575079872,
"grad_norm": 0.8298783641280376,
"learning_rate": 3.742488455961358e-05,
"loss": 0.5489,
"step": 232
},
{
"epoch": 0.744408945686901,
"grad_norm": 0.8194676142692792,
"learning_rate": 3.738826599069385e-05,
"loss": 0.5277,
"step": 233
},
{
"epoch": 0.7476038338658147,
"grad_norm": 0.7674059358894463,
"learning_rate": 3.7351407072894356e-05,
"loss": 0.5169,
"step": 234
},
{
"epoch": 0.7507987220447284,
"grad_norm": 0.8486151770014176,
"learning_rate": 3.7314308315696604e-05,
"loss": 0.535,
"step": 235
},
{
"epoch": 0.7539936102236422,
"grad_norm": 0.7391651407617897,
"learning_rate": 3.7276970231897225e-05,
"loss": 0.504,
"step": 236
},
{
"epoch": 0.7571884984025559,
"grad_norm": 0.902259232556215,
"learning_rate": 3.723939333760099e-05,
"loss": 0.5613,
"step": 237
},
{
"epoch": 0.7603833865814696,
"grad_norm": 0.8625873135638807,
"learning_rate": 3.720157815221358e-05,
"loss": 0.5244,
"step": 238
},
{
"epoch": 0.7635782747603834,
"grad_norm": 0.8936431918114204,
"learning_rate": 3.716352519843448e-05,
"loss": 0.5426,
"step": 239
},
{
"epoch": 0.7667731629392971,
"grad_norm": 0.995320435639344,
"learning_rate": 3.71252350022497e-05,
"loss": 0.5104,
"step": 240
},
{
"epoch": 0.7699680511182109,
"grad_norm": 0.8031264080307341,
"learning_rate": 3.708670809292455e-05,
"loss": 0.5246,
"step": 241
},
{
"epoch": 0.7731629392971247,
"grad_norm": 1.0835434981781038,
"learning_rate": 3.704794500299627e-05,
"loss": 0.5003,
"step": 242
},
{
"epoch": 0.7763578274760383,
"grad_norm": 0.9478013614793179,
"learning_rate": 3.700894626826674e-05,
"loss": 0.5116,
"step": 243
},
{
"epoch": 0.7795527156549521,
"grad_norm": 1.0469893211794972,
"learning_rate": 3.696971242779499e-05,
"loss": 0.6261,
"step": 244
},
{
"epoch": 0.7827476038338658,
"grad_norm": 0.9297231377773715,
"learning_rate": 3.693024402388984e-05,
"loss": 0.5502,
"step": 245
},
{
"epoch": 0.7859424920127795,
"grad_norm": 0.7610910877457355,
"learning_rate": 3.689054160210232e-05,
"loss": 0.542,
"step": 246
},
{
"epoch": 0.7891373801916933,
"grad_norm": 1.1342717917686111,
"learning_rate": 3.6850605711218176e-05,
"loss": 0.5844,
"step": 247
},
{
"epoch": 0.792332268370607,
"grad_norm": 0.6995938546636964,
"learning_rate": 3.681043690325029e-05,
"loss": 0.5343,
"step": 248
},
{
"epoch": 0.7955271565495208,
"grad_norm": 1.1509298073605394,
"learning_rate": 3.6770035733431014e-05,
"loss": 0.5209,
"step": 249
},
{
"epoch": 0.7987220447284346,
"grad_norm": 0.8295131498958032,
"learning_rate": 3.6729402760204535e-05,
"loss": 0.5369,
"step": 250
},
{
"epoch": 0.8019169329073482,
"grad_norm": 0.9218479825070323,
"learning_rate": 3.668853854521913e-05,
"loss": 0.4855,
"step": 251
},
{
"epoch": 0.805111821086262,
"grad_norm": 0.7441334936621781,
"learning_rate": 3.66474436533194e-05,
"loss": 0.5268,
"step": 252
},
{
"epoch": 0.8083067092651757,
"grad_norm": 0.7507501619024712,
"learning_rate": 3.660611865253848e-05,
"loss": 0.5105,
"step": 253
},
{
"epoch": 0.8115015974440895,
"grad_norm": 0.7924592861345243,
"learning_rate": 3.6564564114090175e-05,
"loss": 0.4829,
"step": 254
},
{
"epoch": 0.8146964856230032,
"grad_norm": 0.7720230321659396,
"learning_rate": 3.652278061236109e-05,
"loss": 0.5,
"step": 255
},
{
"epoch": 0.8178913738019169,
"grad_norm": 0.7979403807748843,
"learning_rate": 3.648076872490263e-05,
"loss": 0.5296,
"step": 256
},
{
"epoch": 0.8210862619808307,
"grad_norm": 0.8135723649226206,
"learning_rate": 3.6438529032423086e-05,
"loss": 0.507,
"step": 257
},
{
"epoch": 0.8242811501597445,
"grad_norm": 0.7476070159945926,
"learning_rate": 3.639606211877958e-05,
"loss": 0.5006,
"step": 258
},
{
"epoch": 0.8274760383386581,
"grad_norm": 0.8362960189666772,
"learning_rate": 3.635336857096997e-05,
"loss": 0.5254,
"step": 259
},
{
"epoch": 0.8306709265175719,
"grad_norm": 0.9452074954015198,
"learning_rate": 3.631044897912478e-05,
"loss": 0.5499,
"step": 260
},
{
"epoch": 0.8338658146964856,
"grad_norm": 0.6680220539129625,
"learning_rate": 3.6267303936499006e-05,
"loss": 0.5311,
"step": 261
},
{
"epoch": 0.8370607028753994,
"grad_norm": 0.9178609027182498,
"learning_rate": 3.622393403946395e-05,
"loss": 0.5675,
"step": 262
},
{
"epoch": 0.8402555910543131,
"grad_norm": 0.8455480168827179,
"learning_rate": 3.6180339887498953e-05,
"loss": 0.5582,
"step": 263
},
{
"epoch": 0.8434504792332268,
"grad_norm": 0.7440034772147296,
"learning_rate": 3.6136522083183096e-05,
"loss": 0.5813,
"step": 264
},
{
"epoch": 0.8466453674121406,
"grad_norm": 0.718472836263842,
"learning_rate": 3.6092481232186905e-05,
"loss": 0.5302,
"step": 265
},
{
"epoch": 0.8498402555910544,
"grad_norm": 0.8057027197237966,
"learning_rate": 3.604821794326398e-05,
"loss": 0.4891,
"step": 266
},
{
"epoch": 0.853035143769968,
"grad_norm": 0.8835357018051417,
"learning_rate": 3.600373282824252e-05,
"loss": 0.5171,
"step": 267
},
{
"epoch": 0.8562300319488818,
"grad_norm": 0.7771156562797832,
"learning_rate": 3.595902650201695e-05,
"loss": 0.5085,
"step": 268
},
{
"epoch": 0.8594249201277955,
"grad_norm": 0.8465271291843096,
"learning_rate": 3.591409958253937e-05,
"loss": 0.5324,
"step": 269
},
{
"epoch": 0.8626198083067093,
"grad_norm": 0.7718211766797882,
"learning_rate": 3.5868952690811015e-05,
"loss": 0.5752,
"step": 270
},
{
"epoch": 0.865814696485623,
"grad_norm": 0.9775795997595186,
"learning_rate": 3.582358645087368e-05,
"loss": 0.5599,
"step": 271
},
{
"epoch": 0.8690095846645367,
"grad_norm": 0.8312123906165099,
"learning_rate": 3.577800148980112e-05,
"loss": 0.5223,
"step": 272
},
{
"epoch": 0.8722044728434505,
"grad_norm": 0.935336068810225,
"learning_rate": 3.573219843769033e-05,
"loss": 0.5083,
"step": 273
},
{
"epoch": 0.8753993610223643,
"grad_norm": 0.8720606095772891,
"learning_rate": 3.568617792765287e-05,
"loss": 0.5636,
"step": 274
},
{
"epoch": 0.8785942492012779,
"grad_norm": 0.7824277269020549,
"learning_rate": 3.563994059580611e-05,
"loss": 0.5461,
"step": 275
},
{
"epoch": 0.8817891373801917,
"grad_norm": 1.1055706045113576,
"learning_rate": 3.559348708126445e-05,
"loss": 0.5623,
"step": 276
},
{
"epoch": 0.8849840255591054,
"grad_norm": 0.7710258784149644,
"learning_rate": 3.5546818026130444e-05,
"loss": 0.5279,
"step": 277
},
{
"epoch": 0.8881789137380192,
"grad_norm": 0.8879078744940178,
"learning_rate": 3.549993407548595e-05,
"loss": 0.4966,
"step": 278
},
{
"epoch": 0.8913738019169329,
"grad_norm": 1.0405338141997835,
"learning_rate": 3.545283587738324e-05,
"loss": 0.5365,
"step": 279
},
{
"epoch": 0.8945686900958466,
"grad_norm": 0.7971271443117728,
"learning_rate": 3.5405524082836e-05,
"loss": 0.5672,
"step": 280
},
{
"epoch": 0.8977635782747604,
"grad_norm": 0.931450449668035,
"learning_rate": 3.5357999345810335e-05,
"loss": 0.5668,
"step": 281
},
{
"epoch": 0.9009584664536742,
"grad_norm": 0.7895639631341859,
"learning_rate": 3.5310262323215774e-05,
"loss": 0.4955,
"step": 282
},
{
"epoch": 0.9041533546325878,
"grad_norm": 0.9607888759719633,
"learning_rate": 3.5262313674896125e-05,
"loss": 0.5147,
"step": 283
},
{
"epoch": 0.9073482428115016,
"grad_norm": 0.9689194114193417,
"learning_rate": 3.521415406362041e-05,
"loss": 0.5062,
"step": 284
},
{
"epoch": 0.9105431309904153,
"grad_norm": 0.9647692491089708,
"learning_rate": 3.5165784155073665e-05,
"loss": 0.5625,
"step": 285
},
{
"epoch": 0.9137380191693291,
"grad_norm": 0.8493612979149395,
"learning_rate": 3.511720461784778e-05,
"loss": 0.5424,
"step": 286
},
{
"epoch": 0.9169329073482428,
"grad_norm": 0.8442771355018583,
"learning_rate": 3.50684161234322e-05,
"loss": 0.5632,
"step": 287
},
{
"epoch": 0.9201277955271565,
"grad_norm": 0.8552088751974364,
"learning_rate": 3.50194193462047e-05,
"loss": 0.5372,
"step": 288
},
{
"epoch": 0.9233226837060703,
"grad_norm": 0.8152267999689093,
"learning_rate": 3.497021496342203e-05,
"loss": 0.4861,
"step": 289
},
{
"epoch": 0.9265175718849841,
"grad_norm": 0.9398979894292626,
"learning_rate": 3.4920803655210553e-05,
"loss": 0.5801,
"step": 290
},
{
"epoch": 0.9297124600638977,
"grad_norm": 0.977293682261694,
"learning_rate": 3.4871186104556874e-05,
"loss": 0.5207,
"step": 291
},
{
"epoch": 0.9329073482428115,
"grad_norm": 0.8549875703308473,
"learning_rate": 3.482136299729836e-05,
"loss": 0.5349,
"step": 292
},
{
"epoch": 0.9361022364217252,
"grad_norm": 0.8819868669919189,
"learning_rate": 3.4771335022113705e-05,
"loss": 0.5597,
"step": 293
},
{
"epoch": 0.939297124600639,
"grad_norm": 0.9666325301004642,
"learning_rate": 3.4721102870513345e-05,
"loss": 0.5329,
"step": 294
},
{
"epoch": 0.9424920127795527,
"grad_norm": 0.8203876358079187,
"learning_rate": 3.467066723682998e-05,
"loss": 0.5246,
"step": 295
},
{
"epoch": 0.9456869009584664,
"grad_norm": 1.1228342280176937,
"learning_rate": 3.462002881820891e-05,
"loss": 0.6097,
"step": 296
},
{
"epoch": 0.9488817891373802,
"grad_norm": 0.7732572275271596,
"learning_rate": 3.456918831459844e-05,
"loss": 0.5253,
"step": 297
},
{
"epoch": 0.952076677316294,
"grad_norm": 0.9752827068051154,
"learning_rate": 3.451814642874017e-05,
"loss": 0.5539,
"step": 298
},
{
"epoch": 0.9552715654952076,
"grad_norm": 0.7900396864367236,
"learning_rate": 3.4466903866159326e-05,
"loss": 0.5457,
"step": 299
},
{
"epoch": 0.9584664536741214,
"grad_norm": 0.8596711318037582,
"learning_rate": 3.441546133515496e-05,
"loss": 0.5266,
"step": 300
},
{
"epoch": 0.9616613418530351,
"grad_norm": 0.9071723802112778,
"learning_rate": 3.4363819546790216e-05,
"loss": 0.5189,
"step": 301
},
{
"epoch": 0.9648562300319489,
"grad_norm": 0.6897307625929432,
"learning_rate": 3.431197921488242e-05,
"loss": 0.5258,
"step": 302
},
{
"epoch": 0.9680511182108626,
"grad_norm": 0.8072267951218067,
"learning_rate": 3.425994105599331e-05,
"loss": 0.5025,
"step": 303
},
{
"epoch": 0.9712460063897763,
"grad_norm": 0.7299045123280457,
"learning_rate": 3.4207705789419035e-05,
"loss": 0.4942,
"step": 304
},
{
"epoch": 0.9744408945686901,
"grad_norm": 0.811210391135453,
"learning_rate": 3.41552741371803e-05,
"loss": 0.5128,
"step": 305
},
{
"epoch": 0.9776357827476039,
"grad_norm": 0.6833163220999185,
"learning_rate": 3.4102646824012333e-05,
"loss": 0.5036,
"step": 306
},
{
"epoch": 0.9808306709265175,
"grad_norm": 0.7318928742301355,
"learning_rate": 3.404982457735487e-05,
"loss": 0.5248,
"step": 307
},
{
"epoch": 0.9840255591054313,
"grad_norm": 0.8151408628855044,
"learning_rate": 3.399680812734213e-05,
"loss": 0.5244,
"step": 308
},
{
"epoch": 0.987220447284345,
"grad_norm": 0.7365970167922717,
"learning_rate": 3.3943598206792665e-05,
"loss": 0.5334,
"step": 309
},
{
"epoch": 0.9904153354632588,
"grad_norm": 0.6444531685595024,
"learning_rate": 3.3890195551199334e-05,
"loss": 0.506,
"step": 310
},
{
"epoch": 0.9936102236421726,
"grad_norm": 0.7379917525512831,
"learning_rate": 3.3836600898719e-05,
"loss": 0.4884,
"step": 311
},
{
"epoch": 0.9968051118210862,
"grad_norm": 0.8006414034782756,
"learning_rate": 3.3782814990162457e-05,
"loss": 0.6063,
"step": 312
},
{
"epoch": 1.0,
"grad_norm": 0.8714839087095215,
"learning_rate": 3.372883856898408e-05,
"loss": 0.5957,
"step": 313
},
{
"epoch": 1.0031948881789137,
"grad_norm": 0.8608076101674021,
"learning_rate": 3.367467238127165e-05,
"loss": 0.4153,
"step": 314
},
{
"epoch": 1.0063897763578276,
"grad_norm": 0.7863968465089736,
"learning_rate": 3.3620317175735945e-05,
"loss": 0.4178,
"step": 315
},
{
"epoch": 1.0095846645367412,
"grad_norm": 0.6819742061303111,
"learning_rate": 3.3565773703700474e-05,
"loss": 0.3475,
"step": 316
},
{
"epoch": 1.012779552715655,
"grad_norm": 0.7870011705239355,
"learning_rate": 3.351104271909104e-05,
"loss": 0.3629,
"step": 317
},
{
"epoch": 1.0159744408945688,
"grad_norm": 0.8316632267191167,
"learning_rate": 3.345612497842532e-05,
"loss": 0.3761,
"step": 318
},
{
"epoch": 1.0191693290734825,
"grad_norm": 0.7862604900294438,
"learning_rate": 3.3401021240802446e-05,
"loss": 0.3627,
"step": 319
},
{
"epoch": 1.0223642172523961,
"grad_norm": 1.0061847359485523,
"learning_rate": 3.334573226789249e-05,
"loss": 0.4051,
"step": 320
},
{
"epoch": 1.0255591054313098,
"grad_norm": 0.8807904375499824,
"learning_rate": 3.32902588239259e-05,
"loss": 0.3968,
"step": 321
},
{
"epoch": 1.0287539936102237,
"grad_norm": 1.1981019094258039,
"learning_rate": 3.3234601675683005e-05,
"loss": 0.4202,
"step": 322
},
{
"epoch": 1.0319488817891374,
"grad_norm": 0.844697383840708,
"learning_rate": 3.317876159248337e-05,
"loss": 0.3743,
"step": 323
},
{
"epoch": 1.035143769968051,
"grad_norm": 0.9126433986445898,
"learning_rate": 3.3122739346175176e-05,
"loss": 0.3855,
"step": 324
},
{
"epoch": 1.038338658146965,
"grad_norm": 0.6598463168728887,
"learning_rate": 3.306653571112454e-05,
"loss": 0.3476,
"step": 325
},
{
"epoch": 1.0415335463258786,
"grad_norm": 0.8158289993131012,
"learning_rate": 3.301015146420484e-05,
"loss": 0.3718,
"step": 326
},
{
"epoch": 1.0447284345047922,
"grad_norm": 0.9397053531896098,
"learning_rate": 3.295358738478593e-05,
"loss": 0.4497,
"step": 327
},
{
"epoch": 1.0479233226837061,
"grad_norm": 0.6428708472951121,
"learning_rate": 3.2896844254723414e-05,
"loss": 0.3422,
"step": 328
},
{
"epoch": 1.0511182108626198,
"grad_norm": 0.9006480199213088,
"learning_rate": 3.283992285834782e-05,
"loss": 0.3803,
"step": 329
},
{
"epoch": 1.0543130990415335,
"grad_norm": 0.804430943445223,
"learning_rate": 3.2782823982453746e-05,
"loss": 0.3999,
"step": 330
},
{
"epoch": 1.0575079872204474,
"grad_norm": 0.9270904209069851,
"learning_rate": 3.272554841628901e-05,
"loss": 0.4319,
"step": 331
},
{
"epoch": 1.060702875399361,
"grad_norm": 0.845825639588173,
"learning_rate": 3.266809695154371e-05,
"loss": 0.3746,
"step": 332
},
{
"epoch": 1.0638977635782747,
"grad_norm": 0.8494398683413655,
"learning_rate": 3.261047038233931e-05,
"loss": 0.3969,
"step": 333
},
{
"epoch": 1.0670926517571886,
"grad_norm": 0.7126024738944513,
"learning_rate": 3.2552669505217646e-05,
"loss": 0.3474,
"step": 334
},
{
"epoch": 1.0702875399361023,
"grad_norm": 0.9359004672009658,
"learning_rate": 3.2494695119129924e-05,
"loss": 0.3707,
"step": 335
},
{
"epoch": 1.073482428115016,
"grad_norm": 0.6005459131808137,
"learning_rate": 3.243654802542568e-05,
"loss": 0.3063,
"step": 336
},
{
"epoch": 1.0766773162939298,
"grad_norm": 0.8842790772014463,
"learning_rate": 3.2378229027841675e-05,
"loss": 0.3765,
"step": 337
},
{
"epoch": 1.0798722044728435,
"grad_norm": 0.9070675817695492,
"learning_rate": 3.231973893249083e-05,
"loss": 0.3779,
"step": 338
},
{
"epoch": 1.0830670926517572,
"grad_norm": 0.670447746767721,
"learning_rate": 3.226107854785106e-05,
"loss": 0.4082,
"step": 339
},
{
"epoch": 1.0862619808306708,
"grad_norm": 0.9213283733651502,
"learning_rate": 3.220224868475408e-05,
"loss": 0.4237,
"step": 340
},
{
"epoch": 1.0894568690095847,
"grad_norm": 0.6839594328860654,
"learning_rate": 3.2143250156374226e-05,
"loss": 0.4307,
"step": 341
},
{
"epoch": 1.0926517571884984,
"grad_norm": 0.7489428445729561,
"learning_rate": 3.208408377821722e-05,
"loss": 0.3652,
"step": 342
},
{
"epoch": 1.095846645367412,
"grad_norm": 0.7550186524407567,
"learning_rate": 3.202475036810886e-05,
"loss": 0.406,
"step": 343
},
{
"epoch": 1.099041533546326,
"grad_norm": 0.6395054008269865,
"learning_rate": 3.1965250746183755e-05,
"loss": 0.3711,
"step": 344
},
{
"epoch": 1.1022364217252396,
"grad_norm": 0.657977446052051,
"learning_rate": 3.190558573487397e-05,
"loss": 0.3542,
"step": 345
},
{
"epoch": 1.1054313099041533,
"grad_norm": 0.8206988613968245,
"learning_rate": 3.1845756158897654e-05,
"loss": 0.3985,
"step": 346
},
{
"epoch": 1.1086261980830672,
"grad_norm": 0.7479778299255093,
"learning_rate": 3.178576284524765e-05,
"loss": 0.3371,
"step": 347
},
{
"epoch": 1.1118210862619808,
"grad_norm": 0.753877520359999,
"learning_rate": 3.1725606623180086e-05,
"loss": 0.3699,
"step": 348
},
{
"epoch": 1.1150159744408945,
"grad_norm": 0.8504076245127729,
"learning_rate": 3.166528832420283e-05,
"loss": 0.3912,
"step": 349
},
{
"epoch": 1.1182108626198084,
"grad_norm": 0.6964611574953985,
"learning_rate": 3.160480878206412e-05,
"loss": 0.3386,
"step": 350
},
{
"epoch": 1.121405750798722,
"grad_norm": 0.6824499798649496,
"learning_rate": 3.154416883274092e-05,
"loss": 0.3709,
"step": 351
},
{
"epoch": 1.1246006389776357,
"grad_norm": 1.0185089841293544,
"learning_rate": 3.148336931442745e-05,
"loss": 0.3634,
"step": 352
},
{
"epoch": 1.1277955271565494,
"grad_norm": 0.7083393549747615,
"learning_rate": 3.142241106752356e-05,
"loss": 0.3941,
"step": 353
},
{
"epoch": 1.1309904153354633,
"grad_norm": 0.8028674737087654,
"learning_rate": 3.136129493462312e-05,
"loss": 0.3424,
"step": 354
},
{
"epoch": 1.134185303514377,
"grad_norm": 0.9169589595364791,
"learning_rate": 3.130002176050238e-05,
"loss": 0.37,
"step": 355
},
{
"epoch": 1.1373801916932909,
"grad_norm": 0.6610294457775537,
"learning_rate": 3.123859239210827e-05,
"loss": 0.3673,
"step": 356
},
{
"epoch": 1.1405750798722045,
"grad_norm": 0.7218101821218883,
"learning_rate": 3.1177007678546746e-05,
"loss": 0.4232,
"step": 357
},
{
"epoch": 1.1437699680511182,
"grad_norm": 0.9486057955199187,
"learning_rate": 3.111526847107099e-05,
"loss": 0.3852,
"step": 358
},
{
"epoch": 1.1469648562300319,
"grad_norm": 0.6192218269870875,
"learning_rate": 3.105337562306968e-05,
"loss": 0.3301,
"step": 359
},
{
"epoch": 1.1501597444089458,
"grad_norm": 0.6912746341979487,
"learning_rate": 3.099132999005519e-05,
"loss": 0.3625,
"step": 360
},
{
"epoch": 1.1533546325878594,
"grad_norm": 0.8493378937993168,
"learning_rate": 3.092913242965175e-05,
"loss": 0.3951,
"step": 361
},
{
"epoch": 1.156549520766773,
"grad_norm": 0.833993863731902,
"learning_rate": 3.086678380158364e-05,
"loss": 0.3902,
"step": 362
},
{
"epoch": 1.159744408945687,
"grad_norm": 0.7398039881016663,
"learning_rate": 3.0804284967663214e-05,
"loss": 0.3924,
"step": 363
},
{
"epoch": 1.1629392971246006,
"grad_norm": 0.7703232633649562,
"learning_rate": 3.074163679177907e-05,
"loss": 0.3761,
"step": 364
},
{
"epoch": 1.1661341853035143,
"grad_norm": 0.89436430995006,
"learning_rate": 3.06788401398841e-05,
"loss": 0.3701,
"step": 365
},
{
"epoch": 1.1693290734824282,
"grad_norm": 0.7039513259201169,
"learning_rate": 3.061589587998346e-05,
"loss": 0.3622,
"step": 366
},
{
"epoch": 1.1725239616613419,
"grad_norm": 0.7355381764642869,
"learning_rate": 3.055280488212266e-05,
"loss": 0.3969,
"step": 367
},
{
"epoch": 1.1757188498402555,
"grad_norm": 0.8100722031529548,
"learning_rate": 3.0489568018375447e-05,
"loss": 0.3718,
"step": 368
},
{
"epoch": 1.1789137380191694,
"grad_norm": 0.7026117502103958,
"learning_rate": 3.042618616283184e-05,
"loss": 0.3405,
"step": 369
},
{
"epoch": 1.182108626198083,
"grad_norm": 0.7207118281643955,
"learning_rate": 3.036266019158596e-05,
"loss": 0.3889,
"step": 370
},
{
"epoch": 1.1853035143769968,
"grad_norm": 0.79060661451023,
"learning_rate": 3.0298990982724e-05,
"loss": 0.3994,
"step": 371
},
{
"epoch": 1.1884984025559104,
"grad_norm": 0.8441043300302222,
"learning_rate": 3.0235179416312025e-05,
"loss": 0.3508,
"step": 372
},
{
"epoch": 1.1916932907348243,
"grad_norm": 0.7349501648718484,
"learning_rate": 3.017122637438385e-05,
"loss": 0.3847,
"step": 373
},
{
"epoch": 1.194888178913738,
"grad_norm": 0.8725781608132315,
"learning_rate": 3.0107132740928832e-05,
"loss": 0.3926,
"step": 374
},
{
"epoch": 1.1980830670926517,
"grad_norm": 0.8327406117293235,
"learning_rate": 3.004289940187964e-05,
"loss": 0.3802,
"step": 375
},
{
"epoch": 1.2012779552715656,
"grad_norm": 0.7199915365896178,
"learning_rate": 2.9978527245100034e-05,
"loss": 0.354,
"step": 376
},
{
"epoch": 1.2044728434504792,
"grad_norm": 0.7419030308089841,
"learning_rate": 2.991401716037255e-05,
"loss": 0.3884,
"step": 377
},
{
"epoch": 1.207667731629393,
"grad_norm": 0.8062260851883908,
"learning_rate": 2.9849370039386284e-05,
"loss": 0.366,
"step": 378
},
{
"epoch": 1.2108626198083068,
"grad_norm": 0.7591181232522807,
"learning_rate": 2.9784586775724443e-05,
"loss": 0.3579,
"step": 379
},
{
"epoch": 1.2140575079872205,
"grad_norm": 0.8434345898198293,
"learning_rate": 2.971966826485212e-05,
"loss": 0.4524,
"step": 380
},
{
"epoch": 1.2172523961661341,
"grad_norm": 0.8495573303145608,
"learning_rate": 2.9654615404103837e-05,
"loss": 0.434,
"step": 381
},
{
"epoch": 1.220447284345048,
"grad_norm": 0.6933721175439074,
"learning_rate": 2.9589429092671155e-05,
"loss": 0.4343,
"step": 382
},
{
"epoch": 1.2236421725239617,
"grad_norm": 0.7274608975530785,
"learning_rate": 2.952411023159027e-05,
"loss": 0.3298,
"step": 383
},
{
"epoch": 1.2268370607028753,
"grad_norm": 0.8168975435659065,
"learning_rate": 2.945865972372954e-05,
"loss": 0.4002,
"step": 384
},
{
"epoch": 1.230031948881789,
"grad_norm": 0.8873260212581557,
"learning_rate": 2.939307847377699e-05,
"loss": 0.4397,
"step": 385
},
{
"epoch": 1.233226837060703,
"grad_norm": 0.6916592999050895,
"learning_rate": 2.9327367388227847e-05,
"loss": 0.391,
"step": 386
},
{
"epoch": 1.2364217252396166,
"grad_norm": 0.7553082417173358,
"learning_rate": 2.926152737537198e-05,
"loss": 0.3466,
"step": 387
},
{
"epoch": 1.2396166134185305,
"grad_norm": 0.8650485170070908,
"learning_rate": 2.9195559345281336e-05,
"loss": 0.4146,
"step": 388
},
{
"epoch": 1.2428115015974441,
"grad_norm": 0.7330543803583387,
"learning_rate": 2.9129464209797404e-05,
"loss": 0.3898,
"step": 389
},
{
"epoch": 1.2460063897763578,
"grad_norm": 0.7363439527629848,
"learning_rate": 2.906324288251857e-05,
"loss": 0.4112,
"step": 390
},
{
"epoch": 1.2492012779552715,
"grad_norm": 0.8327177625635779,
"learning_rate": 2.8996896278787504e-05,
"loss": 0.3905,
"step": 391
},
{
"epoch": 1.2523961661341854,
"grad_norm": 0.6537335351551286,
"learning_rate": 2.893042531567851e-05,
"loss": 0.4207,
"step": 392
},
{
"epoch": 1.255591054313099,
"grad_norm": 0.7146002049281741,
"learning_rate": 2.886383091198483e-05,
"loss": 0.3441,
"step": 393
},
{
"epoch": 1.2587859424920127,
"grad_norm": 0.8078428165110145,
"learning_rate": 2.8797113988205992e-05,
"loss": 0.4221,
"step": 394
},
{
"epoch": 1.2619808306709266,
"grad_norm": 0.5972404381082699,
"learning_rate": 2.8730275466535027e-05,
"loss": 0.3144,
"step": 395
},
{
"epoch": 1.2651757188498403,
"grad_norm": 0.7706819426110447,
"learning_rate": 2.866331627084576e-05,
"loss": 0.3822,
"step": 396
},
{
"epoch": 1.268370607028754,
"grad_norm": 0.6908783947369732,
"learning_rate": 2.8596237326680035e-05,
"loss": 0.3535,
"step": 397
},
{
"epoch": 1.2715654952076676,
"grad_norm": 0.712295172157205,
"learning_rate": 2.8529039561234904e-05,
"loss": 0.3748,
"step": 398
},
{
"epoch": 1.2747603833865815,
"grad_norm": 0.842337102034422,
"learning_rate": 2.846172390334983e-05,
"loss": 0.3949,
"step": 399
},
{
"epoch": 1.2779552715654952,
"grad_norm": 0.7287013629663512,
"learning_rate": 2.8394291283493846e-05,
"loss": 0.419,
"step": 400
},
{
"epoch": 1.281150159744409,
"grad_norm": 0.7190834632935403,
"learning_rate": 2.8326742633752693e-05,
"loss": 0.3852,
"step": 401
},
{
"epoch": 1.2843450479233227,
"grad_norm": 0.76075306510952,
"learning_rate": 2.82590788878159e-05,
"loss": 0.4172,
"step": 402
},
{
"epoch": 1.2875399361022364,
"grad_norm": 0.7039618344228508,
"learning_rate": 2.8191300980963956e-05,
"loss": 0.4121,
"step": 403
},
{
"epoch": 1.29073482428115,
"grad_norm": 0.6460440389338991,
"learning_rate": 2.8123409850055307e-05,
"loss": 0.3896,
"step": 404
},
{
"epoch": 1.293929712460064,
"grad_norm": 0.6974323179340803,
"learning_rate": 2.8055406433513437e-05,
"loss": 0.3549,
"step": 405
},
{
"epoch": 1.2971246006389776,
"grad_norm": 0.8173306197186939,
"learning_rate": 2.798729167131391e-05,
"loss": 0.4078,
"step": 406
},
{
"epoch": 1.3003194888178915,
"grad_norm": 0.7127151827524344,
"learning_rate": 2.7919066504971355e-05,
"loss": 0.3622,
"step": 407
},
{
"epoch": 1.3035143769968052,
"grad_norm": 0.732663954477486,
"learning_rate": 2.7850731877526454e-05,
"loss": 0.3845,
"step": 408
},
{
"epoch": 1.3067092651757188,
"grad_norm": 0.6128280361598326,
"learning_rate": 2.7782288733532915e-05,
"loss": 0.3877,
"step": 409
},
{
"epoch": 1.3099041533546325,
"grad_norm": 0.6950127994352941,
"learning_rate": 2.7713738019044424e-05,
"loss": 0.3538,
"step": 410
},
{
"epoch": 1.3130990415335464,
"grad_norm": 0.686187825577798,
"learning_rate": 2.764508068160154e-05,
"loss": 0.387,
"step": 411
},
{
"epoch": 1.31629392971246,
"grad_norm": 0.6478458382514949,
"learning_rate": 2.7576317670218626e-05,
"loss": 0.3751,
"step": 412
},
{
"epoch": 1.3194888178913737,
"grad_norm": 0.6815550771368388,
"learning_rate": 2.7507449935370726e-05,
"loss": 0.3475,
"step": 413
},
{
"epoch": 1.3226837060702876,
"grad_norm": 0.6741037656638967,
"learning_rate": 2.7438478428980407e-05,
"loss": 0.396,
"step": 414
},
{
"epoch": 1.3258785942492013,
"grad_norm": 0.7167174755449621,
"learning_rate": 2.736940410440462e-05,
"loss": 0.3964,
"step": 415
},
{
"epoch": 1.329073482428115,
"grad_norm": 0.6842795899926827,
"learning_rate": 2.7300227916421528e-05,
"loss": 0.3973,
"step": 416
},
{
"epoch": 1.3322683706070286,
"grad_norm": 0.702058426076705,
"learning_rate": 2.7230950821217294e-05,
"loss": 0.387,
"step": 417
},
{
"epoch": 1.3354632587859425,
"grad_norm": 0.6368932760300601,
"learning_rate": 2.7161573776372856e-05,
"loss": 0.3964,
"step": 418
},
{
"epoch": 1.3386581469648562,
"grad_norm": 0.6782570721877241,
"learning_rate": 2.7092097740850712e-05,
"loss": 0.4042,
"step": 419
},
{
"epoch": 1.34185303514377,
"grad_norm": 0.5931978671155632,
"learning_rate": 2.7022523674981674e-05,
"loss": 0.395,
"step": 420
},
{
"epoch": 1.3450479233226837,
"grad_norm": 0.7201174578306515,
"learning_rate": 2.6952852540451536e-05,
"loss": 0.3921,
"step": 421
},
{
"epoch": 1.3482428115015974,
"grad_norm": 0.6946900695747145,
"learning_rate": 2.688308530028786e-05,
"loss": 0.3652,
"step": 422
},
{
"epoch": 1.351437699680511,
"grad_norm": 0.698534785763629,
"learning_rate": 2.6813222918846613e-05,
"loss": 0.3741,
"step": 423
},
{
"epoch": 1.354632587859425,
"grad_norm": 0.7564830310668963,
"learning_rate": 2.6743266361798833e-05,
"loss": 0.4067,
"step": 424
},
{
"epoch": 1.3578274760383386,
"grad_norm": 0.8625137509946531,
"learning_rate": 2.6673216596117324e-05,
"loss": 0.3512,
"step": 425
},
{
"epoch": 1.3610223642172525,
"grad_norm": 0.6004241242239579,
"learning_rate": 2.660307459006325e-05,
"loss": 0.3885,
"step": 426
},
{
"epoch": 1.3642172523961662,
"grad_norm": 0.8493675462357329,
"learning_rate": 2.653284131317276e-05,
"loss": 0.359,
"step": 427
},
{
"epoch": 1.3674121405750799,
"grad_norm": 0.7054484310915905,
"learning_rate": 2.6462517736243612e-05,
"loss": 0.3776,
"step": 428
},
{
"epoch": 1.3706070287539935,
"grad_norm": 0.7635212688991575,
"learning_rate": 2.639210483132171e-05,
"loss": 0.3678,
"step": 429
},
{
"epoch": 1.3738019169329074,
"grad_norm": 0.7551414066116379,
"learning_rate": 2.6321603571687714e-05,
"loss": 0.3538,
"step": 430
},
{
"epoch": 1.376996805111821,
"grad_norm": 0.5933801420526407,
"learning_rate": 2.625101493184355e-05,
"loss": 0.3608,
"step": 431
},
{
"epoch": 1.3801916932907348,
"grad_norm": 0.7485716718685856,
"learning_rate": 2.618033988749895e-05,
"loss": 0.3392,
"step": 432
},
{
"epoch": 1.3833865814696487,
"grad_norm": 0.6856872085204931,
"learning_rate": 2.6109579415557997e-05,
"loss": 0.3696,
"step": 433
},
{
"epoch": 1.3865814696485623,
"grad_norm": 0.6906273049790326,
"learning_rate": 2.6038734494105562e-05,
"loss": 0.3949,
"step": 434
},
{
"epoch": 1.389776357827476,
"grad_norm": 0.7317141874141749,
"learning_rate": 2.5967806102393844e-05,
"loss": 0.3961,
"step": 435
},
{
"epoch": 1.3929712460063897,
"grad_norm": 0.7084024455483174,
"learning_rate": 2.5896795220828786e-05,
"loss": 0.3729,
"step": 436
},
{
"epoch": 1.3961661341853036,
"grad_norm": 0.6735234384239845,
"learning_rate": 2.582570283095656e-05,
"loss": 0.3755,
"step": 437
},
{
"epoch": 1.3993610223642172,
"grad_norm": 0.639545495865673,
"learning_rate": 2.575452991544998e-05,
"loss": 0.3461,
"step": 438
},
{
"epoch": 1.4025559105431311,
"grad_norm": 0.8314980113052899,
"learning_rate": 2.5683277458094926e-05,
"loss": 0.3766,
"step": 439
},
{
"epoch": 1.4057507987220448,
"grad_norm": 0.5774548863952043,
"learning_rate": 2.5611946443776733e-05,
"loss": 0.3822,
"step": 440
},
{
"epoch": 1.4089456869009584,
"grad_norm": 0.5808224894717059,
"learning_rate": 2.5540537858466596e-05,
"loss": 0.3936,
"step": 441
},
{
"epoch": 1.4121405750798721,
"grad_norm": 0.6123085104187193,
"learning_rate": 2.546905268920794e-05,
"loss": 0.344,
"step": 442
},
{
"epoch": 1.415335463258786,
"grad_norm": 0.5345986142478568,
"learning_rate": 2.5397491924102758e-05,
"loss": 0.3681,
"step": 443
},
{
"epoch": 1.4185303514376997,
"grad_norm": 0.7318254435855026,
"learning_rate": 2.532585655229797e-05,
"loss": 0.3677,
"step": 444
},
{
"epoch": 1.4217252396166133,
"grad_norm": 0.6012026977631383,
"learning_rate": 2.525414756397174e-05,
"loss": 0.334,
"step": 445
},
{
"epoch": 1.4249201277955272,
"grad_norm": 0.7187336200880879,
"learning_rate": 2.51823659503198e-05,
"loss": 0.4127,
"step": 446
},
{
"epoch": 1.428115015974441,
"grad_norm": 0.657446388301636,
"learning_rate": 2.5110512703541747e-05,
"loss": 0.367,
"step": 447
},
{
"epoch": 1.4313099041533546,
"grad_norm": 0.6399302817983334,
"learning_rate": 2.503858881682731e-05,
"loss": 0.4096,
"step": 448
},
{
"epoch": 1.4345047923322682,
"grad_norm": 0.6063474923650863,
"learning_rate": 2.4966595284342664e-05,
"loss": 0.3701,
"step": 449
},
{
"epoch": 1.4376996805111821,
"grad_norm": 0.6983262454628201,
"learning_rate": 2.489453310121663e-05,
"loss": 0.3796,
"step": 450
},
{
"epoch": 1.4408945686900958,
"grad_norm": 0.7119999272751214,
"learning_rate": 2.4822403263526966e-05,
"loss": 0.3553,
"step": 451
},
{
"epoch": 1.4440894568690097,
"grad_norm": 0.67185797833669,
"learning_rate": 2.4750206768286593e-05,
"loss": 0.3517,
"step": 452
},
{
"epoch": 1.4472843450479234,
"grad_norm": 0.6486337996555229,
"learning_rate": 2.4677944613429778e-05,
"loss": 0.3287,
"step": 453
},
{
"epoch": 1.450479233226837,
"grad_norm": 0.6516969882558993,
"learning_rate": 2.46056177977984e-05,
"loss": 0.3514,
"step": 454
},
{
"epoch": 1.4536741214057507,
"grad_norm": 0.7388191757920071,
"learning_rate": 2.4533227321128084e-05,
"loss": 0.4362,
"step": 455
},
{
"epoch": 1.4568690095846646,
"grad_norm": 0.6119479848006957,
"learning_rate": 2.4460774184034408e-05,
"loss": 0.3825,
"step": 456
},
{
"epoch": 1.4600638977635783,
"grad_norm": 0.7425050306965432,
"learning_rate": 2.4388259387999097e-05,
"loss": 0.3759,
"step": 457
},
{
"epoch": 1.4632587859424921,
"grad_norm": 0.6502397685694954,
"learning_rate": 2.4315683935356127e-05,
"loss": 0.3829,
"step": 458
},
{
"epoch": 1.4664536741214058,
"grad_norm": 0.6725716076372529,
"learning_rate": 2.4243048829277916e-05,
"loss": 0.3861,
"step": 459
},
{
"epoch": 1.4696485623003195,
"grad_norm": 0.6219046409113833,
"learning_rate": 2.4170355073761433e-05,
"loss": 0.3736,
"step": 460
},
{
"epoch": 1.4728434504792332,
"grad_norm": 0.5835078991741417,
"learning_rate": 2.4097603673614325e-05,
"loss": 0.3531,
"step": 461
},
{
"epoch": 1.476038338658147,
"grad_norm": 0.6550851854107704,
"learning_rate": 2.4024795634441044e-05,
"loss": 0.3262,
"step": 462
},
{
"epoch": 1.4792332268370607,
"grad_norm": 0.7675785572629827,
"learning_rate": 2.3951931962628918e-05,
"loss": 0.392,
"step": 463
},
{
"epoch": 1.4824281150159744,
"grad_norm": 0.6411841347495657,
"learning_rate": 2.3879013665334258e-05,
"loss": 0.4024,
"step": 464
},
{
"epoch": 1.4856230031948883,
"grad_norm": 0.6007938608835436,
"learning_rate": 2.380604175046844e-05,
"loss": 0.3661,
"step": 465
},
{
"epoch": 1.488817891373802,
"grad_norm": 0.6525253798215147,
"learning_rate": 2.373301722668398e-05,
"loss": 0.3746,
"step": 466
},
{
"epoch": 1.4920127795527156,
"grad_norm": 0.6446114375585952,
"learning_rate": 2.365994110336054e-05,
"loss": 0.3889,
"step": 467
},
{
"epoch": 1.4952076677316293,
"grad_norm": 0.6653896661247665,
"learning_rate": 2.358681439059106e-05,
"loss": 0.3594,
"step": 468
},
{
"epoch": 1.4984025559105432,
"grad_norm": 0.7011886854730449,
"learning_rate": 2.3513638099167723e-05,
"loss": 0.3889,
"step": 469
},
{
"epoch": 1.5015974440894568,
"grad_norm": 0.7538883021599542,
"learning_rate": 2.3440413240568022e-05,
"loss": 0.3642,
"step": 470
},
{
"epoch": 1.5047923322683707,
"grad_norm": 0.6733515586731865,
"learning_rate": 2.3367140826940768e-05,
"loss": 0.3482,
"step": 471
},
{
"epoch": 1.5079872204472844,
"grad_norm": 0.600416095532099,
"learning_rate": 2.329382187109211e-05,
"loss": 0.3399,
"step": 472
},
{
"epoch": 1.511182108626198,
"grad_norm": 0.687374773018976,
"learning_rate": 2.3220457386471496e-05,
"loss": 0.3754,
"step": 473
},
{
"epoch": 1.5143769968051117,
"grad_norm": 0.748016995785705,
"learning_rate": 2.3147048387157725e-05,
"loss": 0.3648,
"step": 474
},
{
"epoch": 1.5175718849840254,
"grad_norm": 0.7005861674092242,
"learning_rate": 2.3073595887844884e-05,
"loss": 0.305,
"step": 475
},
{
"epoch": 1.5207667731629393,
"grad_norm": 0.608521420596584,
"learning_rate": 2.3000100903828343e-05,
"loss": 0.3601,
"step": 476
},
{
"epoch": 1.5239616613418532,
"grad_norm": 0.6464475093135752,
"learning_rate": 2.2926564450990716e-05,
"loss": 0.3746,
"step": 477
},
{
"epoch": 1.5271565495207668,
"grad_norm": 0.6448444881128504,
"learning_rate": 2.2852987545787815e-05,
"loss": 0.3714,
"step": 478
},
{
"epoch": 1.5303514376996805,
"grad_norm": 0.6107451967926957,
"learning_rate": 2.2779371205234604e-05,
"loss": 0.3796,
"step": 479
},
{
"epoch": 1.5335463258785942,
"grad_norm": 0.660686053273227,
"learning_rate": 2.2705716446891143e-05,
"loss": 0.3822,
"step": 480
},
{
"epoch": 1.5367412140575079,
"grad_norm": 0.7750842489034678,
"learning_rate": 2.263202428884853e-05,
"loss": 0.4105,
"step": 481
},
{
"epoch": 1.5399361022364217,
"grad_norm": 0.6055959124966067,
"learning_rate": 2.2558295749714794e-05,
"loss": 0.4151,
"step": 482
},
{
"epoch": 1.5431309904153354,
"grad_norm": 0.6183198946717873,
"learning_rate": 2.2484531848600866e-05,
"loss": 0.3262,
"step": 483
},
{
"epoch": 1.5463258785942493,
"grad_norm": 0.6370711793501769,
"learning_rate": 2.2410733605106462e-05,
"loss": 0.3857,
"step": 484
},
{
"epoch": 1.549520766773163,
"grad_norm": 0.8015901036246532,
"learning_rate": 2.233690203930599e-05,
"loss": 0.3496,
"step": 485
},
{
"epoch": 1.5527156549520766,
"grad_norm": 0.7615585707403518,
"learning_rate": 2.2263038171734447e-05,
"loss": 0.3672,
"step": 486
},
{
"epoch": 1.5559105431309903,
"grad_norm": 0.5663892496577331,
"learning_rate": 2.2189143023373337e-05,
"loss": 0.3761,
"step": 487
},
{
"epoch": 1.5591054313099042,
"grad_norm": 0.7878946252561335,
"learning_rate": 2.2115217615636534e-05,
"loss": 0.3588,
"step": 488
},
{
"epoch": 1.5623003194888179,
"grad_norm": 0.7764697921864392,
"learning_rate": 2.204126297035617e-05,
"loss": 0.4196,
"step": 489
},
{
"epoch": 1.5654952076677318,
"grad_norm": 0.6542418496260201,
"learning_rate": 2.1967280109768505e-05,
"loss": 0.3408,
"step": 490
},
{
"epoch": 1.5686900958466454,
"grad_norm": 0.6223599528533879,
"learning_rate": 2.1893270056499832e-05,
"loss": 0.3777,
"step": 491
},
{
"epoch": 1.571884984025559,
"grad_norm": 0.661700414509121,
"learning_rate": 2.1819233833552275e-05,
"loss": 0.3128,
"step": 492
},
{
"epoch": 1.5750798722044728,
"grad_norm": 0.6234974404457322,
"learning_rate": 2.1745172464289722e-05,
"loss": 0.3962,
"step": 493
},
{
"epoch": 1.5782747603833864,
"grad_norm": 0.6676512384075517,
"learning_rate": 2.167108697242363e-05,
"loss": 0.3468,
"step": 494
},
{
"epoch": 1.5814696485623003,
"grad_norm": 0.6209137519763187,
"learning_rate": 2.1596978381998883e-05,
"loss": 0.344,
"step": 495
},
{
"epoch": 1.5846645367412142,
"grad_norm": 0.6215059716975698,
"learning_rate": 2.152284771737966e-05,
"loss": 0.3742,
"step": 496
},
{
"epoch": 1.5878594249201279,
"grad_norm": 0.6023037671559589,
"learning_rate": 2.1448696003235252e-05,
"loss": 0.3752,
"step": 497
},
{
"epoch": 1.5910543130990416,
"grad_norm": 0.6369188120702737,
"learning_rate": 2.1374524264525905e-05,
"loss": 0.3796,
"step": 498
},
{
"epoch": 1.5942492012779552,
"grad_norm": 0.5900720442619971,
"learning_rate": 2.130033352648866e-05,
"loss": 0.3535,
"step": 499
},
{
"epoch": 1.5974440894568689,
"grad_norm": 0.6255668983966362,
"learning_rate": 2.122612481462316e-05,
"loss": 0.4114,
"step": 500
},
{
"epoch": 1.6006389776357828,
"grad_norm": 0.6798385259233033,
"learning_rate": 2.115189915467752e-05,
"loss": 0.389,
"step": 501
},
{
"epoch": 1.6038338658146964,
"grad_norm": 0.6905443213231501,
"learning_rate": 2.1077657572634092e-05,
"loss": 0.3246,
"step": 502
},
{
"epoch": 1.6070287539936103,
"grad_norm": 0.571423682418217,
"learning_rate": 2.1003401094695325e-05,
"loss": 0.3344,
"step": 503
},
{
"epoch": 1.610223642172524,
"grad_norm": 0.6504514480344465,
"learning_rate": 2.0929130747269567e-05,
"loss": 0.3621,
"step": 504
},
{
"epoch": 1.6134185303514377,
"grad_norm": 0.6411322199210792,
"learning_rate": 2.0854847556956856e-05,
"loss": 0.3734,
"step": 505
},
{
"epoch": 1.6166134185303513,
"grad_norm": 0.6380721378438481,
"learning_rate": 2.078055255053478e-05,
"loss": 0.4034,
"step": 506
},
{
"epoch": 1.619808306709265,
"grad_norm": 0.6249192416083079,
"learning_rate": 2.070624675494424e-05,
"loss": 0.3504,
"step": 507
},
{
"epoch": 1.623003194888179,
"grad_norm": 0.6741471873600642,
"learning_rate": 2.0631931197275267e-05,
"loss": 0.3197,
"step": 508
},
{
"epoch": 1.6261980830670928,
"grad_norm": 0.6125040165749199,
"learning_rate": 2.0557606904752833e-05,
"loss": 0.3419,
"step": 509
},
{
"epoch": 1.6293929712460065,
"grad_norm": 0.6665831610245562,
"learning_rate": 2.0483274904722647e-05,
"loss": 0.3399,
"step": 510
},
{
"epoch": 1.6325878594249201,
"grad_norm": 0.6782509805533894,
"learning_rate": 2.0408936224636958e-05,
"loss": 0.384,
"step": 511
},
{
"epoch": 1.6357827476038338,
"grad_norm": 0.7436644924935581,
"learning_rate": 2.033459189204034e-05,
"loss": 0.3595,
"step": 512
},
{
"epoch": 1.6389776357827475,
"grad_norm": 0.7574508093688915,
"learning_rate": 2.026024293455551e-05,
"loss": 0.403,
"step": 513
},
{
"epoch": 1.6421725239616614,
"grad_norm": 0.6776067254300645,
"learning_rate": 2.0185890379869115e-05,
"loss": 0.3563,
"step": 514
},
{
"epoch": 1.645367412140575,
"grad_norm": 0.5691981666919255,
"learning_rate": 2.0111535255717496e-05,
"loss": 0.3613,
"step": 515
},
{
"epoch": 1.648562300319489,
"grad_norm": 0.6501032885266717,
"learning_rate": 2.0037178589872547e-05,
"loss": 0.3553,
"step": 516
},
{
"epoch": 1.6517571884984026,
"grad_norm": 0.5894185843432012,
"learning_rate": 1.9962821410127456e-05,
"loss": 0.3335,
"step": 517
},
{
"epoch": 1.6549520766773163,
"grad_norm": 0.639710192390572,
"learning_rate": 1.9888464744282504e-05,
"loss": 0.3627,
"step": 518
},
{
"epoch": 1.65814696485623,
"grad_norm": 0.6216821494956912,
"learning_rate": 1.981410962013089e-05,
"loss": 0.3344,
"step": 519
},
{
"epoch": 1.6613418530351438,
"grad_norm": 0.6345005257067161,
"learning_rate": 1.9739757065444492e-05,
"loss": 0.3698,
"step": 520
},
{
"epoch": 1.6645367412140575,
"grad_norm": 0.8101905044732475,
"learning_rate": 1.9665408107959657e-05,
"loss": 0.3861,
"step": 521
},
{
"epoch": 1.6677316293929714,
"grad_norm": 0.5961761807889937,
"learning_rate": 1.9591063775363045e-05,
"loss": 0.3535,
"step": 522
},
{
"epoch": 1.670926517571885,
"grad_norm": 0.7163037162115128,
"learning_rate": 1.951672509527736e-05,
"loss": 0.3573,
"step": 523
},
{
"epoch": 1.6741214057507987,
"grad_norm": 0.7626145562758659,
"learning_rate": 1.944239309524717e-05,
"loss": 0.3943,
"step": 524
},
{
"epoch": 1.6773162939297124,
"grad_norm": 0.6369019346205154,
"learning_rate": 1.936806880272474e-05,
"loss": 0.3311,
"step": 525
},
{
"epoch": 1.680511182108626,
"grad_norm": 0.6564798194303639,
"learning_rate": 1.9293753245055772e-05,
"loss": 0.4014,
"step": 526
},
{
"epoch": 1.68370607028754,
"grad_norm": 0.6312445430308768,
"learning_rate": 1.9219447449465222e-05,
"loss": 0.3123,
"step": 527
},
{
"epoch": 1.6869009584664538,
"grad_norm": 0.674163910217299,
"learning_rate": 1.9145152443043147e-05,
"loss": 0.4069,
"step": 528
},
{
"epoch": 1.6900958466453675,
"grad_norm": 0.5512026479165967,
"learning_rate": 1.9070869252730443e-05,
"loss": 0.3823,
"step": 529
},
{
"epoch": 1.6932907348242812,
"grad_norm": 0.5459303135589316,
"learning_rate": 1.899659890530468e-05,
"loss": 0.3801,
"step": 530
},
{
"epoch": 1.6964856230031948,
"grad_norm": 0.7107489428061275,
"learning_rate": 1.8922342427365915e-05,
"loss": 0.3743,
"step": 531
},
{
"epoch": 1.6996805111821085,
"grad_norm": 0.6821024548908552,
"learning_rate": 1.8848100845322486e-05,
"loss": 0.4001,
"step": 532
},
{
"epoch": 1.7028753993610224,
"grad_norm": 0.7194530880466413,
"learning_rate": 1.8773875185376845e-05,
"loss": 0.3967,
"step": 533
},
{
"epoch": 1.706070287539936,
"grad_norm": 0.619789910277277,
"learning_rate": 1.869966647351135e-05,
"loss": 0.3914,
"step": 534
},
{
"epoch": 1.70926517571885,
"grad_norm": 0.6514123622117488,
"learning_rate": 1.86254757354741e-05,
"loss": 0.355,
"step": 535
},
{
"epoch": 1.7124600638977636,
"grad_norm": 0.5831800498454969,
"learning_rate": 1.8551303996764755e-05,
"loss": 0.3715,
"step": 536
},
{
"epoch": 1.7156549520766773,
"grad_norm": 0.7041038447903819,
"learning_rate": 1.8477152282620344e-05,
"loss": 0.3452,
"step": 537
},
{
"epoch": 1.718849840255591,
"grad_norm": 0.6489046638136863,
"learning_rate": 1.8403021618001124e-05,
"loss": 0.328,
"step": 538
},
{
"epoch": 1.7220447284345048,
"grad_norm": 0.6220134083521842,
"learning_rate": 1.8328913027576373e-05,
"loss": 0.3644,
"step": 539
},
{
"epoch": 1.7252396166134185,
"grad_norm": 0.691164548614104,
"learning_rate": 1.825482753571028e-05,
"loss": 0.386,
"step": 540
},
{
"epoch": 1.7284345047923324,
"grad_norm": 0.6471216980585122,
"learning_rate": 1.818076616644773e-05,
"loss": 0.3863,
"step": 541
},
{
"epoch": 1.731629392971246,
"grad_norm": 0.5955048340238702,
"learning_rate": 1.8106729943500174e-05,
"loss": 0.3813,
"step": 542
},
{
"epoch": 1.7348242811501597,
"grad_norm": 0.5557497123983334,
"learning_rate": 1.80327198902315e-05,
"loss": 0.4207,
"step": 543
},
{
"epoch": 1.7380191693290734,
"grad_norm": 0.6140635892910118,
"learning_rate": 1.7958737029643835e-05,
"loss": 0.352,
"step": 544
},
{
"epoch": 1.741214057507987,
"grad_norm": 0.654044064069364,
"learning_rate": 1.788478238436347e-05,
"loss": 0.3887,
"step": 545
},
{
"epoch": 1.744408945686901,
"grad_norm": 0.5807508632491186,
"learning_rate": 1.781085697662667e-05,
"loss": 0.3833,
"step": 546
},
{
"epoch": 1.7476038338658149,
"grad_norm": 0.7236056668219373,
"learning_rate": 1.7736961828265553e-05,
"loss": 0.387,
"step": 547
},
{
"epoch": 1.7507987220447285,
"grad_norm": 0.6286981610831269,
"learning_rate": 1.7663097960694017e-05,
"loss": 0.413,
"step": 548
},
{
"epoch": 1.7539936102236422,
"grad_norm": 0.6471275268399443,
"learning_rate": 1.758926639489354e-05,
"loss": 0.3265,
"step": 549
},
{
"epoch": 1.7571884984025559,
"grad_norm": 0.6634839205028399,
"learning_rate": 1.7515468151399134e-05,
"loss": 0.3959,
"step": 550
},
{
"epoch": 1.7603833865814695,
"grad_norm": 0.6755904835423869,
"learning_rate": 1.7441704250285212e-05,
"loss": 0.3606,
"step": 551
},
{
"epoch": 1.7635782747603834,
"grad_norm": 0.6500797325645201,
"learning_rate": 1.7367975711151483e-05,
"loss": 0.3876,
"step": 552
},
{
"epoch": 1.766773162939297,
"grad_norm": 0.6477835743216911,
"learning_rate": 1.729428355310886e-05,
"loss": 0.3158,
"step": 553
},
{
"epoch": 1.769968051118211,
"grad_norm": 0.6950215837197072,
"learning_rate": 1.7220628794765403e-05,
"loss": 0.3578,
"step": 554
},
{
"epoch": 1.7731629392971247,
"grad_norm": 0.7161970787305121,
"learning_rate": 1.7147012454212195e-05,
"loss": 0.4181,
"step": 555
},
{
"epoch": 1.7763578274760383,
"grad_norm": 0.56303267169658,
"learning_rate": 1.7073435549009288e-05,
"loss": 0.3609,
"step": 556
},
{
"epoch": 1.779552715654952,
"grad_norm": 0.6914199086511422,
"learning_rate": 1.699989909617166e-05,
"loss": 0.3109,
"step": 557
},
{
"epoch": 1.7827476038338657,
"grad_norm": 0.6547054751902353,
"learning_rate": 1.6926404112155123e-05,
"loss": 0.3595,
"step": 558
},
{
"epoch": 1.7859424920127795,
"grad_norm": 0.7444527094326194,
"learning_rate": 1.6852951612842278e-05,
"loss": 0.3476,
"step": 559
},
{
"epoch": 1.7891373801916934,
"grad_norm": 0.8274662111243524,
"learning_rate": 1.677954261352851e-05,
"loss": 0.3673,
"step": 560
},
{
"epoch": 1.792332268370607,
"grad_norm": 0.7793207248626209,
"learning_rate": 1.6706178128907897e-05,
"loss": 0.3756,
"step": 561
},
{
"epoch": 1.7955271565495208,
"grad_norm": 0.6411675936700109,
"learning_rate": 1.6632859173059232e-05,
"loss": 0.3573,
"step": 562
},
{
"epoch": 1.7987220447284344,
"grad_norm": 0.7225443818014319,
"learning_rate": 1.655958675943198e-05,
"loss": 0.3443,
"step": 563
},
{
"epoch": 1.8019169329073481,
"grad_norm": 0.7764335703579314,
"learning_rate": 1.6486361900832284e-05,
"loss": 0.3644,
"step": 564
},
{
"epoch": 1.805111821086262,
"grad_norm": 0.7643738155044116,
"learning_rate": 1.6413185609408946e-05,
"loss": 0.3814,
"step": 565
},
{
"epoch": 1.8083067092651757,
"grad_norm": 0.6797205069953752,
"learning_rate": 1.6340058896639464e-05,
"loss": 0.3431,
"step": 566
},
{
"epoch": 1.8115015974440896,
"grad_norm": 0.6487569782925131,
"learning_rate": 1.6266982773316032e-05,
"loss": 0.3827,
"step": 567
},
{
"epoch": 1.8146964856230032,
"grad_norm": 0.7645835950918118,
"learning_rate": 1.6193958249531562e-05,
"loss": 0.4112,
"step": 568
},
{
"epoch": 1.817891373801917,
"grad_norm": 0.7113029044269039,
"learning_rate": 1.612098633466575e-05,
"loss": 0.3779,
"step": 569
},
{
"epoch": 1.8210862619808306,
"grad_norm": 0.5948671080899526,
"learning_rate": 1.6048068037371092e-05,
"loss": 0.3195,
"step": 570
},
{
"epoch": 1.8242811501597445,
"grad_norm": 0.70421201984334,
"learning_rate": 1.597520436555896e-05,
"loss": 0.3776,
"step": 571
},
{
"epoch": 1.8274760383386581,
"grad_norm": 0.6726359632379854,
"learning_rate": 1.590239632638568e-05,
"loss": 0.4225,
"step": 572
},
{
"epoch": 1.830670926517572,
"grad_norm": 0.681506981858163,
"learning_rate": 1.582964492623857e-05,
"loss": 0.3627,
"step": 573
},
{
"epoch": 1.8338658146964857,
"grad_norm": 0.6579916459695835,
"learning_rate": 1.575695117072209e-05,
"loss": 0.3566,
"step": 574
},
{
"epoch": 1.8370607028753994,
"grad_norm": 0.5767726433601224,
"learning_rate": 1.568431606464388e-05,
"loss": 0.3582,
"step": 575
},
{
"epoch": 1.840255591054313,
"grad_norm": 0.6950140418517404,
"learning_rate": 1.5611740612000906e-05,
"loss": 0.3861,
"step": 576
},
{
"epoch": 1.8434504792332267,
"grad_norm": 0.5791968225578055,
"learning_rate": 1.5539225815965595e-05,
"loss": 0.3383,
"step": 577
},
{
"epoch": 1.8466453674121406,
"grad_norm": 0.6049662279870999,
"learning_rate": 1.546677267887193e-05,
"loss": 0.3719,
"step": 578
},
{
"epoch": 1.8498402555910545,
"grad_norm": 0.6161946790988828,
"learning_rate": 1.5394382202201605e-05,
"loss": 0.382,
"step": 579
},
{
"epoch": 1.8530351437699681,
"grad_norm": 0.6325260524024612,
"learning_rate": 1.5322055386570225e-05,
"loss": 0.3587,
"step": 580
},
{
"epoch": 1.8562300319488818,
"grad_norm": 0.6241897380088132,
"learning_rate": 1.5249793231713418e-05,
"loss": 0.3386,
"step": 581
},
{
"epoch": 1.8594249201277955,
"grad_norm": 0.565220565027428,
"learning_rate": 1.5177596736473034e-05,
"loss": 0.3774,
"step": 582
},
{
"epoch": 1.8626198083067091,
"grad_norm": 0.5845662723376586,
"learning_rate": 1.5105466898783379e-05,
"loss": 0.3235,
"step": 583
},
{
"epoch": 1.865814696485623,
"grad_norm": 0.6433701307468581,
"learning_rate": 1.5033404715657344e-05,
"loss": 0.3728,
"step": 584
},
{
"epoch": 1.8690095846645367,
"grad_norm": 0.6619778705276104,
"learning_rate": 1.4961411183172686e-05,
"loss": 0.346,
"step": 585
},
{
"epoch": 1.8722044728434506,
"grad_norm": 0.7410247957434669,
"learning_rate": 1.4889487296458258e-05,
"loss": 0.367,
"step": 586
},
{
"epoch": 1.8753993610223643,
"grad_norm": 0.6248658361645802,
"learning_rate": 1.4817634049680207e-05,
"loss": 0.3377,
"step": 587
},
{
"epoch": 1.878594249201278,
"grad_norm": 0.5927935921175941,
"learning_rate": 1.4745852436028262e-05,
"loss": 0.3355,
"step": 588
},
{
"epoch": 1.8817891373801916,
"grad_norm": 0.6031296090039979,
"learning_rate": 1.4674143447702036e-05,
"loss": 0.3432,
"step": 589
},
{
"epoch": 1.8849840255591053,
"grad_norm": 0.6978415390998038,
"learning_rate": 1.4602508075897249e-05,
"loss": 0.4307,
"step": 590
},
{
"epoch": 1.8881789137380192,
"grad_norm": 0.6802206273285568,
"learning_rate": 1.453094731079206e-05,
"loss": 0.3337,
"step": 591
},
{
"epoch": 1.891373801916933,
"grad_norm": 0.6075920734911536,
"learning_rate": 1.4459462141533407e-05,
"loss": 0.3959,
"step": 592
},
{
"epoch": 1.8945686900958467,
"grad_norm": 0.5500279524690617,
"learning_rate": 1.4388053556223274e-05,
"loss": 0.3456,
"step": 593
},
{
"epoch": 1.8977635782747604,
"grad_norm": 0.598113110586812,
"learning_rate": 1.4316722541905081e-05,
"loss": 0.3581,
"step": 594
},
{
"epoch": 1.900958466453674,
"grad_norm": 0.5879717265811937,
"learning_rate": 1.4245470084550026e-05,
"loss": 0.3484,
"step": 595
},
{
"epoch": 1.9041533546325877,
"grad_norm": 0.5983260797622781,
"learning_rate": 1.4174297169043447e-05,
"loss": 0.3968,
"step": 596
},
{
"epoch": 1.9073482428115016,
"grad_norm": 0.5893645004999872,
"learning_rate": 1.410320477917122e-05,
"loss": 0.3377,
"step": 597
},
{
"epoch": 1.9105431309904153,
"grad_norm": 0.5981250674431736,
"learning_rate": 1.4032193897606164e-05,
"loss": 0.3728,
"step": 598
},
{
"epoch": 1.9137380191693292,
"grad_norm": 0.7033342157204642,
"learning_rate": 1.3961265505894442e-05,
"loss": 0.3986,
"step": 599
},
{
"epoch": 1.9169329073482428,
"grad_norm": 0.6193784424182153,
"learning_rate": 1.3890420584442007e-05,
"loss": 0.3833,
"step": 600
},
{
"epoch": 1.9201277955271565,
"grad_norm": 0.6378469529500934,
"learning_rate": 1.3819660112501054e-05,
"loss": 0.4048,
"step": 601
},
{
"epoch": 1.9233226837060702,
"grad_norm": 0.6260896725715146,
"learning_rate": 1.374898506815646e-05,
"loss": 0.3259,
"step": 602
},
{
"epoch": 1.926517571884984,
"grad_norm": 0.6830049362505771,
"learning_rate": 1.3678396428312291e-05,
"loss": 0.3824,
"step": 603
},
{
"epoch": 1.9297124600638977,
"grad_norm": 0.6210066067190025,
"learning_rate": 1.3607895168678296e-05,
"loss": 0.3612,
"step": 604
},
{
"epoch": 1.9329073482428116,
"grad_norm": 0.6413094034332517,
"learning_rate": 1.3537482263756391e-05,
"loss": 0.3548,
"step": 605
},
{
"epoch": 1.9361022364217253,
"grad_norm": 0.620634478966929,
"learning_rate": 1.3467158686827242e-05,
"loss": 0.3884,
"step": 606
},
{
"epoch": 1.939297124600639,
"grad_norm": 0.6407808790372046,
"learning_rate": 1.339692540993676e-05,
"loss": 0.3948,
"step": 607
},
{
"epoch": 1.9424920127795526,
"grad_norm": 0.7276368275858982,
"learning_rate": 1.332678340388268e-05,
"loss": 0.347,
"step": 608
},
{
"epoch": 1.9456869009584663,
"grad_norm": 0.6880632837814936,
"learning_rate": 1.3256733638201172e-05,
"loss": 0.3811,
"step": 609
},
{
"epoch": 1.9488817891373802,
"grad_norm": 0.568414071034355,
"learning_rate": 1.3186777081153398e-05,
"loss": 0.3852,
"step": 610
},
{
"epoch": 1.952076677316294,
"grad_norm": 0.6125392745773798,
"learning_rate": 1.311691469971214e-05,
"loss": 0.3314,
"step": 611
},
{
"epoch": 1.9552715654952078,
"grad_norm": 0.6154906870246765,
"learning_rate": 1.3047147459548469e-05,
"loss": 0.3983,
"step": 612
},
{
"epoch": 1.9584664536741214,
"grad_norm": 0.7090879134482768,
"learning_rate": 1.297747632501834e-05,
"loss": 0.3547,
"step": 613
},
{
"epoch": 1.961661341853035,
"grad_norm": 0.7919803140094102,
"learning_rate": 1.2907902259149287e-05,
"loss": 0.3884,
"step": 614
},
{
"epoch": 1.9648562300319488,
"grad_norm": 0.6135804624651449,
"learning_rate": 1.2838426223627152e-05,
"loss": 0.3292,
"step": 615
},
{
"epoch": 1.9680511182108626,
"grad_norm": 0.627406953051245,
"learning_rate": 1.2769049178782716e-05,
"loss": 0.3379,
"step": 616
},
{
"epoch": 1.9712460063897763,
"grad_norm": 0.7149906660956054,
"learning_rate": 1.2699772083578472e-05,
"loss": 0.3727,
"step": 617
},
{
"epoch": 1.9744408945686902,
"grad_norm": 0.7249915937043523,
"learning_rate": 1.2630595895595383e-05,
"loss": 0.3424,
"step": 618
},
{
"epoch": 1.9776357827476039,
"grad_norm": 0.7589039306193895,
"learning_rate": 1.2561521571019603e-05,
"loss": 0.3637,
"step": 619
},
{
"epoch": 1.9808306709265175,
"grad_norm": 0.6176823766792565,
"learning_rate": 1.249255006462928e-05,
"loss": 0.3495,
"step": 620
},
{
"epoch": 1.9840255591054312,
"grad_norm": 0.7150454220639018,
"learning_rate": 1.2423682329781378e-05,
"loss": 0.3628,
"step": 621
},
{
"epoch": 1.9872204472843449,
"grad_norm": 0.6869892242638056,
"learning_rate": 1.2354919318398473e-05,
"loss": 0.3528,
"step": 622
},
{
"epoch": 1.9904153354632588,
"grad_norm": 0.7412388678758558,
"learning_rate": 1.2286261980955583e-05,
"loss": 0.3744,
"step": 623
},
{
"epoch": 1.9936102236421727,
"grad_norm": 0.7562821838566286,
"learning_rate": 1.2217711266467092e-05,
"loss": 0.3947,
"step": 624
},
{
"epoch": 1.9968051118210863,
"grad_norm": 0.5867017695210721,
"learning_rate": 1.2149268122473554e-05,
"loss": 0.327,
"step": 625
},
{
"epoch": 2.0,
"grad_norm": 0.5635062124393114,
"learning_rate": 1.2080933495028648e-05,
"loss": 0.3089,
"step": 626
},
{
"epoch": 2.0031948881789137,
"grad_norm": 0.6715285391032262,
"learning_rate": 1.2012708328686093e-05,
"loss": 0.2396,
"step": 627
},
{
"epoch": 2.0063897763578273,
"grad_norm": 0.5999774675489589,
"learning_rate": 1.1944593566486562e-05,
"loss": 0.2402,
"step": 628
},
{
"epoch": 2.009584664536741,
"grad_norm": 0.6523644403725044,
"learning_rate": 1.18765901499447e-05,
"loss": 0.2595,
"step": 629
},
{
"epoch": 2.012779552715655,
"grad_norm": 0.6688113012671898,
"learning_rate": 1.1808699019036047e-05,
"loss": 0.2063,
"step": 630
},
{
"epoch": 2.015974440894569,
"grad_norm": 0.8690136893873772,
"learning_rate": 1.17409211121841e-05,
"loss": 0.2628,
"step": 631
},
{
"epoch": 2.0191693290734825,
"grad_norm": 0.5849734190846696,
"learning_rate": 1.1673257366247319e-05,
"loss": 0.2178,
"step": 632
},
{
"epoch": 2.022364217252396,
"grad_norm": 0.6360648521093288,
"learning_rate": 1.1605708716506161e-05,
"loss": 0.2374,
"step": 633
},
{
"epoch": 2.02555910543131,
"grad_norm": 0.5809260418478515,
"learning_rate": 1.1538276096650175e-05,
"loss": 0.1988,
"step": 634
},
{
"epoch": 2.0287539936102235,
"grad_norm": 0.6364023975071599,
"learning_rate": 1.1470960438765108e-05,
"loss": 0.253,
"step": 635
},
{
"epoch": 2.0319488817891376,
"grad_norm": 0.608832086457546,
"learning_rate": 1.1403762673319983e-05,
"loss": 0.2254,
"step": 636
},
{
"epoch": 2.0351437699680512,
"grad_norm": 0.6196453859771298,
"learning_rate": 1.133668372915425e-05,
"loss": 0.2477,
"step": 637
},
{
"epoch": 2.038338658146965,
"grad_norm": 0.49320636326890754,
"learning_rate": 1.1269724533464984e-05,
"loss": 0.2344,
"step": 638
},
{
"epoch": 2.0415335463258786,
"grad_norm": 0.5893706817555677,
"learning_rate": 1.1202886011794023e-05,
"loss": 0.284,
"step": 639
},
{
"epoch": 2.0447284345047922,
"grad_norm": 0.5927365358551429,
"learning_rate": 1.1136169088015177e-05,
"loss": 0.2446,
"step": 640
},
{
"epoch": 2.047923322683706,
"grad_norm": 0.5912266488563296,
"learning_rate": 1.1069574684321505e-05,
"loss": 0.249,
"step": 641
},
{
"epoch": 2.0511182108626196,
"grad_norm": 0.5068898035040869,
"learning_rate": 1.1003103721212503e-05,
"loss": 0.2404,
"step": 642
},
{
"epoch": 2.0543130990415337,
"grad_norm": 0.49830277604732454,
"learning_rate": 1.0936757117481438e-05,
"loss": 0.2226,
"step": 643
},
{
"epoch": 2.0575079872204474,
"grad_norm": 0.5137466675546016,
"learning_rate": 1.0870535790202606e-05,
"loss": 0.2246,
"step": 644
},
{
"epoch": 2.060702875399361,
"grad_norm": 0.5579716315671657,
"learning_rate": 1.080444065471867e-05,
"loss": 0.218,
"step": 645
},
{
"epoch": 2.0638977635782747,
"grad_norm": 0.5528250590936209,
"learning_rate": 1.0738472624628034e-05,
"loss": 0.2485,
"step": 646
},
{
"epoch": 2.0670926517571884,
"grad_norm": 0.5493618613823972,
"learning_rate": 1.0672632611772156e-05,
"loss": 0.2425,
"step": 647
},
{
"epoch": 2.070287539936102,
"grad_norm": 0.5673228300389772,
"learning_rate": 1.0606921526223016e-05,
"loss": 0.2497,
"step": 648
},
{
"epoch": 2.073482428115016,
"grad_norm": 0.5203097618726312,
"learning_rate": 1.0541340276270468e-05,
"loss": 0.212,
"step": 649
},
{
"epoch": 2.07667731629393,
"grad_norm": 0.5534033378725121,
"learning_rate": 1.0475889768409729e-05,
"loss": 0.1971,
"step": 650
},
{
"epoch": 2.0798722044728435,
"grad_norm": 0.5300132980491199,
"learning_rate": 1.0410570907328848e-05,
"loss": 0.2576,
"step": 651
},
{
"epoch": 2.083067092651757,
"grad_norm": 0.5690239162135191,
"learning_rate": 1.0345384595896161e-05,
"loss": 0.2366,
"step": 652
},
{
"epoch": 2.086261980830671,
"grad_norm": 0.5256402067518808,
"learning_rate": 1.028033173514788e-05,
"loss": 0.235,
"step": 653
},
{
"epoch": 2.0894568690095845,
"grad_norm": 0.48537333431940805,
"learning_rate": 1.0215413224275552e-05,
"loss": 0.2154,
"step": 654
},
{
"epoch": 2.0926517571884986,
"grad_norm": 0.5161821862609771,
"learning_rate": 1.0150629960613721e-05,
"loss": 0.21,
"step": 655
},
{
"epoch": 2.0958466453674123,
"grad_norm": 0.5281317540190527,
"learning_rate": 1.0085982839627445e-05,
"loss": 0.1899,
"step": 656
},
{
"epoch": 2.099041533546326,
"grad_norm": 0.4709200890585819,
"learning_rate": 1.0021472754899966e-05,
"loss": 0.2292,
"step": 657
},
{
"epoch": 2.1022364217252396,
"grad_norm": 0.45322842520766593,
"learning_rate": 9.957100598120357e-06,
"loss": 0.2147,
"step": 658
},
{
"epoch": 2.1054313099041533,
"grad_norm": 0.5018911122380977,
"learning_rate": 9.89286725907117e-06,
"loss": 0.2665,
"step": 659
},
{
"epoch": 2.108626198083067,
"grad_norm": 0.5090958885022994,
"learning_rate": 9.828773625616145e-06,
"loss": 0.2071,
"step": 660
},
{
"epoch": 2.1118210862619806,
"grad_norm": 0.5042008645673148,
"learning_rate": 9.764820583687978e-06,
"loss": 0.22,
"step": 661
},
{
"epoch": 2.1150159744408947,
"grad_norm": 0.5636525223827146,
"learning_rate": 9.701009017276008e-06,
"loss": 0.2255,
"step": 662
},
{
"epoch": 2.1182108626198084,
"grad_norm": 0.4968534545435798,
"learning_rate": 9.637339808414042e-06,
"loss": 0.2187,
"step": 663
},
{
"epoch": 2.121405750798722,
"grad_norm": 0.536209126935731,
"learning_rate": 9.573813837168166e-06,
"loss": 0.2212,
"step": 664
},
{
"epoch": 2.1246006389776357,
"grad_norm": 0.477893192356411,
"learning_rate": 9.510431981624554e-06,
"loss": 0.2084,
"step": 665
},
{
"epoch": 2.1277955271565494,
"grad_norm": 0.47741193568130846,
"learning_rate": 9.447195117877343e-06,
"loss": 0.216,
"step": 666
},
{
"epoch": 2.130990415335463,
"grad_norm": 0.5101367545264079,
"learning_rate": 9.384104120016542e-06,
"loss": 0.2634,
"step": 667
},
{
"epoch": 2.134185303514377,
"grad_norm": 0.49187172517640576,
"learning_rate": 9.321159860115909e-06,
"loss": 0.224,
"step": 668
},
{
"epoch": 2.137380191693291,
"grad_norm": 0.5056192356778784,
"learning_rate": 9.258363208220929e-06,
"loss": 0.2228,
"step": 669
},
{
"epoch": 2.1405750798722045,
"grad_norm": 0.5445137543547477,
"learning_rate": 9.195715032336794e-06,
"loss": 0.2252,
"step": 670
},
{
"epoch": 2.143769968051118,
"grad_norm": 0.5060926192022697,
"learning_rate": 9.13321619841637e-06,
"loss": 0.2774,
"step": 671
},
{
"epoch": 2.146964856230032,
"grad_norm": 0.47010736862307156,
"learning_rate": 9.070867570348247e-06,
"loss": 0.22,
"step": 672
},
{
"epoch": 2.1501597444089455,
"grad_norm": 0.5094958330325633,
"learning_rate": 9.00867000994482e-06,
"loss": 0.2256,
"step": 673
},
{
"epoch": 2.1533546325878596,
"grad_norm": 0.5259589614128072,
"learning_rate": 8.946624376930333e-06,
"loss": 0.2755,
"step": 674
},
{
"epoch": 2.1565495207667733,
"grad_norm": 0.5139540383730451,
"learning_rate": 8.884731528929019e-06,
"loss": 0.2659,
"step": 675
},
{
"epoch": 2.159744408945687,
"grad_norm": 0.4832097809114296,
"learning_rate": 8.822992321453264e-06,
"loss": 0.215,
"step": 676
},
{
"epoch": 2.1629392971246006,
"grad_norm": 0.44217298247605424,
"learning_rate": 8.76140760789174e-06,
"loss": 0.2212,
"step": 677
},
{
"epoch": 2.1661341853035143,
"grad_norm": 0.5003409469817744,
"learning_rate": 8.69997823949763e-06,
"loss": 0.1728,
"step": 678
},
{
"epoch": 2.169329073482428,
"grad_norm": 0.4819137577229696,
"learning_rate": 8.638705065376887e-06,
"loss": 0.2091,
"step": 679
},
{
"epoch": 2.1725239616613417,
"grad_norm": 0.48347484950099356,
"learning_rate": 8.577588932476448e-06,
"loss": 0.208,
"step": 680
},
{
"epoch": 2.1757188498402558,
"grad_norm": 0.48841924781203694,
"learning_rate": 8.516630685572553e-06,
"loss": 0.2097,
"step": 681
},
{
"epoch": 2.1789137380191694,
"grad_norm": 0.5540389648785453,
"learning_rate": 8.455831167259086e-06,
"loss": 0.2343,
"step": 682
},
{
"epoch": 2.182108626198083,
"grad_norm": 0.45425575230845544,
"learning_rate": 8.395191217935883e-06,
"loss": 0.2278,
"step": 683
},
{
"epoch": 2.1853035143769968,
"grad_norm": 0.4779637593364659,
"learning_rate": 8.33471167579717e-06,
"loss": 0.2637,
"step": 684
},
{
"epoch": 2.1884984025559104,
"grad_norm": 0.5285221690710397,
"learning_rate": 8.274393376819924e-06,
"loss": 0.2347,
"step": 685
},
{
"epoch": 2.191693290734824,
"grad_norm": 0.473579878257591,
"learning_rate": 8.214237154752345e-06,
"loss": 0.2159,
"step": 686
},
{
"epoch": 2.194888178913738,
"grad_norm": 0.501799232462877,
"learning_rate": 8.154243841102351e-06,
"loss": 0.2476,
"step": 687
},
{
"epoch": 2.198083067092652,
"grad_norm": 0.4627453575153537,
"learning_rate": 8.09441426512604e-06,
"loss": 0.2466,
"step": 688
},
{
"epoch": 2.2012779552715656,
"grad_norm": 0.5185845450190892,
"learning_rate": 8.03474925381625e-06,
"loss": 0.2418,
"step": 689
},
{
"epoch": 2.2044728434504792,
"grad_norm": 0.49421035355847515,
"learning_rate": 7.97524963189115e-06,
"loss": 0.2903,
"step": 690
},
{
"epoch": 2.207667731629393,
"grad_norm": 0.4884543130989985,
"learning_rate": 7.91591622178279e-06,
"loss": 0.2604,
"step": 691
},
{
"epoch": 2.2108626198083066,
"grad_norm": 0.421393965123483,
"learning_rate": 7.856749843625777e-06,
"loss": 0.2027,
"step": 692
},
{
"epoch": 2.2140575079872207,
"grad_norm": 0.49505325003001727,
"learning_rate": 7.797751315245927e-06,
"loss": 0.2265,
"step": 693
},
{
"epoch": 2.2172523961661343,
"grad_norm": 0.5048775790142346,
"learning_rate": 7.738921452148949e-06,
"loss": 0.2624,
"step": 694
},
{
"epoch": 2.220447284345048,
"grad_norm": 0.48026767805561044,
"learning_rate": 7.68026106750917e-06,
"loss": 0.267,
"step": 695
},
{
"epoch": 2.2236421725239617,
"grad_norm": 0.5099721248485902,
"learning_rate": 7.621770972158331e-06,
"loss": 0.2275,
"step": 696
},
{
"epoch": 2.2268370607028753,
"grad_norm": 0.478130569524219,
"learning_rate": 7.563451974574332e-06,
"loss": 0.2135,
"step": 697
},
{
"epoch": 2.230031948881789,
"grad_norm": 0.4333719232122216,
"learning_rate": 7.5053048808700814e-06,
"loss": 0.2028,
"step": 698
},
{
"epoch": 2.2332268370607027,
"grad_norm": 0.4907979978569055,
"learning_rate": 7.447330494782363e-06,
"loss": 0.2448,
"step": 699
},
{
"epoch": 2.236421725239617,
"grad_norm": 0.6087204868248344,
"learning_rate": 7.389529617660705e-06,
"loss": 0.2514,
"step": 700
},
{
"epoch": 2.2396166134185305,
"grad_norm": 0.4779517559418497,
"learning_rate": 7.331903048456299e-06,
"loss": 0.1698,
"step": 701
},
{
"epoch": 2.242811501597444,
"grad_norm": 0.48682990012078686,
"learning_rate": 7.274451583711e-06,
"loss": 0.2456,
"step": 702
},
{
"epoch": 2.246006389776358,
"grad_norm": 0.5696452459133338,
"learning_rate": 7.217176017546263e-06,
"loss": 0.23,
"step": 703
},
{
"epoch": 2.2492012779552715,
"grad_norm": 0.47827230908811474,
"learning_rate": 7.160077141652186e-06,
"loss": 0.2121,
"step": 704
},
{
"epoch": 2.252396166134185,
"grad_norm": 0.49227009280747874,
"learning_rate": 7.1031557452765934e-06,
"loss": 0.2662,
"step": 705
},
{
"epoch": 2.255591054313099,
"grad_norm": 0.5350393375681801,
"learning_rate": 7.046412615214075e-06,
"loss": 0.2493,
"step": 706
},
{
"epoch": 2.258785942492013,
"grad_norm": 0.4684828298726711,
"learning_rate": 6.98984853579517e-06,
"loss": 0.1838,
"step": 707
},
{
"epoch": 2.2619808306709266,
"grad_norm": 0.4784754257674382,
"learning_rate": 6.933464288875467e-06,
"loss": 0.1974,
"step": 708
},
{
"epoch": 2.2651757188498403,
"grad_norm": 0.4997974355017908,
"learning_rate": 6.8772606538248285e-06,
"loss": 0.2016,
"step": 709
},
{
"epoch": 2.268370607028754,
"grad_norm": 0.47011903064932126,
"learning_rate": 6.821238407516635e-06,
"loss": 0.2346,
"step": 710
},
{
"epoch": 2.2715654952076676,
"grad_norm": 0.45084286894131964,
"learning_rate": 6.765398324316996e-06,
"loss": 0.2261,
"step": 711
},
{
"epoch": 2.2747603833865817,
"grad_norm": 0.4586308317736137,
"learning_rate": 6.7097411760741075e-06,
"loss": 0.2141,
"step": 712
},
{
"epoch": 2.2779552715654954,
"grad_norm": 0.452772588205902,
"learning_rate": 6.654267732107516e-06,
"loss": 0.1987,
"step": 713
},
{
"epoch": 2.281150159744409,
"grad_norm": 0.5116763949149057,
"learning_rate": 6.598978759197554e-06,
"loss": 0.2392,
"step": 714
},
{
"epoch": 2.2843450479233227,
"grad_norm": 0.498901942763996,
"learning_rate": 6.543875021574686e-06,
"loss": 0.1921,
"step": 715
},
{
"epoch": 2.2875399361022364,
"grad_norm": 0.49554234705109035,
"learning_rate": 6.4889572809089655e-06,
"loss": 0.2205,
"step": 716
},
{
"epoch": 2.29073482428115,
"grad_norm": 0.4845867709952482,
"learning_rate": 6.43422629629953e-06,
"loss": 0.2497,
"step": 717
},
{
"epoch": 2.2939297124600637,
"grad_norm": 0.47026482069183134,
"learning_rate": 6.379682824264055e-06,
"loss": 0.2295,
"step": 718
},
{
"epoch": 2.297124600638978,
"grad_norm": 0.4649585300155169,
"learning_rate": 6.325327618728356e-06,
"loss": 0.1516,
"step": 719
},
{
"epoch": 2.3003194888178915,
"grad_norm": 0.40206850228054203,
"learning_rate": 6.271161431015922e-06,
"loss": 0.2417,
"step": 720
},
{
"epoch": 2.303514376996805,
"grad_norm": 0.430228031696727,
"learning_rate": 6.2171850098375475e-06,
"loss": 0.2368,
"step": 721
},
{
"epoch": 2.306709265175719,
"grad_norm": 0.42538480611909024,
"learning_rate": 6.163399101281e-06,
"loss": 0.1991,
"step": 722
},
{
"epoch": 2.3099041533546325,
"grad_norm": 0.5044088982415396,
"learning_rate": 6.1098044488006735e-06,
"loss": 0.1973,
"step": 723
},
{
"epoch": 2.313099041533546,
"grad_norm": 0.4668130698688352,
"learning_rate": 6.056401793207329e-06,
"loss": 0.2262,
"step": 724
},
{
"epoch": 2.31629392971246,
"grad_norm": 0.5258923718903504,
"learning_rate": 6.003191872657878e-06,
"loss": 0.2634,
"step": 725
},
{
"epoch": 2.319488817891374,
"grad_norm": 0.47646849150208387,
"learning_rate": 5.950175422645134e-06,
"loss": 0.2301,
"step": 726
},
{
"epoch": 2.3226837060702876,
"grad_norm": 0.4597740271176972,
"learning_rate": 5.897353175987668e-06,
"loss": 0.2313,
"step": 727
},
{
"epoch": 2.3258785942492013,
"grad_norm": 0.4428572782581877,
"learning_rate": 5.844725862819703e-06,
"loss": 0.2174,
"step": 728
},
{
"epoch": 2.329073482428115,
"grad_norm": 0.5173021855990265,
"learning_rate": 5.792294210580971e-06,
"loss": 0.2068,
"step": 729
},
{
"epoch": 2.3322683706070286,
"grad_norm": 0.4883764779635357,
"learning_rate": 5.740058944006697e-06,
"loss": 0.194,
"step": 730
},
{
"epoch": 2.3354632587859427,
"grad_norm": 0.5032508220310962,
"learning_rate": 5.688020785117581e-06,
"loss": 0.2653,
"step": 731
},
{
"epoch": 2.3386581469648564,
"grad_norm": 0.5190580775432023,
"learning_rate": 5.636180453209789e-06,
"loss": 0.2323,
"step": 732
},
{
"epoch": 2.34185303514377,
"grad_norm": 0.4545099042298369,
"learning_rate": 5.584538664845034e-06,
"loss": 0.2282,
"step": 733
},
{
"epoch": 2.3450479233226837,
"grad_norm": 0.46066006930429154,
"learning_rate": 5.533096133840677e-06,
"loss": 0.2454,
"step": 734
},
{
"epoch": 2.3482428115015974,
"grad_norm": 0.49707563335332516,
"learning_rate": 5.48185357125983e-06,
"loss": 0.2457,
"step": 735
},
{
"epoch": 2.351437699680511,
"grad_norm": 0.4739619495394498,
"learning_rate": 5.4308116854015644e-06,
"loss": 0.2192,
"step": 736
},
{
"epoch": 2.3546325878594248,
"grad_norm": 0.46153270987931605,
"learning_rate": 5.379971181791093e-06,
"loss": 0.2727,
"step": 737
},
{
"epoch": 2.357827476038339,
"grad_norm": 0.44872264474740164,
"learning_rate": 5.3293327631700185e-06,
"loss": 0.2112,
"step": 738
},
{
"epoch": 2.3610223642172525,
"grad_norm": 0.46169938248569553,
"learning_rate": 5.278897129486656e-06,
"loss": 0.2021,
"step": 739
},
{
"epoch": 2.364217252396166,
"grad_norm": 0.4614793999691915,
"learning_rate": 5.228664977886304e-06,
"loss": 0.201,
"step": 740
},
{
"epoch": 2.36741214057508,
"grad_norm": 0.48601214815606647,
"learning_rate": 5.178637002701639e-06,
"loss": 0.2016,
"step": 741
},
{
"epoch": 2.3706070287539935,
"grad_norm": 0.4448925101508038,
"learning_rate": 5.128813895443132e-06,
"loss": 0.2411,
"step": 742
},
{
"epoch": 2.373801916932907,
"grad_norm": 0.45678562943215706,
"learning_rate": 5.079196344789454e-06,
"loss": 0.2397,
"step": 743
},
{
"epoch": 2.376996805111821,
"grad_norm": 0.47737064374073584,
"learning_rate": 5.029785036577976e-06,
"loss": 0.2168,
"step": 744
},
{
"epoch": 2.380191693290735,
"grad_norm": 0.4330785555805196,
"learning_rate": 4.980580653795306e-06,
"loss": 0.2206,
"step": 745
},
{
"epoch": 2.3833865814696487,
"grad_norm": 0.5282508944251668,
"learning_rate": 4.931583876567807e-06,
"loss": 0.2209,
"step": 746
},
{
"epoch": 2.3865814696485623,
"grad_norm": 0.48191470832355815,
"learning_rate": 4.882795382152223e-06,
"loss": 0.2604,
"step": 747
},
{
"epoch": 2.389776357827476,
"grad_norm": 0.45329891295506547,
"learning_rate": 4.834215844926338e-06,
"loss": 0.2662,
"step": 748
},
{
"epoch": 2.3929712460063897,
"grad_norm": 0.5020462913719165,
"learning_rate": 4.785845936379601e-06,
"loss": 0.1824,
"step": 749
},
{
"epoch": 2.3961661341853033,
"grad_norm": 0.5125956468296781,
"learning_rate": 4.737686325103883e-06,
"loss": 0.2388,
"step": 750
},
{
"epoch": 2.3993610223642174,
"grad_norm": 0.4610736608378653,
"learning_rate": 4.6897376767842365e-06,
"loss": 0.2184,
"step": 751
},
{
"epoch": 2.402555910543131,
"grad_norm": 0.4215976064237638,
"learning_rate": 4.642000654189673e-06,
"loss": 0.2239,
"step": 752
},
{
"epoch": 2.405750798722045,
"grad_norm": 0.4605285704227125,
"learning_rate": 4.59447591716401e-06,
"loss": 0.2298,
"step": 753
},
{
"epoch": 2.4089456869009584,
"grad_norm": 0.4485924651231439,
"learning_rate": 4.547164122616767e-06,
"loss": 0.2197,
"step": 754
},
{
"epoch": 2.412140575079872,
"grad_norm": 0.4611809765297732,
"learning_rate": 4.500065924514059e-06,
"loss": 0.2405,
"step": 755
},
{
"epoch": 2.415335463258786,
"grad_norm": 0.44198322300277526,
"learning_rate": 4.453181973869565e-06,
"loss": 0.261,
"step": 756
},
{
"epoch": 2.4185303514377,
"grad_norm": 0.4422002397029462,
"learning_rate": 4.406512918735555e-06,
"loss": 0.2086,
"step": 757
},
{
"epoch": 2.4217252396166136,
"grad_norm": 0.44126751575207934,
"learning_rate": 4.360059404193892e-06,
"loss": 0.238,
"step": 758
},
{
"epoch": 2.4249201277955272,
"grad_norm": 0.4639687021386926,
"learning_rate": 4.313822072347136e-06,
"loss": 0.215,
"step": 759
},
{
"epoch": 2.428115015974441,
"grad_norm": 0.44442484361823825,
"learning_rate": 4.267801562309679e-06,
"loss": 0.1696,
"step": 760
},
{
"epoch": 2.4313099041533546,
"grad_norm": 0.45364102082678376,
"learning_rate": 4.221998510198888e-06,
"loss": 0.2313,
"step": 761
},
{
"epoch": 2.4345047923322682,
"grad_norm": 0.5633393868088448,
"learning_rate": 4.176413549126322e-06,
"loss": 0.1996,
"step": 762
},
{
"epoch": 2.437699680511182,
"grad_norm": 0.4842473998169101,
"learning_rate": 4.131047309188994e-06,
"loss": 0.2374,
"step": 763
},
{
"epoch": 2.440894568690096,
"grad_norm": 0.49271155780265713,
"learning_rate": 4.085900417460633e-06,
"loss": 0.2428,
"step": 764
},
{
"epoch": 2.4440894568690097,
"grad_norm": 0.4474969801362191,
"learning_rate": 4.040973497983052e-06,
"loss": 0.2149,
"step": 765
},
{
"epoch": 2.4472843450479234,
"grad_norm": 0.49449842276919154,
"learning_rate": 3.996267171757486e-06,
"loss": 0.2221,
"step": 766
},
{
"epoch": 2.450479233226837,
"grad_norm": 0.4045769885160123,
"learning_rate": 3.951782056736027e-06,
"loss": 0.2088,
"step": 767
},
{
"epoch": 2.4536741214057507,
"grad_norm": 0.420197638385653,
"learning_rate": 3.907518767813097e-06,
"loss": 0.2508,
"step": 768
},
{
"epoch": 2.4568690095846644,
"grad_norm": 0.4712286531119528,
"learning_rate": 3.863477916816914e-06,
"loss": 0.2012,
"step": 769
},
{
"epoch": 2.460063897763578,
"grad_norm": 0.4427542368579534,
"learning_rate": 3.819660112501053e-06,
"loss": 0.2397,
"step": 770
},
{
"epoch": 2.463258785942492,
"grad_norm": 0.49326258562927866,
"learning_rate": 3.7760659605360506e-06,
"loss": 0.2334,
"step": 771
},
{
"epoch": 2.466453674121406,
"grad_norm": 0.4601236932926047,
"learning_rate": 3.732696063500998e-06,
"loss": 0.2034,
"step": 772
},
{
"epoch": 2.4696485623003195,
"grad_norm": 0.4472250977974353,
"learning_rate": 3.689551020875226e-06,
"loss": 0.2271,
"step": 773
},
{
"epoch": 2.472843450479233,
"grad_norm": 0.46936589235681647,
"learning_rate": 3.6466314290300366e-06,
"loss": 0.1636,
"step": 774
},
{
"epoch": 2.476038338658147,
"grad_norm": 0.4357541144936099,
"learning_rate": 3.603937881220425e-06,
"loss": 0.2292,
"step": 775
},
{
"epoch": 2.479233226837061,
"grad_norm": 0.6172961003964446,
"learning_rate": 3.5614709675769166e-06,
"loss": 0.2371,
"step": 776
},
{
"epoch": 2.4824281150159746,
"grad_norm": 0.42287295079164994,
"learning_rate": 3.519231275097372e-06,
"loss": 0.192,
"step": 777
},
{
"epoch": 2.4856230031948883,
"grad_norm": 0.44907721421445357,
"learning_rate": 3.477219387638917e-06,
"loss": 0.275,
"step": 778
},
{
"epoch": 2.488817891373802,
"grad_norm": 0.5148479668458735,
"learning_rate": 3.435435885909828e-06,
"loss": 0.2505,
"step": 779
},
{
"epoch": 2.4920127795527156,
"grad_norm": 0.4708481234430973,
"learning_rate": 3.393881347461525e-06,
"loss": 0.2337,
"step": 780
},
{
"epoch": 2.4952076677316293,
"grad_norm": 0.4652941463915953,
"learning_rate": 3.3525563466806068e-06,
"loss": 0.2068,
"step": 781
},
{
"epoch": 2.498402555910543,
"grad_norm": 0.45937429044470846,
"learning_rate": 3.311461454780871e-06,
"loss": 0.2616,
"step": 782
},
{
"epoch": 2.501597444089457,
"grad_norm": 0.4540154081882579,
"learning_rate": 3.2705972397954655e-06,
"loss": 0.2004,
"step": 783
},
{
"epoch": 2.5047923322683707,
"grad_norm": 0.44687303635857867,
"learning_rate": 3.22996426656899e-06,
"loss": 0.2137,
"step": 784
},
{
"epoch": 2.5079872204472844,
"grad_norm": 0.42549399369528,
"learning_rate": 3.1895630967497147e-06,
"loss": 0.237,
"step": 785
},
{
"epoch": 2.511182108626198,
"grad_norm": 0.4686919863303311,
"learning_rate": 3.1493942887818287e-06,
"loss": 0.1818,
"step": 786
},
{
"epoch": 2.5143769968051117,
"grad_norm": 0.43180677830115527,
"learning_rate": 3.1094583978976887e-06,
"loss": 0.2135,
"step": 787
},
{
"epoch": 2.5175718849840254,
"grad_norm": 0.41051305687717876,
"learning_rate": 3.0697559761101623e-06,
"loss": 0.2362,
"step": 788
},
{
"epoch": 2.520766773162939,
"grad_norm": 0.4339764538651385,
"learning_rate": 3.0302875722050064e-06,
"loss": 0.1871,
"step": 789
},
{
"epoch": 2.523961661341853,
"grad_norm": 0.44537108832627426,
"learning_rate": 2.99105373173326e-06,
"loss": 0.2317,
"step": 790
},
{
"epoch": 2.527156549520767,
"grad_norm": 0.48453456082844887,
"learning_rate": 2.9520549970037238e-06,
"loss": 0.1899,
"step": 791
},
{
"epoch": 2.5303514376996805,
"grad_norm": 0.4280884923970404,
"learning_rate": 2.913291907075451e-06,
"loss": 0.2116,
"step": 792
},
{
"epoch": 2.533546325878594,
"grad_norm": 0.4503925387506874,
"learning_rate": 2.8747649977502945e-06,
"loss": 0.266,
"step": 793
},
{
"epoch": 2.536741214057508,
"grad_norm": 0.44028462915181193,
"learning_rate": 2.836474801565521e-06,
"loss": 0.216,
"step": 794
},
{
"epoch": 2.539936102236422,
"grad_norm": 0.4484069903506647,
"learning_rate": 2.7984218477864213e-06,
"loss": 0.2081,
"step": 795
},
{
"epoch": 2.543130990415335,
"grad_norm": 0.4412193065052018,
"learning_rate": 2.7606066623990145e-06,
"loss": 0.2943,
"step": 796
},
{
"epoch": 2.5463258785942493,
"grad_norm": 0.4281448882843457,
"learning_rate": 2.723029768102776e-06,
"loss": 0.1912,
"step": 797
},
{
"epoch": 2.549520766773163,
"grad_norm": 0.48266966006465656,
"learning_rate": 2.6856916843034062e-06,
"loss": 0.2363,
"step": 798
},
{
"epoch": 2.5527156549520766,
"grad_norm": 0.4313024255011029,
"learning_rate": 2.648592927105642e-06,
"loss": 0.2356,
"step": 799
},
{
"epoch": 2.5559105431309903,
"grad_norm": 0.42876561566865096,
"learning_rate": 2.611734009306155e-06,
"loss": 0.2222,
"step": 800
},
{
"epoch": 2.559105431309904,
"grad_norm": 0.39838879945078226,
"learning_rate": 2.5751154403864264e-06,
"loss": 0.2119,
"step": 801
},
{
"epoch": 2.562300319488818,
"grad_norm": 0.4755341593418832,
"learning_rate": 2.5387377265057246e-06,
"loss": 0.235,
"step": 802
},
{
"epoch": 2.5654952076677318,
"grad_norm": 0.45820320153330835,
"learning_rate": 2.502601370494111e-06,
"loss": 0.2016,
"step": 803
},
{
"epoch": 2.5686900958466454,
"grad_norm": 0.49185493187997026,
"learning_rate": 2.4667068718454766e-06,
"loss": 0.2038,
"step": 804
},
{
"epoch": 2.571884984025559,
"grad_norm": 0.4346051571719638,
"learning_rate": 2.4310547267106443e-06,
"loss": 0.168,
"step": 805
},
{
"epoch": 2.5750798722044728,
"grad_norm": 0.47891912799119724,
"learning_rate": 2.395645427890525e-06,
"loss": 0.192,
"step": 806
},
{
"epoch": 2.5782747603833864,
"grad_norm": 0.4957624567636564,
"learning_rate": 2.360479464829275e-06,
"loss": 0.2015,
"step": 807
},
{
"epoch": 2.5814696485623,
"grad_norm": 0.43031345754147116,
"learning_rate": 2.3255573236075523e-06,
"loss": 0.2332,
"step": 808
},
{
"epoch": 2.584664536741214,
"grad_norm": 0.4493876332117976,
"learning_rate": 2.2908794869358044e-06,
"loss": 0.1779,
"step": 809
},
{
"epoch": 2.587859424920128,
"grad_norm": 0.48046582003978555,
"learning_rate": 2.2564464341475724e-06,
"loss": 0.2085,
"step": 810
},
{
"epoch": 2.5910543130990416,
"grad_norm": 0.4610477157344099,
"learning_rate": 2.2222586411928826e-06,
"loss": 0.2733,
"step": 811
},
{
"epoch": 2.594249201277955,
"grad_norm": 0.42491586645922075,
"learning_rate": 2.1883165806316688e-06,
"loss": 0.2045,
"step": 812
},
{
"epoch": 2.597444089456869,
"grad_norm": 0.4558620955090394,
"learning_rate": 2.154620721627225e-06,
"loss": 0.2348,
"step": 813
},
{
"epoch": 2.600638977635783,
"grad_norm": 0.4177140807716825,
"learning_rate": 2.121171529939734e-06,
"loss": 0.2154,
"step": 814
},
{
"epoch": 2.6038338658146962,
"grad_norm": 0.44671878373727514,
"learning_rate": 2.0879694679198346e-06,
"loss": 0.2339,
"step": 815
},
{
"epoch": 2.6070287539936103,
"grad_norm": 0.5421104481464741,
"learning_rate": 2.055014994502207e-06,
"loss": 0.2628,
"step": 816
},
{
"epoch": 2.610223642172524,
"grad_norm": 0.4183342876241132,
"learning_rate": 2.022308565199249e-06,
"loss": 0.2308,
"step": 817
},
{
"epoch": 2.6134185303514377,
"grad_norm": 0.4076706224784102,
"learning_rate": 1.989850632094783e-06,
"loss": 0.2697,
"step": 818
},
{
"epoch": 2.6166134185303513,
"grad_norm": 0.4327417815919417,
"learning_rate": 1.9576416438377864e-06,
"loss": 0.2204,
"step": 819
},
{
"epoch": 2.619808306709265,
"grad_norm": 0.4465688214774834,
"learning_rate": 1.925682045636217e-06,
"loss": 0.2326,
"step": 820
},
{
"epoch": 2.623003194888179,
"grad_norm": 0.4574212377076256,
"learning_rate": 1.8939722792508307e-06,
"loss": 0.2263,
"step": 821
},
{
"epoch": 2.626198083067093,
"grad_norm": 0.4455916657403554,
"learning_rate": 1.8625127829890922e-06,
"loss": 0.2387,
"step": 822
},
{
"epoch": 2.6293929712460065,
"grad_norm": 0.436528550029091,
"learning_rate": 1.8313039916991204e-06,
"loss": 0.2384,
"step": 823
},
{
"epoch": 2.63258785942492,
"grad_norm": 0.44219679143144724,
"learning_rate": 1.8003463367636676e-06,
"loss": 0.2269,
"step": 824
},
{
"epoch": 2.635782747603834,
"grad_norm": 0.44101913923028035,
"learning_rate": 1.7696402460941554e-06,
"loss": 0.2712,
"step": 825
},
{
"epoch": 2.6389776357827475,
"grad_norm": 0.43323308580675207,
"learning_rate": 1.7391861441247715e-06,
"loss": 0.2645,
"step": 826
},
{
"epoch": 2.642172523961661,
"grad_norm": 0.451593663495069,
"learning_rate": 1.7089844518065902e-06,
"loss": 0.2218,
"step": 827
},
{
"epoch": 2.6453674121405752,
"grad_norm": 0.42165221052628815,
"learning_rate": 1.6790355866017604e-06,
"loss": 0.2272,
"step": 828
},
{
"epoch": 2.648562300319489,
"grad_norm": 0.45660435100783486,
"learning_rate": 1.6493399624777428e-06,
"loss": 0.2322,
"step": 829
},
{
"epoch": 2.6517571884984026,
"grad_norm": 0.46289265194890444,
"learning_rate": 1.6198979899015687e-06,
"loss": 0.2469,
"step": 830
},
{
"epoch": 2.6549520766773163,
"grad_norm": 0.42669559181390876,
"learning_rate": 1.5907100758341787e-06,
"loss": 0.1998,
"step": 831
},
{
"epoch": 2.65814696485623,
"grad_norm": 0.4190815694114573,
"learning_rate": 1.5617766237248023e-06,
"loss": 0.2103,
"step": 832
},
{
"epoch": 2.661341853035144,
"grad_norm": 0.41531478233240604,
"learning_rate": 1.5330980335053714e-06,
"loss": 0.2039,
"step": 833
},
{
"epoch": 2.6645367412140573,
"grad_norm": 0.4357539666161216,
"learning_rate": 1.5046747015849893e-06,
"loss": 0.2375,
"step": 834
},
{
"epoch": 2.6677316293929714,
"grad_norm": 0.44449107969992363,
"learning_rate": 1.4765070208444732e-06,
"loss": 0.2807,
"step": 835
},
{
"epoch": 2.670926517571885,
"grad_norm": 0.43149390691820894,
"learning_rate": 1.4485953806308883e-06,
"loss": 0.2307,
"step": 836
},
{
"epoch": 2.6741214057507987,
"grad_norm": 0.43315728774154344,
"learning_rate": 1.4209401667522028e-06,
"loss": 0.2276,
"step": 837
},
{
"epoch": 2.6773162939297124,
"grad_norm": 0.4499922141030728,
"learning_rate": 1.3935417614719327e-06,
"loss": 0.2079,
"step": 838
},
{
"epoch": 2.680511182108626,
"grad_norm": 0.49094900978907974,
"learning_rate": 1.366400543503854e-06,
"loss": 0.1824,
"step": 839
},
{
"epoch": 2.68370607028754,
"grad_norm": 0.47322551913300975,
"learning_rate": 1.3395168880067978e-06,
"loss": 0.2501,
"step": 840
},
{
"epoch": 2.686900958466454,
"grad_norm": 0.43241827743072775,
"learning_rate": 1.3128911665794198e-06,
"loss": 0.2489,
"step": 841
},
{
"epoch": 2.6900958466453675,
"grad_norm": 0.5782151533369344,
"learning_rate": 1.2865237472551106e-06,
"loss": 0.2477,
"step": 842
},
{
"epoch": 2.693290734824281,
"grad_norm": 0.4494764243991699,
"learning_rate": 1.2604149944968725e-06,
"loss": 0.2111,
"step": 843
},
{
"epoch": 2.696485623003195,
"grad_norm": 0.45156798802844117,
"learning_rate": 1.234565269192296e-06,
"loss": 0.176,
"step": 844
},
{
"epoch": 2.6996805111821085,
"grad_norm": 0.4390858768011781,
"learning_rate": 1.2089749286485808e-06,
"loss": 0.2475,
"step": 845
},
{
"epoch": 2.702875399361022,
"grad_norm": 0.4190824848284048,
"learning_rate": 1.183644326587574e-06,
"loss": 0.2275,
"step": 846
},
{
"epoch": 2.7060702875399363,
"grad_norm": 0.4919591533349322,
"learning_rate": 1.1585738131409107e-06,
"loss": 0.2096,
"step": 847
},
{
"epoch": 2.70926517571885,
"grad_norm": 0.4983722601382295,
"learning_rate": 1.1337637348451369e-06,
"loss": 0.2353,
"step": 848
},
{
"epoch": 2.7124600638977636,
"grad_norm": 0.394892627520411,
"learning_rate": 1.1092144346369581e-06,
"loss": 0.2215,
"step": 849
},
{
"epoch": 2.7156549520766773,
"grad_norm": 0.4103602053022109,
"learning_rate": 1.0849262518484704e-06,
"loss": 0.195,
"step": 850
},
{
"epoch": 2.718849840255591,
"grad_norm": 0.4478481118516963,
"learning_rate": 1.060899522202483e-06,
"loss": 0.2243,
"step": 851
},
{
"epoch": 2.722044728434505,
"grad_norm": 0.44779585136229183,
"learning_rate": 1.037134577807879e-06,
"loss": 0.1981,
"step": 852
},
{
"epoch": 2.7252396166134183,
"grad_norm": 0.4309440677606521,
"learning_rate": 1.0136317471550195e-06,
"loss": 0.2119,
"step": 853
},
{
"epoch": 2.7284345047923324,
"grad_norm": 0.44538470312104783,
"learning_rate": 9.903913551112e-07,
"loss": 0.2343,
"step": 854
},
{
"epoch": 2.731629392971246,
"grad_norm": 0.46088845916460575,
"learning_rate": 9.67413722916175e-07,
"loss": 0.2384,
"step": 855
},
{
"epoch": 2.7348242811501597,
"grad_norm": 0.4435462990703814,
"learning_rate": 9.446991681776985e-07,
"loss": 0.2338,
"step": 856
},
{
"epoch": 2.7380191693290734,
"grad_norm": 0.4083232549654808,
"learning_rate": 9.222480048671412e-07,
"loss": 0.2039,
"step": 857
},
{
"epoch": 2.741214057507987,
"grad_norm": 0.4751109728713644,
"learning_rate": 9.000605433151643e-07,
"loss": 0.2202,
"step": 858
},
{
"epoch": 2.744408945686901,
"grad_norm": 0.42315898167185495,
"learning_rate": 8.781370902074049e-07,
"loss": 0.2429,
"step": 859
},
{
"epoch": 2.747603833865815,
"grad_norm": 0.4389817241489918,
"learning_rate": 8.564779485802566e-07,
"loss": 0.2523,
"step": 860
},
{
"epoch": 2.7507987220447285,
"grad_norm": 0.40824675808125727,
"learning_rate": 8.350834178166755e-07,
"loss": 0.2317,
"step": 861
},
{
"epoch": 2.753993610223642,
"grad_norm": 0.44644266395927934,
"learning_rate": 8.139537936420372e-07,
"loss": 0.2198,
"step": 862
},
{
"epoch": 2.757188498402556,
"grad_norm": 0.4364831760939215,
"learning_rate": 7.93089368120048e-07,
"loss": 0.2424,
"step": 863
},
{
"epoch": 2.7603833865814695,
"grad_norm": 0.47326054771389603,
"learning_rate": 7.724904296487246e-07,
"loss": 0.2386,
"step": 864
},
{
"epoch": 2.763578274760383,
"grad_norm": 0.4571089142274389,
"learning_rate": 7.521572629563834e-07,
"loss": 0.1619,
"step": 865
},
{
"epoch": 2.7667731629392973,
"grad_norm": 0.4221090741397694,
"learning_rate": 7.320901490977217e-07,
"loss": 0.2486,
"step": 866
},
{
"epoch": 2.769968051118211,
"grad_norm": 0.44759152235134214,
"learning_rate": 7.122893654499318e-07,
"loss": 0.2376,
"step": 867
},
{
"epoch": 2.7731629392971247,
"grad_norm": 0.4559649616233877,
"learning_rate": 6.927551857088576e-07,
"loss": 0.2216,
"step": 868
},
{
"epoch": 2.7763578274760383,
"grad_norm": 0.3938914848222988,
"learning_rate": 6.734878798852174e-07,
"loss": 0.2331,
"step": 869
},
{
"epoch": 2.779552715654952,
"grad_norm": 0.44967947965212385,
"learning_rate": 6.544877143008777e-07,
"loss": 0.2303,
"step": 870
},
{
"epoch": 2.7827476038338657,
"grad_norm": 0.4204688969968103,
"learning_rate": 6.357549515851525e-07,
"loss": 0.2497,
"step": 871
},
{
"epoch": 2.7859424920127793,
"grad_norm": 0.42708869477683215,
"learning_rate": 6.172898506712033e-07,
"loss": 0.2502,
"step": 872
},
{
"epoch": 2.7891373801916934,
"grad_norm": 0.4289703544035532,
"learning_rate": 5.990926667924313e-07,
"loss": 0.2637,
"step": 873
},
{
"epoch": 2.792332268370607,
"grad_norm": 0.43987128340382603,
"learning_rate": 5.811636514789598e-07,
"loss": 0.193,
"step": 874
},
{
"epoch": 2.7955271565495208,
"grad_norm": 0.5004291788721248,
"learning_rate": 5.635030525541685e-07,
"loss": 0.2105,
"step": 875
},
{
"epoch": 2.7987220447284344,
"grad_norm": 0.44122554674258213,
"learning_rate": 5.461111141312492e-07,
"loss": 0.1874,
"step": 876
},
{
"epoch": 2.801916932907348,
"grad_norm": 0.4230374283981073,
"learning_rate": 5.289880766098421e-07,
"loss": 0.2113,
"step": 877
},
{
"epoch": 2.8051118210862622,
"grad_norm": 0.4219143270150449,
"learning_rate": 5.121341766727184e-07,
"loss": 0.1856,
"step": 878
},
{
"epoch": 2.8083067092651754,
"grad_norm": 0.4280311905897577,
"learning_rate": 4.955496472824939e-07,
"loss": 0.2479,
"step": 879
},
{
"epoch": 2.8115015974440896,
"grad_norm": 0.4275114109142309,
"learning_rate": 4.79234717678414e-07,
"loss": 0.2109,
"step": 880
},
{
"epoch": 2.8146964856230032,
"grad_norm": 0.41231768670069835,
"learning_rate": 4.631896133732006e-07,
"loss": 0.1914,
"step": 881
},
{
"epoch": 2.817891373801917,
"grad_norm": 0.4046385798085887,
"learning_rate": 4.474145561499099e-07,
"loss": 0.2497,
"step": 882
},
{
"epoch": 2.8210862619808306,
"grad_norm": 0.4337798293798453,
"learning_rate": 4.319097640588821e-07,
"loss": 0.2105,
"step": 883
},
{
"epoch": 2.8242811501597442,
"grad_norm": 0.4367363776983278,
"learning_rate": 4.166754514147275e-07,
"loss": 0.2541,
"step": 884
},
{
"epoch": 2.8274760383386583,
"grad_norm": 0.4121592143726346,
"learning_rate": 4.0171182879335856e-07,
"loss": 0.2934,
"step": 885
},
{
"epoch": 2.830670926517572,
"grad_norm": 0.45692427171461536,
"learning_rate": 3.870191030290782e-07,
"loss": 0.2123,
"step": 886
},
{
"epoch": 2.8338658146964857,
"grad_norm": 0.44515522805300783,
"learning_rate": 3.7259747721173134e-07,
"loss": 0.1926,
"step": 887
},
{
"epoch": 2.8370607028753994,
"grad_norm": 0.41210623411640795,
"learning_rate": 3.584471506838871e-07,
"loss": 0.2355,
"step": 888
},
{
"epoch": 2.840255591054313,
"grad_norm": 0.7278276221930801,
"learning_rate": 3.445683190380833e-07,
"loss": 0.2734,
"step": 889
},
{
"epoch": 2.8434504792332267,
"grad_norm": 0.4293466579902048,
"learning_rate": 3.3096117411413056e-07,
"loss": 0.2084,
"step": 890
},
{
"epoch": 2.8466453674121404,
"grad_norm": 0.3945276989591356,
"learning_rate": 3.1762590399645907e-07,
"loss": 0.2355,
"step": 891
},
{
"epoch": 2.8498402555910545,
"grad_norm": 0.4169674719101088,
"learning_rate": 3.045626930115053e-07,
"loss": 0.2556,
"step": 892
},
{
"epoch": 2.853035143769968,
"grad_norm": 0.4677238531453402,
"learning_rate": 2.917717217251914e-07,
"loss": 0.2067,
"step": 893
},
{
"epoch": 2.856230031948882,
"grad_norm": 0.4508849526360919,
"learning_rate": 2.7925316694039637e-07,
"loss": 0.2264,
"step": 894
},
{
"epoch": 2.8594249201277955,
"grad_norm": 1.3406369405711993,
"learning_rate": 2.670072016945402e-07,
"loss": 0.3042,
"step": 895
},
{
"epoch": 2.862619808306709,
"grad_norm": 0.3953393489058587,
"learning_rate": 2.5503399525717674e-07,
"loss": 0.2038,
"step": 896
},
{
"epoch": 2.8658146964856233,
"grad_norm": 0.3991610340225267,
"learning_rate": 2.433337131276581e-07,
"loss": 0.2806,
"step": 897
},
{
"epoch": 2.8690095846645365,
"grad_norm": 0.42671305379955443,
"learning_rate": 2.3190651703284273e-07,
"loss": 0.2369,
"step": 898
},
{
"epoch": 2.8722044728434506,
"grad_norm": 0.41224602176747227,
"learning_rate": 2.207525649248754e-07,
"loss": 0.2171,
"step": 899
},
{
"epoch": 2.8753993610223643,
"grad_norm": 0.509647560563868,
"learning_rate": 2.0987201097897757e-07,
"loss": 0.2097,
"step": 900
},
{
"epoch": 2.878594249201278,
"grad_norm": 0.4724203634577862,
"learning_rate": 1.9926500559134477e-07,
"loss": 0.24,
"step": 901
},
{
"epoch": 2.8817891373801916,
"grad_norm": 0.44222221500216696,
"learning_rate": 1.8893169537704813e-07,
"loss": 0.2815,
"step": 902
},
{
"epoch": 2.8849840255591053,
"grad_norm": 0.43359238337650113,
"learning_rate": 1.7887222316800957e-07,
"loss": 0.2058,
"step": 903
},
{
"epoch": 2.8881789137380194,
"grad_norm": 0.4309496609438018,
"learning_rate": 1.690867280110431e-07,
"loss": 0.2481,
"step": 904
},
{
"epoch": 2.891373801916933,
"grad_norm": 0.4330580118936636,
"learning_rate": 1.5957534516590988e-07,
"loss": 0.2267,
"step": 905
},
{
"epoch": 2.8945686900958467,
"grad_norm": 0.41196459499404176,
"learning_rate": 1.503382061034686e-07,
"loss": 0.2471,
"step": 906
},
{
"epoch": 2.8977635782747604,
"grad_norm": 0.46467452130408804,
"learning_rate": 1.4137543850384572e-07,
"loss": 0.2321,
"step": 907
},
{
"epoch": 2.900958466453674,
"grad_norm": 0.430788659047838,
"learning_rate": 1.3268716625467914e-07,
"loss": 0.2805,
"step": 908
},
{
"epoch": 2.9041533546325877,
"grad_norm": 0.4431748531892815,
"learning_rate": 1.242735094493952e-07,
"loss": 0.2397,
"step": 909
},
{
"epoch": 2.9073482428115014,
"grad_norm": 0.4435883937921819,
"learning_rate": 1.1613458438556102e-07,
"loss": 0.2752,
"step": 910
},
{
"epoch": 2.9105431309904155,
"grad_norm": 0.4304268716716866,
"learning_rate": 1.0827050356326585e-07,
"loss": 0.26,
"step": 911
},
{
"epoch": 2.913738019169329,
"grad_norm": 0.4399673032424353,
"learning_rate": 1.0068137568357783e-07,
"loss": 0.2205,
"step": 912
},
{
"epoch": 2.916932907348243,
"grad_norm": 0.4445872824299983,
"learning_rate": 9.336730564702745e-08,
"loss": 0.1941,
"step": 913
},
{
"epoch": 2.9201277955271565,
"grad_norm": 0.4433121701499639,
"learning_rate": 8.632839455216869e-08,
"loss": 0.2012,
"step": 914
},
{
"epoch": 2.92332268370607,
"grad_norm": 0.445710760902897,
"learning_rate": 7.956473969417789e-08,
"loss": 0.1946,
"step": 915
},
{
"epoch": 2.9265175718849843,
"grad_norm": 0.44031338022101135,
"learning_rate": 7.307643456351044e-08,
"loss": 0.236,
"step": 916
},
{
"epoch": 2.9297124600638975,
"grad_norm": 0.42316443693466144,
"learning_rate": 6.686356884460177e-08,
"loss": 0.2314,
"step": 917
},
{
"epoch": 2.9329073482428116,
"grad_norm": 0.4449659893027522,
"learning_rate": 6.092622841463502e-08,
"loss": 0.1657,
"step": 918
},
{
"epoch": 2.9361022364217253,
"grad_norm": 0.4225339222713818,
"learning_rate": 5.526449534235534e-08,
"loss": 0.2542,
"step": 919
},
{
"epoch": 2.939297124600639,
"grad_norm": 0.44649236358142536,
"learning_rate": 4.9878447886926305e-08,
"loss": 0.2343,
"step": 920
},
{
"epoch": 2.9424920127795526,
"grad_norm": 0.4096672005948774,
"learning_rate": 4.4768160496859725e-08,
"loss": 0.2727,
"step": 921
},
{
"epoch": 2.9456869009584663,
"grad_norm": 0.4393531135600368,
"learning_rate": 3.993370380897421e-08,
"loss": 0.225,
"step": 922
},
{
"epoch": 2.9488817891373804,
"grad_norm": 0.47244217125437676,
"learning_rate": 3.537514464743152e-08,
"loss": 0.2135,
"step": 923
},
{
"epoch": 2.952076677316294,
"grad_norm": 0.4328106188446803,
"learning_rate": 3.109254602280398e-08,
"loss": 0.218,
"step": 924
},
{
"epoch": 2.9552715654952078,
"grad_norm": 0.4358995352547161,
"learning_rate": 2.7085967131201818e-08,
"loss": 0.2325,
"step": 925
},
{
"epoch": 2.9584664536741214,
"grad_norm": 0.4376093029807377,
"learning_rate": 2.3355463353467168e-08,
"loss": 0.2082,
"step": 926
},
{
"epoch": 2.961661341853035,
"grad_norm": 0.42933069783001954,
"learning_rate": 1.9901086254396908e-08,
"loss": 0.2006,
"step": 927
},
{
"epoch": 2.9648562300319488,
"grad_norm": 0.43253728612411796,
"learning_rate": 1.672288358203211e-08,
"loss": 0.2086,
"step": 928
},
{
"epoch": 2.9680511182108624,
"grad_norm": 0.49147199258587176,
"learning_rate": 1.382089926700303e-08,
"loss": 0.2226,
"step": 929
},
{
"epoch": 2.9712460063897765,
"grad_norm": 0.4073568057815397,
"learning_rate": 1.1195173421914007e-08,
"loss": 0.2687,
"step": 930
},
{
"epoch": 2.97444089456869,
"grad_norm": 0.392034693925328,
"learning_rate": 8.84574234079727e-09,
"loss": 0.2252,
"step": 931
},
{
"epoch": 2.977635782747604,
"grad_norm": 0.44649691262839863,
"learning_rate": 6.772638498606654e-09,
"loss": 0.241,
"step": 932
},
{
"epoch": 2.9808306709265175,
"grad_norm": 0.4212064663206084,
"learning_rate": 4.97589055076908e-09,
"loss": 0.2497,
"step": 933
},
{
"epoch": 2.984025559105431,
"grad_norm": 0.4438386239006071,
"learning_rate": 3.4555233327893124e-09,
"loss": 0.2348,
"step": 934
},
{
"epoch": 2.987220447284345,
"grad_norm": 0.43882394327805996,
"learning_rate": 2.2115578599035683e-09,
"loss": 0.2373,
"step": 935
},
{
"epoch": 2.9904153354632586,
"grad_norm": 0.4544247984602708,
"learning_rate": 1.244011326797523e-09,
"loss": 0.2172,
"step": 936
},
{
"epoch": 2.9936102236421727,
"grad_norm": 0.4700550963913251,
"learning_rate": 5.52897107355399e-10,
"loss": 0.2571,
"step": 937
},
{
"epoch": 2.9968051118210863,
"grad_norm": 0.4038371285545047,
"learning_rate": 1.3822475449121186e-10,
"loss": 0.2118,
"step": 938
},
{
"epoch": 3.0,
"grad_norm": 0.3831820442811552,
"learning_rate": 0.0,
"loss": 0.2001,
"step": 939
},
{
"epoch": 3.0,
"step": 939,
"total_flos": 406391461183488.0,
"train_loss": 0.3877937021696022,
"train_runtime": 10098.5367,
"train_samples_per_second": 2.97,
"train_steps_per_second": 0.093
}
],
"logging_steps": 1.0,
"max_steps": 939,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 406391461183488.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}