{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998877035373386, "eval_steps": 500, "global_step": 2670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011229646266142617, "grad_norm": 5.786720275878906, "learning_rate": 3.7453183520599254e-08, "loss": 0.8193, "step": 1 }, { "epoch": 0.0022459292532285235, "grad_norm": 5.650145530700684, "learning_rate": 7.490636704119851e-08, "loss": 0.8203, "step": 2 }, { "epoch": 0.003368893879842785, "grad_norm": 5.858572959899902, "learning_rate": 1.1235955056179776e-07, "loss": 0.8396, "step": 3 }, { "epoch": 0.004491858506457047, "grad_norm": 5.87903356552124, "learning_rate": 1.4981273408239702e-07, "loss": 0.8554, "step": 4 }, { "epoch": 0.005614823133071308, "grad_norm": 6.044643402099609, "learning_rate": 1.8726591760299626e-07, "loss": 0.9053, "step": 5 }, { "epoch": 0.00673778775968557, "grad_norm": 5.962353706359863, "learning_rate": 2.247191011235955e-07, "loss": 0.8647, "step": 6 }, { "epoch": 0.007860752386299831, "grad_norm": 5.850592136383057, "learning_rate": 2.621722846441948e-07, "loss": 0.854, "step": 7 }, { "epoch": 0.008983717012914094, "grad_norm": 5.849573135375977, "learning_rate": 2.9962546816479403e-07, "loss": 0.8568, "step": 8 }, { "epoch": 0.010106681639528355, "grad_norm": 5.729428768157959, "learning_rate": 3.3707865168539325e-07, "loss": 0.8484, "step": 9 }, { "epoch": 0.011229646266142616, "grad_norm": 5.72395133972168, "learning_rate": 3.7453183520599253e-07, "loss": 0.8656, "step": 10 }, { "epoch": 0.012352610892756879, "grad_norm": 5.83106803894043, "learning_rate": 4.1198501872659175e-07, "loss": 0.866, "step": 11 }, { "epoch": 0.01347557551937114, "grad_norm": 5.251598834991455, "learning_rate": 4.49438202247191e-07, "loss": 0.8067, "step": 12 }, { "epoch": 0.014598540145985401, "grad_norm": 5.573467254638672, "learning_rate": 4.868913857677903e-07, "loss": 0.8643, "step": 13 }, { "epoch": 0.015721504772599662, "grad_norm": 5.2251811027526855, "learning_rate": 5.243445692883896e-07, "loss": 0.8576, "step": 14 }, { "epoch": 0.016844469399213923, "grad_norm": 5.4005818367004395, "learning_rate": 5.617977528089888e-07, "loss": 0.8292, "step": 15 }, { "epoch": 0.017967434025828188, "grad_norm": 4.468184471130371, "learning_rate": 5.992509363295881e-07, "loss": 0.8054, "step": 16 }, { "epoch": 0.01909039865244245, "grad_norm": 4.3659257888793945, "learning_rate": 6.367041198501874e-07, "loss": 0.7998, "step": 17 }, { "epoch": 0.02021336327905671, "grad_norm": 4.347238063812256, "learning_rate": 6.741573033707865e-07, "loss": 0.8135, "step": 18 }, { "epoch": 0.02133632790567097, "grad_norm": 4.2649946212768555, "learning_rate": 7.116104868913857e-07, "loss": 0.826, "step": 19 }, { "epoch": 0.022459292532285232, "grad_norm": 4.177603244781494, "learning_rate": 7.490636704119851e-07, "loss": 0.8331, "step": 20 }, { "epoch": 0.023582257158899493, "grad_norm": 4.032388687133789, "learning_rate": 7.865168539325843e-07, "loss": 0.794, "step": 21 }, { "epoch": 0.024705221785513758, "grad_norm": 2.4847230911254883, "learning_rate": 8.239700374531835e-07, "loss": 0.754, "step": 22 }, { "epoch": 0.02582818641212802, "grad_norm": 2.2987992763519287, "learning_rate": 8.614232209737828e-07, "loss": 0.7698, "step": 23 }, { "epoch": 0.02695115103874228, "grad_norm": 2.2334015369415283, "learning_rate": 8.98876404494382e-07, "loss": 0.7573, "step": 24 }, { "epoch": 0.02807411566535654, "grad_norm": 2.054539442062378, "learning_rate": 9.363295880149814e-07, "loss": 0.748, "step": 25 }, { "epoch": 0.029197080291970802, "grad_norm": 2.1349334716796875, "learning_rate": 9.737827715355806e-07, "loss": 0.7509, "step": 26 }, { "epoch": 0.030320044918585063, "grad_norm": 1.9073206186294556, "learning_rate": 1.01123595505618e-06, "loss": 0.7585, "step": 27 }, { "epoch": 0.031443009545199324, "grad_norm": 1.8453083038330078, "learning_rate": 1.0486891385767792e-06, "loss": 0.7391, "step": 28 }, { "epoch": 0.03256597417181359, "grad_norm": 1.8017247915267944, "learning_rate": 1.0861423220973784e-06, "loss": 0.7791, "step": 29 }, { "epoch": 0.033688938798427846, "grad_norm": 1.4344351291656494, "learning_rate": 1.1235955056179777e-06, "loss": 0.7192, "step": 30 }, { "epoch": 0.03481190342504211, "grad_norm": 2.0251548290252686, "learning_rate": 1.161048689138577e-06, "loss": 0.7372, "step": 31 }, { "epoch": 0.035934868051656375, "grad_norm": 2.315131902694702, "learning_rate": 1.1985018726591761e-06, "loss": 0.6972, "step": 32 }, { "epoch": 0.03705783267827063, "grad_norm": 2.2830312252044678, "learning_rate": 1.2359550561797752e-06, "loss": 0.708, "step": 33 }, { "epoch": 0.0381807973048849, "grad_norm": 2.374768018722534, "learning_rate": 1.2734082397003748e-06, "loss": 0.7061, "step": 34 }, { "epoch": 0.039303761931499155, "grad_norm": 2.1877286434173584, "learning_rate": 1.3108614232209737e-06, "loss": 0.7134, "step": 35 }, { "epoch": 0.04042672655811342, "grad_norm": 1.9931354522705078, "learning_rate": 1.348314606741573e-06, "loss": 0.7047, "step": 36 }, { "epoch": 0.041549691184727684, "grad_norm": 1.8176078796386719, "learning_rate": 1.3857677902621726e-06, "loss": 0.6934, "step": 37 }, { "epoch": 0.04267265581134194, "grad_norm": 1.46376633644104, "learning_rate": 1.4232209737827715e-06, "loss": 0.6741, "step": 38 }, { "epoch": 0.043795620437956206, "grad_norm": 1.2667194604873657, "learning_rate": 1.4606741573033708e-06, "loss": 0.6995, "step": 39 }, { "epoch": 0.044918585064570464, "grad_norm": 0.946598470211029, "learning_rate": 1.4981273408239701e-06, "loss": 0.7111, "step": 40 }, { "epoch": 0.04604154969118473, "grad_norm": 1.0059950351715088, "learning_rate": 1.5355805243445692e-06, "loss": 0.7207, "step": 41 }, { "epoch": 0.047164514317798986, "grad_norm": 1.1553947925567627, "learning_rate": 1.5730337078651686e-06, "loss": 0.6705, "step": 42 }, { "epoch": 0.04828747894441325, "grad_norm": 1.222509503364563, "learning_rate": 1.6104868913857679e-06, "loss": 0.668, "step": 43 }, { "epoch": 0.049410443571027515, "grad_norm": 1.1623841524124146, "learning_rate": 1.647940074906367e-06, "loss": 0.6247, "step": 44 }, { "epoch": 0.05053340819764177, "grad_norm": 1.059621810913086, "learning_rate": 1.6853932584269663e-06, "loss": 0.6777, "step": 45 }, { "epoch": 0.05165637282425604, "grad_norm": 0.832984983921051, "learning_rate": 1.7228464419475657e-06, "loss": 0.6389, "step": 46 }, { "epoch": 0.052779337450870295, "grad_norm": 0.9247879981994629, "learning_rate": 1.760299625468165e-06, "loss": 0.6584, "step": 47 }, { "epoch": 0.05390230207748456, "grad_norm": 0.7802162766456604, "learning_rate": 1.797752808988764e-06, "loss": 0.6171, "step": 48 }, { "epoch": 0.055025266704098824, "grad_norm": 0.7509079575538635, "learning_rate": 1.8352059925093634e-06, "loss": 0.6203, "step": 49 }, { "epoch": 0.05614823133071308, "grad_norm": 0.7340830564498901, "learning_rate": 1.8726591760299627e-06, "loss": 0.6386, "step": 50 }, { "epoch": 0.057271195957327346, "grad_norm": 0.6697390079498291, "learning_rate": 1.910112359550562e-06, "loss": 0.6102, "step": 51 }, { "epoch": 0.058394160583941604, "grad_norm": 0.6812121272087097, "learning_rate": 1.947565543071161e-06, "loss": 0.6276, "step": 52 }, { "epoch": 0.05951712521055587, "grad_norm": 0.677414059638977, "learning_rate": 1.9850187265917605e-06, "loss": 0.6123, "step": 53 }, { "epoch": 0.060640089837170126, "grad_norm": 0.7848894596099854, "learning_rate": 2.02247191011236e-06, "loss": 0.6257, "step": 54 }, { "epoch": 0.06176305446378439, "grad_norm": 0.6357181668281555, "learning_rate": 2.059925093632959e-06, "loss": 0.6229, "step": 55 }, { "epoch": 0.06288601909039865, "grad_norm": 0.5423211455345154, "learning_rate": 2.0973782771535585e-06, "loss": 0.611, "step": 56 }, { "epoch": 0.06400898371701291, "grad_norm": 0.48002859950065613, "learning_rate": 2.1348314606741574e-06, "loss": 0.576, "step": 57 }, { "epoch": 0.06513194834362718, "grad_norm": 0.5418688654899597, "learning_rate": 2.1722846441947567e-06, "loss": 0.6142, "step": 58 }, { "epoch": 0.06625491297024144, "grad_norm": 0.5699586868286133, "learning_rate": 2.209737827715356e-06, "loss": 0.6134, "step": 59 }, { "epoch": 0.06737787759685569, "grad_norm": 0.5669326186180115, "learning_rate": 2.2471910112359554e-06, "loss": 0.6083, "step": 60 }, { "epoch": 0.06850084222346996, "grad_norm": 0.5413917899131775, "learning_rate": 2.2846441947565547e-06, "loss": 0.6081, "step": 61 }, { "epoch": 0.06962380685008422, "grad_norm": 0.49015897512435913, "learning_rate": 2.322097378277154e-06, "loss": 0.5794, "step": 62 }, { "epoch": 0.07074677147669849, "grad_norm": 0.4820050299167633, "learning_rate": 2.359550561797753e-06, "loss": 0.5998, "step": 63 }, { "epoch": 0.07186973610331275, "grad_norm": 0.4758567810058594, "learning_rate": 2.3970037453183523e-06, "loss": 0.5968, "step": 64 }, { "epoch": 0.072992700729927, "grad_norm": 0.48817014694213867, "learning_rate": 2.4344569288389516e-06, "loss": 0.5952, "step": 65 }, { "epoch": 0.07411566535654127, "grad_norm": 0.4600358307361603, "learning_rate": 2.4719101123595505e-06, "loss": 0.6172, "step": 66 }, { "epoch": 0.07523862998315553, "grad_norm": 0.44970738887786865, "learning_rate": 2.5093632958801502e-06, "loss": 0.5996, "step": 67 }, { "epoch": 0.0763615946097698, "grad_norm": 0.4442385137081146, "learning_rate": 2.5468164794007496e-06, "loss": 0.614, "step": 68 }, { "epoch": 0.07748455923638406, "grad_norm": 0.4561745822429657, "learning_rate": 2.584269662921349e-06, "loss": 0.57, "step": 69 }, { "epoch": 0.07860752386299831, "grad_norm": 0.4757951498031616, "learning_rate": 2.6217228464419474e-06, "loss": 0.5979, "step": 70 }, { "epoch": 0.07973048848961257, "grad_norm": 0.4367990791797638, "learning_rate": 2.6591760299625467e-06, "loss": 0.5975, "step": 71 }, { "epoch": 0.08085345311622684, "grad_norm": 0.41188302636146545, "learning_rate": 2.696629213483146e-06, "loss": 0.571, "step": 72 }, { "epoch": 0.0819764177428411, "grad_norm": 0.40649649500846863, "learning_rate": 2.7340823970037454e-06, "loss": 0.5889, "step": 73 }, { "epoch": 0.08309938236945537, "grad_norm": 0.4055359959602356, "learning_rate": 2.771535580524345e-06, "loss": 0.5723, "step": 74 }, { "epoch": 0.08422234699606962, "grad_norm": 0.3926713764667511, "learning_rate": 2.8089887640449444e-06, "loss": 0.5885, "step": 75 }, { "epoch": 0.08534531162268388, "grad_norm": 0.3989022970199585, "learning_rate": 2.846441947565543e-06, "loss": 0.576, "step": 76 }, { "epoch": 0.08646827624929815, "grad_norm": 0.4056854844093323, "learning_rate": 2.8838951310861422e-06, "loss": 0.5938, "step": 77 }, { "epoch": 0.08759124087591241, "grad_norm": 0.36081668734550476, "learning_rate": 2.9213483146067416e-06, "loss": 0.5888, "step": 78 }, { "epoch": 0.08871420550252666, "grad_norm": 0.3615940809249878, "learning_rate": 2.958801498127341e-06, "loss": 0.5651, "step": 79 }, { "epoch": 0.08983717012914093, "grad_norm": 0.37395480275154114, "learning_rate": 2.9962546816479402e-06, "loss": 0.5888, "step": 80 }, { "epoch": 0.09096013475575519, "grad_norm": 0.4041288197040558, "learning_rate": 3.03370786516854e-06, "loss": 0.5643, "step": 81 }, { "epoch": 0.09208309938236946, "grad_norm": 0.3966233730316162, "learning_rate": 3.0711610486891385e-06, "loss": 0.5741, "step": 82 }, { "epoch": 0.09320606400898372, "grad_norm": 0.35737988352775574, "learning_rate": 3.1086142322097378e-06, "loss": 0.5776, "step": 83 }, { "epoch": 0.09432902863559797, "grad_norm": 0.3632304072380066, "learning_rate": 3.146067415730337e-06, "loss": 0.5604, "step": 84 }, { "epoch": 0.09545199326221224, "grad_norm": 0.42090266942977905, "learning_rate": 3.1835205992509364e-06, "loss": 0.558, "step": 85 }, { "epoch": 0.0965749578888265, "grad_norm": 0.36558839678764343, "learning_rate": 3.2209737827715358e-06, "loss": 0.5836, "step": 86 }, { "epoch": 0.09769792251544077, "grad_norm": 0.3869830369949341, "learning_rate": 3.258426966292135e-06, "loss": 0.5559, "step": 87 }, { "epoch": 0.09882088714205503, "grad_norm": 0.39099591970443726, "learning_rate": 3.295880149812734e-06, "loss": 0.5659, "step": 88 }, { "epoch": 0.09994385176866928, "grad_norm": 0.3590041697025299, "learning_rate": 3.3333333333333333e-06, "loss": 0.5612, "step": 89 }, { "epoch": 0.10106681639528355, "grad_norm": 0.3642204999923706, "learning_rate": 3.3707865168539327e-06, "loss": 0.5632, "step": 90 }, { "epoch": 0.10218978102189781, "grad_norm": 0.38678011298179626, "learning_rate": 3.408239700374532e-06, "loss": 0.5874, "step": 91 }, { "epoch": 0.10331274564851207, "grad_norm": 0.3339954614639282, "learning_rate": 3.4456928838951313e-06, "loss": 0.5399, "step": 92 }, { "epoch": 0.10443571027512634, "grad_norm": 0.3532693088054657, "learning_rate": 3.4831460674157306e-06, "loss": 0.5302, "step": 93 }, { "epoch": 0.10555867490174059, "grad_norm": 0.34916800260543823, "learning_rate": 3.52059925093633e-06, "loss": 0.5622, "step": 94 }, { "epoch": 0.10668163952835485, "grad_norm": 0.35121726989746094, "learning_rate": 3.558052434456929e-06, "loss": 0.5779, "step": 95 }, { "epoch": 0.10780460415496912, "grad_norm": 0.3479062616825104, "learning_rate": 3.595505617977528e-06, "loss": 0.536, "step": 96 }, { "epoch": 0.10892756878158338, "grad_norm": 0.35582488775253296, "learning_rate": 3.6329588014981275e-06, "loss": 0.5565, "step": 97 }, { "epoch": 0.11005053340819765, "grad_norm": 0.3760950565338135, "learning_rate": 3.670411985018727e-06, "loss": 0.5708, "step": 98 }, { "epoch": 0.1111734980348119, "grad_norm": 0.35329577326774597, "learning_rate": 3.707865168539326e-06, "loss": 0.5323, "step": 99 }, { "epoch": 0.11229646266142616, "grad_norm": 0.3792141377925873, "learning_rate": 3.7453183520599255e-06, "loss": 0.5591, "step": 100 }, { "epoch": 0.11341942728804043, "grad_norm": 0.37652456760406494, "learning_rate": 3.7827715355805244e-06, "loss": 0.5427, "step": 101 }, { "epoch": 0.11454239191465469, "grad_norm": 0.3674143850803375, "learning_rate": 3.820224719101124e-06, "loss": 0.5513, "step": 102 }, { "epoch": 0.11566535654126894, "grad_norm": 0.3615972101688385, "learning_rate": 3.857677902621723e-06, "loss": 0.5268, "step": 103 }, { "epoch": 0.11678832116788321, "grad_norm": 0.433125376701355, "learning_rate": 3.895131086142322e-06, "loss": 0.5327, "step": 104 }, { "epoch": 0.11791128579449747, "grad_norm": 0.39159590005874634, "learning_rate": 3.932584269662922e-06, "loss": 0.5458, "step": 105 }, { "epoch": 0.11903425042111174, "grad_norm": 0.3455137610435486, "learning_rate": 3.970037453183521e-06, "loss": 0.5443, "step": 106 }, { "epoch": 0.120157215047726, "grad_norm": 0.3411164879798889, "learning_rate": 4.00749063670412e-06, "loss": 0.5416, "step": 107 }, { "epoch": 0.12128017967434025, "grad_norm": 0.33674782514572144, "learning_rate": 4.04494382022472e-06, "loss": 0.55, "step": 108 }, { "epoch": 0.12240314430095452, "grad_norm": 0.4221669137477875, "learning_rate": 4.082397003745319e-06, "loss": 0.5481, "step": 109 }, { "epoch": 0.12352610892756878, "grad_norm": 0.35815510153770447, "learning_rate": 4.119850187265918e-06, "loss": 0.5499, "step": 110 }, { "epoch": 0.12464907355418305, "grad_norm": 0.3904813826084137, "learning_rate": 4.157303370786518e-06, "loss": 0.5356, "step": 111 }, { "epoch": 0.1257720381807973, "grad_norm": 0.3945789933204651, "learning_rate": 4.194756554307117e-06, "loss": 0.5542, "step": 112 }, { "epoch": 0.12689500280741156, "grad_norm": 0.38726410269737244, "learning_rate": 4.2322097378277155e-06, "loss": 0.5449, "step": 113 }, { "epoch": 0.12801796743402583, "grad_norm": 0.37713056802749634, "learning_rate": 4.269662921348315e-06, "loss": 0.536, "step": 114 }, { "epoch": 0.1291409320606401, "grad_norm": 0.3399564325809479, "learning_rate": 4.307116104868914e-06, "loss": 0.5502, "step": 115 }, { "epoch": 0.13026389668725435, "grad_norm": 0.4132470488548279, "learning_rate": 4.3445692883895135e-06, "loss": 0.5464, "step": 116 }, { "epoch": 0.13138686131386862, "grad_norm": 0.3973707854747772, "learning_rate": 4.382022471910113e-06, "loss": 0.5442, "step": 117 }, { "epoch": 0.13250982594048288, "grad_norm": 0.384389728307724, "learning_rate": 4.419475655430712e-06, "loss": 0.5541, "step": 118 }, { "epoch": 0.13363279056709715, "grad_norm": 0.4049537479877472, "learning_rate": 4.456928838951311e-06, "loss": 0.5416, "step": 119 }, { "epoch": 0.13475575519371139, "grad_norm": 0.40603429079055786, "learning_rate": 4.494382022471911e-06, "loss": 0.5508, "step": 120 }, { "epoch": 0.13587871982032565, "grad_norm": 0.432468980550766, "learning_rate": 4.53183520599251e-06, "loss": 0.5498, "step": 121 }, { "epoch": 0.13700168444693991, "grad_norm": 0.37769705057144165, "learning_rate": 4.569288389513109e-06, "loss": 0.5269, "step": 122 }, { "epoch": 0.13812464907355418, "grad_norm": 0.3807169497013092, "learning_rate": 4.606741573033709e-06, "loss": 0.5629, "step": 123 }, { "epoch": 0.13924761370016844, "grad_norm": 0.4441196024417877, "learning_rate": 4.644194756554308e-06, "loss": 0.5433, "step": 124 }, { "epoch": 0.1403705783267827, "grad_norm": 0.3748587965965271, "learning_rate": 4.6816479400749066e-06, "loss": 0.5458, "step": 125 }, { "epoch": 0.14149354295339697, "grad_norm": 0.41672688722610474, "learning_rate": 4.719101123595506e-06, "loss": 0.5572, "step": 126 }, { "epoch": 0.14261650758001124, "grad_norm": 0.4214284121990204, "learning_rate": 4.756554307116105e-06, "loss": 0.5546, "step": 127 }, { "epoch": 0.1437394722066255, "grad_norm": 0.3590875566005707, "learning_rate": 4.7940074906367045e-06, "loss": 0.5188, "step": 128 }, { "epoch": 0.14486243683323977, "grad_norm": 0.4446124732494354, "learning_rate": 4.831460674157304e-06, "loss": 0.5483, "step": 129 }, { "epoch": 0.145985401459854, "grad_norm": 0.416648805141449, "learning_rate": 4.868913857677903e-06, "loss": 0.5336, "step": 130 }, { "epoch": 0.14710836608646827, "grad_norm": 0.3914474844932556, "learning_rate": 4.906367041198502e-06, "loss": 0.5236, "step": 131 }, { "epoch": 0.14823133071308253, "grad_norm": 0.3826431632041931, "learning_rate": 4.943820224719101e-06, "loss": 0.5129, "step": 132 }, { "epoch": 0.1493542953396968, "grad_norm": 0.4325251281261444, "learning_rate": 4.9812734082397e-06, "loss": 0.5361, "step": 133 }, { "epoch": 0.15047725996631106, "grad_norm": 0.38540413975715637, "learning_rate": 5.0187265917603005e-06, "loss": 0.5431, "step": 134 }, { "epoch": 0.15160022459292533, "grad_norm": 0.372889906167984, "learning_rate": 5.0561797752809e-06, "loss": 0.5348, "step": 135 }, { "epoch": 0.1527231892195396, "grad_norm": 0.39501506090164185, "learning_rate": 5.093632958801499e-06, "loss": 0.5224, "step": 136 }, { "epoch": 0.15384615384615385, "grad_norm": 0.40757691860198975, "learning_rate": 5.1310861423220985e-06, "loss": 0.5246, "step": 137 }, { "epoch": 0.15496911847276812, "grad_norm": 0.4316727817058563, "learning_rate": 5.168539325842698e-06, "loss": 0.5395, "step": 138 }, { "epoch": 0.15609208309938236, "grad_norm": 0.40537285804748535, "learning_rate": 5.205992509363297e-06, "loss": 0.5573, "step": 139 }, { "epoch": 0.15721504772599662, "grad_norm": 0.42079925537109375, "learning_rate": 5.243445692883895e-06, "loss": 0.5131, "step": 140 }, { "epoch": 0.15833801235261089, "grad_norm": 0.4106050729751587, "learning_rate": 5.280898876404494e-06, "loss": 0.513, "step": 141 }, { "epoch": 0.15946097697922515, "grad_norm": 0.4220394492149353, "learning_rate": 5.318352059925093e-06, "loss": 0.5501, "step": 142 }, { "epoch": 0.16058394160583941, "grad_norm": 0.3774314224720001, "learning_rate": 5.355805243445693e-06, "loss": 0.5293, "step": 143 }, { "epoch": 0.16170690623245368, "grad_norm": 0.41277751326560974, "learning_rate": 5.393258426966292e-06, "loss": 0.5041, "step": 144 }, { "epoch": 0.16282987085906794, "grad_norm": 0.4056062698364258, "learning_rate": 5.430711610486891e-06, "loss": 0.5135, "step": 145 }, { "epoch": 0.1639528354856822, "grad_norm": 0.39523497223854065, "learning_rate": 5.468164794007491e-06, "loss": 0.5227, "step": 146 }, { "epoch": 0.16507580011229647, "grad_norm": 0.4095844030380249, "learning_rate": 5.50561797752809e-06, "loss": 0.5143, "step": 147 }, { "epoch": 0.16619876473891074, "grad_norm": 0.3901519179344177, "learning_rate": 5.54307116104869e-06, "loss": 0.5275, "step": 148 }, { "epoch": 0.16732172936552497, "grad_norm": 0.44281473755836487, "learning_rate": 5.5805243445692896e-06, "loss": 0.5551, "step": 149 }, { "epoch": 0.16844469399213924, "grad_norm": 0.3832061290740967, "learning_rate": 5.617977528089889e-06, "loss": 0.5277, "step": 150 }, { "epoch": 0.1695676586187535, "grad_norm": 0.37243837118148804, "learning_rate": 5.655430711610488e-06, "loss": 0.5155, "step": 151 }, { "epoch": 0.17069062324536777, "grad_norm": 0.3837786018848419, "learning_rate": 5.692883895131086e-06, "loss": 0.5303, "step": 152 }, { "epoch": 0.17181358787198203, "grad_norm": 0.3514273166656494, "learning_rate": 5.730337078651685e-06, "loss": 0.5408, "step": 153 }, { "epoch": 0.1729365524985963, "grad_norm": 0.3498665988445282, "learning_rate": 5.7677902621722845e-06, "loss": 0.5025, "step": 154 }, { "epoch": 0.17405951712521056, "grad_norm": 0.3799659013748169, "learning_rate": 5.805243445692884e-06, "loss": 0.5287, "step": 155 }, { "epoch": 0.17518248175182483, "grad_norm": 0.4551142454147339, "learning_rate": 5.842696629213483e-06, "loss": 0.5303, "step": 156 }, { "epoch": 0.1763054463784391, "grad_norm": 0.3917481303215027, "learning_rate": 5.8801498127340825e-06, "loss": 0.5205, "step": 157 }, { "epoch": 0.17742841100505333, "grad_norm": 0.40373384952545166, "learning_rate": 5.917602996254682e-06, "loss": 0.5576, "step": 158 }, { "epoch": 0.1785513756316676, "grad_norm": 0.4142207205295563, "learning_rate": 5.955056179775281e-06, "loss": 0.4933, "step": 159 }, { "epoch": 0.17967434025828186, "grad_norm": 0.3689754903316498, "learning_rate": 5.9925093632958805e-06, "loss": 0.5419, "step": 160 }, { "epoch": 0.18079730488489612, "grad_norm": 0.3970787823200226, "learning_rate": 6.02996254681648e-06, "loss": 0.5347, "step": 161 }, { "epoch": 0.18192026951151039, "grad_norm": 0.3900080621242523, "learning_rate": 6.06741573033708e-06, "loss": 0.5585, "step": 162 }, { "epoch": 0.18304323413812465, "grad_norm": 0.4213597774505615, "learning_rate": 6.104868913857679e-06, "loss": 0.5133, "step": 163 }, { "epoch": 0.18416619876473891, "grad_norm": 0.38274073600769043, "learning_rate": 6.142322097378277e-06, "loss": 0.5261, "step": 164 }, { "epoch": 0.18528916339135318, "grad_norm": 0.3762882649898529, "learning_rate": 6.179775280898876e-06, "loss": 0.5222, "step": 165 }, { "epoch": 0.18641212801796744, "grad_norm": 0.4253355860710144, "learning_rate": 6.2172284644194756e-06, "loss": 0.5154, "step": 166 }, { "epoch": 0.1875350926445817, "grad_norm": 0.3748941123485565, "learning_rate": 6.254681647940075e-06, "loss": 0.5337, "step": 167 }, { "epoch": 0.18865805727119594, "grad_norm": 0.37981370091438293, "learning_rate": 6.292134831460674e-06, "loss": 0.5004, "step": 168 }, { "epoch": 0.1897810218978102, "grad_norm": 0.4112354516983032, "learning_rate": 6.3295880149812736e-06, "loss": 0.511, "step": 169 }, { "epoch": 0.19090398652442447, "grad_norm": 0.3982425034046173, "learning_rate": 6.367041198501873e-06, "loss": 0.5214, "step": 170 }, { "epoch": 0.19202695115103874, "grad_norm": 0.3644613027572632, "learning_rate": 6.404494382022472e-06, "loss": 0.5184, "step": 171 }, { "epoch": 0.193149915777653, "grad_norm": 0.3823944330215454, "learning_rate": 6.4419475655430715e-06, "loss": 0.5093, "step": 172 }, { "epoch": 0.19427288040426727, "grad_norm": 0.40740829706192017, "learning_rate": 6.479400749063671e-06, "loss": 0.5393, "step": 173 }, { "epoch": 0.19539584503088153, "grad_norm": 0.3820866048336029, "learning_rate": 6.51685393258427e-06, "loss": 0.4906, "step": 174 }, { "epoch": 0.1965188096574958, "grad_norm": 0.4176633656024933, "learning_rate": 6.5543071161048695e-06, "loss": 0.559, "step": 175 }, { "epoch": 0.19764177428411006, "grad_norm": 0.4170495569705963, "learning_rate": 6.591760299625468e-06, "loss": 0.4918, "step": 176 }, { "epoch": 0.19876473891072433, "grad_norm": 0.45308277010917664, "learning_rate": 6.629213483146067e-06, "loss": 0.5161, "step": 177 }, { "epoch": 0.19988770353733856, "grad_norm": 0.4047456681728363, "learning_rate": 6.666666666666667e-06, "loss": 0.515, "step": 178 }, { "epoch": 0.20101066816395283, "grad_norm": 0.44204241037368774, "learning_rate": 6.704119850187266e-06, "loss": 0.5093, "step": 179 }, { "epoch": 0.2021336327905671, "grad_norm": 0.41985079646110535, "learning_rate": 6.741573033707865e-06, "loss": 0.5074, "step": 180 }, { "epoch": 0.20325659741718136, "grad_norm": 0.444957435131073, "learning_rate": 6.779026217228465e-06, "loss": 0.5094, "step": 181 }, { "epoch": 0.20437956204379562, "grad_norm": 0.4080573320388794, "learning_rate": 6.816479400749064e-06, "loss": 0.5359, "step": 182 }, { "epoch": 0.20550252667040989, "grad_norm": 0.4934269189834595, "learning_rate": 6.853932584269663e-06, "loss": 0.5162, "step": 183 }, { "epoch": 0.20662549129702415, "grad_norm": 0.3958436846733093, "learning_rate": 6.891385767790263e-06, "loss": 0.4982, "step": 184 }, { "epoch": 0.20774845592363841, "grad_norm": 0.5202816128730774, "learning_rate": 6.928838951310862e-06, "loss": 0.5254, "step": 185 }, { "epoch": 0.20887142055025268, "grad_norm": 0.4014663100242615, "learning_rate": 6.966292134831461e-06, "loss": 0.513, "step": 186 }, { "epoch": 0.20999438517686692, "grad_norm": 0.47242704033851624, "learning_rate": 7.003745318352061e-06, "loss": 0.504, "step": 187 }, { "epoch": 0.21111734980348118, "grad_norm": 0.4094894528388977, "learning_rate": 7.04119850187266e-06, "loss": 0.497, "step": 188 }, { "epoch": 0.21224031443009544, "grad_norm": 0.4766428470611572, "learning_rate": 7.078651685393258e-06, "loss": 0.5013, "step": 189 }, { "epoch": 0.2133632790567097, "grad_norm": 0.41666924953460693, "learning_rate": 7.116104868913858e-06, "loss": 0.5161, "step": 190 }, { "epoch": 0.21448624368332397, "grad_norm": 0.4991929829120636, "learning_rate": 7.153558052434457e-06, "loss": 0.5342, "step": 191 }, { "epoch": 0.21560920830993824, "grad_norm": 0.4565524458885193, "learning_rate": 7.191011235955056e-06, "loss": 0.5196, "step": 192 }, { "epoch": 0.2167321729365525, "grad_norm": 0.45647987723350525, "learning_rate": 7.228464419475656e-06, "loss": 0.52, "step": 193 }, { "epoch": 0.21785513756316677, "grad_norm": 0.41998425126075745, "learning_rate": 7.265917602996255e-06, "loss": 0.4998, "step": 194 }, { "epoch": 0.21897810218978103, "grad_norm": 0.41214245557785034, "learning_rate": 7.303370786516854e-06, "loss": 0.4884, "step": 195 }, { "epoch": 0.2201010668163953, "grad_norm": 0.4146271049976349, "learning_rate": 7.340823970037454e-06, "loss": 0.4952, "step": 196 }, { "epoch": 0.22122403144300953, "grad_norm": 0.4441204369068146, "learning_rate": 7.378277153558053e-06, "loss": 0.5273, "step": 197 }, { "epoch": 0.2223469960696238, "grad_norm": 0.4331224262714386, "learning_rate": 7.415730337078652e-06, "loss": 0.5296, "step": 198 }, { "epoch": 0.22346996069623806, "grad_norm": 0.43818145990371704, "learning_rate": 7.453183520599252e-06, "loss": 0.5204, "step": 199 }, { "epoch": 0.22459292532285233, "grad_norm": 0.41158199310302734, "learning_rate": 7.490636704119851e-06, "loss": 0.5173, "step": 200 }, { "epoch": 0.2257158899494666, "grad_norm": 0.4884057939052582, "learning_rate": 7.5280898876404495e-06, "loss": 0.5133, "step": 201 }, { "epoch": 0.22683885457608086, "grad_norm": 0.4119909703731537, "learning_rate": 7.565543071161049e-06, "loss": 0.4814, "step": 202 }, { "epoch": 0.22796181920269512, "grad_norm": 0.4749114215373993, "learning_rate": 7.602996254681648e-06, "loss": 0.527, "step": 203 }, { "epoch": 0.22908478382930939, "grad_norm": 0.48893845081329346, "learning_rate": 7.640449438202247e-06, "loss": 0.5197, "step": 204 }, { "epoch": 0.23020774845592365, "grad_norm": 0.515061616897583, "learning_rate": 7.677902621722848e-06, "loss": 0.5066, "step": 205 }, { "epoch": 0.2313307130825379, "grad_norm": 0.45863911509513855, "learning_rate": 7.715355805243446e-06, "loss": 0.5016, "step": 206 }, { "epoch": 0.23245367770915215, "grad_norm": 0.5267060399055481, "learning_rate": 7.752808988764046e-06, "loss": 0.5195, "step": 207 }, { "epoch": 0.23357664233576642, "grad_norm": 0.4649311900138855, "learning_rate": 7.790262172284645e-06, "loss": 0.5343, "step": 208 }, { "epoch": 0.23469960696238068, "grad_norm": 0.4683702886104584, "learning_rate": 7.827715355805245e-06, "loss": 0.532, "step": 209 }, { "epoch": 0.23582257158899494, "grad_norm": 0.43623611330986023, "learning_rate": 7.865168539325843e-06, "loss": 0.4906, "step": 210 }, { "epoch": 0.2369455362156092, "grad_norm": 0.45316627621650696, "learning_rate": 7.902621722846444e-06, "loss": 0.5187, "step": 211 }, { "epoch": 0.23806850084222347, "grad_norm": 0.43184012174606323, "learning_rate": 7.940074906367042e-06, "loss": 0.5037, "step": 212 }, { "epoch": 0.23919146546883774, "grad_norm": 0.42945393919944763, "learning_rate": 7.97752808988764e-06, "loss": 0.5004, "step": 213 }, { "epoch": 0.240314430095452, "grad_norm": 0.43841272592544556, "learning_rate": 8.01498127340824e-06, "loss": 0.521, "step": 214 }, { "epoch": 0.24143739472206627, "grad_norm": 0.43215683102607727, "learning_rate": 8.05243445692884e-06, "loss": 0.5116, "step": 215 }, { "epoch": 0.2425603593486805, "grad_norm": 0.41102921962738037, "learning_rate": 8.08988764044944e-06, "loss": 0.5224, "step": 216 }, { "epoch": 0.24368332397529477, "grad_norm": 0.42310720682144165, "learning_rate": 8.127340823970038e-06, "loss": 0.483, "step": 217 }, { "epoch": 0.24480628860190903, "grad_norm": 0.39420217275619507, "learning_rate": 8.164794007490638e-06, "loss": 0.5079, "step": 218 }, { "epoch": 0.2459292532285233, "grad_norm": 0.43512579798698425, "learning_rate": 8.202247191011237e-06, "loss": 0.4838, "step": 219 }, { "epoch": 0.24705221785513756, "grad_norm": 0.4287498891353607, "learning_rate": 8.239700374531837e-06, "loss": 0.5013, "step": 220 }, { "epoch": 0.24817518248175183, "grad_norm": 0.43911269307136536, "learning_rate": 8.277153558052435e-06, "loss": 0.5036, "step": 221 }, { "epoch": 0.2492981471083661, "grad_norm": 0.4549112915992737, "learning_rate": 8.314606741573035e-06, "loss": 0.505, "step": 222 }, { "epoch": 0.25042111173498033, "grad_norm": 0.4515922963619232, "learning_rate": 8.352059925093634e-06, "loss": 0.51, "step": 223 }, { "epoch": 0.2515440763615946, "grad_norm": 0.413474440574646, "learning_rate": 8.389513108614234e-06, "loss": 0.4999, "step": 224 }, { "epoch": 0.25266704098820886, "grad_norm": 0.4102017283439636, "learning_rate": 8.426966292134832e-06, "loss": 0.5081, "step": 225 }, { "epoch": 0.2537900056148231, "grad_norm": 0.46972721815109253, "learning_rate": 8.464419475655431e-06, "loss": 0.5101, "step": 226 }, { "epoch": 0.2549129702414374, "grad_norm": 0.47850480675697327, "learning_rate": 8.501872659176031e-06, "loss": 0.4974, "step": 227 }, { "epoch": 0.25603593486805165, "grad_norm": 0.4696987271308899, "learning_rate": 8.53932584269663e-06, "loss": 0.5054, "step": 228 }, { "epoch": 0.2571588994946659, "grad_norm": 0.5040601491928101, "learning_rate": 8.57677902621723e-06, "loss": 0.5103, "step": 229 }, { "epoch": 0.2582818641212802, "grad_norm": 0.4864058494567871, "learning_rate": 8.614232209737828e-06, "loss": 0.491, "step": 230 }, { "epoch": 0.25940482874789444, "grad_norm": 0.4537062346935272, "learning_rate": 8.651685393258428e-06, "loss": 0.494, "step": 231 }, { "epoch": 0.2605277933745087, "grad_norm": 0.4709954261779785, "learning_rate": 8.689138576779027e-06, "loss": 0.4951, "step": 232 }, { "epoch": 0.261650758001123, "grad_norm": 0.5310398936271667, "learning_rate": 8.726591760299627e-06, "loss": 0.5061, "step": 233 }, { "epoch": 0.26277372262773724, "grad_norm": 0.4751857817173004, "learning_rate": 8.764044943820226e-06, "loss": 0.5098, "step": 234 }, { "epoch": 0.2638966872543515, "grad_norm": 0.43371933698654175, "learning_rate": 8.801498127340826e-06, "loss": 0.5209, "step": 235 }, { "epoch": 0.26501965188096577, "grad_norm": 0.5767494440078735, "learning_rate": 8.838951310861424e-06, "loss": 0.5116, "step": 236 }, { "epoch": 0.26614261650758003, "grad_norm": 0.47821083664894104, "learning_rate": 8.876404494382023e-06, "loss": 0.5016, "step": 237 }, { "epoch": 0.2672655811341943, "grad_norm": 0.4854621887207031, "learning_rate": 8.913857677902621e-06, "loss": 0.4918, "step": 238 }, { "epoch": 0.26838854576080856, "grad_norm": 0.4513131380081177, "learning_rate": 8.951310861423221e-06, "loss": 0.5184, "step": 239 }, { "epoch": 0.26951151038742277, "grad_norm": 0.495781809091568, "learning_rate": 8.988764044943822e-06, "loss": 0.5033, "step": 240 }, { "epoch": 0.27063447501403703, "grad_norm": 0.5072525143623352, "learning_rate": 9.02621722846442e-06, "loss": 0.4934, "step": 241 }, { "epoch": 0.2717574396406513, "grad_norm": 0.45455124974250793, "learning_rate": 9.06367041198502e-06, "loss": 0.5019, "step": 242 }, { "epoch": 0.27288040426726556, "grad_norm": 0.5147728323936462, "learning_rate": 9.101123595505619e-06, "loss": 0.4821, "step": 243 }, { "epoch": 0.27400336889387983, "grad_norm": 0.4444701373577118, "learning_rate": 9.138576779026219e-06, "loss": 0.495, "step": 244 }, { "epoch": 0.2751263335204941, "grad_norm": 0.4710620045661926, "learning_rate": 9.176029962546817e-06, "loss": 0.5116, "step": 245 }, { "epoch": 0.27624929814710836, "grad_norm": 0.4726759195327759, "learning_rate": 9.213483146067417e-06, "loss": 0.4892, "step": 246 }, { "epoch": 0.2773722627737226, "grad_norm": 0.4665619432926178, "learning_rate": 9.250936329588016e-06, "loss": 0.487, "step": 247 }, { "epoch": 0.2784952274003369, "grad_norm": 0.5117336511611938, "learning_rate": 9.288389513108616e-06, "loss": 0.522, "step": 248 }, { "epoch": 0.27961819202695115, "grad_norm": 0.4177057445049286, "learning_rate": 9.325842696629213e-06, "loss": 0.4711, "step": 249 }, { "epoch": 0.2807411566535654, "grad_norm": 0.6065915822982788, "learning_rate": 9.363295880149813e-06, "loss": 0.5325, "step": 250 }, { "epoch": 0.2818641212801797, "grad_norm": 0.40769192576408386, "learning_rate": 9.400749063670412e-06, "loss": 0.5119, "step": 251 }, { "epoch": 0.28298708590679394, "grad_norm": 0.7062539458274841, "learning_rate": 9.438202247191012e-06, "loss": 0.5283, "step": 252 }, { "epoch": 0.2841100505334082, "grad_norm": 0.39617815613746643, "learning_rate": 9.475655430711612e-06, "loss": 0.5017, "step": 253 }, { "epoch": 0.2852330151600225, "grad_norm": 0.5892727375030518, "learning_rate": 9.51310861423221e-06, "loss": 0.4736, "step": 254 }, { "epoch": 0.28635597978663674, "grad_norm": 0.5093287229537964, "learning_rate": 9.55056179775281e-06, "loss": 0.5119, "step": 255 }, { "epoch": 0.287478944413251, "grad_norm": 0.47920793294906616, "learning_rate": 9.588014981273409e-06, "loss": 0.496, "step": 256 }, { "epoch": 0.28860190903986527, "grad_norm": 0.48592856526374817, "learning_rate": 9.62546816479401e-06, "loss": 0.5066, "step": 257 }, { "epoch": 0.28972487366647953, "grad_norm": 0.5363805890083313, "learning_rate": 9.662921348314608e-06, "loss": 0.5238, "step": 258 }, { "epoch": 0.29084783829309374, "grad_norm": 0.44914162158966064, "learning_rate": 9.700374531835208e-06, "loss": 0.5068, "step": 259 }, { "epoch": 0.291970802919708, "grad_norm": 0.4731637239456177, "learning_rate": 9.737827715355806e-06, "loss": 0.4704, "step": 260 }, { "epoch": 0.29309376754632227, "grad_norm": 0.5483570098876953, "learning_rate": 9.775280898876405e-06, "loss": 0.4774, "step": 261 }, { "epoch": 0.29421673217293653, "grad_norm": 0.4706237316131592, "learning_rate": 9.812734082397003e-06, "loss": 0.5104, "step": 262 }, { "epoch": 0.2953396967995508, "grad_norm": 0.46471041440963745, "learning_rate": 9.850187265917604e-06, "loss": 0.4832, "step": 263 }, { "epoch": 0.29646266142616506, "grad_norm": 0.5917724967002869, "learning_rate": 9.887640449438202e-06, "loss": 0.4777, "step": 264 }, { "epoch": 0.29758562605277933, "grad_norm": 0.5033347606658936, "learning_rate": 9.925093632958802e-06, "loss": 0.5223, "step": 265 }, { "epoch": 0.2987085906793936, "grad_norm": 0.5690016746520996, "learning_rate": 9.9625468164794e-06, "loss": 0.4906, "step": 266 }, { "epoch": 0.29983155530600786, "grad_norm": 0.48196834325790405, "learning_rate": 1e-05, "loss": 0.4958, "step": 267 }, { "epoch": 0.3009545199326221, "grad_norm": 0.55958092212677, "learning_rate": 9.999995727007303e-06, "loss": 0.4993, "step": 268 }, { "epoch": 0.3020774845592364, "grad_norm": 0.46213364601135254, "learning_rate": 9.999982908036516e-06, "loss": 0.516, "step": 269 }, { "epoch": 0.30320044918585065, "grad_norm": 0.5498700141906738, "learning_rate": 9.999961543109546e-06, "loss": 0.4796, "step": 270 }, { "epoch": 0.3043234138124649, "grad_norm": 0.46868792176246643, "learning_rate": 9.99993163226291e-06, "loss": 0.4781, "step": 271 }, { "epoch": 0.3054463784390792, "grad_norm": 0.4110560119152069, "learning_rate": 9.999893175547737e-06, "loss": 0.4992, "step": 272 }, { "epoch": 0.30656934306569344, "grad_norm": 0.515194296836853, "learning_rate": 9.999846173029752e-06, "loss": 0.5137, "step": 273 }, { "epoch": 0.3076923076923077, "grad_norm": 0.44064363837242126, "learning_rate": 9.999790624789291e-06, "loss": 0.497, "step": 274 }, { "epoch": 0.308815272318922, "grad_norm": 0.45706939697265625, "learning_rate": 9.999726530921301e-06, "loss": 0.4938, "step": 275 }, { "epoch": 0.30993823694553624, "grad_norm": 0.4459260106086731, "learning_rate": 9.99965389153533e-06, "loss": 0.4966, "step": 276 }, { "epoch": 0.3110612015721505, "grad_norm": 0.457116037607193, "learning_rate": 9.99957270675553e-06, "loss": 0.5002, "step": 277 }, { "epoch": 0.3121841661987647, "grad_norm": 0.4143843352794647, "learning_rate": 9.999482976720665e-06, "loss": 0.4746, "step": 278 }, { "epoch": 0.313307130825379, "grad_norm": 0.5233191251754761, "learning_rate": 9.999384701584098e-06, "loss": 0.5099, "step": 279 }, { "epoch": 0.31443009545199324, "grad_norm": 0.5287202000617981, "learning_rate": 9.999277881513805e-06, "loss": 0.493, "step": 280 }, { "epoch": 0.3155530600786075, "grad_norm": 0.42509064078330994, "learning_rate": 9.999162516692358e-06, "loss": 0.4666, "step": 281 }, { "epoch": 0.31667602470522177, "grad_norm": 0.461577445268631, "learning_rate": 9.999038607316942e-06, "loss": 0.5123, "step": 282 }, { "epoch": 0.31779898933183603, "grad_norm": 0.4394127130508423, "learning_rate": 9.99890615359934e-06, "loss": 0.4802, "step": 283 }, { "epoch": 0.3189219539584503, "grad_norm": 0.47414135932922363, "learning_rate": 9.998765155765945e-06, "loss": 0.5124, "step": 284 }, { "epoch": 0.32004491858506456, "grad_norm": 0.491132527589798, "learning_rate": 9.998615614057743e-06, "loss": 0.4982, "step": 285 }, { "epoch": 0.32116788321167883, "grad_norm": 0.40194958448410034, "learning_rate": 9.998457528730337e-06, "loss": 0.4967, "step": 286 }, { "epoch": 0.3222908478382931, "grad_norm": 0.529348611831665, "learning_rate": 9.998290900053925e-06, "loss": 0.5268, "step": 287 }, { "epoch": 0.32341381246490736, "grad_norm": 0.4462125599384308, "learning_rate": 9.998115728313305e-06, "loss": 0.5172, "step": 288 }, { "epoch": 0.3245367770915216, "grad_norm": 0.4175768196582794, "learning_rate": 9.997932013807883e-06, "loss": 0.4635, "step": 289 }, { "epoch": 0.3256597417181359, "grad_norm": 0.45502564311027527, "learning_rate": 9.997739756851663e-06, "loss": 0.5023, "step": 290 }, { "epoch": 0.32678270634475015, "grad_norm": 0.44981124997138977, "learning_rate": 9.997538957773248e-06, "loss": 0.5195, "step": 291 }, { "epoch": 0.3279056709713644, "grad_norm": 0.4389380216598511, "learning_rate": 9.997329616915845e-06, "loss": 0.4713, "step": 292 }, { "epoch": 0.3290286355979787, "grad_norm": 0.4270534813404083, "learning_rate": 9.997111734637258e-06, "loss": 0.4875, "step": 293 }, { "epoch": 0.33015160022459294, "grad_norm": 0.4399762451648712, "learning_rate": 9.996885311309892e-06, "loss": 0.5235, "step": 294 }, { "epoch": 0.3312745648512072, "grad_norm": 0.49100926518440247, "learning_rate": 9.996650347320748e-06, "loss": 0.5094, "step": 295 }, { "epoch": 0.3323975294778215, "grad_norm": 0.4316697120666504, "learning_rate": 9.996406843071424e-06, "loss": 0.4839, "step": 296 }, { "epoch": 0.3335204941044357, "grad_norm": 0.4970281422138214, "learning_rate": 9.996154798978122e-06, "loss": 0.4986, "step": 297 }, { "epoch": 0.33464345873104995, "grad_norm": 0.5157686471939087, "learning_rate": 9.99589421547163e-06, "loss": 0.4895, "step": 298 }, { "epoch": 0.3357664233576642, "grad_norm": 0.43213075399398804, "learning_rate": 9.99562509299734e-06, "loss": 0.534, "step": 299 }, { "epoch": 0.3368893879842785, "grad_norm": 0.5215768218040466, "learning_rate": 9.99534743201523e-06, "loss": 0.4947, "step": 300 }, { "epoch": 0.33801235261089274, "grad_norm": 0.4859315752983093, "learning_rate": 9.995061232999884e-06, "loss": 0.4666, "step": 301 }, { "epoch": 0.339135317237507, "grad_norm": 0.5052666664123535, "learning_rate": 9.994766496440467e-06, "loss": 0.5007, "step": 302 }, { "epoch": 0.34025828186412127, "grad_norm": 0.5719942450523376, "learning_rate": 9.994463222840748e-06, "loss": 0.4699, "step": 303 }, { "epoch": 0.34138124649073553, "grad_norm": 0.4965064227581024, "learning_rate": 9.994151412719073e-06, "loss": 0.5127, "step": 304 }, { "epoch": 0.3425042111173498, "grad_norm": 0.41176483035087585, "learning_rate": 9.993831066608395e-06, "loss": 0.4928, "step": 305 }, { "epoch": 0.34362717574396406, "grad_norm": 0.47889184951782227, "learning_rate": 9.993502185056244e-06, "loss": 0.4992, "step": 306 }, { "epoch": 0.34475014037057833, "grad_norm": 0.4367619454860687, "learning_rate": 9.993164768624746e-06, "loss": 0.5037, "step": 307 }, { "epoch": 0.3458731049971926, "grad_norm": 0.5818862318992615, "learning_rate": 9.992818817890609e-06, "loss": 0.5113, "step": 308 }, { "epoch": 0.34699606962380686, "grad_norm": 0.4597448408603668, "learning_rate": 9.992464333445134e-06, "loss": 0.4843, "step": 309 }, { "epoch": 0.3481190342504211, "grad_norm": 0.51938796043396, "learning_rate": 9.992101315894204e-06, "loss": 0.4789, "step": 310 }, { "epoch": 0.3492419988770354, "grad_norm": 0.5540185570716858, "learning_rate": 9.991729765858287e-06, "loss": 0.4828, "step": 311 }, { "epoch": 0.35036496350364965, "grad_norm": 0.413429319858551, "learning_rate": 9.991349683972435e-06, "loss": 0.4879, "step": 312 }, { "epoch": 0.3514879281302639, "grad_norm": 0.47350990772247314, "learning_rate": 9.990961070886283e-06, "loss": 0.4824, "step": 313 }, { "epoch": 0.3526108927568782, "grad_norm": 0.4968799352645874, "learning_rate": 9.990563927264048e-06, "loss": 0.5102, "step": 314 }, { "epoch": 0.35373385738349244, "grad_norm": 0.4722319543361664, "learning_rate": 9.990158253784525e-06, "loss": 0.5134, "step": 315 }, { "epoch": 0.35485682201010665, "grad_norm": 0.4976036846637726, "learning_rate": 9.989744051141092e-06, "loss": 0.4866, "step": 316 }, { "epoch": 0.3559797866367209, "grad_norm": 0.4209980368614197, "learning_rate": 9.989321320041704e-06, "loss": 0.4935, "step": 317 }, { "epoch": 0.3571027512633352, "grad_norm": 0.48900121450424194, "learning_rate": 9.988890061208889e-06, "loss": 0.5202, "step": 318 }, { "epoch": 0.35822571588994945, "grad_norm": 0.40544068813323975, "learning_rate": 9.988450275379753e-06, "loss": 0.4827, "step": 319 }, { "epoch": 0.3593486805165637, "grad_norm": 0.47542792558670044, "learning_rate": 9.98800196330598e-06, "loss": 0.4798, "step": 320 }, { "epoch": 0.360471645143178, "grad_norm": 0.4969334006309509, "learning_rate": 9.987545125753818e-06, "loss": 0.5081, "step": 321 }, { "epoch": 0.36159460976979224, "grad_norm": 0.4858170747756958, "learning_rate": 9.9870797635041e-06, "loss": 0.5089, "step": 322 }, { "epoch": 0.3627175743964065, "grad_norm": 0.4791705906391144, "learning_rate": 9.986605877352216e-06, "loss": 0.4676, "step": 323 }, { "epoch": 0.36384053902302077, "grad_norm": 0.4257008135318756, "learning_rate": 9.986123468108134e-06, "loss": 0.4967, "step": 324 }, { "epoch": 0.36496350364963503, "grad_norm": 0.5351857542991638, "learning_rate": 9.985632536596384e-06, "loss": 0.5057, "step": 325 }, { "epoch": 0.3660864682762493, "grad_norm": 0.4519592225551605, "learning_rate": 9.985133083656066e-06, "loss": 0.4928, "step": 326 }, { "epoch": 0.36720943290286356, "grad_norm": 0.5511761903762817, "learning_rate": 9.984625110140844e-06, "loss": 0.4971, "step": 327 }, { "epoch": 0.36833239752947783, "grad_norm": 0.5177006125450134, "learning_rate": 9.984108616918944e-06, "loss": 0.5077, "step": 328 }, { "epoch": 0.3694553621560921, "grad_norm": 0.5029638409614563, "learning_rate": 9.983583604873154e-06, "loss": 0.4784, "step": 329 }, { "epoch": 0.37057832678270636, "grad_norm": 0.4522557854652405, "learning_rate": 9.983050074900824e-06, "loss": 0.4897, "step": 330 }, { "epoch": 0.3717012914093206, "grad_norm": 0.4449736773967743, "learning_rate": 9.982508027913862e-06, "loss": 0.5049, "step": 331 }, { "epoch": 0.3728242560359349, "grad_norm": 0.447950541973114, "learning_rate": 9.981957464838735e-06, "loss": 0.4978, "step": 332 }, { "epoch": 0.37394722066254915, "grad_norm": 0.40548381209373474, "learning_rate": 9.98139838661646e-06, "loss": 0.4946, "step": 333 }, { "epoch": 0.3750701852891634, "grad_norm": 0.420398473739624, "learning_rate": 9.980830794202611e-06, "loss": 0.4897, "step": 334 }, { "epoch": 0.3761931499157777, "grad_norm": 0.4002024829387665, "learning_rate": 9.98025468856732e-06, "loss": 0.4897, "step": 335 }, { "epoch": 0.3773161145423919, "grad_norm": 0.5054208040237427, "learning_rate": 9.979670070695265e-06, "loss": 0.4875, "step": 336 }, { "epoch": 0.37843907916900615, "grad_norm": 0.4543408751487732, "learning_rate": 9.97907694158567e-06, "loss": 0.5109, "step": 337 }, { "epoch": 0.3795620437956204, "grad_norm": 0.3982942998409271, "learning_rate": 9.97847530225231e-06, "loss": 0.4766, "step": 338 }, { "epoch": 0.3806850084222347, "grad_norm": 0.4317050278186798, "learning_rate": 9.977865153723508e-06, "loss": 0.4793, "step": 339 }, { "epoch": 0.38180797304884895, "grad_norm": 0.44591253995895386, "learning_rate": 9.977246497042124e-06, "loss": 0.493, "step": 340 }, { "epoch": 0.3829309376754632, "grad_norm": 0.4473872482776642, "learning_rate": 9.976619333265564e-06, "loss": 0.4758, "step": 341 }, { "epoch": 0.3840539023020775, "grad_norm": 0.40184497833251953, "learning_rate": 9.97598366346578e-06, "loss": 0.4612, "step": 342 }, { "epoch": 0.38517686692869174, "grad_norm": 0.4235111474990845, "learning_rate": 9.975339488729251e-06, "loss": 0.4724, "step": 343 }, { "epoch": 0.386299831555306, "grad_norm": 0.43712371587753296, "learning_rate": 9.974686810157003e-06, "loss": 0.5038, "step": 344 }, { "epoch": 0.38742279618192027, "grad_norm": 0.45501983165740967, "learning_rate": 9.974025628864592e-06, "loss": 0.4979, "step": 345 }, { "epoch": 0.38854576080853453, "grad_norm": 0.4766309857368469, "learning_rate": 9.973355945982103e-06, "loss": 0.4685, "step": 346 }, { "epoch": 0.3896687254351488, "grad_norm": 0.41206714510917664, "learning_rate": 9.97267776265416e-06, "loss": 0.4773, "step": 347 }, { "epoch": 0.39079169006176306, "grad_norm": 0.4275694191455841, "learning_rate": 9.971991080039912e-06, "loss": 0.4812, "step": 348 }, { "epoch": 0.39191465468837733, "grad_norm": 0.3967951536178589, "learning_rate": 9.97129589931303e-06, "loss": 0.4986, "step": 349 }, { "epoch": 0.3930376193149916, "grad_norm": 0.42633628845214844, "learning_rate": 9.970592221661721e-06, "loss": 0.474, "step": 350 }, { "epoch": 0.39416058394160586, "grad_norm": 0.42876726388931274, "learning_rate": 9.969880048288704e-06, "loss": 0.4847, "step": 351 }, { "epoch": 0.3952835485682201, "grad_norm": 0.3759877681732178, "learning_rate": 9.969159380411228e-06, "loss": 0.4782, "step": 352 }, { "epoch": 0.3964065131948344, "grad_norm": 0.4307141900062561, "learning_rate": 9.968430219261054e-06, "loss": 0.5089, "step": 353 }, { "epoch": 0.39752947782144865, "grad_norm": 0.42869722843170166, "learning_rate": 9.96769256608446e-06, "loss": 0.4801, "step": 354 }, { "epoch": 0.39865244244806286, "grad_norm": 0.43098029494285583, "learning_rate": 9.966946422142248e-06, "loss": 0.5035, "step": 355 }, { "epoch": 0.3997754070746771, "grad_norm": 0.5097415447235107, "learning_rate": 9.966191788709716e-06, "loss": 0.4723, "step": 356 }, { "epoch": 0.4008983717012914, "grad_norm": 0.4017908275127411, "learning_rate": 9.965428667076687e-06, "loss": 0.5107, "step": 357 }, { "epoch": 0.40202133632790565, "grad_norm": 0.44839635491371155, "learning_rate": 9.964657058547483e-06, "loss": 0.4834, "step": 358 }, { "epoch": 0.4031443009545199, "grad_norm": 0.5015233755111694, "learning_rate": 9.963876964440937e-06, "loss": 0.4682, "step": 359 }, { "epoch": 0.4042672655811342, "grad_norm": 0.43770015239715576, "learning_rate": 9.963088386090386e-06, "loss": 0.4863, "step": 360 }, { "epoch": 0.40539023020774845, "grad_norm": 0.41875240206718445, "learning_rate": 9.96229132484366e-06, "loss": 0.4469, "step": 361 }, { "epoch": 0.4065131948343627, "grad_norm": 0.47245386242866516, "learning_rate": 9.961485782063098e-06, "loss": 0.5096, "step": 362 }, { "epoch": 0.407636159460977, "grad_norm": 0.4714481830596924, "learning_rate": 9.960671759125529e-06, "loss": 0.5091, "step": 363 }, { "epoch": 0.40875912408759124, "grad_norm": 0.4381459653377533, "learning_rate": 9.95984925742228e-06, "loss": 0.5007, "step": 364 }, { "epoch": 0.4098820887142055, "grad_norm": 0.4770166873931885, "learning_rate": 9.959018278359169e-06, "loss": 0.4778, "step": 365 }, { "epoch": 0.41100505334081977, "grad_norm": 0.5017430186271667, "learning_rate": 9.958178823356503e-06, "loss": 0.4915, "step": 366 }, { "epoch": 0.41212801796743403, "grad_norm": 0.4692360460758209, "learning_rate": 9.957330893849074e-06, "loss": 0.4756, "step": 367 }, { "epoch": 0.4132509825940483, "grad_norm": 0.5453149080276489, "learning_rate": 9.956474491286163e-06, "loss": 0.4872, "step": 368 }, { "epoch": 0.41437394722066256, "grad_norm": 0.37699589133262634, "learning_rate": 9.95560961713153e-06, "loss": 0.4817, "step": 369 }, { "epoch": 0.41549691184727683, "grad_norm": 0.6338233351707458, "learning_rate": 9.954736272863414e-06, "loss": 0.4735, "step": 370 }, { "epoch": 0.4166198764738911, "grad_norm": 0.5061712265014648, "learning_rate": 9.953854459974534e-06, "loss": 0.4685, "step": 371 }, { "epoch": 0.41774284110050536, "grad_norm": 0.595748782157898, "learning_rate": 9.95296417997208e-06, "loss": 0.5137, "step": 372 }, { "epoch": 0.4188658057271196, "grad_norm": 0.525424599647522, "learning_rate": 9.95206543437772e-06, "loss": 0.4933, "step": 373 }, { "epoch": 0.41998877035373383, "grad_norm": 0.49247491359710693, "learning_rate": 9.951158224727584e-06, "loss": 0.4857, "step": 374 }, { "epoch": 0.4211117349803481, "grad_norm": 0.5255704522132874, "learning_rate": 9.950242552572272e-06, "loss": 0.4914, "step": 375 }, { "epoch": 0.42223469960696236, "grad_norm": 0.46349164843559265, "learning_rate": 9.94931841947685e-06, "loss": 0.4914, "step": 376 }, { "epoch": 0.4233576642335766, "grad_norm": 0.45132696628570557, "learning_rate": 9.948385827020844e-06, "loss": 0.4964, "step": 377 }, { "epoch": 0.4244806288601909, "grad_norm": 0.5173887014389038, "learning_rate": 9.947444776798235e-06, "loss": 0.4792, "step": 378 }, { "epoch": 0.42560359348680515, "grad_norm": 0.4404643177986145, "learning_rate": 9.946495270417467e-06, "loss": 0.4849, "step": 379 }, { "epoch": 0.4267265581134194, "grad_norm": 0.4796120226383209, "learning_rate": 9.94553730950143e-06, "loss": 0.499, "step": 380 }, { "epoch": 0.4278495227400337, "grad_norm": 0.5392577052116394, "learning_rate": 9.944570895687471e-06, "loss": 0.4769, "step": 381 }, { "epoch": 0.42897248736664795, "grad_norm": 0.5313969254493713, "learning_rate": 9.94359603062738e-06, "loss": 0.4708, "step": 382 }, { "epoch": 0.4300954519932622, "grad_norm": 0.41659438610076904, "learning_rate": 9.942612715987396e-06, "loss": 0.4611, "step": 383 }, { "epoch": 0.4312184166198765, "grad_norm": 0.5265209674835205, "learning_rate": 9.941620953448195e-06, "loss": 0.4787, "step": 384 }, { "epoch": 0.43234138124649074, "grad_norm": 0.48242077231407166, "learning_rate": 9.940620744704893e-06, "loss": 0.4909, "step": 385 }, { "epoch": 0.433464345873105, "grad_norm": 0.4732156991958618, "learning_rate": 9.939612091467048e-06, "loss": 0.4937, "step": 386 }, { "epoch": 0.43458731049971927, "grad_norm": 0.3784290552139282, "learning_rate": 9.938594995458644e-06, "loss": 0.4776, "step": 387 }, { "epoch": 0.43571027512633353, "grad_norm": 0.4370846748352051, "learning_rate": 9.9375694584181e-06, "loss": 0.4948, "step": 388 }, { "epoch": 0.4368332397529478, "grad_norm": 0.39102810621261597, "learning_rate": 9.936535482098261e-06, "loss": 0.4614, "step": 389 }, { "epoch": 0.43795620437956206, "grad_norm": 0.4884539544582367, "learning_rate": 9.935493068266396e-06, "loss": 0.4874, "step": 390 }, { "epoch": 0.43907916900617633, "grad_norm": 0.4176507294178009, "learning_rate": 9.934442218704196e-06, "loss": 0.4902, "step": 391 }, { "epoch": 0.4402021336327906, "grad_norm": 0.4882607161998749, "learning_rate": 9.933382935207769e-06, "loss": 0.4954, "step": 392 }, { "epoch": 0.4413250982594048, "grad_norm": 0.4433613419532776, "learning_rate": 9.932315219587641e-06, "loss": 0.4663, "step": 393 }, { "epoch": 0.44244806288601907, "grad_norm": 0.48430201411247253, "learning_rate": 9.931239073668747e-06, "loss": 0.4562, "step": 394 }, { "epoch": 0.44357102751263333, "grad_norm": 0.4654388427734375, "learning_rate": 9.930154499290431e-06, "loss": 0.4804, "step": 395 }, { "epoch": 0.4446939921392476, "grad_norm": 0.4363792836666107, "learning_rate": 9.929061498306448e-06, "loss": 0.4703, "step": 396 }, { "epoch": 0.44581695676586186, "grad_norm": 0.46808966994285583, "learning_rate": 9.92796007258495e-06, "loss": 0.4784, "step": 397 }, { "epoch": 0.4469399213924761, "grad_norm": 0.4248290956020355, "learning_rate": 9.926850224008491e-06, "loss": 0.4602, "step": 398 }, { "epoch": 0.4480628860190904, "grad_norm": 0.4573725163936615, "learning_rate": 9.92573195447402e-06, "loss": 0.4812, "step": 399 }, { "epoch": 0.44918585064570465, "grad_norm": 0.5233784914016724, "learning_rate": 9.924605265892882e-06, "loss": 0.478, "step": 400 }, { "epoch": 0.4503088152723189, "grad_norm": 0.4338049292564392, "learning_rate": 9.923470160190807e-06, "loss": 0.4705, "step": 401 }, { "epoch": 0.4514317798989332, "grad_norm": 0.5290100574493408, "learning_rate": 9.922326639307918e-06, "loss": 0.4715, "step": 402 }, { "epoch": 0.45255474452554745, "grad_norm": 0.5152130722999573, "learning_rate": 9.921174705198715e-06, "loss": 0.4974, "step": 403 }, { "epoch": 0.4536777091521617, "grad_norm": 0.5322993397712708, "learning_rate": 9.92001435983208e-06, "loss": 0.5052, "step": 404 }, { "epoch": 0.454800673778776, "grad_norm": 0.5136135816574097, "learning_rate": 9.918845605191274e-06, "loss": 0.4904, "step": 405 }, { "epoch": 0.45592363840539024, "grad_norm": 0.5039328336715698, "learning_rate": 9.917668443273926e-06, "loss": 0.4693, "step": 406 }, { "epoch": 0.4570466030320045, "grad_norm": 0.5538428425788879, "learning_rate": 9.916482876092042e-06, "loss": 0.4717, "step": 407 }, { "epoch": 0.45816956765861877, "grad_norm": 0.5264353156089783, "learning_rate": 9.915288905671986e-06, "loss": 0.5011, "step": 408 }, { "epoch": 0.45929253228523303, "grad_norm": 0.43789029121398926, "learning_rate": 9.91408653405449e-06, "loss": 0.4537, "step": 409 }, { "epoch": 0.4604154969118473, "grad_norm": 0.4361424744129181, "learning_rate": 9.912875763294646e-06, "loss": 0.5039, "step": 410 }, { "epoch": 0.46153846153846156, "grad_norm": 0.568766713142395, "learning_rate": 9.911656595461899e-06, "loss": 0.5094, "step": 411 }, { "epoch": 0.4626614261650758, "grad_norm": 0.4496559798717499, "learning_rate": 9.910429032640044e-06, "loss": 0.4821, "step": 412 }, { "epoch": 0.46378439079169004, "grad_norm": 0.5127695798873901, "learning_rate": 9.90919307692723e-06, "loss": 0.4616, "step": 413 }, { "epoch": 0.4649073554183043, "grad_norm": 0.4671713411808014, "learning_rate": 9.90794873043595e-06, "loss": 0.4794, "step": 414 }, { "epoch": 0.46603032004491857, "grad_norm": 0.6197286248207092, "learning_rate": 9.906695995293036e-06, "loss": 0.4822, "step": 415 }, { "epoch": 0.46715328467153283, "grad_norm": 0.46139782667160034, "learning_rate": 9.905434873639661e-06, "loss": 0.473, "step": 416 }, { "epoch": 0.4682762492981471, "grad_norm": 0.5810860991477966, "learning_rate": 9.904165367631329e-06, "loss": 0.482, "step": 417 }, { "epoch": 0.46939921392476136, "grad_norm": 0.4518386125564575, "learning_rate": 9.902887479437874e-06, "loss": 0.4559, "step": 418 }, { "epoch": 0.4705221785513756, "grad_norm": 0.42068353295326233, "learning_rate": 9.901601211243462e-06, "loss": 0.4689, "step": 419 }, { "epoch": 0.4716451431779899, "grad_norm": 0.5219129920005798, "learning_rate": 9.900306565246579e-06, "loss": 0.4748, "step": 420 }, { "epoch": 0.47276810780460415, "grad_norm": 0.41824042797088623, "learning_rate": 9.899003543660026e-06, "loss": 0.4754, "step": 421 }, { "epoch": 0.4738910724312184, "grad_norm": 0.4802229702472687, "learning_rate": 9.897692148710928e-06, "loss": 0.503, "step": 422 }, { "epoch": 0.4750140370578327, "grad_norm": 0.38377711176872253, "learning_rate": 9.896372382640718e-06, "loss": 0.4626, "step": 423 }, { "epoch": 0.47613700168444695, "grad_norm": 0.4462909400463104, "learning_rate": 9.895044247705132e-06, "loss": 0.4761, "step": 424 }, { "epoch": 0.4772599663110612, "grad_norm": 0.4134484529495239, "learning_rate": 9.893707746174217e-06, "loss": 0.487, "step": 425 }, { "epoch": 0.4783829309376755, "grad_norm": 0.5016562938690186, "learning_rate": 9.892362880332316e-06, "loss": 0.4955, "step": 426 }, { "epoch": 0.47950589556428974, "grad_norm": 0.44626879692077637, "learning_rate": 9.891009652478072e-06, "loss": 0.4561, "step": 427 }, { "epoch": 0.480628860190904, "grad_norm": 0.43035081028938293, "learning_rate": 9.889648064924417e-06, "loss": 0.469, "step": 428 }, { "epoch": 0.48175182481751827, "grad_norm": 0.48790451884269714, "learning_rate": 9.888278119998573e-06, "loss": 0.482, "step": 429 }, { "epoch": 0.48287478944413254, "grad_norm": 0.4052470624446869, "learning_rate": 9.886899820042043e-06, "loss": 0.4808, "step": 430 }, { "epoch": 0.48399775407074674, "grad_norm": 0.48452579975128174, "learning_rate": 9.885513167410618e-06, "loss": 0.4999, "step": 431 }, { "epoch": 0.485120718697361, "grad_norm": 0.4259363114833832, "learning_rate": 9.884118164474359e-06, "loss": 0.4704, "step": 432 }, { "epoch": 0.4862436833239753, "grad_norm": 0.39419764280319214, "learning_rate": 9.882714813617597e-06, "loss": 0.5034, "step": 433 }, { "epoch": 0.48736664795058954, "grad_norm": 0.3929540514945984, "learning_rate": 9.881303117238941e-06, "loss": 0.4621, "step": 434 }, { "epoch": 0.4884896125772038, "grad_norm": 0.4056868851184845, "learning_rate": 9.879883077751255e-06, "loss": 0.4922, "step": 435 }, { "epoch": 0.48961257720381807, "grad_norm": 0.3715357780456543, "learning_rate": 9.878454697581666e-06, "loss": 0.484, "step": 436 }, { "epoch": 0.49073554183043233, "grad_norm": 0.41796091198921204, "learning_rate": 9.87701797917156e-06, "loss": 0.4828, "step": 437 }, { "epoch": 0.4918585064570466, "grad_norm": 0.4155077040195465, "learning_rate": 9.875572924976568e-06, "loss": 0.4761, "step": 438 }, { "epoch": 0.49298147108366086, "grad_norm": 0.4175276458263397, "learning_rate": 9.874119537466575e-06, "loss": 0.4696, "step": 439 }, { "epoch": 0.4941044357102751, "grad_norm": 0.45004788041114807, "learning_rate": 9.872657819125706e-06, "loss": 0.4949, "step": 440 }, { "epoch": 0.4952274003368894, "grad_norm": 0.3748813569545746, "learning_rate": 9.871187772452327e-06, "loss": 0.4689, "step": 441 }, { "epoch": 0.49635036496350365, "grad_norm": 0.4662182927131653, "learning_rate": 9.869709399959037e-06, "loss": 0.4985, "step": 442 }, { "epoch": 0.4974733295901179, "grad_norm": 0.36327266693115234, "learning_rate": 9.868222704172663e-06, "loss": 0.4706, "step": 443 }, { "epoch": 0.4985962942167322, "grad_norm": 0.398894339799881, "learning_rate": 9.866727687634266e-06, "loss": 0.4866, "step": 444 }, { "epoch": 0.49971925884334645, "grad_norm": 0.41571396589279175, "learning_rate": 9.86522435289912e-06, "loss": 0.4652, "step": 445 }, { "epoch": 0.5008422234699607, "grad_norm": 0.3947865068912506, "learning_rate": 9.863712702536722e-06, "loss": 0.4791, "step": 446 }, { "epoch": 0.501965188096575, "grad_norm": 0.3808096945285797, "learning_rate": 9.86219273913078e-06, "loss": 0.4814, "step": 447 }, { "epoch": 0.5030881527231892, "grad_norm": 0.46112746000289917, "learning_rate": 9.860664465279211e-06, "loss": 0.463, "step": 448 }, { "epoch": 0.5042111173498035, "grad_norm": 0.4132797420024872, "learning_rate": 9.859127883594138e-06, "loss": 0.4704, "step": 449 }, { "epoch": 0.5053340819764177, "grad_norm": 0.4653143584728241, "learning_rate": 9.857582996701878e-06, "loss": 0.4899, "step": 450 }, { "epoch": 0.506457046603032, "grad_norm": 0.4484460949897766, "learning_rate": 9.856029807242949e-06, "loss": 0.5024, "step": 451 }, { "epoch": 0.5075800112296462, "grad_norm": 0.4624413549900055, "learning_rate": 9.854468317872059e-06, "loss": 0.4779, "step": 452 }, { "epoch": 0.5087029758562606, "grad_norm": 0.46307650208473206, "learning_rate": 9.852898531258102e-06, "loss": 0.4915, "step": 453 }, { "epoch": 0.5098259404828748, "grad_norm": 0.46623823046684265, "learning_rate": 9.851320450084148e-06, "loss": 0.4917, "step": 454 }, { "epoch": 0.5109489051094891, "grad_norm": 0.4382672607898712, "learning_rate": 9.849734077047455e-06, "loss": 0.4804, "step": 455 }, { "epoch": 0.5120718697361033, "grad_norm": 0.42378270626068115, "learning_rate": 9.848139414859441e-06, "loss": 0.4741, "step": 456 }, { "epoch": 0.5131948343627176, "grad_norm": 0.4479183256626129, "learning_rate": 9.846536466245703e-06, "loss": 0.4841, "step": 457 }, { "epoch": 0.5143177989893318, "grad_norm": 0.4019947350025177, "learning_rate": 9.844925233945993e-06, "loss": 0.4705, "step": 458 }, { "epoch": 0.5154407636159462, "grad_norm": 0.41949713230133057, "learning_rate": 9.843305720714227e-06, "loss": 0.4653, "step": 459 }, { "epoch": 0.5165637282425604, "grad_norm": 0.4161226749420166, "learning_rate": 9.84167792931847e-06, "loss": 0.4641, "step": 460 }, { "epoch": 0.5176866928691746, "grad_norm": 0.4596388638019562, "learning_rate": 9.840041862540936e-06, "loss": 0.5037, "step": 461 }, { "epoch": 0.5188096574957889, "grad_norm": 0.3896803855895996, "learning_rate": 9.838397523177993e-06, "loss": 0.4659, "step": 462 }, { "epoch": 0.5199326221224031, "grad_norm": 0.47495409846305847, "learning_rate": 9.836744914040135e-06, "loss": 0.5005, "step": 463 }, { "epoch": 0.5210555867490174, "grad_norm": 0.4093184769153595, "learning_rate": 9.835084037951999e-06, "loss": 0.4894, "step": 464 }, { "epoch": 0.5221785513756316, "grad_norm": 0.4255260229110718, "learning_rate": 9.833414897752346e-06, "loss": 0.4676, "step": 465 }, { "epoch": 0.523301516002246, "grad_norm": 0.42729464173316956, "learning_rate": 9.831737496294072e-06, "loss": 0.4673, "step": 466 }, { "epoch": 0.5244244806288602, "grad_norm": 0.4746253192424774, "learning_rate": 9.830051836444184e-06, "loss": 0.4812, "step": 467 }, { "epoch": 0.5255474452554745, "grad_norm": 0.5302431583404541, "learning_rate": 9.828357921083803e-06, "loss": 0.502, "step": 468 }, { "epoch": 0.5266704098820887, "grad_norm": 0.4195486903190613, "learning_rate": 9.826655753108168e-06, "loss": 0.4645, "step": 469 }, { "epoch": 0.527793374508703, "grad_norm": 0.554396390914917, "learning_rate": 9.82494533542662e-06, "loss": 0.4893, "step": 470 }, { "epoch": 0.5289163391353172, "grad_norm": 0.5582112669944763, "learning_rate": 9.823226670962598e-06, "loss": 0.4954, "step": 471 }, { "epoch": 0.5300393037619315, "grad_norm": 0.5307537913322449, "learning_rate": 9.821499762653639e-06, "loss": 0.4803, "step": 472 }, { "epoch": 0.5311622683885457, "grad_norm": 0.4711509943008423, "learning_rate": 9.81976461345137e-06, "loss": 0.4605, "step": 473 }, { "epoch": 0.5322852330151601, "grad_norm": 0.45335400104522705, "learning_rate": 9.818021226321502e-06, "loss": 0.4901, "step": 474 }, { "epoch": 0.5334081976417743, "grad_norm": 0.4417211413383484, "learning_rate": 9.816269604243828e-06, "loss": 0.4441, "step": 475 }, { "epoch": 0.5345311622683886, "grad_norm": 0.4154834747314453, "learning_rate": 9.814509750212215e-06, "loss": 0.4992, "step": 476 }, { "epoch": 0.5356541268950028, "grad_norm": 0.4024500846862793, "learning_rate": 9.812741667234599e-06, "loss": 0.4946, "step": 477 }, { "epoch": 0.5367770915216171, "grad_norm": 0.40121862292289734, "learning_rate": 9.810965358332986e-06, "loss": 0.4575, "step": 478 }, { "epoch": 0.5379000561482313, "grad_norm": 0.4214293360710144, "learning_rate": 9.809180826543435e-06, "loss": 0.4664, "step": 479 }, { "epoch": 0.5390230207748455, "grad_norm": 0.41472771763801575, "learning_rate": 9.807388074916064e-06, "loss": 0.4826, "step": 480 }, { "epoch": 0.5401459854014599, "grad_norm": 0.43654873967170715, "learning_rate": 9.805587106515036e-06, "loss": 0.4683, "step": 481 }, { "epoch": 0.5412689500280741, "grad_norm": 0.42597058415412903, "learning_rate": 9.803777924418565e-06, "loss": 0.49, "step": 482 }, { "epoch": 0.5423919146546884, "grad_norm": 0.39001351594924927, "learning_rate": 9.801960531718898e-06, "loss": 0.4717, "step": 483 }, { "epoch": 0.5435148792813026, "grad_norm": 0.4286368489265442, "learning_rate": 9.800134931522316e-06, "loss": 0.4946, "step": 484 }, { "epoch": 0.5446378439079169, "grad_norm": 0.44233402609825134, "learning_rate": 9.79830112694913e-06, "loss": 0.4614, "step": 485 }, { "epoch": 0.5457608085345311, "grad_norm": 0.4762893319129944, "learning_rate": 9.796459121133675e-06, "loss": 0.491, "step": 486 }, { "epoch": 0.5468837731611454, "grad_norm": 0.5094893574714661, "learning_rate": 9.7946089172243e-06, "loss": 0.49, "step": 487 }, { "epoch": 0.5480067377877597, "grad_norm": 0.45378559827804565, "learning_rate": 9.79275051838337e-06, "loss": 0.471, "step": 488 }, { "epoch": 0.549129702414374, "grad_norm": 0.5100117325782776, "learning_rate": 9.790883927787254e-06, "loss": 0.4634, "step": 489 }, { "epoch": 0.5502526670409882, "grad_norm": 0.4314429461956024, "learning_rate": 9.789009148626324e-06, "loss": 0.4508, "step": 490 }, { "epoch": 0.5513756316676025, "grad_norm": 0.481952041387558, "learning_rate": 9.787126184104943e-06, "loss": 0.4521, "step": 491 }, { "epoch": 0.5524985962942167, "grad_norm": 0.4315899908542633, "learning_rate": 9.785235037441473e-06, "loss": 0.4774, "step": 492 }, { "epoch": 0.553621560920831, "grad_norm": 0.45052534341812134, "learning_rate": 9.783335711868258e-06, "loss": 0.4815, "step": 493 }, { "epoch": 0.5547445255474452, "grad_norm": 0.4916282296180725, "learning_rate": 9.781428210631614e-06, "loss": 0.4743, "step": 494 }, { "epoch": 0.5558674901740596, "grad_norm": 0.4346478283405304, "learning_rate": 9.779512536991839e-06, "loss": 0.4849, "step": 495 }, { "epoch": 0.5569904548006738, "grad_norm": 0.45673269033432007, "learning_rate": 9.7775886942232e-06, "loss": 0.5058, "step": 496 }, { "epoch": 0.5581134194272881, "grad_norm": 0.4402593970298767, "learning_rate": 9.775656685613917e-06, "loss": 0.4691, "step": 497 }, { "epoch": 0.5592363840539023, "grad_norm": 0.45581209659576416, "learning_rate": 9.773716514466179e-06, "loss": 0.4756, "step": 498 }, { "epoch": 0.5603593486805165, "grad_norm": 0.5606263279914856, "learning_rate": 9.771768184096117e-06, "loss": 0.5196, "step": 499 }, { "epoch": 0.5614823133071308, "grad_norm": 0.448047399520874, "learning_rate": 9.769811697833815e-06, "loss": 0.4675, "step": 500 }, { "epoch": 0.562605277933745, "grad_norm": 0.5028826594352722, "learning_rate": 9.767847059023292e-06, "loss": 0.4577, "step": 501 }, { "epoch": 0.5637282425603594, "grad_norm": 0.5141559839248657, "learning_rate": 9.765874271022503e-06, "loss": 0.4768, "step": 502 }, { "epoch": 0.5648512071869736, "grad_norm": 0.4652795195579529, "learning_rate": 9.76389333720333e-06, "loss": 0.4783, "step": 503 }, { "epoch": 0.5659741718135879, "grad_norm": 0.49386098980903625, "learning_rate": 9.761904260951583e-06, "loss": 0.4482, "step": 504 }, { "epoch": 0.5670971364402021, "grad_norm": 0.4713054895401001, "learning_rate": 9.75990704566698e-06, "loss": 0.4592, "step": 505 }, { "epoch": 0.5682201010668164, "grad_norm": 0.3959769010543823, "learning_rate": 9.75790169476316e-06, "loss": 0.4668, "step": 506 }, { "epoch": 0.5693430656934306, "grad_norm": 0.4947628974914551, "learning_rate": 9.755888211667663e-06, "loss": 0.4728, "step": 507 }, { "epoch": 0.570466030320045, "grad_norm": 0.4038761556148529, "learning_rate": 9.753866599821926e-06, "loss": 0.4691, "step": 508 }, { "epoch": 0.5715889949466592, "grad_norm": 0.4167007803916931, "learning_rate": 9.751836862681283e-06, "loss": 0.4716, "step": 509 }, { "epoch": 0.5727119595732735, "grad_norm": 0.4152282774448395, "learning_rate": 9.749799003714954e-06, "loss": 0.4787, "step": 510 }, { "epoch": 0.5738349241998877, "grad_norm": 0.43793606758117676, "learning_rate": 9.747753026406044e-06, "loss": 0.4927, "step": 511 }, { "epoch": 0.574957888826502, "grad_norm": 0.4520910978317261, "learning_rate": 9.745698934251528e-06, "loss": 0.4574, "step": 512 }, { "epoch": 0.5760808534531162, "grad_norm": 0.4276700019836426, "learning_rate": 9.743636730762259e-06, "loss": 0.4763, "step": 513 }, { "epoch": 0.5772038180797305, "grad_norm": 0.4442545175552368, "learning_rate": 9.741566419462942e-06, "loss": 0.4855, "step": 514 }, { "epoch": 0.5783267827063447, "grad_norm": 0.4595250189304352, "learning_rate": 9.739488003892155e-06, "loss": 0.4835, "step": 515 }, { "epoch": 0.5794497473329591, "grad_norm": 0.4206981956958771, "learning_rate": 9.737401487602314e-06, "loss": 0.4621, "step": 516 }, { "epoch": 0.5805727119595733, "grad_norm": 0.430006206035614, "learning_rate": 9.735306874159689e-06, "loss": 0.4836, "step": 517 }, { "epoch": 0.5816956765861875, "grad_norm": 0.4312647879123688, "learning_rate": 9.733204167144388e-06, "loss": 0.4406, "step": 518 }, { "epoch": 0.5828186412128018, "grad_norm": 0.4748629927635193, "learning_rate": 9.731093370150349e-06, "loss": 0.4563, "step": 519 }, { "epoch": 0.583941605839416, "grad_norm": 0.45944279432296753, "learning_rate": 9.728974486785342e-06, "loss": 0.471, "step": 520 }, { "epoch": 0.5850645704660303, "grad_norm": 0.48992931842803955, "learning_rate": 9.726847520670956e-06, "loss": 0.4861, "step": 521 }, { "epoch": 0.5861875350926445, "grad_norm": 0.47786641120910645, "learning_rate": 9.724712475442597e-06, "loss": 0.4729, "step": 522 }, { "epoch": 0.5873104997192589, "grad_norm": 0.5171506404876709, "learning_rate": 9.722569354749475e-06, "loss": 0.4759, "step": 523 }, { "epoch": 0.5884334643458731, "grad_norm": 0.543573796749115, "learning_rate": 9.720418162254604e-06, "loss": 0.4744, "step": 524 }, { "epoch": 0.5895564289724874, "grad_norm": 0.45456403493881226, "learning_rate": 9.718258901634802e-06, "loss": 0.4577, "step": 525 }, { "epoch": 0.5906793935991016, "grad_norm": 0.6439238786697388, "learning_rate": 9.716091576580666e-06, "loss": 0.4714, "step": 526 }, { "epoch": 0.5918023582257159, "grad_norm": 0.4170093536376953, "learning_rate": 9.713916190796584e-06, "loss": 0.4526, "step": 527 }, { "epoch": 0.5929253228523301, "grad_norm": 0.4833886921405792, "learning_rate": 9.71173274800072e-06, "loss": 0.454, "step": 528 }, { "epoch": 0.5940482874789444, "grad_norm": 0.43355032801628113, "learning_rate": 9.709541251925003e-06, "loss": 0.4747, "step": 529 }, { "epoch": 0.5951712521055587, "grad_norm": 0.4202955961227417, "learning_rate": 9.707341706315138e-06, "loss": 0.4942, "step": 530 }, { "epoch": 0.596294216732173, "grad_norm": 0.5362517833709717, "learning_rate": 9.70513411493058e-06, "loss": 0.4535, "step": 531 }, { "epoch": 0.5974171813587872, "grad_norm": 0.40542495250701904, "learning_rate": 9.702918481544533e-06, "loss": 0.4558, "step": 532 }, { "epoch": 0.5985401459854015, "grad_norm": 0.44505777955055237, "learning_rate": 9.700694809943957e-06, "loss": 0.4732, "step": 533 }, { "epoch": 0.5996631106120157, "grad_norm": 0.5400472283363342, "learning_rate": 9.698463103929542e-06, "loss": 0.4872, "step": 534 }, { "epoch": 0.60078607523863, "grad_norm": 0.4192606508731842, "learning_rate": 9.696223367315716e-06, "loss": 0.4432, "step": 535 }, { "epoch": 0.6019090398652442, "grad_norm": 0.548052191734314, "learning_rate": 9.693975603930628e-06, "loss": 0.4863, "step": 536 }, { "epoch": 0.6030320044918585, "grad_norm": 0.49575239419937134, "learning_rate": 9.691719817616148e-06, "loss": 0.4925, "step": 537 }, { "epoch": 0.6041549691184728, "grad_norm": 0.5150744915008545, "learning_rate": 9.689456012227863e-06, "loss": 0.4731, "step": 538 }, { "epoch": 0.605277933745087, "grad_norm": 0.48065418004989624, "learning_rate": 9.687184191635057e-06, "loss": 0.4571, "step": 539 }, { "epoch": 0.6064008983717013, "grad_norm": 0.5033710598945618, "learning_rate": 9.684904359720724e-06, "loss": 0.4538, "step": 540 }, { "epoch": 0.6075238629983155, "grad_norm": 0.5239039659500122, "learning_rate": 9.682616520381545e-06, "loss": 0.4711, "step": 541 }, { "epoch": 0.6086468276249298, "grad_norm": 0.5570626258850098, "learning_rate": 9.680320677527886e-06, "loss": 0.4942, "step": 542 }, { "epoch": 0.609769792251544, "grad_norm": 0.5582017302513123, "learning_rate": 9.678016835083798e-06, "loss": 0.4606, "step": 543 }, { "epoch": 0.6108927568781584, "grad_norm": 0.5023290514945984, "learning_rate": 9.675704996986999e-06, "loss": 0.4713, "step": 544 }, { "epoch": 0.6120157215047726, "grad_norm": 0.5260736346244812, "learning_rate": 9.673385167188878e-06, "loss": 0.4653, "step": 545 }, { "epoch": 0.6131386861313869, "grad_norm": 0.4239555597305298, "learning_rate": 9.671057349654481e-06, "loss": 0.4714, "step": 546 }, { "epoch": 0.6142616507580011, "grad_norm": 0.5315463542938232, "learning_rate": 9.668721548362505e-06, "loss": 0.468, "step": 547 }, { "epoch": 0.6153846153846154, "grad_norm": 0.46136072278022766, "learning_rate": 9.666377767305297e-06, "loss": 0.4748, "step": 548 }, { "epoch": 0.6165075800112296, "grad_norm": 0.4815479516983032, "learning_rate": 9.66402601048884e-06, "loss": 0.4858, "step": 549 }, { "epoch": 0.617630544637844, "grad_norm": 0.45313891768455505, "learning_rate": 9.661666281932751e-06, "loss": 0.4499, "step": 550 }, { "epoch": 0.6187535092644582, "grad_norm": 0.4435279369354248, "learning_rate": 9.659298585670268e-06, "loss": 0.4727, "step": 551 }, { "epoch": 0.6198764738910725, "grad_norm": 0.44515541195869446, "learning_rate": 9.656922925748254e-06, "loss": 0.4636, "step": 552 }, { "epoch": 0.6209994385176867, "grad_norm": 0.47983822226524353, "learning_rate": 9.654539306227178e-06, "loss": 0.4662, "step": 553 }, { "epoch": 0.622122403144301, "grad_norm": 0.4637977182865143, "learning_rate": 9.652147731181116e-06, "loss": 0.4746, "step": 554 }, { "epoch": 0.6232453677709152, "grad_norm": 0.45143234729766846, "learning_rate": 9.649748204697741e-06, "loss": 0.4816, "step": 555 }, { "epoch": 0.6243683323975294, "grad_norm": 0.48032164573669434, "learning_rate": 9.647340730878316e-06, "loss": 0.4633, "step": 556 }, { "epoch": 0.6254912970241437, "grad_norm": 0.4569777846336365, "learning_rate": 9.644925313837689e-06, "loss": 0.4667, "step": 557 }, { "epoch": 0.626614261650758, "grad_norm": 0.44239336252212524, "learning_rate": 9.642501957704287e-06, "loss": 0.4758, "step": 558 }, { "epoch": 0.6277372262773723, "grad_norm": 0.5049344897270203, "learning_rate": 9.640070666620095e-06, "loss": 0.4473, "step": 559 }, { "epoch": 0.6288601909039865, "grad_norm": 0.419763445854187, "learning_rate": 9.637631444740679e-06, "loss": 0.4685, "step": 560 }, { "epoch": 0.6299831555306008, "grad_norm": 0.49944445490837097, "learning_rate": 9.63518429623514e-06, "loss": 0.4793, "step": 561 }, { "epoch": 0.631106120157215, "grad_norm": 0.5042561292648315, "learning_rate": 9.632729225286144e-06, "loss": 0.4991, "step": 562 }, { "epoch": 0.6322290847838293, "grad_norm": 0.45684516429901123, "learning_rate": 9.630266236089889e-06, "loss": 0.479, "step": 563 }, { "epoch": 0.6333520494104435, "grad_norm": 0.4554424285888672, "learning_rate": 9.627795332856107e-06, "loss": 0.4601, "step": 564 }, { "epoch": 0.6344750140370579, "grad_norm": 0.49570131301879883, "learning_rate": 9.62531651980806e-06, "loss": 0.4707, "step": 565 }, { "epoch": 0.6355979786636721, "grad_norm": 0.46659091114997864, "learning_rate": 9.62282980118253e-06, "loss": 0.4538, "step": 566 }, { "epoch": 0.6367209432902864, "grad_norm": 0.4619212746620178, "learning_rate": 9.620335181229805e-06, "loss": 0.4556, "step": 567 }, { "epoch": 0.6378439079169006, "grad_norm": 0.49733150005340576, "learning_rate": 9.617832664213686e-06, "loss": 0.4833, "step": 568 }, { "epoch": 0.6389668725435149, "grad_norm": 0.5193385481834412, "learning_rate": 9.615322254411468e-06, "loss": 0.4696, "step": 569 }, { "epoch": 0.6400898371701291, "grad_norm": 0.4174236059188843, "learning_rate": 9.612803956113932e-06, "loss": 0.4533, "step": 570 }, { "epoch": 0.6412128017967434, "grad_norm": 0.48638105392456055, "learning_rate": 9.610277773625349e-06, "loss": 0.4362, "step": 571 }, { "epoch": 0.6423357664233577, "grad_norm": 0.3890558183193207, "learning_rate": 9.607743711263463e-06, "loss": 0.4759, "step": 572 }, { "epoch": 0.643458731049972, "grad_norm": 0.5165433883666992, "learning_rate": 9.605201773359485e-06, "loss": 0.4435, "step": 573 }, { "epoch": 0.6445816956765862, "grad_norm": 0.39229169487953186, "learning_rate": 9.602651964258087e-06, "loss": 0.474, "step": 574 }, { "epoch": 0.6457046603032004, "grad_norm": 0.4285760521888733, "learning_rate": 9.600094288317398e-06, "loss": 0.471, "step": 575 }, { "epoch": 0.6468276249298147, "grad_norm": 0.482292115688324, "learning_rate": 9.59752874990899e-06, "loss": 0.4645, "step": 576 }, { "epoch": 0.6479505895564289, "grad_norm": 0.4191385805606842, "learning_rate": 9.594955353417869e-06, "loss": 0.4752, "step": 577 }, { "epoch": 0.6490735541830432, "grad_norm": 0.4393656849861145, "learning_rate": 9.592374103242481e-06, "loss": 0.4717, "step": 578 }, { "epoch": 0.6501965188096575, "grad_norm": 0.4367274343967438, "learning_rate": 9.589785003794692e-06, "loss": 0.4737, "step": 579 }, { "epoch": 0.6513194834362718, "grad_norm": 0.372048556804657, "learning_rate": 9.58718805949978e-06, "loss": 0.4794, "step": 580 }, { "epoch": 0.652442448062886, "grad_norm": 0.404815673828125, "learning_rate": 9.584583274796438e-06, "loss": 0.4576, "step": 581 }, { "epoch": 0.6535654126895003, "grad_norm": 0.44855591654777527, "learning_rate": 9.581970654136752e-06, "loss": 0.4885, "step": 582 }, { "epoch": 0.6546883773161145, "grad_norm": 0.3628772497177124, "learning_rate": 9.57935020198621e-06, "loss": 0.4537, "step": 583 }, { "epoch": 0.6558113419427288, "grad_norm": 0.41862308979034424, "learning_rate": 9.576721922823678e-06, "loss": 0.4814, "step": 584 }, { "epoch": 0.656934306569343, "grad_norm": 0.4539453089237213, "learning_rate": 9.574085821141406e-06, "loss": 0.4493, "step": 585 }, { "epoch": 0.6580572711959574, "grad_norm": 0.4036634862422943, "learning_rate": 9.571441901445009e-06, "loss": 0.4639, "step": 586 }, { "epoch": 0.6591802358225716, "grad_norm": 0.44412878155708313, "learning_rate": 9.568790168253468e-06, "loss": 0.4537, "step": 587 }, { "epoch": 0.6603032004491859, "grad_norm": 0.4187055230140686, "learning_rate": 9.566130626099118e-06, "loss": 0.4939, "step": 588 }, { "epoch": 0.6614261650758001, "grad_norm": 0.46620312333106995, "learning_rate": 9.563463279527638e-06, "loss": 0.4562, "step": 589 }, { "epoch": 0.6625491297024144, "grad_norm": 0.43926718831062317, "learning_rate": 9.560788133098052e-06, "loss": 0.4763, "step": 590 }, { "epoch": 0.6636720943290286, "grad_norm": 0.41630467772483826, "learning_rate": 9.55810519138271e-06, "loss": 0.4696, "step": 591 }, { "epoch": 0.664795058955643, "grad_norm": 0.45322662591934204, "learning_rate": 9.555414458967291e-06, "loss": 0.467, "step": 592 }, { "epoch": 0.6659180235822572, "grad_norm": 0.4815731346607208, "learning_rate": 9.552715940450785e-06, "loss": 0.4663, "step": 593 }, { "epoch": 0.6670409882088714, "grad_norm": 0.48741331696510315, "learning_rate": 9.550009640445492e-06, "loss": 0.4915, "step": 594 }, { "epoch": 0.6681639528354857, "grad_norm": 0.45795246958732605, "learning_rate": 9.547295563577013e-06, "loss": 0.4571, "step": 595 }, { "epoch": 0.6692869174620999, "grad_norm": 0.49801135063171387, "learning_rate": 9.544573714484239e-06, "loss": 0.4703, "step": 596 }, { "epoch": 0.6704098820887142, "grad_norm": 0.41725873947143555, "learning_rate": 9.541844097819347e-06, "loss": 0.467, "step": 597 }, { "epoch": 0.6715328467153284, "grad_norm": 0.41955646872520447, "learning_rate": 9.539106718247791e-06, "loss": 0.4672, "step": 598 }, { "epoch": 0.6726558113419427, "grad_norm": 0.420707106590271, "learning_rate": 9.53636158044829e-06, "loss": 0.4752, "step": 599 }, { "epoch": 0.673778775968557, "grad_norm": 0.3712999224662781, "learning_rate": 9.533608689112827e-06, "loss": 0.4517, "step": 600 }, { "epoch": 0.6749017405951713, "grad_norm": 0.39783045649528503, "learning_rate": 9.530848048946637e-06, "loss": 0.4831, "step": 601 }, { "epoch": 0.6760247052217855, "grad_norm": 0.44775089621543884, "learning_rate": 9.528079664668197e-06, "loss": 0.4887, "step": 602 }, { "epoch": 0.6771476698483998, "grad_norm": 0.39643368124961853, "learning_rate": 9.525303541009218e-06, "loss": 0.4868, "step": 603 }, { "epoch": 0.678270634475014, "grad_norm": 0.44490957260131836, "learning_rate": 9.522519682714648e-06, "loss": 0.4599, "step": 604 }, { "epoch": 0.6793935991016283, "grad_norm": 0.4353307783603668, "learning_rate": 9.519728094542646e-06, "loss": 0.4948, "step": 605 }, { "epoch": 0.6805165637282425, "grad_norm": 0.43816569447517395, "learning_rate": 9.516928781264588e-06, "loss": 0.4967, "step": 606 }, { "epoch": 0.6816395283548569, "grad_norm": 0.3810655474662781, "learning_rate": 9.51412174766505e-06, "loss": 0.4657, "step": 607 }, { "epoch": 0.6827624929814711, "grad_norm": 0.3616602420806885, "learning_rate": 9.51130699854181e-06, "loss": 0.4831, "step": 608 }, { "epoch": 0.6838854576080854, "grad_norm": 0.39401665329933167, "learning_rate": 9.508484538705823e-06, "loss": 0.4598, "step": 609 }, { "epoch": 0.6850084222346996, "grad_norm": 0.39725714921951294, "learning_rate": 9.505654372981233e-06, "loss": 0.4608, "step": 610 }, { "epoch": 0.6861313868613139, "grad_norm": 0.4143022298812866, "learning_rate": 9.50281650620535e-06, "loss": 0.4804, "step": 611 }, { "epoch": 0.6872543514879281, "grad_norm": 0.393466055393219, "learning_rate": 9.499970943228646e-06, "loss": 0.4936, "step": 612 }, { "epoch": 0.6883773161145423, "grad_norm": 0.3683865964412689, "learning_rate": 9.497117688914753e-06, "loss": 0.4625, "step": 613 }, { "epoch": 0.6895002807411567, "grad_norm": 0.36151397228240967, "learning_rate": 9.494256748140442e-06, "loss": 0.454, "step": 614 }, { "epoch": 0.6906232453677709, "grad_norm": 0.3920838534832001, "learning_rate": 9.491388125795623e-06, "loss": 0.4792, "step": 615 }, { "epoch": 0.6917462099943852, "grad_norm": 0.37149983644485474, "learning_rate": 9.488511826783341e-06, "loss": 0.4739, "step": 616 }, { "epoch": 0.6928691746209994, "grad_norm": 0.46957194805145264, "learning_rate": 9.485627856019757e-06, "loss": 0.4806, "step": 617 }, { "epoch": 0.6939921392476137, "grad_norm": 0.39553332328796387, "learning_rate": 9.482736218434144e-06, "loss": 0.493, "step": 618 }, { "epoch": 0.6951151038742279, "grad_norm": 0.37923356890678406, "learning_rate": 9.47983691896888e-06, "loss": 0.4612, "step": 619 }, { "epoch": 0.6962380685008422, "grad_norm": 0.38691797852516174, "learning_rate": 9.476929962579439e-06, "loss": 0.4521, "step": 620 }, { "epoch": 0.6973610331274565, "grad_norm": 0.455996036529541, "learning_rate": 9.474015354234385e-06, "loss": 0.4779, "step": 621 }, { "epoch": 0.6984839977540708, "grad_norm": 0.4309120178222656, "learning_rate": 9.471093098915356e-06, "loss": 0.477, "step": 622 }, { "epoch": 0.699606962380685, "grad_norm": 0.4526803493499756, "learning_rate": 9.468163201617063e-06, "loss": 0.4639, "step": 623 }, { "epoch": 0.7007299270072993, "grad_norm": 0.46890005469322205, "learning_rate": 9.465225667347275e-06, "loss": 0.4571, "step": 624 }, { "epoch": 0.7018528916339135, "grad_norm": 0.37757250666618347, "learning_rate": 9.462280501126822e-06, "loss": 0.4675, "step": 625 }, { "epoch": 0.7029758562605278, "grad_norm": 0.43785524368286133, "learning_rate": 9.459327707989572e-06, "loss": 0.4703, "step": 626 }, { "epoch": 0.704098820887142, "grad_norm": 0.4286290407180786, "learning_rate": 9.45636729298243e-06, "loss": 0.4514, "step": 627 }, { "epoch": 0.7052217855137564, "grad_norm": 0.43789517879486084, "learning_rate": 9.453399261165325e-06, "loss": 0.456, "step": 628 }, { "epoch": 0.7063447501403706, "grad_norm": 0.40469712018966675, "learning_rate": 9.450423617611214e-06, "loss": 0.4716, "step": 629 }, { "epoch": 0.7074677147669849, "grad_norm": 0.4312828481197357, "learning_rate": 9.447440367406053e-06, "loss": 0.4569, "step": 630 }, { "epoch": 0.7085906793935991, "grad_norm": 0.4970250725746155, "learning_rate": 9.444449515648811e-06, "loss": 0.4588, "step": 631 }, { "epoch": 0.7097136440202133, "grad_norm": 0.44374823570251465, "learning_rate": 9.441451067451438e-06, "loss": 0.4861, "step": 632 }, { "epoch": 0.7108366086468276, "grad_norm": 0.49838149547576904, "learning_rate": 9.438445027938873e-06, "loss": 0.4639, "step": 633 }, { "epoch": 0.7119595732734418, "grad_norm": 0.49401262402534485, "learning_rate": 9.435431402249032e-06, "loss": 0.4781, "step": 634 }, { "epoch": 0.7130825379000562, "grad_norm": 0.44120165705680847, "learning_rate": 9.432410195532796e-06, "loss": 0.4757, "step": 635 }, { "epoch": 0.7142055025266704, "grad_norm": 0.502472460269928, "learning_rate": 9.429381412954e-06, "loss": 0.4885, "step": 636 }, { "epoch": 0.7153284671532847, "grad_norm": 0.42427709698677063, "learning_rate": 9.42634505968943e-06, "loss": 0.4727, "step": 637 }, { "epoch": 0.7164514317798989, "grad_norm": 0.4140471816062927, "learning_rate": 9.423301140928816e-06, "loss": 0.482, "step": 638 }, { "epoch": 0.7175743964065132, "grad_norm": 0.4578076899051666, "learning_rate": 9.420249661874812e-06, "loss": 0.4735, "step": 639 }, { "epoch": 0.7186973610331274, "grad_norm": 0.4082004725933075, "learning_rate": 9.417190627742998e-06, "loss": 0.4729, "step": 640 }, { "epoch": 0.7198203256597417, "grad_norm": 0.42905890941619873, "learning_rate": 9.414124043761865e-06, "loss": 0.4423, "step": 641 }, { "epoch": 0.720943290286356, "grad_norm": 0.4341091811656952, "learning_rate": 9.41104991517281e-06, "loss": 0.4802, "step": 642 }, { "epoch": 0.7220662549129703, "grad_norm": 0.394256591796875, "learning_rate": 9.407968247230126e-06, "loss": 0.4507, "step": 643 }, { "epoch": 0.7231892195395845, "grad_norm": 0.3790942132472992, "learning_rate": 9.404879045200991e-06, "loss": 0.4708, "step": 644 }, { "epoch": 0.7243121841661988, "grad_norm": 0.4054070711135864, "learning_rate": 9.401782314365458e-06, "loss": 0.463, "step": 645 }, { "epoch": 0.725435148792813, "grad_norm": 0.377355694770813, "learning_rate": 9.39867806001645e-06, "loss": 0.4769, "step": 646 }, { "epoch": 0.7265581134194273, "grad_norm": 0.3509123623371124, "learning_rate": 9.395566287459752e-06, "loss": 0.4496, "step": 647 }, { "epoch": 0.7276810780460415, "grad_norm": 0.3638748228549957, "learning_rate": 9.392447002013996e-06, "loss": 0.4747, "step": 648 }, { "epoch": 0.7288040426726559, "grad_norm": 0.4374319612979889, "learning_rate": 9.389320209010653e-06, "loss": 0.4562, "step": 649 }, { "epoch": 0.7299270072992701, "grad_norm": 0.41521188616752625, "learning_rate": 9.38618591379403e-06, "loss": 0.456, "step": 650 }, { "epoch": 0.7310499719258844, "grad_norm": 0.3689146041870117, "learning_rate": 9.383044121721257e-06, "loss": 0.4465, "step": 651 }, { "epoch": 0.7321729365524986, "grad_norm": 0.39518049359321594, "learning_rate": 9.379894838162273e-06, "loss": 0.4542, "step": 652 }, { "epoch": 0.7332959011791128, "grad_norm": 0.40690284967422485, "learning_rate": 9.376738068499827e-06, "loss": 0.4736, "step": 653 }, { "epoch": 0.7344188658057271, "grad_norm": 0.3676255941390991, "learning_rate": 9.37357381812946e-06, "loss": 0.4418, "step": 654 }, { "epoch": 0.7355418304323413, "grad_norm": 0.4409908354282379, "learning_rate": 9.370402092459496e-06, "loss": 0.4519, "step": 655 }, { "epoch": 0.7366647950589557, "grad_norm": 0.3692103326320648, "learning_rate": 9.367222896911044e-06, "loss": 0.4549, "step": 656 }, { "epoch": 0.7377877596855699, "grad_norm": 0.4028908312320709, "learning_rate": 9.364036236917972e-06, "loss": 0.456, "step": 657 }, { "epoch": 0.7389107243121842, "grad_norm": 0.4350546896457672, "learning_rate": 9.360842117926912e-06, "loss": 0.4764, "step": 658 }, { "epoch": 0.7400336889387984, "grad_norm": 0.38644105195999146, "learning_rate": 9.357640545397242e-06, "loss": 0.4782, "step": 659 }, { "epoch": 0.7411566535654127, "grad_norm": 0.3976083993911743, "learning_rate": 9.354431524801082e-06, "loss": 0.4742, "step": 660 }, { "epoch": 0.7422796181920269, "grad_norm": 0.40654605627059937, "learning_rate": 9.351215061623277e-06, "loss": 0.481, "step": 661 }, { "epoch": 0.7434025828186412, "grad_norm": 0.37744253873825073, "learning_rate": 9.347991161361402e-06, "loss": 0.452, "step": 662 }, { "epoch": 0.7445255474452555, "grad_norm": 0.43675747513771057, "learning_rate": 9.344759829525734e-06, "loss": 0.4652, "step": 663 }, { "epoch": 0.7456485120718698, "grad_norm": 0.3949107825756073, "learning_rate": 9.341521071639254e-06, "loss": 0.4958, "step": 664 }, { "epoch": 0.746771476698484, "grad_norm": 0.4594654142856598, "learning_rate": 9.338274893237641e-06, "loss": 0.4825, "step": 665 }, { "epoch": 0.7478944413250983, "grad_norm": 0.42419153451919556, "learning_rate": 9.335021299869256e-06, "loss": 0.471, "step": 666 }, { "epoch": 0.7490174059517125, "grad_norm": 0.5200890302658081, "learning_rate": 9.331760297095127e-06, "loss": 0.4926, "step": 667 }, { "epoch": 0.7501403705783268, "grad_norm": 0.4090116024017334, "learning_rate": 9.32849189048895e-06, "loss": 0.4581, "step": 668 }, { "epoch": 0.751263335204941, "grad_norm": 0.41746464371681213, "learning_rate": 9.32521608563708e-06, "loss": 0.4777, "step": 669 }, { "epoch": 0.7523862998315554, "grad_norm": 0.44327256083488464, "learning_rate": 9.321932888138508e-06, "loss": 0.4544, "step": 670 }, { "epoch": 0.7535092644581696, "grad_norm": 0.3926316499710083, "learning_rate": 9.31864230360487e-06, "loss": 0.4583, "step": 671 }, { "epoch": 0.7546322290847838, "grad_norm": 0.396666556596756, "learning_rate": 9.315344337660422e-06, "loss": 0.4517, "step": 672 }, { "epoch": 0.7557551937113981, "grad_norm": 0.48439329862594604, "learning_rate": 9.312038995942036e-06, "loss": 0.4681, "step": 673 }, { "epoch": 0.7568781583380123, "grad_norm": 0.44877126812934875, "learning_rate": 9.308726284099195e-06, "loss": 0.4475, "step": 674 }, { "epoch": 0.7580011229646266, "grad_norm": 0.44032377004623413, "learning_rate": 9.305406207793974e-06, "loss": 0.4452, "step": 675 }, { "epoch": 0.7591240875912408, "grad_norm": 0.5642189979553223, "learning_rate": 9.302078772701043e-06, "loss": 0.4963, "step": 676 }, { "epoch": 0.7602470522178552, "grad_norm": 0.4791392982006073, "learning_rate": 9.298743984507635e-06, "loss": 0.4559, "step": 677 }, { "epoch": 0.7613700168444694, "grad_norm": 0.5497483611106873, "learning_rate": 9.295401848913569e-06, "loss": 0.4838, "step": 678 }, { "epoch": 0.7624929814710837, "grad_norm": 0.4850218892097473, "learning_rate": 9.292052371631209e-06, "loss": 0.4682, "step": 679 }, { "epoch": 0.7636159460976979, "grad_norm": 0.5227738618850708, "learning_rate": 9.288695558385472e-06, "loss": 0.4606, "step": 680 }, { "epoch": 0.7647389107243122, "grad_norm": 0.5289008021354675, "learning_rate": 9.285331414913816e-06, "loss": 0.4888, "step": 681 }, { "epoch": 0.7658618753509264, "grad_norm": 0.39174798130989075, "learning_rate": 9.281959946966223e-06, "loss": 0.4839, "step": 682 }, { "epoch": 0.7669848399775407, "grad_norm": 0.4334009885787964, "learning_rate": 9.278581160305196e-06, "loss": 0.4363, "step": 683 }, { "epoch": 0.768107804604155, "grad_norm": 0.4722071588039398, "learning_rate": 9.275195060705749e-06, "loss": 0.4839, "step": 684 }, { "epoch": 0.7692307692307693, "grad_norm": 0.36978498101234436, "learning_rate": 9.27180165395539e-06, "loss": 0.4556, "step": 685 }, { "epoch": 0.7703537338573835, "grad_norm": 0.5473568439483643, "learning_rate": 9.268400945854124e-06, "loss": 0.4545, "step": 686 }, { "epoch": 0.7714766984839978, "grad_norm": 0.44910505414009094, "learning_rate": 9.264992942214427e-06, "loss": 0.4794, "step": 687 }, { "epoch": 0.772599663110612, "grad_norm": 0.3840465247631073, "learning_rate": 9.261577648861254e-06, "loss": 0.4738, "step": 688 }, { "epoch": 0.7737226277372263, "grad_norm": 0.46789997816085815, "learning_rate": 9.258155071632014e-06, "loss": 0.4458, "step": 689 }, { "epoch": 0.7748455923638405, "grad_norm": 0.4122277498245239, "learning_rate": 9.254725216376562e-06, "loss": 0.4451, "step": 690 }, { "epoch": 0.7759685569904547, "grad_norm": 0.38088029623031616, "learning_rate": 9.251288088957197e-06, "loss": 0.4576, "step": 691 }, { "epoch": 0.7770915216170691, "grad_norm": 0.5478758811950684, "learning_rate": 9.24784369524865e-06, "loss": 0.4804, "step": 692 }, { "epoch": 0.7782144862436833, "grad_norm": 0.4150387942790985, "learning_rate": 9.244392041138068e-06, "loss": 0.5041, "step": 693 }, { "epoch": 0.7793374508702976, "grad_norm": 0.4786434769630432, "learning_rate": 9.240933132525007e-06, "loss": 0.4437, "step": 694 }, { "epoch": 0.7804604154969118, "grad_norm": 0.38800302147865295, "learning_rate": 9.237466975321423e-06, "loss": 0.4756, "step": 695 }, { "epoch": 0.7815833801235261, "grad_norm": 0.39296966791152954, "learning_rate": 9.233993575451663e-06, "loss": 0.452, "step": 696 }, { "epoch": 0.7827063447501403, "grad_norm": 0.5156542658805847, "learning_rate": 9.230512938852452e-06, "loss": 0.4615, "step": 697 }, { "epoch": 0.7838293093767547, "grad_norm": 0.4073236882686615, "learning_rate": 9.227025071472884e-06, "loss": 0.455, "step": 698 }, { "epoch": 0.7849522740033689, "grad_norm": 0.4636685848236084, "learning_rate": 9.223529979274411e-06, "loss": 0.4993, "step": 699 }, { "epoch": 0.7860752386299832, "grad_norm": 0.4379482865333557, "learning_rate": 9.220027668230835e-06, "loss": 0.461, "step": 700 }, { "epoch": 0.7871982032565974, "grad_norm": 0.41735711693763733, "learning_rate": 9.216518144328295e-06, "loss": 0.4623, "step": 701 }, { "epoch": 0.7883211678832117, "grad_norm": 0.45664846897125244, "learning_rate": 9.213001413565259e-06, "loss": 0.4704, "step": 702 }, { "epoch": 0.7894441325098259, "grad_norm": 0.4406040906906128, "learning_rate": 9.209477481952514e-06, "loss": 0.4777, "step": 703 }, { "epoch": 0.7905670971364402, "grad_norm": 0.5026876330375671, "learning_rate": 9.205946355513154e-06, "loss": 0.4762, "step": 704 }, { "epoch": 0.7916900617630545, "grad_norm": 0.41113269329071045, "learning_rate": 9.202408040282567e-06, "loss": 0.4365, "step": 705 }, { "epoch": 0.7928130263896688, "grad_norm": 0.49789881706237793, "learning_rate": 9.198862542308433e-06, "loss": 0.439, "step": 706 }, { "epoch": 0.793935991016283, "grad_norm": 0.4446854889392853, "learning_rate": 9.19530986765071e-06, "loss": 0.4661, "step": 707 }, { "epoch": 0.7950589556428973, "grad_norm": 0.4721910357475281, "learning_rate": 9.191750022381613e-06, "loss": 0.4679, "step": 708 }, { "epoch": 0.7961819202695115, "grad_norm": 0.455388605594635, "learning_rate": 9.188183012585624e-06, "loss": 0.47, "step": 709 }, { "epoch": 0.7973048848961257, "grad_norm": 0.4587063491344452, "learning_rate": 9.184608844359461e-06, "loss": 0.4772, "step": 710 }, { "epoch": 0.79842784952274, "grad_norm": 0.4085964858531952, "learning_rate": 9.181027523812088e-06, "loss": 0.4493, "step": 711 }, { "epoch": 0.7995508141493542, "grad_norm": 0.45541882514953613, "learning_rate": 9.177439057064684e-06, "loss": 0.452, "step": 712 }, { "epoch": 0.8006737787759686, "grad_norm": 0.44390204548835754, "learning_rate": 9.173843450250644e-06, "loss": 0.4536, "step": 713 }, { "epoch": 0.8017967434025828, "grad_norm": 0.47058311104774475, "learning_rate": 9.170240709515573e-06, "loss": 0.4733, "step": 714 }, { "epoch": 0.8029197080291971, "grad_norm": 0.5059202909469604, "learning_rate": 9.166630841017262e-06, "loss": 0.4675, "step": 715 }, { "epoch": 0.8040426726558113, "grad_norm": 0.4196977913379669, "learning_rate": 9.163013850925688e-06, "loss": 0.4677, "step": 716 }, { "epoch": 0.8051656372824256, "grad_norm": 0.5431621074676514, "learning_rate": 9.159389745423003e-06, "loss": 0.4826, "step": 717 }, { "epoch": 0.8062886019090398, "grad_norm": 0.48227277398109436, "learning_rate": 9.155758530703512e-06, "loss": 0.4622, "step": 718 }, { "epoch": 0.8074115665356542, "grad_norm": 0.4410320520401001, "learning_rate": 9.152120212973681e-06, "loss": 0.4822, "step": 719 }, { "epoch": 0.8085345311622684, "grad_norm": 0.508107602596283, "learning_rate": 9.14847479845211e-06, "loss": 0.4474, "step": 720 }, { "epoch": 0.8096574957888827, "grad_norm": 0.42198458313941956, "learning_rate": 9.144822293369534e-06, "loss": 0.4865, "step": 721 }, { "epoch": 0.8107804604154969, "grad_norm": 0.4074525237083435, "learning_rate": 9.141162703968797e-06, "loss": 0.4653, "step": 722 }, { "epoch": 0.8119034250421112, "grad_norm": 0.531233012676239, "learning_rate": 9.137496036504868e-06, "loss": 0.4993, "step": 723 }, { "epoch": 0.8130263896687254, "grad_norm": 0.41961681842803955, "learning_rate": 9.133822297244794e-06, "loss": 0.4527, "step": 724 }, { "epoch": 0.8141493542953397, "grad_norm": 0.4582473337650299, "learning_rate": 9.130141492467728e-06, "loss": 0.4548, "step": 725 }, { "epoch": 0.815272318921954, "grad_norm": 0.585796594619751, "learning_rate": 9.126453628464889e-06, "loss": 0.46, "step": 726 }, { "epoch": 0.8163952835485683, "grad_norm": 0.36879703402519226, "learning_rate": 9.122758711539558e-06, "loss": 0.4533, "step": 727 }, { "epoch": 0.8175182481751825, "grad_norm": 0.6463472843170166, "learning_rate": 9.119056748007083e-06, "loss": 0.4854, "step": 728 }, { "epoch": 0.8186412128017967, "grad_norm": 0.4350607097148895, "learning_rate": 9.115347744194844e-06, "loss": 0.4627, "step": 729 }, { "epoch": 0.819764177428411, "grad_norm": 0.47857823967933655, "learning_rate": 9.111631706442264e-06, "loss": 0.4708, "step": 730 }, { "epoch": 0.8208871420550252, "grad_norm": 0.5093725323677063, "learning_rate": 9.107908641100782e-06, "loss": 0.4639, "step": 731 }, { "epoch": 0.8220101066816395, "grad_norm": 0.5324406027793884, "learning_rate": 9.10417855453385e-06, "loss": 0.4702, "step": 732 }, { "epoch": 0.8231330713082537, "grad_norm": 0.4593227803707123, "learning_rate": 9.10044145311692e-06, "loss": 0.4563, "step": 733 }, { "epoch": 0.8242560359348681, "grad_norm": 0.5262898206710815, "learning_rate": 9.096697343237434e-06, "loss": 0.4695, "step": 734 }, { "epoch": 0.8253790005614823, "grad_norm": 0.4905157685279846, "learning_rate": 9.09294623129482e-06, "loss": 0.4543, "step": 735 }, { "epoch": 0.8265019651880966, "grad_norm": 0.46078696846961975, "learning_rate": 9.089188123700461e-06, "loss": 0.4507, "step": 736 }, { "epoch": 0.8276249298147108, "grad_norm": 0.5077294707298279, "learning_rate": 9.085423026877706e-06, "loss": 0.4652, "step": 737 }, { "epoch": 0.8287478944413251, "grad_norm": 0.4637020528316498, "learning_rate": 9.081650947261847e-06, "loss": 0.4642, "step": 738 }, { "epoch": 0.8298708590679393, "grad_norm": 0.44405990839004517, "learning_rate": 9.077871891300113e-06, "loss": 0.436, "step": 739 }, { "epoch": 0.8309938236945537, "grad_norm": 0.457200825214386, "learning_rate": 9.074085865451652e-06, "loss": 0.4712, "step": 740 }, { "epoch": 0.8321167883211679, "grad_norm": 0.3929470181465149, "learning_rate": 9.070292876187532e-06, "loss": 0.4581, "step": 741 }, { "epoch": 0.8332397529477822, "grad_norm": 0.4121647775173187, "learning_rate": 9.066492929990717e-06, "loss": 0.4464, "step": 742 }, { "epoch": 0.8343627175743964, "grad_norm": 0.4228673279285431, "learning_rate": 9.062686033356065e-06, "loss": 0.4979, "step": 743 }, { "epoch": 0.8354856822010107, "grad_norm": 0.43014732003211975, "learning_rate": 9.058872192790314e-06, "loss": 0.4582, "step": 744 }, { "epoch": 0.8366086468276249, "grad_norm": 0.43572112917900085, "learning_rate": 9.055051414812065e-06, "loss": 0.4615, "step": 745 }, { "epoch": 0.8377316114542392, "grad_norm": 0.45476120710372925, "learning_rate": 9.051223705951784e-06, "loss": 0.4766, "step": 746 }, { "epoch": 0.8388545760808535, "grad_norm": 0.450595498085022, "learning_rate": 9.047389072751777e-06, "loss": 0.4826, "step": 747 }, { "epoch": 0.8399775407074677, "grad_norm": 0.4859873354434967, "learning_rate": 9.043547521766191e-06, "loss": 0.4394, "step": 748 }, { "epoch": 0.841100505334082, "grad_norm": 0.4938810169696808, "learning_rate": 9.039699059560992e-06, "loss": 0.4617, "step": 749 }, { "epoch": 0.8422234699606962, "grad_norm": 0.507832944393158, "learning_rate": 9.035843692713961e-06, "loss": 0.4549, "step": 750 }, { "epoch": 0.8433464345873105, "grad_norm": 0.43733423948287964, "learning_rate": 9.031981427814679e-06, "loss": 0.4558, "step": 751 }, { "epoch": 0.8444693992139247, "grad_norm": 0.46081140637397766, "learning_rate": 9.028112271464517e-06, "loss": 0.4711, "step": 752 }, { "epoch": 0.845592363840539, "grad_norm": 0.5297753214836121, "learning_rate": 9.02423623027663e-06, "loss": 0.4541, "step": 753 }, { "epoch": 0.8467153284671532, "grad_norm": 0.4310184717178345, "learning_rate": 9.02035331087593e-06, "loss": 0.4571, "step": 754 }, { "epoch": 0.8478382930937676, "grad_norm": 0.42384177446365356, "learning_rate": 9.016463519899097e-06, "loss": 0.4619, "step": 755 }, { "epoch": 0.8489612577203818, "grad_norm": 0.4305708706378937, "learning_rate": 9.012566863994548e-06, "loss": 0.4551, "step": 756 }, { "epoch": 0.8500842223469961, "grad_norm": 0.4533427655696869, "learning_rate": 9.008663349822435e-06, "loss": 0.4829, "step": 757 }, { "epoch": 0.8512071869736103, "grad_norm": 0.4272121489048004, "learning_rate": 9.004752984054636e-06, "loss": 0.4694, "step": 758 }, { "epoch": 0.8523301516002246, "grad_norm": 0.4167126417160034, "learning_rate": 9.000835773374733e-06, "loss": 0.4793, "step": 759 }, { "epoch": 0.8534531162268388, "grad_norm": 0.4358173906803131, "learning_rate": 8.996911724478014e-06, "loss": 0.464, "step": 760 }, { "epoch": 0.8545760808534532, "grad_norm": 0.39343827962875366, "learning_rate": 8.992980844071451e-06, "loss": 0.4557, "step": 761 }, { "epoch": 0.8556990454800674, "grad_norm": 0.48488444089889526, "learning_rate": 8.98904313887369e-06, "loss": 0.4536, "step": 762 }, { "epoch": 0.8568220101066817, "grad_norm": 0.403816819190979, "learning_rate": 8.985098615615051e-06, "loss": 0.4767, "step": 763 }, { "epoch": 0.8579449747332959, "grad_norm": 0.46225622296333313, "learning_rate": 8.981147281037498e-06, "loss": 0.4956, "step": 764 }, { "epoch": 0.8590679393599102, "grad_norm": 0.4198470115661621, "learning_rate": 8.977189141894645e-06, "loss": 0.4624, "step": 765 }, { "epoch": 0.8601909039865244, "grad_norm": 0.4320054054260254, "learning_rate": 8.973224204951725e-06, "loss": 0.4831, "step": 766 }, { "epoch": 0.8613138686131386, "grad_norm": 0.40329208970069885, "learning_rate": 8.969252476985599e-06, "loss": 0.4427, "step": 767 }, { "epoch": 0.862436833239753, "grad_norm": 0.44967222213745117, "learning_rate": 8.965273964784735e-06, "loss": 0.4708, "step": 768 }, { "epoch": 0.8635597978663672, "grad_norm": 0.42187657952308655, "learning_rate": 8.961288675149195e-06, "loss": 0.4594, "step": 769 }, { "epoch": 0.8646827624929815, "grad_norm": 0.3917265832424164, "learning_rate": 8.95729661489062e-06, "loss": 0.4443, "step": 770 }, { "epoch": 0.8658057271195957, "grad_norm": 0.5050042867660522, "learning_rate": 8.953297790832231e-06, "loss": 0.4678, "step": 771 }, { "epoch": 0.86692869174621, "grad_norm": 0.3722599446773529, "learning_rate": 8.949292209808808e-06, "loss": 0.4806, "step": 772 }, { "epoch": 0.8680516563728242, "grad_norm": 0.414469838142395, "learning_rate": 8.945279878666673e-06, "loss": 0.4551, "step": 773 }, { "epoch": 0.8691746209994385, "grad_norm": 0.4813458323478699, "learning_rate": 8.941260804263697e-06, "loss": 0.4733, "step": 774 }, { "epoch": 0.8702975856260527, "grad_norm": 0.44529399275779724, "learning_rate": 8.937234993469263e-06, "loss": 0.4921, "step": 775 }, { "epoch": 0.8714205502526671, "grad_norm": 0.46331220865249634, "learning_rate": 8.933202453164282e-06, "loss": 0.4739, "step": 776 }, { "epoch": 0.8725435148792813, "grad_norm": 0.4155137240886688, "learning_rate": 8.929163190241157e-06, "loss": 0.4564, "step": 777 }, { "epoch": 0.8736664795058956, "grad_norm": 0.46986615657806396, "learning_rate": 8.925117211603784e-06, "loss": 0.4536, "step": 778 }, { "epoch": 0.8747894441325098, "grad_norm": 0.46992066502571106, "learning_rate": 8.92106452416754e-06, "loss": 0.4506, "step": 779 }, { "epoch": 0.8759124087591241, "grad_norm": 0.4021821618080139, "learning_rate": 8.917005134859263e-06, "loss": 0.4445, "step": 780 }, { "epoch": 0.8770353733857383, "grad_norm": 0.4424058198928833, "learning_rate": 8.912939050617253e-06, "loss": 0.468, "step": 781 }, { "epoch": 0.8781583380123527, "grad_norm": 0.3984222412109375, "learning_rate": 8.908866278391246e-06, "loss": 0.4305, "step": 782 }, { "epoch": 0.8792813026389669, "grad_norm": 0.4618794918060303, "learning_rate": 8.904786825142416e-06, "loss": 0.4648, "step": 783 }, { "epoch": 0.8804042672655812, "grad_norm": 0.44797205924987793, "learning_rate": 8.900700697843348e-06, "loss": 0.4577, "step": 784 }, { "epoch": 0.8815272318921954, "grad_norm": 0.3909663259983063, "learning_rate": 8.896607903478043e-06, "loss": 0.4605, "step": 785 }, { "epoch": 0.8826501965188096, "grad_norm": 0.5488174557685852, "learning_rate": 8.892508449041893e-06, "loss": 0.4443, "step": 786 }, { "epoch": 0.8837731611454239, "grad_norm": 0.3824789524078369, "learning_rate": 8.88840234154167e-06, "loss": 0.469, "step": 787 }, { "epoch": 0.8848961257720381, "grad_norm": 0.4581597149372101, "learning_rate": 8.884289587995524e-06, "loss": 0.4607, "step": 788 }, { "epoch": 0.8860190903986525, "grad_norm": 0.4235505163669586, "learning_rate": 8.88017019543296e-06, "loss": 0.4547, "step": 789 }, { "epoch": 0.8871420550252667, "grad_norm": 0.37747853994369507, "learning_rate": 8.876044170894833e-06, "loss": 0.4646, "step": 790 }, { "epoch": 0.888265019651881, "grad_norm": 0.45242834091186523, "learning_rate": 8.871911521433332e-06, "loss": 0.489, "step": 791 }, { "epoch": 0.8893879842784952, "grad_norm": 0.4243738353252411, "learning_rate": 8.867772254111966e-06, "loss": 0.4701, "step": 792 }, { "epoch": 0.8905109489051095, "grad_norm": 0.3728593587875366, "learning_rate": 8.863626376005563e-06, "loss": 0.4411, "step": 793 }, { "epoch": 0.8916339135317237, "grad_norm": 0.387367308139801, "learning_rate": 8.859473894200246e-06, "loss": 0.4445, "step": 794 }, { "epoch": 0.892756878158338, "grad_norm": 0.3663690388202667, "learning_rate": 8.85531481579342e-06, "loss": 0.4844, "step": 795 }, { "epoch": 0.8938798427849522, "grad_norm": 0.4169730842113495, "learning_rate": 8.851149147893773e-06, "loss": 0.4613, "step": 796 }, { "epoch": 0.8950028074115666, "grad_norm": 0.40635716915130615, "learning_rate": 8.846976897621253e-06, "loss": 0.4576, "step": 797 }, { "epoch": 0.8961257720381808, "grad_norm": 0.42888256907463074, "learning_rate": 8.842798072107055e-06, "loss": 0.4455, "step": 798 }, { "epoch": 0.8972487366647951, "grad_norm": 0.42872321605682373, "learning_rate": 8.83861267849362e-06, "loss": 0.4614, "step": 799 }, { "epoch": 0.8983717012914093, "grad_norm": 0.4714615046977997, "learning_rate": 8.834420723934606e-06, "loss": 0.4336, "step": 800 }, { "epoch": 0.8994946659180236, "grad_norm": 0.38186824321746826, "learning_rate": 8.83022221559489e-06, "loss": 0.4425, "step": 801 }, { "epoch": 0.9006176305446378, "grad_norm": 0.3903937041759491, "learning_rate": 8.826017160650554e-06, "loss": 0.4708, "step": 802 }, { "epoch": 0.9017405951712522, "grad_norm": 0.39862531423568726, "learning_rate": 8.821805566288862e-06, "loss": 0.4539, "step": 803 }, { "epoch": 0.9028635597978664, "grad_norm": 0.4129088222980499, "learning_rate": 8.81758743970826e-06, "loss": 0.4664, "step": 804 }, { "epoch": 0.9039865244244806, "grad_norm": 0.39205819368362427, "learning_rate": 8.813362788118359e-06, "loss": 0.4683, "step": 805 }, { "epoch": 0.9051094890510949, "grad_norm": 0.4299173355102539, "learning_rate": 8.809131618739917e-06, "loss": 0.4864, "step": 806 }, { "epoch": 0.9062324536777091, "grad_norm": 0.44812437891960144, "learning_rate": 8.804893938804839e-06, "loss": 0.4661, "step": 807 }, { "epoch": 0.9073554183043234, "grad_norm": 0.4919585585594177, "learning_rate": 8.800649755556156e-06, "loss": 0.4735, "step": 808 }, { "epoch": 0.9084783829309376, "grad_norm": 0.4439605176448822, "learning_rate": 8.796399076248015e-06, "loss": 0.4641, "step": 809 }, { "epoch": 0.909601347557552, "grad_norm": 0.46329575777053833, "learning_rate": 8.79214190814566e-06, "loss": 0.4556, "step": 810 }, { "epoch": 0.9107243121841662, "grad_norm": 0.49694690108299255, "learning_rate": 8.787878258525432e-06, "loss": 0.4436, "step": 811 }, { "epoch": 0.9118472768107805, "grad_norm": 0.46536824107170105, "learning_rate": 8.78360813467475e-06, "loss": 0.4467, "step": 812 }, { "epoch": 0.9129702414373947, "grad_norm": 0.5356235504150391, "learning_rate": 8.779331543892097e-06, "loss": 0.4559, "step": 813 }, { "epoch": 0.914093206064009, "grad_norm": 0.45210355520248413, "learning_rate": 8.775048493487009e-06, "loss": 0.4459, "step": 814 }, { "epoch": 0.9152161706906232, "grad_norm": 0.4621727168560028, "learning_rate": 8.770758990780064e-06, "loss": 0.4757, "step": 815 }, { "epoch": 0.9163391353172375, "grad_norm": 0.4892253279685974, "learning_rate": 8.766463043102864e-06, "loss": 0.4595, "step": 816 }, { "epoch": 0.9174620999438517, "grad_norm": 0.5432390570640564, "learning_rate": 8.762160657798036e-06, "loss": 0.4522, "step": 817 }, { "epoch": 0.9185850645704661, "grad_norm": 0.44073566794395447, "learning_rate": 8.757851842219199e-06, "loss": 0.4588, "step": 818 }, { "epoch": 0.9197080291970803, "grad_norm": 0.5464689135551453, "learning_rate": 8.75353660373097e-06, "loss": 0.4695, "step": 819 }, { "epoch": 0.9208309938236946, "grad_norm": 0.4968947768211365, "learning_rate": 8.749214949708944e-06, "loss": 0.4748, "step": 820 }, { "epoch": 0.9219539584503088, "grad_norm": 0.4484274387359619, "learning_rate": 8.744886887539677e-06, "loss": 0.4924, "step": 821 }, { "epoch": 0.9230769230769231, "grad_norm": 0.421745628118515, "learning_rate": 8.740552424620679e-06, "loss": 0.4661, "step": 822 }, { "epoch": 0.9241998877035373, "grad_norm": 0.4423162341117859, "learning_rate": 8.736211568360405e-06, "loss": 0.4655, "step": 823 }, { "epoch": 0.9253228523301515, "grad_norm": 0.41479671001434326, "learning_rate": 8.731864326178232e-06, "loss": 0.4603, "step": 824 }, { "epoch": 0.9264458169567659, "grad_norm": 0.4318062365055084, "learning_rate": 8.727510705504453e-06, "loss": 0.4522, "step": 825 }, { "epoch": 0.9275687815833801, "grad_norm": 0.4035295844078064, "learning_rate": 8.723150713780266e-06, "loss": 0.4562, "step": 826 }, { "epoch": 0.9286917462099944, "grad_norm": 0.45890599489212036, "learning_rate": 8.718784358457753e-06, "loss": 0.4515, "step": 827 }, { "epoch": 0.9298147108366086, "grad_norm": 0.44467824697494507, "learning_rate": 8.714411646999878e-06, "loss": 0.4601, "step": 828 }, { "epoch": 0.9309376754632229, "grad_norm": 0.39522966742515564, "learning_rate": 8.710032586880468e-06, "loss": 0.4489, "step": 829 }, { "epoch": 0.9320606400898371, "grad_norm": 0.4257168471813202, "learning_rate": 8.705647185584196e-06, "loss": 0.4582, "step": 830 }, { "epoch": 0.9331836047164515, "grad_norm": 0.3933909833431244, "learning_rate": 8.701255450606579e-06, "loss": 0.4672, "step": 831 }, { "epoch": 0.9343065693430657, "grad_norm": 0.4096648097038269, "learning_rate": 8.696857389453957e-06, "loss": 0.4465, "step": 832 }, { "epoch": 0.93542953396968, "grad_norm": 0.4873373806476593, "learning_rate": 8.692453009643482e-06, "loss": 0.5004, "step": 833 }, { "epoch": 0.9365524985962942, "grad_norm": 0.449069082736969, "learning_rate": 8.688042318703111e-06, "loss": 0.4642, "step": 834 }, { "epoch": 0.9376754632229085, "grad_norm": 0.4785894453525543, "learning_rate": 8.683625324171584e-06, "loss": 0.4541, "step": 835 }, { "epoch": 0.9387984278495227, "grad_norm": 0.48974478244781494, "learning_rate": 8.679202033598411e-06, "loss": 0.4478, "step": 836 }, { "epoch": 0.939921392476137, "grad_norm": 0.38954824209213257, "learning_rate": 8.674772454543869e-06, "loss": 0.4841, "step": 837 }, { "epoch": 0.9410443571027512, "grad_norm": 0.4328992962837219, "learning_rate": 8.670336594578981e-06, "loss": 0.4696, "step": 838 }, { "epoch": 0.9421673217293656, "grad_norm": 0.43887534737586975, "learning_rate": 8.665894461285508e-06, "loss": 0.4478, "step": 839 }, { "epoch": 0.9432902863559798, "grad_norm": 0.40982890129089355, "learning_rate": 8.661446062255931e-06, "loss": 0.4714, "step": 840 }, { "epoch": 0.9444132509825941, "grad_norm": 0.43676480650901794, "learning_rate": 8.656991405093438e-06, "loss": 0.4471, "step": 841 }, { "epoch": 0.9455362156092083, "grad_norm": 0.4660939574241638, "learning_rate": 8.65253049741192e-06, "loss": 0.457, "step": 842 }, { "epoch": 0.9466591802358225, "grad_norm": 0.4767362177371979, "learning_rate": 8.648063346835943e-06, "loss": 0.472, "step": 843 }, { "epoch": 0.9477821448624368, "grad_norm": 0.4302798807621002, "learning_rate": 8.64358996100075e-06, "loss": 0.4678, "step": 844 }, { "epoch": 0.948905109489051, "grad_norm": 0.47011104226112366, "learning_rate": 8.63911034755224e-06, "loss": 0.4412, "step": 845 }, { "epoch": 0.9500280741156654, "grad_norm": 0.47140833735466003, "learning_rate": 8.634624514146954e-06, "loss": 0.4525, "step": 846 }, { "epoch": 0.9511510387422796, "grad_norm": 0.3836977183818817, "learning_rate": 8.630132468452064e-06, "loss": 0.4534, "step": 847 }, { "epoch": 0.9522740033688939, "grad_norm": 0.4999038279056549, "learning_rate": 8.625634218145364e-06, "loss": 0.4546, "step": 848 }, { "epoch": 0.9533969679955081, "grad_norm": 0.4407668709754944, "learning_rate": 8.621129770915248e-06, "loss": 0.4693, "step": 849 }, { "epoch": 0.9545199326221224, "grad_norm": 0.5317721366882324, "learning_rate": 8.616619134460707e-06, "loss": 0.4375, "step": 850 }, { "epoch": 0.9556428972487366, "grad_norm": 0.40472543239593506, "learning_rate": 8.612102316491305e-06, "loss": 0.432, "step": 851 }, { "epoch": 0.956765861875351, "grad_norm": 0.5662089586257935, "learning_rate": 8.607579324727175e-06, "loss": 0.4656, "step": 852 }, { "epoch": 0.9578888265019652, "grad_norm": 0.36155351996421814, "learning_rate": 8.603050166899002e-06, "loss": 0.4658, "step": 853 }, { "epoch": 0.9590117911285795, "grad_norm": 0.4601685404777527, "learning_rate": 8.598514850748006e-06, "loss": 0.4549, "step": 854 }, { "epoch": 0.9601347557551937, "grad_norm": 0.4658159911632538, "learning_rate": 8.59397338402594e-06, "loss": 0.4887, "step": 855 }, { "epoch": 0.961257720381808, "grad_norm": 0.44334855675697327, "learning_rate": 8.589425774495064e-06, "loss": 0.4389, "step": 856 }, { "epoch": 0.9623806850084222, "grad_norm": 0.4052773416042328, "learning_rate": 8.58487202992814e-06, "loss": 0.4276, "step": 857 }, { "epoch": 0.9635036496350365, "grad_norm": 0.4454156458377838, "learning_rate": 8.580312158108413e-06, "loss": 0.4635, "step": 858 }, { "epoch": 0.9646266142616507, "grad_norm": 0.4994664490222931, "learning_rate": 8.575746166829604e-06, "loss": 0.444, "step": 859 }, { "epoch": 0.9657495788882651, "grad_norm": 0.3760167956352234, "learning_rate": 8.571174063895892e-06, "loss": 0.4396, "step": 860 }, { "epoch": 0.9668725435148793, "grad_norm": 0.46264317631721497, "learning_rate": 8.566595857121902e-06, "loss": 0.4602, "step": 861 }, { "epoch": 0.9679955081414935, "grad_norm": 0.4161238670349121, "learning_rate": 8.562011554332691e-06, "loss": 0.4593, "step": 862 }, { "epoch": 0.9691184727681078, "grad_norm": 0.41577526926994324, "learning_rate": 8.557421163363736e-06, "loss": 0.4486, "step": 863 }, { "epoch": 0.970241437394722, "grad_norm": 0.531201958656311, "learning_rate": 8.55282469206092e-06, "loss": 0.4503, "step": 864 }, { "epoch": 0.9713644020213363, "grad_norm": 0.40528520941734314, "learning_rate": 8.548222148280518e-06, "loss": 0.432, "step": 865 }, { "epoch": 0.9724873666479505, "grad_norm": 0.3790910542011261, "learning_rate": 8.543613539889186e-06, "loss": 0.4353, "step": 866 }, { "epoch": 0.9736103312745649, "grad_norm": 0.437468558549881, "learning_rate": 8.538998874763942e-06, "loss": 0.4654, "step": 867 }, { "epoch": 0.9747332959011791, "grad_norm": 0.4615422189235687, "learning_rate": 8.53437816079216e-06, "loss": 0.4854, "step": 868 }, { "epoch": 0.9758562605277934, "grad_norm": 0.39278513193130493, "learning_rate": 8.529751405871548e-06, "loss": 0.4662, "step": 869 }, { "epoch": 0.9769792251544076, "grad_norm": 0.41838669776916504, "learning_rate": 8.525118617910144e-06, "loss": 0.4693, "step": 870 }, { "epoch": 0.9781021897810219, "grad_norm": 0.40234148502349854, "learning_rate": 8.520479804826297e-06, "loss": 0.4529, "step": 871 }, { "epoch": 0.9792251544076361, "grad_norm": 0.4124075472354889, "learning_rate": 8.515834974548649e-06, "loss": 0.4721, "step": 872 }, { "epoch": 0.9803481190342505, "grad_norm": 0.40332093834877014, "learning_rate": 8.511184135016134e-06, "loss": 0.4638, "step": 873 }, { "epoch": 0.9814710836608647, "grad_norm": 0.4441043436527252, "learning_rate": 8.506527294177952e-06, "loss": 0.4659, "step": 874 }, { "epoch": 0.982594048287479, "grad_norm": 0.38291001319885254, "learning_rate": 8.50186445999356e-06, "loss": 0.4536, "step": 875 }, { "epoch": 0.9837170129140932, "grad_norm": 0.35899412631988525, "learning_rate": 8.497195640432664e-06, "loss": 0.4545, "step": 876 }, { "epoch": 0.9848399775407075, "grad_norm": 0.3821329176425934, "learning_rate": 8.492520843475194e-06, "loss": 0.4499, "step": 877 }, { "epoch": 0.9859629421673217, "grad_norm": 0.4021531343460083, "learning_rate": 8.4878400771113e-06, "loss": 0.476, "step": 878 }, { "epoch": 0.987085906793936, "grad_norm": 0.3828616142272949, "learning_rate": 8.483153349341336e-06, "loss": 0.4689, "step": 879 }, { "epoch": 0.9882088714205502, "grad_norm": 0.41049909591674805, "learning_rate": 8.478460668175841e-06, "loss": 0.4782, "step": 880 }, { "epoch": 0.9893318360471645, "grad_norm": 0.3825310170650482, "learning_rate": 8.473762041635531e-06, "loss": 0.4675, "step": 881 }, { "epoch": 0.9904548006737788, "grad_norm": 0.358169287443161, "learning_rate": 8.46905747775129e-06, "loss": 0.4354, "step": 882 }, { "epoch": 0.991577765300393, "grad_norm": 0.36973676085472107, "learning_rate": 8.464346984564137e-06, "loss": 0.4575, "step": 883 }, { "epoch": 0.9927007299270073, "grad_norm": 0.3814185559749603, "learning_rate": 8.45963057012524e-06, "loss": 0.4757, "step": 884 }, { "epoch": 0.9938236945536215, "grad_norm": 0.3659909963607788, "learning_rate": 8.45490824249588e-06, "loss": 0.4624, "step": 885 }, { "epoch": 0.9949466591802358, "grad_norm": 0.3677615821361542, "learning_rate": 8.450180009747441e-06, "loss": 0.4455, "step": 886 }, { "epoch": 0.99606962380685, "grad_norm": 0.39688825607299805, "learning_rate": 8.445445879961411e-06, "loss": 0.4471, "step": 887 }, { "epoch": 0.9971925884334644, "grad_norm": 0.4257163107395172, "learning_rate": 8.440705861229344e-06, "loss": 0.4847, "step": 888 }, { "epoch": 0.9983155530600786, "grad_norm": 0.39355745911598206, "learning_rate": 8.435959961652871e-06, "loss": 0.4517, "step": 889 }, { "epoch": 0.9994385176866929, "grad_norm": 0.41853901743888855, "learning_rate": 8.43120818934367e-06, "loss": 0.4466, "step": 890 }, { "epoch": 1.0008422234699608, "grad_norm": 0.8588258624076843, "learning_rate": 8.426450552423451e-06, "loss": 0.7567, "step": 891 }, { "epoch": 1.001965188096575, "grad_norm": 0.47723883390426636, "learning_rate": 8.421687059023958e-06, "loss": 0.4216, "step": 892 }, { "epoch": 1.0030881527231892, "grad_norm": 0.5316928029060364, "learning_rate": 8.41691771728694e-06, "loss": 0.4262, "step": 893 }, { "epoch": 1.0042111173498034, "grad_norm": 0.4583548307418823, "learning_rate": 8.412142535364139e-06, "loss": 0.4521, "step": 894 }, { "epoch": 1.0053340819764178, "grad_norm": 0.44341006875038147, "learning_rate": 8.407361521417286e-06, "loss": 0.4414, "step": 895 }, { "epoch": 1.006457046603032, "grad_norm": 0.4063226580619812, "learning_rate": 8.402574683618073e-06, "loss": 0.4191, "step": 896 }, { "epoch": 1.0075800112296462, "grad_norm": 0.39591512084007263, "learning_rate": 8.397782030148147e-06, "loss": 0.4281, "step": 897 }, { "epoch": 1.0087029758562605, "grad_norm": 0.4632072150707245, "learning_rate": 8.392983569199103e-06, "loss": 0.4247, "step": 898 }, { "epoch": 1.0098259404828749, "grad_norm": 0.44243383407592773, "learning_rate": 8.388179308972453e-06, "loss": 0.4034, "step": 899 }, { "epoch": 1.010948905109489, "grad_norm": 0.4932941198348999, "learning_rate": 8.383369257679625e-06, "loss": 0.4455, "step": 900 }, { "epoch": 1.0120718697361033, "grad_norm": 0.5202175378799438, "learning_rate": 8.378553423541945e-06, "loss": 0.4563, "step": 901 }, { "epoch": 1.0131948343627175, "grad_norm": 0.4869171977043152, "learning_rate": 8.373731814790623e-06, "loss": 0.4205, "step": 902 }, { "epoch": 1.014317798989332, "grad_norm": 0.44867074489593506, "learning_rate": 8.368904439666739e-06, "loss": 0.4336, "step": 903 }, { "epoch": 1.0154407636159462, "grad_norm": 0.4573555588722229, "learning_rate": 8.364071306421224e-06, "loss": 0.4195, "step": 904 }, { "epoch": 1.0165637282425604, "grad_norm": 0.4247390031814575, "learning_rate": 8.359232423314863e-06, "loss": 0.4047, "step": 905 }, { "epoch": 1.0176866928691746, "grad_norm": 0.44303765892982483, "learning_rate": 8.354387798618254e-06, "loss": 0.4349, "step": 906 }, { "epoch": 1.0188096574957888, "grad_norm": 0.39964333176612854, "learning_rate": 8.349537440611818e-06, "loss": 0.4118, "step": 907 }, { "epoch": 1.0199326221224032, "grad_norm": 0.48327821493148804, "learning_rate": 8.344681357585773e-06, "loss": 0.46, "step": 908 }, { "epoch": 1.0210555867490174, "grad_norm": 0.38615813851356506, "learning_rate": 8.339819557840124e-06, "loss": 0.3914, "step": 909 }, { "epoch": 1.0221785513756316, "grad_norm": 0.5401507616043091, "learning_rate": 8.33495204968464e-06, "loss": 0.4429, "step": 910 }, { "epoch": 1.0233015160022458, "grad_norm": 0.4302870035171509, "learning_rate": 8.330078841438854e-06, "loss": 0.4745, "step": 911 }, { "epoch": 1.0244244806288603, "grad_norm": 0.45851269364356995, "learning_rate": 8.32519994143204e-06, "loss": 0.4083, "step": 912 }, { "epoch": 1.0255474452554745, "grad_norm": 0.3537365198135376, "learning_rate": 8.3203153580032e-06, "loss": 0.4466, "step": 913 }, { "epoch": 1.0266704098820887, "grad_norm": 0.4725887179374695, "learning_rate": 8.315425099501049e-06, "loss": 0.4441, "step": 914 }, { "epoch": 1.027793374508703, "grad_norm": 0.38913580775260925, "learning_rate": 8.310529174284004e-06, "loss": 0.4186, "step": 915 }, { "epoch": 1.0289163391353173, "grad_norm": 0.5014906525611877, "learning_rate": 8.305627590720162e-06, "loss": 0.4257, "step": 916 }, { "epoch": 1.0300393037619315, "grad_norm": 0.3730778694152832, "learning_rate": 8.300720357187299e-06, "loss": 0.429, "step": 917 }, { "epoch": 1.0311622683885457, "grad_norm": 0.5048668384552002, "learning_rate": 8.295807482072842e-06, "loss": 0.4419, "step": 918 }, { "epoch": 1.03228523301516, "grad_norm": 0.44999366998672485, "learning_rate": 8.290888973773865e-06, "loss": 0.4826, "step": 919 }, { "epoch": 1.0334081976417744, "grad_norm": 0.39779308438301086, "learning_rate": 8.285964840697068e-06, "loss": 0.4161, "step": 920 }, { "epoch": 1.0345311622683886, "grad_norm": 0.4838612675666809, "learning_rate": 8.281035091258762e-06, "loss": 0.399, "step": 921 }, { "epoch": 1.0356541268950028, "grad_norm": 0.4550621211528778, "learning_rate": 8.276099733884864e-06, "loss": 0.4989, "step": 922 }, { "epoch": 1.036777091521617, "grad_norm": 0.4486664831638336, "learning_rate": 8.271158777010868e-06, "loss": 0.4227, "step": 923 }, { "epoch": 1.0379000561482314, "grad_norm": 0.404275119304657, "learning_rate": 8.266212229081846e-06, "loss": 0.4281, "step": 924 }, { "epoch": 1.0390230207748457, "grad_norm": 0.4675980508327484, "learning_rate": 8.261260098552426e-06, "loss": 0.4392, "step": 925 }, { "epoch": 1.0401459854014599, "grad_norm": 0.41775909066200256, "learning_rate": 8.25630239388677e-06, "loss": 0.4507, "step": 926 }, { "epoch": 1.041268950028074, "grad_norm": 0.42524078488349915, "learning_rate": 8.251339123558573e-06, "loss": 0.446, "step": 927 }, { "epoch": 1.0423919146546883, "grad_norm": 0.37980592250823975, "learning_rate": 8.246370296051045e-06, "loss": 0.4097, "step": 928 }, { "epoch": 1.0435148792813027, "grad_norm": 0.4534863829612732, "learning_rate": 8.24139591985689e-06, "loss": 0.4697, "step": 929 }, { "epoch": 1.044637843907917, "grad_norm": 0.4243798851966858, "learning_rate": 8.236416003478295e-06, "loss": 0.4168, "step": 930 }, { "epoch": 1.0457608085345311, "grad_norm": 0.47612181305885315, "learning_rate": 8.231430555426923e-06, "loss": 0.4433, "step": 931 }, { "epoch": 1.0468837731611453, "grad_norm": 0.3903225362300873, "learning_rate": 8.226439584223885e-06, "loss": 0.4062, "step": 932 }, { "epoch": 1.0480067377877598, "grad_norm": 0.5543419718742371, "learning_rate": 8.221443098399733e-06, "loss": 0.4739, "step": 933 }, { "epoch": 1.049129702414374, "grad_norm": 0.378098726272583, "learning_rate": 8.216441106494447e-06, "loss": 0.4157, "step": 934 }, { "epoch": 1.0502526670409882, "grad_norm": 0.4766242206096649, "learning_rate": 8.21143361705742e-06, "loss": 0.4559, "step": 935 }, { "epoch": 1.0513756316676024, "grad_norm": 0.410483181476593, "learning_rate": 8.206420638647433e-06, "loss": 0.4463, "step": 936 }, { "epoch": 1.0524985962942168, "grad_norm": 0.454330176115036, "learning_rate": 8.201402179832657e-06, "loss": 0.4205, "step": 937 }, { "epoch": 1.053621560920831, "grad_norm": 0.3885717988014221, "learning_rate": 8.196378249190627e-06, "loss": 0.4379, "step": 938 }, { "epoch": 1.0547445255474452, "grad_norm": 0.5017076730728149, "learning_rate": 8.191348855308229e-06, "loss": 0.4517, "step": 939 }, { "epoch": 1.0558674901740595, "grad_norm": 0.3875944912433624, "learning_rate": 8.186314006781693e-06, "loss": 0.4275, "step": 940 }, { "epoch": 1.0569904548006739, "grad_norm": 0.5399092435836792, "learning_rate": 8.181273712216561e-06, "loss": 0.4603, "step": 941 }, { "epoch": 1.058113419427288, "grad_norm": 0.42408376932144165, "learning_rate": 8.176227980227693e-06, "loss": 0.4273, "step": 942 }, { "epoch": 1.0592363840539023, "grad_norm": 0.4705450236797333, "learning_rate": 8.17117681943924e-06, "loss": 0.4421, "step": 943 }, { "epoch": 1.0603593486805165, "grad_norm": 0.3933738172054291, "learning_rate": 8.166120238484631e-06, "loss": 0.4297, "step": 944 }, { "epoch": 1.0614823133071307, "grad_norm": 0.3634945750236511, "learning_rate": 8.161058246006558e-06, "loss": 0.4257, "step": 945 }, { "epoch": 1.0626052779337452, "grad_norm": 0.4203615188598633, "learning_rate": 8.155990850656965e-06, "loss": 0.4399, "step": 946 }, { "epoch": 1.0637282425603594, "grad_norm": 0.43164652585983276, "learning_rate": 8.15091806109703e-06, "loss": 0.4362, "step": 947 }, { "epoch": 1.0648512071869736, "grad_norm": 0.349311888217926, "learning_rate": 8.145839885997146e-06, "loss": 0.4083, "step": 948 }, { "epoch": 1.0659741718135878, "grad_norm": 0.46893933415412903, "learning_rate": 8.14075633403692e-06, "loss": 0.4279, "step": 949 }, { "epoch": 1.0670971364402022, "grad_norm": 0.39896371960639954, "learning_rate": 8.135667413905144e-06, "loss": 0.4424, "step": 950 }, { "epoch": 1.0682201010668164, "grad_norm": 0.42672649025917053, "learning_rate": 8.130573134299782e-06, "loss": 0.4435, "step": 951 }, { "epoch": 1.0693430656934306, "grad_norm": 0.40876173973083496, "learning_rate": 8.125473503927962e-06, "loss": 0.4185, "step": 952 }, { "epoch": 1.0704660303200448, "grad_norm": 0.36616051197052, "learning_rate": 8.120368531505961e-06, "loss": 0.4312, "step": 953 }, { "epoch": 1.0715889949466593, "grad_norm": 0.3864782154560089, "learning_rate": 8.11525822575918e-06, "loss": 0.4385, "step": 954 }, { "epoch": 1.0727119595732735, "grad_norm": 0.38266775012016296, "learning_rate": 8.11014259542214e-06, "loss": 0.4581, "step": 955 }, { "epoch": 1.0738349241998877, "grad_norm": 0.3610396981239319, "learning_rate": 8.105021649238459e-06, "loss": 0.4262, "step": 956 }, { "epoch": 1.074957888826502, "grad_norm": 0.3450765609741211, "learning_rate": 8.099895395960847e-06, "loss": 0.4034, "step": 957 }, { "epoch": 1.0760808534531163, "grad_norm": 0.3827979266643524, "learning_rate": 8.094763844351078e-06, "loss": 0.4619, "step": 958 }, { "epoch": 1.0772038180797305, "grad_norm": 0.39662814140319824, "learning_rate": 8.089627003179987e-06, "loss": 0.4486, "step": 959 }, { "epoch": 1.0783267827063447, "grad_norm": 0.35309290885925293, "learning_rate": 8.084484881227449e-06, "loss": 0.4081, "step": 960 }, { "epoch": 1.079449747332959, "grad_norm": 0.39437198638916016, "learning_rate": 8.079337487282358e-06, "loss": 0.4369, "step": 961 }, { "epoch": 1.0805727119595732, "grad_norm": 0.3816572427749634, "learning_rate": 8.07418483014263e-06, "loss": 0.4205, "step": 962 }, { "epoch": 1.0816956765861876, "grad_norm": 0.38968396186828613, "learning_rate": 8.069026918615173e-06, "loss": 0.4958, "step": 963 }, { "epoch": 1.0828186412128018, "grad_norm": 0.36063000559806824, "learning_rate": 8.063863761515869e-06, "loss": 0.3736, "step": 964 }, { "epoch": 1.083941605839416, "grad_norm": 0.437924861907959, "learning_rate": 8.058695367669573e-06, "loss": 0.4176, "step": 965 }, { "epoch": 1.0850645704660302, "grad_norm": 0.5171123147010803, "learning_rate": 8.05352174591009e-06, "loss": 0.46, "step": 966 }, { "epoch": 1.0861875350926447, "grad_norm": 0.397216260433197, "learning_rate": 8.04834290508016e-06, "loss": 0.4404, "step": 967 }, { "epoch": 1.0873104997192589, "grad_norm": 0.44285154342651367, "learning_rate": 8.04315885403144e-06, "loss": 0.4624, "step": 968 }, { "epoch": 1.088433464345873, "grad_norm": 0.4045272469520569, "learning_rate": 8.037969601624495e-06, "loss": 0.4196, "step": 969 }, { "epoch": 1.0895564289724873, "grad_norm": 0.3854396343231201, "learning_rate": 8.032775156728783e-06, "loss": 0.4229, "step": 970 }, { "epoch": 1.0906793935991017, "grad_norm": 0.46753114461898804, "learning_rate": 8.02757552822263e-06, "loss": 0.4241, "step": 971 }, { "epoch": 1.091802358225716, "grad_norm": 0.33733659982681274, "learning_rate": 8.022370724993229e-06, "loss": 0.4068, "step": 972 }, { "epoch": 1.0929253228523301, "grad_norm": 0.4303451180458069, "learning_rate": 8.017160755936614e-06, "loss": 0.4482, "step": 973 }, { "epoch": 1.0940482874789443, "grad_norm": 0.376144140958786, "learning_rate": 8.011945629957648e-06, "loss": 0.4097, "step": 974 }, { "epoch": 1.0951712521055588, "grad_norm": 0.38351157307624817, "learning_rate": 8.006725355970008e-06, "loss": 0.4621, "step": 975 }, { "epoch": 1.096294216732173, "grad_norm": 0.37270107865333557, "learning_rate": 8.001499942896174e-06, "loss": 0.4028, "step": 976 }, { "epoch": 1.0974171813587872, "grad_norm": 0.3308415114879608, "learning_rate": 7.996269399667404e-06, "loss": 0.4145, "step": 977 }, { "epoch": 1.0985401459854014, "grad_norm": 0.4011605381965637, "learning_rate": 7.99103373522373e-06, "loss": 0.459, "step": 978 }, { "epoch": 1.0996631106120156, "grad_norm": 0.33609649538993835, "learning_rate": 7.985792958513932e-06, "loss": 0.3984, "step": 979 }, { "epoch": 1.10078607523863, "grad_norm": 0.3980567157268524, "learning_rate": 7.98054707849553e-06, "loss": 0.4402, "step": 980 }, { "epoch": 1.1019090398652442, "grad_norm": 0.4374927282333374, "learning_rate": 7.975296104134768e-06, "loss": 0.4405, "step": 981 }, { "epoch": 1.1030320044918585, "grad_norm": 0.462056428194046, "learning_rate": 7.970040044406598e-06, "loss": 0.4586, "step": 982 }, { "epoch": 1.1041549691184729, "grad_norm": 0.38060250878334045, "learning_rate": 7.964778908294656e-06, "loss": 0.4143, "step": 983 }, { "epoch": 1.105277933745087, "grad_norm": 0.44803741574287415, "learning_rate": 7.959512704791269e-06, "loss": 0.4877, "step": 984 }, { "epoch": 1.1064008983717013, "grad_norm": 0.36589327454566956, "learning_rate": 7.95424144289741e-06, "loss": 0.4363, "step": 985 }, { "epoch": 1.1075238629983155, "grad_norm": 0.37528514862060547, "learning_rate": 7.948965131622705e-06, "loss": 0.4208, "step": 986 }, { "epoch": 1.1086468276249297, "grad_norm": 0.36528536677360535, "learning_rate": 7.943683779985412e-06, "loss": 0.4208, "step": 987 }, { "epoch": 1.1097697922515442, "grad_norm": 0.3558456599712372, "learning_rate": 7.938397397012401e-06, "loss": 0.4097, "step": 988 }, { "epoch": 1.1108927568781584, "grad_norm": 0.34500452876091003, "learning_rate": 7.933105991739144e-06, "loss": 0.4085, "step": 989 }, { "epoch": 1.1120157215047726, "grad_norm": 0.365953654050827, "learning_rate": 7.927809573209691e-06, "loss": 0.4365, "step": 990 }, { "epoch": 1.1131386861313868, "grad_norm": 0.35540658235549927, "learning_rate": 7.92250815047667e-06, "loss": 0.4347, "step": 991 }, { "epoch": 1.1142616507580012, "grad_norm": 0.36899635195732117, "learning_rate": 7.917201732601255e-06, "loss": 0.4402, "step": 992 }, { "epoch": 1.1153846153846154, "grad_norm": 0.32902079820632935, "learning_rate": 7.911890328653156e-06, "loss": 0.4292, "step": 993 }, { "epoch": 1.1165075800112296, "grad_norm": 0.40688687562942505, "learning_rate": 7.906573947710617e-06, "loss": 0.4764, "step": 994 }, { "epoch": 1.1176305446378438, "grad_norm": 0.32126760482788086, "learning_rate": 7.901252598860377e-06, "loss": 0.3762, "step": 995 }, { "epoch": 1.1187535092644583, "grad_norm": 0.37490561604499817, "learning_rate": 7.895926291197667e-06, "loss": 0.4453, "step": 996 }, { "epoch": 1.1198764738910725, "grad_norm": 0.3791968524456024, "learning_rate": 7.890595033826203e-06, "loss": 0.4352, "step": 997 }, { "epoch": 1.1209994385176867, "grad_norm": 0.3715962767601013, "learning_rate": 7.885258835858149e-06, "loss": 0.4477, "step": 998 }, { "epoch": 1.122122403144301, "grad_norm": 0.379116028547287, "learning_rate": 7.87991770641412e-06, "loss": 0.3995, "step": 999 }, { "epoch": 1.1232453677709153, "grad_norm": 0.4016701877117157, "learning_rate": 7.87457165462316e-06, "loss": 0.4791, "step": 1000 }, { "epoch": 1.1243683323975295, "grad_norm": 0.4579463601112366, "learning_rate": 7.869220689622725e-06, "loss": 0.4358, "step": 1001 }, { "epoch": 1.1254912970241437, "grad_norm": 0.4629487991333008, "learning_rate": 7.863864820558669e-06, "loss": 0.4379, "step": 1002 }, { "epoch": 1.126614261650758, "grad_norm": 0.4131915867328644, "learning_rate": 7.858504056585227e-06, "loss": 0.429, "step": 1003 }, { "epoch": 1.1277372262773722, "grad_norm": 0.3739740252494812, "learning_rate": 7.853138406865e-06, "loss": 0.4144, "step": 1004 }, { "epoch": 1.1288601909039866, "grad_norm": 0.35549238324165344, "learning_rate": 7.847767880568944e-06, "loss": 0.465, "step": 1005 }, { "epoch": 1.1299831555306008, "grad_norm": 0.384567528963089, "learning_rate": 7.842392486876345e-06, "loss": 0.4139, "step": 1006 }, { "epoch": 1.131106120157215, "grad_norm": 0.3801855444908142, "learning_rate": 7.83701223497481e-06, "loss": 0.4399, "step": 1007 }, { "epoch": 1.1322290847838292, "grad_norm": 0.3242621421813965, "learning_rate": 7.831627134060249e-06, "loss": 0.3875, "step": 1008 }, { "epoch": 1.1333520494104437, "grad_norm": 0.3935926854610443, "learning_rate": 7.826237193336864e-06, "loss": 0.474, "step": 1009 }, { "epoch": 1.1344750140370579, "grad_norm": 0.35517528653144836, "learning_rate": 7.820842422017122e-06, "loss": 0.4528, "step": 1010 }, { "epoch": 1.135597978663672, "grad_norm": 0.39165619015693665, "learning_rate": 7.815442829321754e-06, "loss": 0.4328, "step": 1011 }, { "epoch": 1.1367209432902863, "grad_norm": 0.40437912940979004, "learning_rate": 7.810038424479723e-06, "loss": 0.4251, "step": 1012 }, { "epoch": 1.1378439079169007, "grad_norm": 0.4088122844696045, "learning_rate": 7.804629216728227e-06, "loss": 0.4636, "step": 1013 }, { "epoch": 1.138966872543515, "grad_norm": 0.35680320858955383, "learning_rate": 7.799215215312667e-06, "loss": 0.4289, "step": 1014 }, { "epoch": 1.1400898371701291, "grad_norm": 0.38441506028175354, "learning_rate": 7.793796429486637e-06, "loss": 0.4428, "step": 1015 }, { "epoch": 1.1412128017967433, "grad_norm": 0.37961602210998535, "learning_rate": 7.78837286851191e-06, "loss": 0.421, "step": 1016 }, { "epoch": 1.1423357664233578, "grad_norm": 0.4144695997238159, "learning_rate": 7.782944541658423e-06, "loss": 0.4101, "step": 1017 }, { "epoch": 1.143458731049972, "grad_norm": 0.4553064703941345, "learning_rate": 7.777511458204253e-06, "loss": 0.4276, "step": 1018 }, { "epoch": 1.1445816956765862, "grad_norm": 0.45754116773605347, "learning_rate": 7.772073627435613e-06, "loss": 0.4342, "step": 1019 }, { "epoch": 1.1457046603032004, "grad_norm": 0.36387041211128235, "learning_rate": 7.766631058646826e-06, "loss": 0.4062, "step": 1020 }, { "epoch": 1.1468276249298146, "grad_norm": 0.4608931839466095, "learning_rate": 7.761183761140315e-06, "loss": 0.4601, "step": 1021 }, { "epoch": 1.147950589556429, "grad_norm": 0.44602611660957336, "learning_rate": 7.755731744226584e-06, "loss": 0.4412, "step": 1022 }, { "epoch": 1.1490735541830432, "grad_norm": 0.35595354437828064, "learning_rate": 7.750275017224208e-06, "loss": 0.4025, "step": 1023 }, { "epoch": 1.1501965188096575, "grad_norm": 0.4195128083229065, "learning_rate": 7.744813589459806e-06, "loss": 0.4269, "step": 1024 }, { "epoch": 1.1513194834362717, "grad_norm": 0.38575872778892517, "learning_rate": 7.739347470268031e-06, "loss": 0.4412, "step": 1025 }, { "epoch": 1.152442448062886, "grad_norm": 0.5020630955696106, "learning_rate": 7.733876668991565e-06, "loss": 0.4563, "step": 1026 }, { "epoch": 1.1535654126895003, "grad_norm": 0.41644996404647827, "learning_rate": 7.72840119498108e-06, "loss": 0.4109, "step": 1027 }, { "epoch": 1.1546883773161145, "grad_norm": 0.43252256512641907, "learning_rate": 7.722921057595245e-06, "loss": 0.4366, "step": 1028 }, { "epoch": 1.1558113419427287, "grad_norm": 0.4760952889919281, "learning_rate": 7.71743626620069e-06, "loss": 0.4338, "step": 1029 }, { "epoch": 1.1569343065693432, "grad_norm": 0.43482813239097595, "learning_rate": 7.711946830172008e-06, "loss": 0.4348, "step": 1030 }, { "epoch": 1.1580572711959574, "grad_norm": 0.5196707248687744, "learning_rate": 7.706452758891726e-06, "loss": 0.4042, "step": 1031 }, { "epoch": 1.1591802358225716, "grad_norm": 0.42039549350738525, "learning_rate": 7.700954061750295e-06, "loss": 0.4573, "step": 1032 }, { "epoch": 1.1603032004491858, "grad_norm": 0.4364170432090759, "learning_rate": 7.69545074814607e-06, "loss": 0.4248, "step": 1033 }, { "epoch": 1.1614261650758002, "grad_norm": 0.46823257207870483, "learning_rate": 7.689942827485298e-06, "loss": 0.4864, "step": 1034 }, { "epoch": 1.1625491297024144, "grad_norm": 0.37183937430381775, "learning_rate": 7.684430309182106e-06, "loss": 0.4036, "step": 1035 }, { "epoch": 1.1636720943290286, "grad_norm": 0.5225604176521301, "learning_rate": 7.678913202658468e-06, "loss": 0.4922, "step": 1036 }, { "epoch": 1.1647950589556428, "grad_norm": 0.36254826188087463, "learning_rate": 7.673391517344211e-06, "loss": 0.4048, "step": 1037 }, { "epoch": 1.165918023582257, "grad_norm": 0.4319021701812744, "learning_rate": 7.667865262676981e-06, "loss": 0.4528, "step": 1038 }, { "epoch": 1.1670409882088715, "grad_norm": 0.4071805775165558, "learning_rate": 7.662334448102238e-06, "loss": 0.418, "step": 1039 }, { "epoch": 1.1681639528354857, "grad_norm": 0.368012934923172, "learning_rate": 7.656799083073232e-06, "loss": 0.4083, "step": 1040 }, { "epoch": 1.1692869174621, "grad_norm": 0.35020434856414795, "learning_rate": 7.651259177050996e-06, "loss": 0.4111, "step": 1041 }, { "epoch": 1.1704098820887143, "grad_norm": 0.3707600235939026, "learning_rate": 7.645714739504317e-06, "loss": 0.4087, "step": 1042 }, { "epoch": 1.1715328467153285, "grad_norm": 0.3652622103691101, "learning_rate": 7.640165779909734e-06, "loss": 0.4119, "step": 1043 }, { "epoch": 1.1726558113419427, "grad_norm": 0.4057026207447052, "learning_rate": 7.634612307751513e-06, "loss": 0.445, "step": 1044 }, { "epoch": 1.173778775968557, "grad_norm": 0.3881671130657196, "learning_rate": 7.629054332521631e-06, "loss": 0.4276, "step": 1045 }, { "epoch": 1.1749017405951712, "grad_norm": 0.3558051884174347, "learning_rate": 7.623491863719764e-06, "loss": 0.4409, "step": 1046 }, { "epoch": 1.1760247052217856, "grad_norm": 0.373041033744812, "learning_rate": 7.617924910853266e-06, "loss": 0.4175, "step": 1047 }, { "epoch": 1.1771476698483998, "grad_norm": 0.407465398311615, "learning_rate": 7.612353483437158e-06, "loss": 0.4795, "step": 1048 }, { "epoch": 1.178270634475014, "grad_norm": 0.37277528643608093, "learning_rate": 7.606777590994107e-06, "loss": 0.4262, "step": 1049 }, { "epoch": 1.1793935991016282, "grad_norm": 0.4178366959095001, "learning_rate": 7.601197243054411e-06, "loss": 0.4525, "step": 1050 }, { "epoch": 1.1805165637282427, "grad_norm": 0.37373918294906616, "learning_rate": 7.5956124491559865e-06, "loss": 0.4218, "step": 1051 }, { "epoch": 1.1816395283548569, "grad_norm": 0.4093683063983917, "learning_rate": 7.5900232188443465e-06, "loss": 0.4235, "step": 1052 }, { "epoch": 1.182762492981471, "grad_norm": 0.33583369851112366, "learning_rate": 7.584429561672586e-06, "loss": 0.4532, "step": 1053 }, { "epoch": 1.1838854576080853, "grad_norm": 0.5235519409179688, "learning_rate": 7.578831487201368e-06, "loss": 0.4549, "step": 1054 }, { "epoch": 1.1850084222346995, "grad_norm": 0.34572046995162964, "learning_rate": 7.573229004998905e-06, "loss": 0.4322, "step": 1055 }, { "epoch": 1.186131386861314, "grad_norm": 0.40512773394584656, "learning_rate": 7.567622124640942e-06, "loss": 0.4195, "step": 1056 }, { "epoch": 1.1872543514879281, "grad_norm": 0.3984316885471344, "learning_rate": 7.562010855710745e-06, "loss": 0.4275, "step": 1057 }, { "epoch": 1.1883773161145423, "grad_norm": 0.4172515571117401, "learning_rate": 7.556395207799078e-06, "loss": 0.4382, "step": 1058 }, { "epoch": 1.1895002807411568, "grad_norm": 0.40150734782218933, "learning_rate": 7.5507751905041885e-06, "loss": 0.4405, "step": 1059 }, { "epoch": 1.190623245367771, "grad_norm": 0.3426018953323364, "learning_rate": 7.545150813431794e-06, "loss": 0.4473, "step": 1060 }, { "epoch": 1.1917462099943852, "grad_norm": 0.3653811812400818, "learning_rate": 7.5395220861950635e-06, "loss": 0.418, "step": 1061 }, { "epoch": 1.1928691746209994, "grad_norm": 0.3776378929615021, "learning_rate": 7.533889018414602e-06, "loss": 0.4418, "step": 1062 }, { "epoch": 1.1939921392476136, "grad_norm": 0.44768524169921875, "learning_rate": 7.528251619718431e-06, "loss": 0.438, "step": 1063 }, { "epoch": 1.195115103874228, "grad_norm": 0.3606531620025635, "learning_rate": 7.522609899741977e-06, "loss": 0.4147, "step": 1064 }, { "epoch": 1.1962380685008422, "grad_norm": 0.3950115144252777, "learning_rate": 7.516963868128054e-06, "loss": 0.4196, "step": 1065 }, { "epoch": 1.1973610331274565, "grad_norm": 0.4503757059574127, "learning_rate": 7.5113135345268364e-06, "loss": 0.4499, "step": 1066 }, { "epoch": 1.1984839977540707, "grad_norm": 0.38043591380119324, "learning_rate": 7.505658908595862e-06, "loss": 0.4183, "step": 1067 }, { "epoch": 1.199606962380685, "grad_norm": 0.41554224491119385, "learning_rate": 7.500000000000001e-06, "loss": 0.4245, "step": 1068 }, { "epoch": 1.2007299270072993, "grad_norm": 0.42119014263153076, "learning_rate": 7.494336818411442e-06, "loss": 0.4357, "step": 1069 }, { "epoch": 1.2018528916339135, "grad_norm": 0.4282399117946625, "learning_rate": 7.48866937350968e-06, "loss": 0.4281, "step": 1070 }, { "epoch": 1.2029758562605277, "grad_norm": 0.4194989502429962, "learning_rate": 7.4829976749814935e-06, "loss": 0.4722, "step": 1071 }, { "epoch": 1.2040988208871422, "grad_norm": 0.39283955097198486, "learning_rate": 7.477321732520935e-06, "loss": 0.386, "step": 1072 }, { "epoch": 1.2052217855137564, "grad_norm": 0.4003312587738037, "learning_rate": 7.471641555829307e-06, "loss": 0.436, "step": 1073 }, { "epoch": 1.2063447501403706, "grad_norm": 0.3932701647281647, "learning_rate": 7.46595715461515e-06, "loss": 0.4363, "step": 1074 }, { "epoch": 1.2074677147669848, "grad_norm": 0.39232808351516724, "learning_rate": 7.46026853859423e-06, "loss": 0.4361, "step": 1075 }, { "epoch": 1.2085906793935992, "grad_norm": 0.3874123990535736, "learning_rate": 7.454575717489509e-06, "loss": 0.4351, "step": 1076 }, { "epoch": 1.2097136440202134, "grad_norm": 0.4073023796081543, "learning_rate": 7.4488787010311425e-06, "loss": 0.4068, "step": 1077 }, { "epoch": 1.2108366086468276, "grad_norm": 0.40707796812057495, "learning_rate": 7.443177498956453e-06, "loss": 0.4311, "step": 1078 }, { "epoch": 1.2119595732734418, "grad_norm": 0.4083722233772278, "learning_rate": 7.437472121009919e-06, "loss": 0.4182, "step": 1079 }, { "epoch": 1.213082537900056, "grad_norm": 0.4421122968196869, "learning_rate": 7.431762576943157e-06, "loss": 0.4336, "step": 1080 }, { "epoch": 1.2142055025266705, "grad_norm": 0.46059784293174744, "learning_rate": 7.4260488765149016e-06, "loss": 0.4404, "step": 1081 }, { "epoch": 1.2153284671532847, "grad_norm": 0.42434006929397583, "learning_rate": 7.420331029490992e-06, "loss": 0.4032, "step": 1082 }, { "epoch": 1.216451431779899, "grad_norm": 0.4589911699295044, "learning_rate": 7.414609045644356e-06, "loss": 0.4294, "step": 1083 }, { "epoch": 1.217574396406513, "grad_norm": 0.3857876658439636, "learning_rate": 7.408882934754995e-06, "loss": 0.4654, "step": 1084 }, { "epoch": 1.2186973610331275, "grad_norm": 0.44097352027893066, "learning_rate": 7.403152706609958e-06, "loss": 0.4407, "step": 1085 }, { "epoch": 1.2198203256597417, "grad_norm": 0.436617910861969, "learning_rate": 7.3974183710033334e-06, "loss": 0.4188, "step": 1086 }, { "epoch": 1.220943290286356, "grad_norm": 0.3832577168941498, "learning_rate": 7.391679937736231e-06, "loss": 0.4328, "step": 1087 }, { "epoch": 1.2220662549129702, "grad_norm": 0.4680957496166229, "learning_rate": 7.385937416616767e-06, "loss": 0.4404, "step": 1088 }, { "epoch": 1.2231892195395846, "grad_norm": 0.39007121324539185, "learning_rate": 7.38019081746004e-06, "loss": 0.4183, "step": 1089 }, { "epoch": 1.2243121841661988, "grad_norm": 0.39264169335365295, "learning_rate": 7.3744401500881205e-06, "loss": 0.4079, "step": 1090 }, { "epoch": 1.225435148792813, "grad_norm": 0.41599225997924805, "learning_rate": 7.368685424330031e-06, "loss": 0.4502, "step": 1091 }, { "epoch": 1.2265581134194272, "grad_norm": 0.3772522807121277, "learning_rate": 7.362926650021736e-06, "loss": 0.418, "step": 1092 }, { "epoch": 1.2276810780460417, "grad_norm": 0.44597381353378296, "learning_rate": 7.357163837006112e-06, "loss": 0.4517, "step": 1093 }, { "epoch": 1.2288040426726559, "grad_norm": 0.3640742003917694, "learning_rate": 7.351396995132941e-06, "loss": 0.4408, "step": 1094 }, { "epoch": 1.22992700729927, "grad_norm": 0.4243693947792053, "learning_rate": 7.345626134258897e-06, "loss": 0.404, "step": 1095 }, { "epoch": 1.2310499719258843, "grad_norm": 0.42698222398757935, "learning_rate": 7.339851264247516e-06, "loss": 0.4659, "step": 1096 }, { "epoch": 1.2321729365524985, "grad_norm": 0.3970142900943756, "learning_rate": 7.334072394969188e-06, "loss": 0.4181, "step": 1097 }, { "epoch": 1.233295901179113, "grad_norm": 0.5291920304298401, "learning_rate": 7.3282895363011405e-06, "loss": 0.4617, "step": 1098 }, { "epoch": 1.2344188658057271, "grad_norm": 0.39109349250793457, "learning_rate": 7.322502698127421e-06, "loss": 0.4299, "step": 1099 }, { "epoch": 1.2355418304323413, "grad_norm": 0.4219363033771515, "learning_rate": 7.31671189033887e-06, "loss": 0.4293, "step": 1100 }, { "epoch": 1.2366647950589555, "grad_norm": 0.5007031559944153, "learning_rate": 7.310917122833127e-06, "loss": 0.4479, "step": 1101 }, { "epoch": 1.23778775968557, "grad_norm": 0.3711352050304413, "learning_rate": 7.3051184055145855e-06, "loss": 0.4351, "step": 1102 }, { "epoch": 1.2389107243121842, "grad_norm": 0.41111141443252563, "learning_rate": 7.2993157482943995e-06, "loss": 0.442, "step": 1103 }, { "epoch": 1.2400336889387984, "grad_norm": 0.4259604811668396, "learning_rate": 7.293509161090453e-06, "loss": 0.3963, "step": 1104 }, { "epoch": 1.2411566535654126, "grad_norm": 0.39920124411582947, "learning_rate": 7.2876986538273485e-06, "loss": 0.4285, "step": 1105 }, { "epoch": 1.242279618192027, "grad_norm": 0.3950008153915405, "learning_rate": 7.281884236436388e-06, "loss": 0.4393, "step": 1106 }, { "epoch": 1.2434025828186412, "grad_norm": 0.389010488986969, "learning_rate": 7.276065918855554e-06, "loss": 0.4173, "step": 1107 }, { "epoch": 1.2445255474452555, "grad_norm": 0.37770727276802063, "learning_rate": 7.270243711029501e-06, "loss": 0.42, "step": 1108 }, { "epoch": 1.2456485120718697, "grad_norm": 0.40217700600624084, "learning_rate": 7.26441762290953e-06, "loss": 0.4499, "step": 1109 }, { "epoch": 1.246771476698484, "grad_norm": 0.399287611246109, "learning_rate": 7.2585876644535705e-06, "loss": 0.4367, "step": 1110 }, { "epoch": 1.2478944413250983, "grad_norm": 0.3783414959907532, "learning_rate": 7.252753845626173e-06, "loss": 0.4164, "step": 1111 }, { "epoch": 1.2490174059517125, "grad_norm": 0.3832404911518097, "learning_rate": 7.246916176398484e-06, "loss": 0.4488, "step": 1112 }, { "epoch": 1.2501403705783267, "grad_norm": 0.4496491253376007, "learning_rate": 7.241074666748228e-06, "loss": 0.4048, "step": 1113 }, { "epoch": 1.251263335204941, "grad_norm": 0.43809518218040466, "learning_rate": 7.2352293266596985e-06, "loss": 0.4652, "step": 1114 }, { "epoch": 1.2523862998315554, "grad_norm": 0.3762839436531067, "learning_rate": 7.229380166123734e-06, "loss": 0.4195, "step": 1115 }, { "epoch": 1.2535092644581696, "grad_norm": 0.3966275155544281, "learning_rate": 7.2235271951377005e-06, "loss": 0.4322, "step": 1116 }, { "epoch": 1.2546322290847838, "grad_norm": 0.39928337931632996, "learning_rate": 7.2176704237054805e-06, "loss": 0.4179, "step": 1117 }, { "epoch": 1.2557551937113982, "grad_norm": 0.3793953061103821, "learning_rate": 7.211809861837451e-06, "loss": 0.441, "step": 1118 }, { "epoch": 1.2568781583380124, "grad_norm": 0.3831635117530823, "learning_rate": 7.205945519550467e-06, "loss": 0.4253, "step": 1119 }, { "epoch": 1.2580011229646266, "grad_norm": 0.3771590292453766, "learning_rate": 7.200077406867842e-06, "loss": 0.4321, "step": 1120 }, { "epoch": 1.2591240875912408, "grad_norm": 0.33374708890914917, "learning_rate": 7.194205533819343e-06, "loss": 0.4139, "step": 1121 }, { "epoch": 1.260247052217855, "grad_norm": 0.4226832687854767, "learning_rate": 7.188329910441154e-06, "loss": 0.418, "step": 1122 }, { "epoch": 1.2613700168444695, "grad_norm": 0.40420618653297424, "learning_rate": 7.182450546775874e-06, "loss": 0.4499, "step": 1123 }, { "epoch": 1.2624929814710837, "grad_norm": 0.42476701736450195, "learning_rate": 7.176567452872495e-06, "loss": 0.4365, "step": 1124 }, { "epoch": 1.263615946097698, "grad_norm": 0.4315333068370819, "learning_rate": 7.170680638786383e-06, "loss": 0.4442, "step": 1125 }, { "epoch": 1.264738910724312, "grad_norm": 0.4909651279449463, "learning_rate": 7.164790114579263e-06, "loss": 0.4298, "step": 1126 }, { "epoch": 1.2658618753509265, "grad_norm": 0.3845425844192505, "learning_rate": 7.158895890319203e-06, "loss": 0.422, "step": 1127 }, { "epoch": 1.2669848399775407, "grad_norm": 0.44684287905693054, "learning_rate": 7.1529979760805946e-06, "loss": 0.4316, "step": 1128 }, { "epoch": 1.268107804604155, "grad_norm": 0.3446615934371948, "learning_rate": 7.147096381944134e-06, "loss": 0.4184, "step": 1129 }, { "epoch": 1.2692307692307692, "grad_norm": 0.3838743269443512, "learning_rate": 7.14119111799681e-06, "loss": 0.4132, "step": 1130 }, { "epoch": 1.2703537338573834, "grad_norm": 0.39346039295196533, "learning_rate": 7.135282194331881e-06, "loss": 0.4446, "step": 1131 }, { "epoch": 1.2714766984839978, "grad_norm": 0.4078960716724396, "learning_rate": 7.1293696210488625e-06, "loss": 0.4322, "step": 1132 }, { "epoch": 1.272599663110612, "grad_norm": 0.41379958391189575, "learning_rate": 7.123453408253508e-06, "loss": 0.4198, "step": 1133 }, { "epoch": 1.2737226277372262, "grad_norm": 0.38134801387786865, "learning_rate": 7.1175335660577906e-06, "loss": 0.434, "step": 1134 }, { "epoch": 1.2748455923638407, "grad_norm": 0.44374024868011475, "learning_rate": 7.111610104579889e-06, "loss": 0.4747, "step": 1135 }, { "epoch": 1.2759685569904549, "grad_norm": 0.37146177887916565, "learning_rate": 7.105683033944163e-06, "loss": 0.4152, "step": 1136 }, { "epoch": 1.277091521617069, "grad_norm": 0.4043941795825958, "learning_rate": 7.099752364281147e-06, "loss": 0.4183, "step": 1137 }, { "epoch": 1.2782144862436833, "grad_norm": 0.3893197178840637, "learning_rate": 7.093818105727522e-06, "loss": 0.4293, "step": 1138 }, { "epoch": 1.2793374508702975, "grad_norm": 0.3343549072742462, "learning_rate": 7.08788026842611e-06, "loss": 0.4292, "step": 1139 }, { "epoch": 1.280460415496912, "grad_norm": 0.38400304317474365, "learning_rate": 7.0819388625258385e-06, "loss": 0.4353, "step": 1140 }, { "epoch": 1.2815833801235261, "grad_norm": 0.36342015862464905, "learning_rate": 7.075993898181748e-06, "loss": 0.4707, "step": 1141 }, { "epoch": 1.2827063447501403, "grad_norm": 0.35770684480667114, "learning_rate": 7.070045385554948e-06, "loss": 0.3863, "step": 1142 }, { "epoch": 1.2838293093767548, "grad_norm": 0.38918396830558777, "learning_rate": 7.0640933348126235e-06, "loss": 0.4425, "step": 1143 }, { "epoch": 1.284952274003369, "grad_norm": 0.39483869075775146, "learning_rate": 7.058137756128001e-06, "loss": 0.4506, "step": 1144 }, { "epoch": 1.2860752386299832, "grad_norm": 0.38156113028526306, "learning_rate": 7.052178659680336e-06, "loss": 0.4341, "step": 1145 }, { "epoch": 1.2871982032565974, "grad_norm": 0.40314194560050964, "learning_rate": 7.046216055654902e-06, "loss": 0.4238, "step": 1146 }, { "epoch": 1.2883211678832116, "grad_norm": 0.3670646548271179, "learning_rate": 7.040249954242962e-06, "loss": 0.4275, "step": 1147 }, { "epoch": 1.2894441325098258, "grad_norm": 0.3831973671913147, "learning_rate": 7.034280365641759e-06, "loss": 0.4192, "step": 1148 }, { "epoch": 1.2905670971364402, "grad_norm": 0.44106510281562805, "learning_rate": 7.028307300054499e-06, "loss": 0.4938, "step": 1149 }, { "epoch": 1.2916900617630545, "grad_norm": 0.35511115193367004, "learning_rate": 7.022330767690326e-06, "loss": 0.4048, "step": 1150 }, { "epoch": 1.2928130263896687, "grad_norm": 0.4103480577468872, "learning_rate": 7.016350778764311e-06, "loss": 0.4354, "step": 1151 }, { "epoch": 1.293935991016283, "grad_norm": 0.3761599659919739, "learning_rate": 7.0103673434974375e-06, "loss": 0.4557, "step": 1152 }, { "epoch": 1.2950589556428973, "grad_norm": 0.4244709312915802, "learning_rate": 7.004380472116571e-06, "loss": 0.4188, "step": 1153 }, { "epoch": 1.2961819202695115, "grad_norm": 0.37143412232398987, "learning_rate": 6.998390174854457e-06, "loss": 0.4139, "step": 1154 }, { "epoch": 1.2973048848961257, "grad_norm": 0.4021906554698944, "learning_rate": 6.992396461949693e-06, "loss": 0.4189, "step": 1155 }, { "epoch": 1.29842784952274, "grad_norm": 0.3602926433086395, "learning_rate": 6.986399343646717e-06, "loss": 0.418, "step": 1156 }, { "epoch": 1.2995508141493544, "grad_norm": 0.38307708501815796, "learning_rate": 6.980398830195785e-06, "loss": 0.4398, "step": 1157 }, { "epoch": 1.3006737787759686, "grad_norm": 0.3833075761795044, "learning_rate": 6.974394931852957e-06, "loss": 0.4153, "step": 1158 }, { "epoch": 1.3017967434025828, "grad_norm": 0.35621264576911926, "learning_rate": 6.968387658880079e-06, "loss": 0.4336, "step": 1159 }, { "epoch": 1.3029197080291972, "grad_norm": 0.3272477686405182, "learning_rate": 6.962377021544765e-06, "loss": 0.4068, "step": 1160 }, { "epoch": 1.3040426726558114, "grad_norm": 0.3801352381706238, "learning_rate": 6.956363030120377e-06, "loss": 0.4692, "step": 1161 }, { "epoch": 1.3051656372824256, "grad_norm": 0.3861273527145386, "learning_rate": 6.950345694886013e-06, "loss": 0.4358, "step": 1162 }, { "epoch": 1.3062886019090398, "grad_norm": 0.3499404788017273, "learning_rate": 6.9443250261264846e-06, "loss": 0.4307, "step": 1163 }, { "epoch": 1.307411566535654, "grad_norm": 0.3578338027000427, "learning_rate": 6.9383010341323e-06, "loss": 0.4251, "step": 1164 }, { "epoch": 1.3085345311622683, "grad_norm": 0.38979002833366394, "learning_rate": 6.932273729199651e-06, "loss": 0.4141, "step": 1165 }, { "epoch": 1.3096574957888827, "grad_norm": 0.34255969524383545, "learning_rate": 6.926243121630387e-06, "loss": 0.4399, "step": 1166 }, { "epoch": 1.310780460415497, "grad_norm": 0.37964457273483276, "learning_rate": 6.920209221732007e-06, "loss": 0.4305, "step": 1167 }, { "epoch": 1.311903425042111, "grad_norm": 0.33156418800354004, "learning_rate": 6.914172039817635e-06, "loss": 0.4067, "step": 1168 }, { "epoch": 1.3130263896687255, "grad_norm": 0.4058791697025299, "learning_rate": 6.9081315862060035e-06, "loss": 0.4198, "step": 1169 }, { "epoch": 1.3141493542953397, "grad_norm": 0.38683784008026123, "learning_rate": 6.902087871221439e-06, "loss": 0.4729, "step": 1170 }, { "epoch": 1.315272318921954, "grad_norm": 0.318362295627594, "learning_rate": 6.89604090519384e-06, "loss": 0.3873, "step": 1171 }, { "epoch": 1.3163952835485682, "grad_norm": 0.36207136511802673, "learning_rate": 6.889990698458666e-06, "loss": 0.4413, "step": 1172 }, { "epoch": 1.3175182481751824, "grad_norm": 0.41741979122161865, "learning_rate": 6.88393726135691e-06, "loss": 0.4539, "step": 1173 }, { "epoch": 1.3186412128017968, "grad_norm": 0.3765466511249542, "learning_rate": 6.87788060423509e-06, "loss": 0.406, "step": 1174 }, { "epoch": 1.319764177428411, "grad_norm": 0.37224501371383667, "learning_rate": 6.871820737445227e-06, "loss": 0.4207, "step": 1175 }, { "epoch": 1.3208871420550252, "grad_norm": 0.35307183861732483, "learning_rate": 6.865757671344827e-06, "loss": 0.3896, "step": 1176 }, { "epoch": 1.3220101066816397, "grad_norm": 0.3841562569141388, "learning_rate": 6.859691416296864e-06, "loss": 0.4665, "step": 1177 }, { "epoch": 1.3231330713082539, "grad_norm": 0.41068384051322937, "learning_rate": 6.853621982669766e-06, "loss": 0.4331, "step": 1178 }, { "epoch": 1.324256035934868, "grad_norm": 0.395233690738678, "learning_rate": 6.8475493808373895e-06, "loss": 0.428, "step": 1179 }, { "epoch": 1.3253790005614823, "grad_norm": 0.3667590618133545, "learning_rate": 6.841473621179006e-06, "loss": 0.4579, "step": 1180 }, { "epoch": 1.3265019651880965, "grad_norm": 0.40979325771331787, "learning_rate": 6.835394714079292e-06, "loss": 0.4363, "step": 1181 }, { "epoch": 1.327624929814711, "grad_norm": 0.3612469434738159, "learning_rate": 6.829312669928293e-06, "loss": 0.454, "step": 1182 }, { "epoch": 1.3287478944413251, "grad_norm": 0.36161094903945923, "learning_rate": 6.8232274991214206e-06, "loss": 0.4169, "step": 1183 }, { "epoch": 1.3298708590679393, "grad_norm": 0.4315537214279175, "learning_rate": 6.817139212059434e-06, "loss": 0.4307, "step": 1184 }, { "epoch": 1.3309938236945535, "grad_norm": 0.3633710741996765, "learning_rate": 6.811047819148413e-06, "loss": 0.4153, "step": 1185 }, { "epoch": 1.332116788321168, "grad_norm": 0.42442455887794495, "learning_rate": 6.804953330799751e-06, "loss": 0.4251, "step": 1186 }, { "epoch": 1.3332397529477822, "grad_norm": 0.3617255687713623, "learning_rate": 6.798855757430127e-06, "loss": 0.4341, "step": 1187 }, { "epoch": 1.3343627175743964, "grad_norm": 0.40096455812454224, "learning_rate": 6.792755109461498e-06, "loss": 0.4688, "step": 1188 }, { "epoch": 1.3354856822010106, "grad_norm": 0.3413197994232178, "learning_rate": 6.786651397321073e-06, "loss": 0.39, "step": 1189 }, { "epoch": 1.3366086468276248, "grad_norm": 0.39256274700164795, "learning_rate": 6.780544631441297e-06, "loss": 0.436, "step": 1190 }, { "epoch": 1.3377316114542392, "grad_norm": 0.3553450405597687, "learning_rate": 6.7744348222598386e-06, "loss": 0.4348, "step": 1191 }, { "epoch": 1.3388545760808535, "grad_norm": 0.35636499524116516, "learning_rate": 6.768321980219565e-06, "loss": 0.4279, "step": 1192 }, { "epoch": 1.3399775407074677, "grad_norm": 0.39261358976364136, "learning_rate": 6.76220611576853e-06, "loss": 0.4116, "step": 1193 }, { "epoch": 1.341100505334082, "grad_norm": 0.3832884132862091, "learning_rate": 6.756087239359948e-06, "loss": 0.4386, "step": 1194 }, { "epoch": 1.3422234699606963, "grad_norm": 0.4396374821662903, "learning_rate": 6.749965361452187e-06, "loss": 0.4386, "step": 1195 }, { "epoch": 1.3433464345873105, "grad_norm": 0.4155983030796051, "learning_rate": 6.7438404925087395e-06, "loss": 0.4248, "step": 1196 }, { "epoch": 1.3444693992139247, "grad_norm": 0.3759906589984894, "learning_rate": 6.737712642998219e-06, "loss": 0.4425, "step": 1197 }, { "epoch": 1.345592363840539, "grad_norm": 0.38770052790641785, "learning_rate": 6.731581823394324e-06, "loss": 0.4306, "step": 1198 }, { "epoch": 1.3467153284671534, "grad_norm": 0.38199129700660706, "learning_rate": 6.725448044175835e-06, "loss": 0.4248, "step": 1199 }, { "epoch": 1.3478382930937676, "grad_norm": 0.34937921166419983, "learning_rate": 6.719311315826589e-06, "loss": 0.4151, "step": 1200 }, { "epoch": 1.3489612577203818, "grad_norm": 0.3748745024204254, "learning_rate": 6.713171648835463e-06, "loss": 0.4327, "step": 1201 }, { "epoch": 1.350084222346996, "grad_norm": 0.4248553514480591, "learning_rate": 6.70702905369636e-06, "loss": 0.4508, "step": 1202 }, { "epoch": 1.3512071869736104, "grad_norm": 0.36588263511657715, "learning_rate": 6.700883540908185e-06, "loss": 0.4556, "step": 1203 }, { "epoch": 1.3523301516002246, "grad_norm": 0.3791574537754059, "learning_rate": 6.694735120974829e-06, "loss": 0.4628, "step": 1204 }, { "epoch": 1.3534531162268388, "grad_norm": 0.34273386001586914, "learning_rate": 6.688583804405157e-06, "loss": 0.3961, "step": 1205 }, { "epoch": 1.354576080853453, "grad_norm": 0.348168283700943, "learning_rate": 6.682429601712976e-06, "loss": 0.4415, "step": 1206 }, { "epoch": 1.3556990454800673, "grad_norm": 0.3491176962852478, "learning_rate": 6.676272523417038e-06, "loss": 0.4144, "step": 1207 }, { "epoch": 1.3568220101066817, "grad_norm": 0.3845398426055908, "learning_rate": 6.6701125800409974e-06, "loss": 0.4266, "step": 1208 }, { "epoch": 1.357944974733296, "grad_norm": 0.36513039469718933, "learning_rate": 6.663949782113413e-06, "loss": 0.4464, "step": 1209 }, { "epoch": 1.35906793935991, "grad_norm": 0.33787697553634644, "learning_rate": 6.657784140167722e-06, "loss": 0.4103, "step": 1210 }, { "epoch": 1.3601909039865245, "grad_norm": 0.34767699241638184, "learning_rate": 6.651615664742221e-06, "loss": 0.378, "step": 1211 }, { "epoch": 1.3613138686131387, "grad_norm": 0.40625014901161194, "learning_rate": 6.64544436638005e-06, "loss": 0.4423, "step": 1212 }, { "epoch": 1.362436833239753, "grad_norm": 0.41110461950302124, "learning_rate": 6.6392702556291755e-06, "loss": 0.4592, "step": 1213 }, { "epoch": 1.3635597978663672, "grad_norm": 0.3891259431838989, "learning_rate": 6.633093343042368e-06, "loss": 0.4235, "step": 1214 }, { "epoch": 1.3646827624929814, "grad_norm": 0.38320687413215637, "learning_rate": 6.626913639177189e-06, "loss": 0.4235, "step": 1215 }, { "epoch": 1.3658057271195958, "grad_norm": 0.38389772176742554, "learning_rate": 6.620731154595971e-06, "loss": 0.4264, "step": 1216 }, { "epoch": 1.36692869174621, "grad_norm": 0.45837917923927307, "learning_rate": 6.614545899865796e-06, "loss": 0.4213, "step": 1217 }, { "epoch": 1.3680516563728242, "grad_norm": 0.3501172363758087, "learning_rate": 6.608357885558485e-06, "loss": 0.4028, "step": 1218 }, { "epoch": 1.3691746209994387, "grad_norm": 0.4277402460575104, "learning_rate": 6.602167122250575e-06, "loss": 0.4227, "step": 1219 }, { "epoch": 1.3702975856260529, "grad_norm": 0.40891093015670776, "learning_rate": 6.595973620523302e-06, "loss": 0.4327, "step": 1220 }, { "epoch": 1.371420550252667, "grad_norm": 0.3398943543434143, "learning_rate": 6.589777390962575e-06, "loss": 0.4184, "step": 1221 }, { "epoch": 1.3725435148792813, "grad_norm": 0.5453594326972961, "learning_rate": 6.583578444158978e-06, "loss": 0.4471, "step": 1222 }, { "epoch": 1.3736664795058955, "grad_norm": 0.41003528237342834, "learning_rate": 6.577376790707729e-06, "loss": 0.4042, "step": 1223 }, { "epoch": 1.3747894441325097, "grad_norm": 0.432170033454895, "learning_rate": 6.571172441208678e-06, "loss": 0.4279, "step": 1224 }, { "epoch": 1.3759124087591241, "grad_norm": 0.4721520245075226, "learning_rate": 6.564965406266278e-06, "loss": 0.4327, "step": 1225 }, { "epoch": 1.3770353733857383, "grad_norm": 0.3714945912361145, "learning_rate": 6.558755696489578e-06, "loss": 0.3999, "step": 1226 }, { "epoch": 1.3781583380123525, "grad_norm": 0.4178280234336853, "learning_rate": 6.552543322492195e-06, "loss": 0.4265, "step": 1227 }, { "epoch": 1.379281302638967, "grad_norm": 0.35271939635276794, "learning_rate": 6.5463282948923e-06, "loss": 0.4312, "step": 1228 }, { "epoch": 1.3804042672655812, "grad_norm": 0.3787769377231598, "learning_rate": 6.540110624312601e-06, "loss": 0.4206, "step": 1229 }, { "epoch": 1.3815272318921954, "grad_norm": 0.3512360453605652, "learning_rate": 6.53389032138032e-06, "loss": 0.4321, "step": 1230 }, { "epoch": 1.3826501965188096, "grad_norm": 0.401193767786026, "learning_rate": 6.527667396727182e-06, "loss": 0.4435, "step": 1231 }, { "epoch": 1.3837731611454238, "grad_norm": 0.34978967905044556, "learning_rate": 6.521441860989395e-06, "loss": 0.4262, "step": 1232 }, { "epoch": 1.3848961257720382, "grad_norm": 0.36464646458625793, "learning_rate": 6.515213724807621e-06, "loss": 0.4073, "step": 1233 }, { "epoch": 1.3860190903986525, "grad_norm": 0.45479047298431396, "learning_rate": 6.508982998826975e-06, "loss": 0.4337, "step": 1234 }, { "epoch": 1.3871420550252667, "grad_norm": 0.38626670837402344, "learning_rate": 6.502749693696996e-06, "loss": 0.4305, "step": 1235 }, { "epoch": 1.388265019651881, "grad_norm": 0.40777796506881714, "learning_rate": 6.49651382007163e-06, "loss": 0.4475, "step": 1236 }, { "epoch": 1.3893879842784953, "grad_norm": 0.39584973454475403, "learning_rate": 6.490275388609213e-06, "loss": 0.4188, "step": 1237 }, { "epoch": 1.3905109489051095, "grad_norm": 0.35893332958221436, "learning_rate": 6.484034409972457e-06, "loss": 0.3989, "step": 1238 }, { "epoch": 1.3916339135317237, "grad_norm": 0.3474050760269165, "learning_rate": 6.477790894828422e-06, "loss": 0.4327, "step": 1239 }, { "epoch": 1.392756878158338, "grad_norm": 0.3613208532333374, "learning_rate": 6.471544853848504e-06, "loss": 0.4139, "step": 1240 }, { "epoch": 1.3938798427849521, "grad_norm": 0.3558221757411957, "learning_rate": 6.465296297708423e-06, "loss": 0.4117, "step": 1241 }, { "epoch": 1.3950028074115666, "grad_norm": 0.3806075155735016, "learning_rate": 6.459045237088189e-06, "loss": 0.4625, "step": 1242 }, { "epoch": 1.3961257720381808, "grad_norm": 0.3768899440765381, "learning_rate": 6.452791682672097e-06, "loss": 0.4256, "step": 1243 }, { "epoch": 1.397248736664795, "grad_norm": 0.33480802178382874, "learning_rate": 6.446535645148705e-06, "loss": 0.3783, "step": 1244 }, { "epoch": 1.3983717012914094, "grad_norm": 0.36673128604888916, "learning_rate": 6.440277135210815e-06, "loss": 0.4413, "step": 1245 }, { "epoch": 1.3994946659180236, "grad_norm": 0.33374840021133423, "learning_rate": 6.434016163555452e-06, "loss": 0.4168, "step": 1246 }, { "epoch": 1.4006176305446378, "grad_norm": 0.4072856903076172, "learning_rate": 6.42775274088385e-06, "loss": 0.4399, "step": 1247 }, { "epoch": 1.401740595171252, "grad_norm": 0.3470134735107422, "learning_rate": 6.421486877901436e-06, "loss": 0.4202, "step": 1248 }, { "epoch": 1.4028635597978663, "grad_norm": 0.3497457802295685, "learning_rate": 6.415218585317802e-06, "loss": 0.4178, "step": 1249 }, { "epoch": 1.4039865244244807, "grad_norm": 0.3406730890274048, "learning_rate": 6.408947873846695e-06, "loss": 0.4374, "step": 1250 }, { "epoch": 1.405109489051095, "grad_norm": 0.3473089337348938, "learning_rate": 6.402674754205998e-06, "loss": 0.3926, "step": 1251 }, { "epoch": 1.406232453677709, "grad_norm": 0.3695933222770691, "learning_rate": 6.396399237117709e-06, "loss": 0.4476, "step": 1252 }, { "epoch": 1.4073554183043235, "grad_norm": 0.30525869131088257, "learning_rate": 6.390121333307921e-06, "loss": 0.4076, "step": 1253 }, { "epoch": 1.4084783829309377, "grad_norm": 0.3395358622074127, "learning_rate": 6.383841053506813e-06, "loss": 0.4576, "step": 1254 }, { "epoch": 1.409601347557552, "grad_norm": 0.36095455288887024, "learning_rate": 6.377558408448618e-06, "loss": 0.436, "step": 1255 }, { "epoch": 1.4107243121841662, "grad_norm": 0.35125985741615295, "learning_rate": 6.371273408871614e-06, "loss": 0.4556, "step": 1256 }, { "epoch": 1.4118472768107804, "grad_norm": 0.41034063696861267, "learning_rate": 6.364986065518106e-06, "loss": 0.45, "step": 1257 }, { "epoch": 1.4129702414373948, "grad_norm": 0.36360031366348267, "learning_rate": 6.358696389134402e-06, "loss": 0.4069, "step": 1258 }, { "epoch": 1.414093206064009, "grad_norm": 0.33977606892585754, "learning_rate": 6.352404390470799e-06, "loss": 0.4476, "step": 1259 }, { "epoch": 1.4152161706906232, "grad_norm": 0.39067739248275757, "learning_rate": 6.3461100802815625e-06, "loss": 0.3977, "step": 1260 }, { "epoch": 1.4163391353172374, "grad_norm": 0.42939794063568115, "learning_rate": 6.3398134693249095e-06, "loss": 0.432, "step": 1261 }, { "epoch": 1.4174620999438519, "grad_norm": 0.380805641412735, "learning_rate": 6.333514568362987e-06, "loss": 0.3895, "step": 1262 }, { "epoch": 1.418585064570466, "grad_norm": 0.31445202231407166, "learning_rate": 6.3272133881618596e-06, "loss": 0.4525, "step": 1263 }, { "epoch": 1.4197080291970803, "grad_norm": 0.3775096535682678, "learning_rate": 6.320909939491486e-06, "loss": 0.4223, "step": 1264 }, { "epoch": 1.4208309938236945, "grad_norm": 0.357318639755249, "learning_rate": 6.314604233125703e-06, "loss": 0.3891, "step": 1265 }, { "epoch": 1.4219539584503087, "grad_norm": 0.49865302443504333, "learning_rate": 6.308296279842204e-06, "loss": 0.4571, "step": 1266 }, { "epoch": 1.4230769230769231, "grad_norm": 0.32295238971710205, "learning_rate": 6.301986090422526e-06, "loss": 0.4407, "step": 1267 }, { "epoch": 1.4241998877035373, "grad_norm": 0.40025267004966736, "learning_rate": 6.295673675652024e-06, "loss": 0.4396, "step": 1268 }, { "epoch": 1.4253228523301515, "grad_norm": 0.38461238145828247, "learning_rate": 6.289359046319862e-06, "loss": 0.4301, "step": 1269 }, { "epoch": 1.426445816956766, "grad_norm": 0.3790307939052582, "learning_rate": 6.283042213218983e-06, "loss": 0.4466, "step": 1270 }, { "epoch": 1.4275687815833802, "grad_norm": 0.4022485315799713, "learning_rate": 6.276723187146102e-06, "loss": 0.422, "step": 1271 }, { "epoch": 1.4286917462099944, "grad_norm": 0.34780874848365784, "learning_rate": 6.270401978901678e-06, "loss": 0.4708, "step": 1272 }, { "epoch": 1.4298147108366086, "grad_norm": 0.3148737847805023, "learning_rate": 6.264078599289904e-06, "loss": 0.3829, "step": 1273 }, { "epoch": 1.4309376754632228, "grad_norm": 0.3738420307636261, "learning_rate": 6.25775305911868e-06, "loss": 0.4305, "step": 1274 }, { "epoch": 1.4320606400898372, "grad_norm": 0.36536839604377747, "learning_rate": 6.2514253691996e-06, "loss": 0.432, "step": 1275 }, { "epoch": 1.4331836047164515, "grad_norm": 0.3619723916053772, "learning_rate": 6.245095540347937e-06, "loss": 0.4518, "step": 1276 }, { "epoch": 1.4343065693430657, "grad_norm": 0.34398528933525085, "learning_rate": 6.238763583382611e-06, "loss": 0.4237, "step": 1277 }, { "epoch": 1.43542953396968, "grad_norm": 0.3524113595485687, "learning_rate": 6.2324295091261885e-06, "loss": 0.4499, "step": 1278 }, { "epoch": 1.4365524985962943, "grad_norm": 0.3545290231704712, "learning_rate": 6.226093328404848e-06, "loss": 0.4599, "step": 1279 }, { "epoch": 1.4376754632229085, "grad_norm": 0.3287031352519989, "learning_rate": 6.2197550520483725e-06, "loss": 0.3986, "step": 1280 }, { "epoch": 1.4387984278495227, "grad_norm": 0.3222607672214508, "learning_rate": 6.213414690890125e-06, "loss": 0.3971, "step": 1281 }, { "epoch": 1.439921392476137, "grad_norm": 0.3929697871208191, "learning_rate": 6.207072255767032e-06, "loss": 0.4391, "step": 1282 }, { "epoch": 1.4410443571027511, "grad_norm": 0.4390507638454437, "learning_rate": 6.2007277575195645e-06, "loss": 0.4403, "step": 1283 }, { "epoch": 1.4421673217293656, "grad_norm": 0.3282164931297302, "learning_rate": 6.194381206991723e-06, "loss": 0.4375, "step": 1284 }, { "epoch": 1.4432902863559798, "grad_norm": 0.3594493567943573, "learning_rate": 6.188032615031011e-06, "loss": 0.419, "step": 1285 }, { "epoch": 1.444413250982594, "grad_norm": 0.3734440505504608, "learning_rate": 6.181681992488424e-06, "loss": 0.4206, "step": 1286 }, { "epoch": 1.4455362156092084, "grad_norm": 0.4052230417728424, "learning_rate": 6.175329350218426e-06, "loss": 0.4645, "step": 1287 }, { "epoch": 1.4466591802358226, "grad_norm": 0.2994524836540222, "learning_rate": 6.168974699078937e-06, "loss": 0.3853, "step": 1288 }, { "epoch": 1.4477821448624368, "grad_norm": 0.36587098240852356, "learning_rate": 6.1626180499313075e-06, "loss": 0.4381, "step": 1289 }, { "epoch": 1.448905109489051, "grad_norm": 0.3327069580554962, "learning_rate": 6.156259413640302e-06, "loss": 0.4427, "step": 1290 }, { "epoch": 1.4500280741156653, "grad_norm": 0.3494040369987488, "learning_rate": 6.149898801074084e-06, "loss": 0.4865, "step": 1291 }, { "epoch": 1.4511510387422797, "grad_norm": 0.3384385406970978, "learning_rate": 6.143536223104194e-06, "loss": 0.4338, "step": 1292 }, { "epoch": 1.452274003368894, "grad_norm": 0.2958364188671112, "learning_rate": 6.1371716906055336e-06, "loss": 0.3901, "step": 1293 }, { "epoch": 1.453396967995508, "grad_norm": 0.37118834257125854, "learning_rate": 6.130805214456339e-06, "loss": 0.4647, "step": 1294 }, { "epoch": 1.4545199326221225, "grad_norm": 0.312923789024353, "learning_rate": 6.124436805538176e-06, "loss": 0.4267, "step": 1295 }, { "epoch": 1.4556428972487367, "grad_norm": 0.3563215136528015, "learning_rate": 6.11806647473591e-06, "loss": 0.4104, "step": 1296 }, { "epoch": 1.456765861875351, "grad_norm": 0.31082478165626526, "learning_rate": 6.111694232937691e-06, "loss": 0.4469, "step": 1297 }, { "epoch": 1.4578888265019652, "grad_norm": 0.36104464530944824, "learning_rate": 6.105320091034937e-06, "loss": 0.4213, "step": 1298 }, { "epoch": 1.4590117911285794, "grad_norm": 0.33642077445983887, "learning_rate": 6.098944059922311e-06, "loss": 0.4303, "step": 1299 }, { "epoch": 1.4601347557551936, "grad_norm": 0.3139961361885071, "learning_rate": 6.09256615049771e-06, "loss": 0.4015, "step": 1300 }, { "epoch": 1.461257720381808, "grad_norm": 0.4025218188762665, "learning_rate": 6.086186373662233e-06, "loss": 0.4657, "step": 1301 }, { "epoch": 1.4623806850084222, "grad_norm": 0.303168386220932, "learning_rate": 6.079804740320181e-06, "loss": 0.4011, "step": 1302 }, { "epoch": 1.4635036496350364, "grad_norm": 0.29873350262641907, "learning_rate": 6.073421261379021e-06, "loss": 0.4094, "step": 1303 }, { "epoch": 1.4646266142616509, "grad_norm": 0.34536266326904297, "learning_rate": 6.067035947749376e-06, "loss": 0.4353, "step": 1304 }, { "epoch": 1.465749578888265, "grad_norm": 0.4209367036819458, "learning_rate": 6.060648810345006e-06, "loss": 0.468, "step": 1305 }, { "epoch": 1.4668725435148793, "grad_norm": 0.32413214445114136, "learning_rate": 6.054259860082788e-06, "loss": 0.4136, "step": 1306 }, { "epoch": 1.4679955081414935, "grad_norm": 0.3966822028160095, "learning_rate": 6.0478691078826956e-06, "loss": 0.4294, "step": 1307 }, { "epoch": 1.4691184727681077, "grad_norm": 0.34370726346969604, "learning_rate": 6.041476564667785e-06, "loss": 0.3965, "step": 1308 }, { "epoch": 1.4702414373947221, "grad_norm": 0.350951611995697, "learning_rate": 6.035082241364173e-06, "loss": 0.445, "step": 1309 }, { "epoch": 1.4713644020213363, "grad_norm": 0.33211949467658997, "learning_rate": 6.0286861489010175e-06, "loss": 0.4277, "step": 1310 }, { "epoch": 1.4724873666479505, "grad_norm": 0.34408682584762573, "learning_rate": 6.022288298210502e-06, "loss": 0.4328, "step": 1311 }, { "epoch": 1.473610331274565, "grad_norm": 0.32697176933288574, "learning_rate": 6.0158887002278124e-06, "loss": 0.4669, "step": 1312 }, { "epoch": 1.4747332959011792, "grad_norm": 0.3053934574127197, "learning_rate": 6.009487365891123e-06, "loss": 0.4309, "step": 1313 }, { "epoch": 1.4758562605277934, "grad_norm": 0.3923242390155792, "learning_rate": 6.003084306141579e-06, "loss": 0.4615, "step": 1314 }, { "epoch": 1.4769792251544076, "grad_norm": 0.31487324833869934, "learning_rate": 5.996679531923268e-06, "loss": 0.404, "step": 1315 }, { "epoch": 1.4781021897810218, "grad_norm": 0.4577028751373291, "learning_rate": 5.990273054183212e-06, "loss": 0.4537, "step": 1316 }, { "epoch": 1.479225154407636, "grad_norm": 0.3137701451778412, "learning_rate": 5.983864883871344e-06, "loss": 0.3824, "step": 1317 }, { "epoch": 1.4803481190342505, "grad_norm": 0.363899290561676, "learning_rate": 5.977455031940491e-06, "loss": 0.4449, "step": 1318 }, { "epoch": 1.4814710836608647, "grad_norm": 0.4452219307422638, "learning_rate": 5.971043509346353e-06, "loss": 0.4379, "step": 1319 }, { "epoch": 1.4825940482874789, "grad_norm": 0.3558255732059479, "learning_rate": 5.964630327047485e-06, "loss": 0.4205, "step": 1320 }, { "epoch": 1.4837170129140933, "grad_norm": 0.35796085000038147, "learning_rate": 5.95821549600528e-06, "loss": 0.4719, "step": 1321 }, { "epoch": 1.4848399775407075, "grad_norm": 0.35168972611427307, "learning_rate": 5.951799027183948e-06, "loss": 0.385, "step": 1322 }, { "epoch": 1.4859629421673217, "grad_norm": 0.4219711124897003, "learning_rate": 5.945380931550497e-06, "loss": 0.4582, "step": 1323 }, { "epoch": 1.487085906793936, "grad_norm": 0.34758421778678894, "learning_rate": 5.9389612200747224e-06, "loss": 0.4292, "step": 1324 }, { "epoch": 1.4882088714205501, "grad_norm": 0.4434061646461487, "learning_rate": 5.932539903729173e-06, "loss": 0.4474, "step": 1325 }, { "epoch": 1.4893318360471646, "grad_norm": 0.4019496440887451, "learning_rate": 5.926116993489143e-06, "loss": 0.4179, "step": 1326 }, { "epoch": 1.4904548006737788, "grad_norm": 0.46054908633232117, "learning_rate": 5.919692500332653e-06, "loss": 0.4476, "step": 1327 }, { "epoch": 1.491577765300393, "grad_norm": 0.3225356340408325, "learning_rate": 5.913266435240429e-06, "loss": 0.3749, "step": 1328 }, { "epoch": 1.4927007299270074, "grad_norm": 0.4698438346385956, "learning_rate": 5.906838809195879e-06, "loss": 0.4727, "step": 1329 }, { "epoch": 1.4938236945536216, "grad_norm": 0.3310648798942566, "learning_rate": 5.900409633185088e-06, "loss": 0.4047, "step": 1330 }, { "epoch": 1.4949466591802358, "grad_norm": 0.37283462285995483, "learning_rate": 5.8939789181967825e-06, "loss": 0.4195, "step": 1331 }, { "epoch": 1.49606962380685, "grad_norm": 0.42325764894485474, "learning_rate": 5.887546675222319e-06, "loss": 0.456, "step": 1332 }, { "epoch": 1.4971925884334643, "grad_norm": 0.3257545530796051, "learning_rate": 5.8811129152556725e-06, "loss": 0.4257, "step": 1333 }, { "epoch": 1.4983155530600787, "grad_norm": 0.44540300965309143, "learning_rate": 5.874677649293403e-06, "loss": 0.4374, "step": 1334 }, { "epoch": 1.499438517686693, "grad_norm": 0.3939068913459778, "learning_rate": 5.8682408883346535e-06, "loss": 0.4439, "step": 1335 }, { "epoch": 1.500561482313307, "grad_norm": 0.38308948278427124, "learning_rate": 5.8618026433811105e-06, "loss": 0.4336, "step": 1336 }, { "epoch": 1.5016844469399215, "grad_norm": 0.3991581201553345, "learning_rate": 5.855362925437009e-06, "loss": 0.4242, "step": 1337 }, { "epoch": 1.5028074115665357, "grad_norm": 0.37797030806541443, "learning_rate": 5.848921745509094e-06, "loss": 0.44, "step": 1338 }, { "epoch": 1.50393037619315, "grad_norm": 0.36779412627220154, "learning_rate": 5.842479114606609e-06, "loss": 0.4314, "step": 1339 }, { "epoch": 1.5050533408197642, "grad_norm": 0.40843796730041504, "learning_rate": 5.836035043741285e-06, "loss": 0.4624, "step": 1340 }, { "epoch": 1.5061763054463784, "grad_norm": 0.3803006112575531, "learning_rate": 5.829589543927305e-06, "loss": 0.4044, "step": 1341 }, { "epoch": 1.5072992700729926, "grad_norm": 0.3996165096759796, "learning_rate": 5.823142626181299e-06, "loss": 0.4424, "step": 1342 }, { "epoch": 1.508422234699607, "grad_norm": 0.35693812370300293, "learning_rate": 5.816694301522323e-06, "loss": 0.4518, "step": 1343 }, { "epoch": 1.5095451993262212, "grad_norm": 0.34747445583343506, "learning_rate": 5.8102445809718325e-06, "loss": 0.4182, "step": 1344 }, { "epoch": 1.5106681639528357, "grad_norm": 0.33477890491485596, "learning_rate": 5.803793475553669e-06, "loss": 0.4085, "step": 1345 }, { "epoch": 1.5117911285794499, "grad_norm": 0.39569091796875, "learning_rate": 5.797340996294046e-06, "loss": 0.4451, "step": 1346 }, { "epoch": 1.512914093206064, "grad_norm": 0.301170289516449, "learning_rate": 5.790887154221521e-06, "loss": 0.395, "step": 1347 }, { "epoch": 1.5140370578326783, "grad_norm": 0.3946448564529419, "learning_rate": 5.78443196036698e-06, "loss": 0.4226, "step": 1348 }, { "epoch": 1.5151600224592925, "grad_norm": 0.3706148564815521, "learning_rate": 5.777975425763628e-06, "loss": 0.4324, "step": 1349 }, { "epoch": 1.5162829870859067, "grad_norm": 0.3294577896595001, "learning_rate": 5.771517561446949e-06, "loss": 0.4083, "step": 1350 }, { "epoch": 1.517405951712521, "grad_norm": 0.40533217787742615, "learning_rate": 5.765058378454707e-06, "loss": 0.4314, "step": 1351 }, { "epoch": 1.5185289163391353, "grad_norm": 0.36212074756622314, "learning_rate": 5.75859788782692e-06, "loss": 0.4251, "step": 1352 }, { "epoch": 1.5196518809657495, "grad_norm": 0.3440522253513336, "learning_rate": 5.75213610060584e-06, "loss": 0.4176, "step": 1353 }, { "epoch": 1.520774845592364, "grad_norm": 0.3442220091819763, "learning_rate": 5.745673027835933e-06, "loss": 0.4458, "step": 1354 }, { "epoch": 1.5218978102189782, "grad_norm": 0.3430546522140503, "learning_rate": 5.739208680563863e-06, "loss": 0.4189, "step": 1355 }, { "epoch": 1.5230207748455924, "grad_norm": 0.3303071856498718, "learning_rate": 5.7327430698384775e-06, "loss": 0.44, "step": 1356 }, { "epoch": 1.5241437394722066, "grad_norm": 0.32069525122642517, "learning_rate": 5.726276206710779e-06, "loss": 0.4127, "step": 1357 }, { "epoch": 1.5252667040988208, "grad_norm": 0.33193540573120117, "learning_rate": 5.719808102233907e-06, "loss": 0.4233, "step": 1358 }, { "epoch": 1.526389668725435, "grad_norm": 0.3794214427471161, "learning_rate": 5.713338767463129e-06, "loss": 0.4496, "step": 1359 }, { "epoch": 1.5275126333520495, "grad_norm": 0.351874977350235, "learning_rate": 5.706868213455814e-06, "loss": 0.4168, "step": 1360 }, { "epoch": 1.5286355979786637, "grad_norm": 0.3340453803539276, "learning_rate": 5.7003964512714135e-06, "loss": 0.4199, "step": 1361 }, { "epoch": 1.529758562605278, "grad_norm": 0.37825071811676025, "learning_rate": 5.693923491971445e-06, "loss": 0.4406, "step": 1362 }, { "epoch": 1.5308815272318923, "grad_norm": 0.4037439525127411, "learning_rate": 5.687449346619472e-06, "loss": 0.4522, "step": 1363 }, { "epoch": 1.5320044918585065, "grad_norm": 0.3332884907722473, "learning_rate": 5.680974026281081e-06, "loss": 0.4526, "step": 1364 }, { "epoch": 1.5331274564851207, "grad_norm": 0.37202346324920654, "learning_rate": 5.674497542023875e-06, "loss": 0.3896, "step": 1365 }, { "epoch": 1.534250421111735, "grad_norm": 0.352293461561203, "learning_rate": 5.668019904917441e-06, "loss": 0.4598, "step": 1366 }, { "epoch": 1.5353733857383491, "grad_norm": 0.36421331763267517, "learning_rate": 5.661541126033335e-06, "loss": 0.4415, "step": 1367 }, { "epoch": 1.5364963503649633, "grad_norm": 0.3478181064128876, "learning_rate": 5.65506121644507e-06, "loss": 0.4261, "step": 1368 }, { "epoch": 1.5376193149915778, "grad_norm": 0.3292109966278076, "learning_rate": 5.648580187228088e-06, "loss": 0.4061, "step": 1369 }, { "epoch": 1.538742279618192, "grad_norm": 0.3834448456764221, "learning_rate": 5.642098049459745e-06, "loss": 0.4441, "step": 1370 }, { "epoch": 1.5398652442448064, "grad_norm": 0.33302247524261475, "learning_rate": 5.635614814219289e-06, "loss": 0.4358, "step": 1371 }, { "epoch": 1.5409882088714206, "grad_norm": 0.3573668599128723, "learning_rate": 5.6291304925878494e-06, "loss": 0.4366, "step": 1372 }, { "epoch": 1.5421111734980348, "grad_norm": 0.3712516725063324, "learning_rate": 5.622645095648411e-06, "loss": 0.4192, "step": 1373 }, { "epoch": 1.543234138124649, "grad_norm": 0.3585268557071686, "learning_rate": 5.616158634485793e-06, "loss": 0.4105, "step": 1374 }, { "epoch": 1.5443571027512633, "grad_norm": 0.38892462849617004, "learning_rate": 5.609671120186638e-06, "loss": 0.4396, "step": 1375 }, { "epoch": 1.5454800673778775, "grad_norm": 0.38341668248176575, "learning_rate": 5.6031825638393855e-06, "loss": 0.4642, "step": 1376 }, { "epoch": 1.546603032004492, "grad_norm": 0.37203583121299744, "learning_rate": 5.596692976534256e-06, "loss": 0.4297, "step": 1377 }, { "epoch": 1.547725996631106, "grad_norm": 0.3466741144657135, "learning_rate": 5.590202369363234e-06, "loss": 0.427, "step": 1378 }, { "epoch": 1.5488489612577205, "grad_norm": 0.42893186211586, "learning_rate": 5.583710753420046e-06, "loss": 0.4512, "step": 1379 }, { "epoch": 1.5499719258843347, "grad_norm": 0.3521140515804291, "learning_rate": 5.577218139800143e-06, "loss": 0.4088, "step": 1380 }, { "epoch": 1.551094890510949, "grad_norm": 0.34223607182502747, "learning_rate": 5.570724539600684e-06, "loss": 0.4337, "step": 1381 }, { "epoch": 1.5522178551375632, "grad_norm": 0.40277859568595886, "learning_rate": 5.564229963920507e-06, "loss": 0.418, "step": 1382 }, { "epoch": 1.5533408197641774, "grad_norm": 0.434794157743454, "learning_rate": 5.557734423860122e-06, "loss": 0.4612, "step": 1383 }, { "epoch": 1.5544637843907916, "grad_norm": 0.3233230710029602, "learning_rate": 5.551237930521692e-06, "loss": 0.3858, "step": 1384 }, { "epoch": 1.5555867490174058, "grad_norm": 0.3392625153064728, "learning_rate": 5.544740495009e-06, "loss": 0.4336, "step": 1385 }, { "epoch": 1.5567097136440202, "grad_norm": 0.4092359244823456, "learning_rate": 5.538242128427444e-06, "loss": 0.4674, "step": 1386 }, { "epoch": 1.5578326782706344, "grad_norm": 0.3863389492034912, "learning_rate": 5.531742841884017e-06, "loss": 0.4089, "step": 1387 }, { "epoch": 1.5589556428972489, "grad_norm": 0.3682871162891388, "learning_rate": 5.525242646487278e-06, "loss": 0.442, "step": 1388 }, { "epoch": 1.560078607523863, "grad_norm": 0.38414162397384644, "learning_rate": 5.518741553347341e-06, "loss": 0.3931, "step": 1389 }, { "epoch": 1.5612015721504773, "grad_norm": 0.3929695785045624, "learning_rate": 5.512239573575855e-06, "loss": 0.4227, "step": 1390 }, { "epoch": 1.5623245367770915, "grad_norm": 0.3410951495170593, "learning_rate": 5.505736718285988e-06, "loss": 0.4309, "step": 1391 }, { "epoch": 1.5634475014037057, "grad_norm": 0.3446836769580841, "learning_rate": 5.499232998592399e-06, "loss": 0.418, "step": 1392 }, { "epoch": 1.56457046603032, "grad_norm": 0.3506161570549011, "learning_rate": 5.492728425611225e-06, "loss": 0.4099, "step": 1393 }, { "epoch": 1.5656934306569343, "grad_norm": 0.3618926405906677, "learning_rate": 5.486223010460068e-06, "loss": 0.4024, "step": 1394 }, { "epoch": 1.5668163952835485, "grad_norm": 0.384712278842926, "learning_rate": 5.479716764257961e-06, "loss": 0.4587, "step": 1395 }, { "epoch": 1.567939359910163, "grad_norm": 0.3329257071018219, "learning_rate": 5.47320969812536e-06, "loss": 0.4165, "step": 1396 }, { "epoch": 1.5690623245367772, "grad_norm": 0.34846460819244385, "learning_rate": 5.466701823184127e-06, "loss": 0.4216, "step": 1397 }, { "epoch": 1.5701852891633914, "grad_norm": 0.3781341314315796, "learning_rate": 5.4601931505575e-06, "loss": 0.4579, "step": 1398 }, { "epoch": 1.5713082537900056, "grad_norm": 0.3628864884376526, "learning_rate": 5.453683691370084e-06, "loss": 0.4032, "step": 1399 }, { "epoch": 1.5724312184166198, "grad_norm": 0.3191184997558594, "learning_rate": 5.44717345674783e-06, "loss": 0.4068, "step": 1400 }, { "epoch": 1.573554183043234, "grad_norm": 0.4035561680793762, "learning_rate": 5.44066245781801e-06, "loss": 0.4452, "step": 1401 }, { "epoch": 1.5746771476698485, "grad_norm": 0.37905293703079224, "learning_rate": 5.434150705709203e-06, "loss": 0.458, "step": 1402 }, { "epoch": 1.5758001122964627, "grad_norm": 0.3235073387622833, "learning_rate": 5.427638211551278e-06, "loss": 0.4075, "step": 1403 }, { "epoch": 1.5769230769230769, "grad_norm": 0.3341541886329651, "learning_rate": 5.421124986475371e-06, "loss": 0.4415, "step": 1404 }, { "epoch": 1.5780460415496913, "grad_norm": 0.41193848848342896, "learning_rate": 5.414611041613869e-06, "loss": 0.4184, "step": 1405 }, { "epoch": 1.5791690061763055, "grad_norm": 0.37269216775894165, "learning_rate": 5.4080963881003835e-06, "loss": 0.4593, "step": 1406 }, { "epoch": 1.5802919708029197, "grad_norm": 0.3226201832294464, "learning_rate": 5.4015810370697445e-06, "loss": 0.3785, "step": 1407 }, { "epoch": 1.581414935429534, "grad_norm": 0.3951623737812042, "learning_rate": 5.395064999657971e-06, "loss": 0.4427, "step": 1408 }, { "epoch": 1.5825379000561481, "grad_norm": 0.38956916332244873, "learning_rate": 5.388548287002251e-06, "loss": 0.4464, "step": 1409 }, { "epoch": 1.5836608646827623, "grad_norm": 0.34109026193618774, "learning_rate": 5.382030910240936e-06, "loss": 0.4046, "step": 1410 }, { "epoch": 1.5847838293093768, "grad_norm": 0.39685767889022827, "learning_rate": 5.375512880513505e-06, "loss": 0.4442, "step": 1411 }, { "epoch": 1.585906793935991, "grad_norm": 0.33120906352996826, "learning_rate": 5.368994208960554e-06, "loss": 0.3985, "step": 1412 }, { "epoch": 1.5870297585626054, "grad_norm": 0.33489105105400085, "learning_rate": 5.362474906723781e-06, "loss": 0.4039, "step": 1413 }, { "epoch": 1.5881527231892196, "grad_norm": 0.393152117729187, "learning_rate": 5.355954984945957e-06, "loss": 0.4286, "step": 1414 }, { "epoch": 1.5892756878158338, "grad_norm": 0.35592615604400635, "learning_rate": 5.34943445477091e-06, "loss": 0.4593, "step": 1415 }, { "epoch": 1.590398652442448, "grad_norm": 0.37851378321647644, "learning_rate": 5.342913327343515e-06, "loss": 0.427, "step": 1416 }, { "epoch": 1.5915216170690623, "grad_norm": 0.41923651099205017, "learning_rate": 5.336391613809663e-06, "loss": 0.4369, "step": 1417 }, { "epoch": 1.5926445816956765, "grad_norm": 0.3389125466346741, "learning_rate": 5.329869325316246e-06, "loss": 0.3911, "step": 1418 }, { "epoch": 1.593767546322291, "grad_norm": 0.4326159358024597, "learning_rate": 5.3233464730111426e-06, "loss": 0.4616, "step": 1419 }, { "epoch": 1.594890510948905, "grad_norm": 0.37848082184791565, "learning_rate": 5.316823068043192e-06, "loss": 0.4268, "step": 1420 }, { "epoch": 1.5960134755755195, "grad_norm": 0.36140716075897217, "learning_rate": 5.310299121562178e-06, "loss": 0.4256, "step": 1421 }, { "epoch": 1.5971364402021337, "grad_norm": 0.40580540895462036, "learning_rate": 5.303774644718813e-06, "loss": 0.3993, "step": 1422 }, { "epoch": 1.598259404828748, "grad_norm": 0.38280847668647766, "learning_rate": 5.297249648664712e-06, "loss": 0.4174, "step": 1423 }, { "epoch": 1.5993823694553622, "grad_norm": 0.3684382438659668, "learning_rate": 5.290724144552379e-06, "loss": 0.4364, "step": 1424 }, { "epoch": 1.6005053340819764, "grad_norm": 0.38516560196876526, "learning_rate": 5.284198143535188e-06, "loss": 0.4201, "step": 1425 }, { "epoch": 1.6016282987085906, "grad_norm": 0.323007196187973, "learning_rate": 5.277671656767361e-06, "loss": 0.4219, "step": 1426 }, { "epoch": 1.6027512633352048, "grad_norm": 0.3676532804965973, "learning_rate": 5.27114469540395e-06, "loss": 0.4127, "step": 1427 }, { "epoch": 1.6038742279618192, "grad_norm": 0.40214043855667114, "learning_rate": 5.2646172706008154e-06, "loss": 0.4287, "step": 1428 }, { "epoch": 1.6049971925884334, "grad_norm": 0.33944350481033325, "learning_rate": 5.258089393514617e-06, "loss": 0.4511, "step": 1429 }, { "epoch": 1.6061201572150479, "grad_norm": 0.3740125596523285, "learning_rate": 5.25156107530278e-06, "loss": 0.414, "step": 1430 }, { "epoch": 1.607243121841662, "grad_norm": 0.3978530764579773, "learning_rate": 5.245032327123488e-06, "loss": 0.4119, "step": 1431 }, { "epoch": 1.6083660864682763, "grad_norm": 0.3496831953525543, "learning_rate": 5.238503160135659e-06, "loss": 0.4581, "step": 1432 }, { "epoch": 1.6094890510948905, "grad_norm": 0.3878086507320404, "learning_rate": 5.231973585498924e-06, "loss": 0.425, "step": 1433 }, { "epoch": 1.6106120157215047, "grad_norm": 0.37713873386383057, "learning_rate": 5.225443614373614e-06, "loss": 0.4448, "step": 1434 }, { "epoch": 1.611734980348119, "grad_norm": 0.4469600319862366, "learning_rate": 5.2189132579207395e-06, "loss": 0.4546, "step": 1435 }, { "epoch": 1.6128579449747333, "grad_norm": 0.3673571050167084, "learning_rate": 5.212382527301961e-06, "loss": 0.4271, "step": 1436 }, { "epoch": 1.6139809096013475, "grad_norm": 0.3854893743991852, "learning_rate": 5.20585143367959e-06, "loss": 0.4324, "step": 1437 }, { "epoch": 1.615103874227962, "grad_norm": 0.3742755949497223, "learning_rate": 5.199319988216547e-06, "loss": 0.4039, "step": 1438 }, { "epoch": 1.6162268388545762, "grad_norm": 0.3203883469104767, "learning_rate": 5.192788202076364e-06, "loss": 0.4034, "step": 1439 }, { "epoch": 1.6173498034811904, "grad_norm": 0.36902064085006714, "learning_rate": 5.186256086423148e-06, "loss": 0.4361, "step": 1440 }, { "epoch": 1.6184727681078046, "grad_norm": 0.31773215532302856, "learning_rate": 5.179723652421575e-06, "loss": 0.4499, "step": 1441 }, { "epoch": 1.6195957327344188, "grad_norm": 0.3588159680366516, "learning_rate": 5.17319091123686e-06, "loss": 0.4113, "step": 1442 }, { "epoch": 1.620718697361033, "grad_norm": 0.36533036828041077, "learning_rate": 5.166657874034745e-06, "loss": 0.4287, "step": 1443 }, { "epoch": 1.6218416619876472, "grad_norm": 0.3709407150745392, "learning_rate": 5.160124551981477e-06, "loss": 0.4425, "step": 1444 }, { "epoch": 1.6229646266142617, "grad_norm": 0.31932583451271057, "learning_rate": 5.153590956243795e-06, "loss": 0.4122, "step": 1445 }, { "epoch": 1.6240875912408759, "grad_norm": 0.3121388554573059, "learning_rate": 5.147057097988898e-06, "loss": 0.42, "step": 1446 }, { "epoch": 1.6252105558674903, "grad_norm": 0.3309304416179657, "learning_rate": 5.140522988384438e-06, "loss": 0.4401, "step": 1447 }, { "epoch": 1.6263335204941045, "grad_norm": 0.33190661668777466, "learning_rate": 5.133988638598497e-06, "loss": 0.4284, "step": 1448 }, { "epoch": 1.6274564851207187, "grad_norm": 0.3270394206047058, "learning_rate": 5.127454059799567e-06, "loss": 0.4508, "step": 1449 }, { "epoch": 1.628579449747333, "grad_norm": 0.3277170658111572, "learning_rate": 5.12091926315653e-06, "loss": 0.4264, "step": 1450 }, { "epoch": 1.6297024143739471, "grad_norm": 0.31912049651145935, "learning_rate": 5.114384259838641e-06, "loss": 0.3921, "step": 1451 }, { "epoch": 1.6308253790005613, "grad_norm": 0.350877046585083, "learning_rate": 5.1078490610155105e-06, "loss": 0.4453, "step": 1452 }, { "epoch": 1.6319483436271758, "grad_norm": 0.2982594966888428, "learning_rate": 5.101313677857078e-06, "loss": 0.3807, "step": 1453 }, { "epoch": 1.63307130825379, "grad_norm": 0.3632868230342865, "learning_rate": 5.094778121533606e-06, "loss": 0.4298, "step": 1454 }, { "epoch": 1.6341942728804044, "grad_norm": 0.3501133918762207, "learning_rate": 5.088242403215644e-06, "loss": 0.4112, "step": 1455 }, { "epoch": 1.6353172375070186, "grad_norm": 0.3597886264324188, "learning_rate": 5.081706534074023e-06, "loss": 0.4453, "step": 1456 }, { "epoch": 1.6364402021336328, "grad_norm": 0.321927011013031, "learning_rate": 5.075170525279834e-06, "loss": 0.46, "step": 1457 }, { "epoch": 1.637563166760247, "grad_norm": 0.3139888346195221, "learning_rate": 5.0686343880044044e-06, "loss": 0.3878, "step": 1458 }, { "epoch": 1.6386861313868613, "grad_norm": 0.3926408886909485, "learning_rate": 5.062098133419276e-06, "loss": 0.4253, "step": 1459 }, { "epoch": 1.6398090960134755, "grad_norm": 0.33582693338394165, "learning_rate": 5.055561772696201e-06, "loss": 0.4348, "step": 1460 }, { "epoch": 1.6409320606400897, "grad_norm": 0.319807231426239, "learning_rate": 5.049025317007108e-06, "loss": 0.3948, "step": 1461 }, { "epoch": 1.642055025266704, "grad_norm": 0.3448527455329895, "learning_rate": 5.042488777524084e-06, "loss": 0.4319, "step": 1462 }, { "epoch": 1.6431779898933183, "grad_norm": 0.3412129282951355, "learning_rate": 5.035952165419366e-06, "loss": 0.4318, "step": 1463 }, { "epoch": 1.6443009545199327, "grad_norm": 0.3656712770462036, "learning_rate": 5.029415491865311e-06, "loss": 0.4493, "step": 1464 }, { "epoch": 1.645423919146547, "grad_norm": 0.28737977147102356, "learning_rate": 5.022878768034386e-06, "loss": 0.3881, "step": 1465 }, { "epoch": 1.6465468837731612, "grad_norm": 0.3111880421638489, "learning_rate": 5.016342005099135e-06, "loss": 0.3899, "step": 1466 }, { "epoch": 1.6476698483997754, "grad_norm": 0.3284454047679901, "learning_rate": 5.009805214232177e-06, "loss": 0.437, "step": 1467 }, { "epoch": 1.6487928130263896, "grad_norm": 0.3124236464500427, "learning_rate": 5.0032684066061766e-06, "loss": 0.4145, "step": 1468 }, { "epoch": 1.6499157776530038, "grad_norm": 0.3450501263141632, "learning_rate": 4.996731593393825e-06, "loss": 0.4549, "step": 1469 }, { "epoch": 1.6510387422796182, "grad_norm": 0.3168811798095703, "learning_rate": 4.990194785767824e-06, "loss": 0.4299, "step": 1470 }, { "epoch": 1.6521617069062324, "grad_norm": 0.3595776855945587, "learning_rate": 4.983657994900865e-06, "loss": 0.4454, "step": 1471 }, { "epoch": 1.6532846715328469, "grad_norm": 0.3338601589202881, "learning_rate": 4.977121231965617e-06, "loss": 0.4312, "step": 1472 }, { "epoch": 1.654407636159461, "grad_norm": 0.33009031414985657, "learning_rate": 4.97058450813469e-06, "loss": 0.3699, "step": 1473 }, { "epoch": 1.6555306007860753, "grad_norm": 0.4129428565502167, "learning_rate": 4.964047834580635e-06, "loss": 0.4689, "step": 1474 }, { "epoch": 1.6566535654126895, "grad_norm": 0.3803240656852722, "learning_rate": 4.957511222475918e-06, "loss": 0.4479, "step": 1475 }, { "epoch": 1.6577765300393037, "grad_norm": 0.346652626991272, "learning_rate": 4.950974682992894e-06, "loss": 0.3797, "step": 1476 }, { "epoch": 1.658899494665918, "grad_norm": 0.3267097771167755, "learning_rate": 4.9444382273038e-06, "loss": 0.4363, "step": 1477 }, { "epoch": 1.6600224592925323, "grad_norm": 0.33385327458381653, "learning_rate": 4.9379018665807245e-06, "loss": 0.4473, "step": 1478 }, { "epoch": 1.6611454239191465, "grad_norm": 0.3711581230163574, "learning_rate": 4.931365611995598e-06, "loss": 0.4339, "step": 1479 }, { "epoch": 1.6622683885457608, "grad_norm": 0.3356890380382538, "learning_rate": 4.924829474720165e-06, "loss": 0.4249, "step": 1480 }, { "epoch": 1.6633913531723752, "grad_norm": 0.34012460708618164, "learning_rate": 4.918293465925978e-06, "loss": 0.4088, "step": 1481 }, { "epoch": 1.6645143177989894, "grad_norm": 0.3433002233505249, "learning_rate": 4.911757596784358e-06, "loss": 0.4097, "step": 1482 }, { "epoch": 1.6656372824256036, "grad_norm": 0.34631970524787903, "learning_rate": 4.905221878466395e-06, "loss": 0.4264, "step": 1483 }, { "epoch": 1.6667602470522178, "grad_norm": 0.3502822518348694, "learning_rate": 4.898686322142923e-06, "loss": 0.3787, "step": 1484 }, { "epoch": 1.667883211678832, "grad_norm": 0.3769824504852295, "learning_rate": 4.892150938984491e-06, "loss": 0.4721, "step": 1485 }, { "epoch": 1.6690061763054462, "grad_norm": 0.34992924332618713, "learning_rate": 4.885615740161359e-06, "loss": 0.433, "step": 1486 }, { "epoch": 1.6701291409320607, "grad_norm": 0.3739887773990631, "learning_rate": 4.879080736843471e-06, "loss": 0.4218, "step": 1487 }, { "epoch": 1.6712521055586749, "grad_norm": 0.36361274123191833, "learning_rate": 4.872545940200435e-06, "loss": 0.4467, "step": 1488 }, { "epoch": 1.6723750701852893, "grad_norm": 0.30618375539779663, "learning_rate": 4.866011361401505e-06, "loss": 0.4064, "step": 1489 }, { "epoch": 1.6734980348119035, "grad_norm": 0.3977403938770294, "learning_rate": 4.859477011615564e-06, "loss": 0.4489, "step": 1490 }, { "epoch": 1.6746209994385177, "grad_norm": 0.37429189682006836, "learning_rate": 4.8529429020111035e-06, "loss": 0.4222, "step": 1491 }, { "epoch": 1.675743964065132, "grad_norm": 0.32864612340927124, "learning_rate": 4.846409043756209e-06, "loss": 0.4666, "step": 1492 }, { "epoch": 1.6768669286917461, "grad_norm": 0.335208535194397, "learning_rate": 4.839875448018524e-06, "loss": 0.4295, "step": 1493 }, { "epoch": 1.6779898933183603, "grad_norm": 0.35737794637680054, "learning_rate": 4.833342125965257e-06, "loss": 0.3948, "step": 1494 }, { "epoch": 1.6791128579449748, "grad_norm": 0.34874504804611206, "learning_rate": 4.826809088763143e-06, "loss": 0.4175, "step": 1495 }, { "epoch": 1.680235822571589, "grad_norm": 0.3138450086116791, "learning_rate": 4.820276347578427e-06, "loss": 0.3947, "step": 1496 }, { "epoch": 1.6813587871982034, "grad_norm": 0.32338929176330566, "learning_rate": 4.813743913576852e-06, "loss": 0.4483, "step": 1497 }, { "epoch": 1.6824817518248176, "grad_norm": 0.33529898524284363, "learning_rate": 4.807211797923638e-06, "loss": 0.4218, "step": 1498 }, { "epoch": 1.6836047164514318, "grad_norm": 0.3462008535861969, "learning_rate": 4.800680011783455e-06, "loss": 0.4097, "step": 1499 }, { "epoch": 1.684727681078046, "grad_norm": 0.3542187809944153, "learning_rate": 4.794148566320412e-06, "loss": 0.4341, "step": 1500 }, { "epoch": 1.6858506457046603, "grad_norm": 0.33356180787086487, "learning_rate": 4.78761747269804e-06, "loss": 0.4108, "step": 1501 }, { "epoch": 1.6869736103312745, "grad_norm": 0.33488398790359497, "learning_rate": 4.781086742079262e-06, "loss": 0.4546, "step": 1502 }, { "epoch": 1.6880965749578887, "grad_norm": 0.33070120215415955, "learning_rate": 4.774556385626386e-06, "loss": 0.4425, "step": 1503 }, { "epoch": 1.689219539584503, "grad_norm": 0.3074497878551483, "learning_rate": 4.768026414501078e-06, "loss": 0.4054, "step": 1504 }, { "epoch": 1.6903425042111173, "grad_norm": 0.30489835143089294, "learning_rate": 4.761496839864343e-06, "loss": 0.4154, "step": 1505 }, { "epoch": 1.6914654688377317, "grad_norm": 0.3423299789428711, "learning_rate": 4.754967672876513e-06, "loss": 0.4326, "step": 1506 }, { "epoch": 1.692588433464346, "grad_norm": 0.3305467963218689, "learning_rate": 4.748438924697222e-06, "loss": 0.4204, "step": 1507 }, { "epoch": 1.6937113980909602, "grad_norm": 0.34336474537849426, "learning_rate": 4.741910606485385e-06, "loss": 0.4278, "step": 1508 }, { "epoch": 1.6948343627175744, "grad_norm": 0.3376448154449463, "learning_rate": 4.7353827293991845e-06, "loss": 0.431, "step": 1509 }, { "epoch": 1.6959573273441886, "grad_norm": 0.355744868516922, "learning_rate": 4.728855304596053e-06, "loss": 0.4369, "step": 1510 }, { "epoch": 1.6970802919708028, "grad_norm": 0.3406313359737396, "learning_rate": 4.72232834323264e-06, "loss": 0.429, "step": 1511 }, { "epoch": 1.6982032565974172, "grad_norm": 0.3444386124610901, "learning_rate": 4.715801856464812e-06, "loss": 0.4094, "step": 1512 }, { "epoch": 1.6993262212240314, "grad_norm": 0.34955787658691406, "learning_rate": 4.7092758554476215e-06, "loss": 0.442, "step": 1513 }, { "epoch": 1.7004491858506459, "grad_norm": 0.33432456851005554, "learning_rate": 4.7027503513352905e-06, "loss": 0.432, "step": 1514 }, { "epoch": 1.70157215047726, "grad_norm": 0.3523922562599182, "learning_rate": 4.6962253552811885e-06, "loss": 0.4183, "step": 1515 }, { "epoch": 1.7026951151038743, "grad_norm": 0.3342672288417816, "learning_rate": 4.689700878437823e-06, "loss": 0.3966, "step": 1516 }, { "epoch": 1.7038180797304885, "grad_norm": 0.31708985567092896, "learning_rate": 4.683176931956809e-06, "loss": 0.4307, "step": 1517 }, { "epoch": 1.7049410443571027, "grad_norm": 0.32994332909584045, "learning_rate": 4.676653526988858e-06, "loss": 0.4548, "step": 1518 }, { "epoch": 1.706064008983717, "grad_norm": 0.31459349393844604, "learning_rate": 4.670130674683756e-06, "loss": 0.4022, "step": 1519 }, { "epoch": 1.7071869736103311, "grad_norm": 0.33248060941696167, "learning_rate": 4.663608386190339e-06, "loss": 0.4487, "step": 1520 }, { "epoch": 1.7083099382369455, "grad_norm": 0.3131781816482544, "learning_rate": 4.657086672656486e-06, "loss": 0.3859, "step": 1521 }, { "epoch": 1.7094329028635598, "grad_norm": 0.3524945080280304, "learning_rate": 4.650565545229092e-06, "loss": 0.4259, "step": 1522 }, { "epoch": 1.7105558674901742, "grad_norm": 0.32958537340164185, "learning_rate": 4.644045015054045e-06, "loss": 0.4337, "step": 1523 }, { "epoch": 1.7116788321167884, "grad_norm": 0.34632357954978943, "learning_rate": 4.63752509327622e-06, "loss": 0.4325, "step": 1524 }, { "epoch": 1.7128017967434026, "grad_norm": 0.3545970022678375, "learning_rate": 4.631005791039447e-06, "loss": 0.4436, "step": 1525 }, { "epoch": 1.7139247613700168, "grad_norm": 0.32058975100517273, "learning_rate": 4.624487119486497e-06, "loss": 0.4458, "step": 1526 }, { "epoch": 1.715047725996631, "grad_norm": 0.3882029950618744, "learning_rate": 4.617969089759066e-06, "loss": 0.4204, "step": 1527 }, { "epoch": 1.7161706906232452, "grad_norm": 0.3161708116531372, "learning_rate": 4.61145171299775e-06, "loss": 0.3908, "step": 1528 }, { "epoch": 1.7172936552498597, "grad_norm": 0.3476976156234741, "learning_rate": 4.6049350003420315e-06, "loss": 0.4473, "step": 1529 }, { "epoch": 1.7184166198764739, "grad_norm": 0.3358439803123474, "learning_rate": 4.598418962930258e-06, "loss": 0.4253, "step": 1530 }, { "epoch": 1.7195395845030883, "grad_norm": 0.3441962003707886, "learning_rate": 4.591903611899618e-06, "loss": 0.4772, "step": 1531 }, { "epoch": 1.7206625491297025, "grad_norm": 0.368468314409256, "learning_rate": 4.585388958386133e-06, "loss": 0.4521, "step": 1532 }, { "epoch": 1.7217855137563167, "grad_norm": 0.30635565519332886, "learning_rate": 4.57887501352463e-06, "loss": 0.4098, "step": 1533 }, { "epoch": 1.722908478382931, "grad_norm": 0.30618005990982056, "learning_rate": 4.572361788448724e-06, "loss": 0.3992, "step": 1534 }, { "epoch": 1.7240314430095451, "grad_norm": 0.33370348811149597, "learning_rate": 4.565849294290798e-06, "loss": 0.4539, "step": 1535 }, { "epoch": 1.7251544076361593, "grad_norm": 0.3167838156223297, "learning_rate": 4.559337542181993e-06, "loss": 0.3984, "step": 1536 }, { "epoch": 1.7262773722627736, "grad_norm": 0.3350682556629181, "learning_rate": 4.552826543252171e-06, "loss": 0.431, "step": 1537 }, { "epoch": 1.727400336889388, "grad_norm": 0.3134891390800476, "learning_rate": 4.546316308629916e-06, "loss": 0.4231, "step": 1538 }, { "epoch": 1.7285233015160022, "grad_norm": 0.29327136278152466, "learning_rate": 4.539806849442501e-06, "loss": 0.3744, "step": 1539 }, { "epoch": 1.7296462661426166, "grad_norm": 0.36168551445007324, "learning_rate": 4.5332981768158744e-06, "loss": 0.4723, "step": 1540 }, { "epoch": 1.7307692307692308, "grad_norm": 0.3768256902694702, "learning_rate": 4.526790301874641e-06, "loss": 0.4333, "step": 1541 }, { "epoch": 1.731892195395845, "grad_norm": 0.33159083127975464, "learning_rate": 4.520283235742042e-06, "loss": 0.4183, "step": 1542 }, { "epoch": 1.7330151600224593, "grad_norm": 0.34202951192855835, "learning_rate": 4.5137769895399345e-06, "loss": 0.458, "step": 1543 }, { "epoch": 1.7341381246490735, "grad_norm": 0.33284705877304077, "learning_rate": 4.507271574388775e-06, "loss": 0.4475, "step": 1544 }, { "epoch": 1.7352610892756877, "grad_norm": 0.3398878276348114, "learning_rate": 4.500767001407604e-06, "loss": 0.4425, "step": 1545 }, { "epoch": 1.736384053902302, "grad_norm": 0.3158484399318695, "learning_rate": 4.4942632817140145e-06, "loss": 0.4317, "step": 1546 }, { "epoch": 1.7375070185289163, "grad_norm": 0.4000184237957001, "learning_rate": 4.487760426424146e-06, "loss": 0.4256, "step": 1547 }, { "epoch": 1.7386299831555307, "grad_norm": 0.3756653666496277, "learning_rate": 4.481258446652662e-06, "loss": 0.4083, "step": 1548 }, { "epoch": 1.739752947782145, "grad_norm": 0.33379530906677246, "learning_rate": 4.474757353512724e-06, "loss": 0.455, "step": 1549 }, { "epoch": 1.7408759124087592, "grad_norm": 0.3534655272960663, "learning_rate": 4.468257158115982e-06, "loss": 0.4131, "step": 1550 }, { "epoch": 1.7419988770353734, "grad_norm": 0.42644965648651123, "learning_rate": 4.4617578715725565e-06, "loss": 0.4511, "step": 1551 }, { "epoch": 1.7431218416619876, "grad_norm": 0.28086718916893005, "learning_rate": 4.4552595049910014e-06, "loss": 0.3952, "step": 1552 }, { "epoch": 1.7442448062886018, "grad_norm": 0.3781779408454895, "learning_rate": 4.448762069478309e-06, "loss": 0.4192, "step": 1553 }, { "epoch": 1.7453677709152162, "grad_norm": 0.3361646831035614, "learning_rate": 4.4422655761398785e-06, "loss": 0.4433, "step": 1554 }, { "epoch": 1.7464907355418304, "grad_norm": 0.3070261776447296, "learning_rate": 4.435770036079495e-06, "loss": 0.4513, "step": 1555 }, { "epoch": 1.7476137001684446, "grad_norm": 0.3583263158798218, "learning_rate": 4.429275460399317e-06, "loss": 0.4455, "step": 1556 }, { "epoch": 1.748736664795059, "grad_norm": 0.3153129518032074, "learning_rate": 4.4227818601998575e-06, "loss": 0.4084, "step": 1557 }, { "epoch": 1.7498596294216733, "grad_norm": 0.31814315915107727, "learning_rate": 4.416289246579955e-06, "loss": 0.429, "step": 1558 }, { "epoch": 1.7509825940482875, "grad_norm": 0.4073929786682129, "learning_rate": 4.409797630636766e-06, "loss": 0.4272, "step": 1559 }, { "epoch": 1.7521055586749017, "grad_norm": 0.3751053512096405, "learning_rate": 4.403307023465746e-06, "loss": 0.4034, "step": 1560 }, { "epoch": 1.753228523301516, "grad_norm": 0.3227291703224182, "learning_rate": 4.396817436160616e-06, "loss": 0.4258, "step": 1561 }, { "epoch": 1.7543514879281301, "grad_norm": 0.3397650420665741, "learning_rate": 4.390328879813364e-06, "loss": 0.4389, "step": 1562 }, { "epoch": 1.7554744525547445, "grad_norm": 0.3839937150478363, "learning_rate": 4.383841365514208e-06, "loss": 0.4434, "step": 1563 }, { "epoch": 1.7565974171813588, "grad_norm": 0.33157357573509216, "learning_rate": 4.3773549043515895e-06, "loss": 0.4478, "step": 1564 }, { "epoch": 1.7577203818079732, "grad_norm": 0.3562920093536377, "learning_rate": 4.370869507412151e-06, "loss": 0.4339, "step": 1565 }, { "epoch": 1.7588433464345874, "grad_norm": 0.31482887268066406, "learning_rate": 4.364385185780712e-06, "loss": 0.4309, "step": 1566 }, { "epoch": 1.7599663110612016, "grad_norm": 0.35397154092788696, "learning_rate": 4.357901950540257e-06, "loss": 0.3899, "step": 1567 }, { "epoch": 1.7610892756878158, "grad_norm": 0.39329102635383606, "learning_rate": 4.3514198127719145e-06, "loss": 0.4383, "step": 1568 }, { "epoch": 1.76221224031443, "grad_norm": 0.3355405628681183, "learning_rate": 4.3449387835549305e-06, "loss": 0.4271, "step": 1569 }, { "epoch": 1.7633352049410442, "grad_norm": 0.3458855450153351, "learning_rate": 4.338458873966665e-06, "loss": 0.4261, "step": 1570 }, { "epoch": 1.7644581695676587, "grad_norm": 0.3516028821468353, "learning_rate": 4.331980095082562e-06, "loss": 0.4306, "step": 1571 }, { "epoch": 1.7655811341942729, "grad_norm": 0.3496720790863037, "learning_rate": 4.325502457976126e-06, "loss": 0.4381, "step": 1572 }, { "epoch": 1.7667040988208873, "grad_norm": 0.34043046832084656, "learning_rate": 4.31902597371892e-06, "loss": 0.4022, "step": 1573 }, { "epoch": 1.7678270634475015, "grad_norm": 0.3447478115558624, "learning_rate": 4.312550653380532e-06, "loss": 0.446, "step": 1574 }, { "epoch": 1.7689500280741157, "grad_norm": 0.3259204626083374, "learning_rate": 4.306076508028557e-06, "loss": 0.4013, "step": 1575 }, { "epoch": 1.77007299270073, "grad_norm": 0.34059205651283264, "learning_rate": 4.299603548728587e-06, "loss": 0.4046, "step": 1576 }, { "epoch": 1.7711959573273441, "grad_norm": 0.3550703227519989, "learning_rate": 4.293131786544187e-06, "loss": 0.4232, "step": 1577 }, { "epoch": 1.7723189219539583, "grad_norm": 0.30197733640670776, "learning_rate": 4.286661232536873e-06, "loss": 0.4031, "step": 1578 }, { "epoch": 1.7734418865805726, "grad_norm": 0.3450040817260742, "learning_rate": 4.280191897766095e-06, "loss": 0.4215, "step": 1579 }, { "epoch": 1.774564851207187, "grad_norm": 0.32787370681762695, "learning_rate": 4.273723793289224e-06, "loss": 0.3877, "step": 1580 }, { "epoch": 1.7756878158338012, "grad_norm": 0.3360215723514557, "learning_rate": 4.267256930161523e-06, "loss": 0.4346, "step": 1581 }, { "epoch": 1.7768107804604156, "grad_norm": 0.3175067603588104, "learning_rate": 4.260791319436137e-06, "loss": 0.4128, "step": 1582 }, { "epoch": 1.7779337450870298, "grad_norm": 0.33781278133392334, "learning_rate": 4.25432697216407e-06, "loss": 0.4224, "step": 1583 }, { "epoch": 1.779056709713644, "grad_norm": 0.35193830728530884, "learning_rate": 4.247863899394162e-06, "loss": 0.4301, "step": 1584 }, { "epoch": 1.7801796743402583, "grad_norm": 0.30720055103302, "learning_rate": 4.24140211217308e-06, "loss": 0.4145, "step": 1585 }, { "epoch": 1.7813026389668725, "grad_norm": 0.37376272678375244, "learning_rate": 4.234941621545294e-06, "loss": 0.423, "step": 1586 }, { "epoch": 1.7824256035934867, "grad_norm": 0.32682278752326965, "learning_rate": 4.228482438553052e-06, "loss": 0.4142, "step": 1587 }, { "epoch": 1.783548568220101, "grad_norm": 0.3150333762168884, "learning_rate": 4.222024574236372e-06, "loss": 0.4173, "step": 1588 }, { "epoch": 1.7846715328467153, "grad_norm": 0.3322964310646057, "learning_rate": 4.2155680396330205e-06, "loss": 0.3979, "step": 1589 }, { "epoch": 1.7857944974733297, "grad_norm": 0.3973678648471832, "learning_rate": 4.209112845778481e-06, "loss": 0.4508, "step": 1590 }, { "epoch": 1.786917462099944, "grad_norm": 0.3345934748649597, "learning_rate": 4.2026590037059554e-06, "loss": 0.4307, "step": 1591 }, { "epoch": 1.7880404267265582, "grad_norm": 0.376287579536438, "learning_rate": 4.196206524446332e-06, "loss": 0.4188, "step": 1592 }, { "epoch": 1.7891633913531724, "grad_norm": 0.3131870627403259, "learning_rate": 4.189755419028169e-06, "loss": 0.4286, "step": 1593 }, { "epoch": 1.7902863559797866, "grad_norm": 0.29493191838264465, "learning_rate": 4.183305698477676e-06, "loss": 0.4193, "step": 1594 }, { "epoch": 1.7914093206064008, "grad_norm": 0.36137086153030396, "learning_rate": 4.1768573738187014e-06, "loss": 0.4288, "step": 1595 }, { "epoch": 1.792532285233015, "grad_norm": 0.3741399347782135, "learning_rate": 4.1704104560726955e-06, "loss": 0.422, "step": 1596 }, { "epoch": 1.7936552498596294, "grad_norm": 0.3340109884738922, "learning_rate": 4.1639649562587175e-06, "loss": 0.4128, "step": 1597 }, { "epoch": 1.7947782144862436, "grad_norm": 0.3242846429347992, "learning_rate": 4.157520885393392e-06, "loss": 0.4399, "step": 1598 }, { "epoch": 1.795901179112858, "grad_norm": 0.35978204011917114, "learning_rate": 4.151078254490908e-06, "loss": 0.4362, "step": 1599 }, { "epoch": 1.7970241437394723, "grad_norm": 0.39118096232414246, "learning_rate": 4.144637074562994e-06, "loss": 0.4292, "step": 1600 }, { "epoch": 1.7981471083660865, "grad_norm": 0.33002153038978577, "learning_rate": 4.138197356618891e-06, "loss": 0.4188, "step": 1601 }, { "epoch": 1.7992700729927007, "grad_norm": 0.34627991914749146, "learning_rate": 4.131759111665349e-06, "loss": 0.4563, "step": 1602 }, { "epoch": 1.800393037619315, "grad_norm": 0.3338475823402405, "learning_rate": 4.125322350706598e-06, "loss": 0.4067, "step": 1603 }, { "epoch": 1.8015160022459291, "grad_norm": 0.34258463978767395, "learning_rate": 4.118887084744329e-06, "loss": 0.4299, "step": 1604 }, { "epoch": 1.8026389668725435, "grad_norm": 0.37330177426338196, "learning_rate": 4.112453324777683e-06, "loss": 0.4179, "step": 1605 }, { "epoch": 1.8037619314991578, "grad_norm": 0.36292973160743713, "learning_rate": 4.10602108180322e-06, "loss": 0.466, "step": 1606 }, { "epoch": 1.8048848961257722, "grad_norm": 0.3170583248138428, "learning_rate": 4.099590366814913e-06, "loss": 0.3926, "step": 1607 }, { "epoch": 1.8060078607523864, "grad_norm": 0.3629921078681946, "learning_rate": 4.09316119080412e-06, "loss": 0.4405, "step": 1608 }, { "epoch": 1.8071308253790006, "grad_norm": 0.35283517837524414, "learning_rate": 4.086733564759574e-06, "loss": 0.3923, "step": 1609 }, { "epoch": 1.8082537900056148, "grad_norm": 0.34560343623161316, "learning_rate": 4.080307499667348e-06, "loss": 0.4312, "step": 1610 }, { "epoch": 1.809376754632229, "grad_norm": 0.34348171949386597, "learning_rate": 4.073883006510858e-06, "loss": 0.4367, "step": 1611 }, { "epoch": 1.8104997192588432, "grad_norm": 0.30913522839546204, "learning_rate": 4.06746009627083e-06, "loss": 0.3875, "step": 1612 }, { "epoch": 1.8116226838854577, "grad_norm": 0.308203786611557, "learning_rate": 4.061038779925278e-06, "loss": 0.4672, "step": 1613 }, { "epoch": 1.8127456485120719, "grad_norm": 0.316845178604126, "learning_rate": 4.054619068449502e-06, "loss": 0.4274, "step": 1614 }, { "epoch": 1.813868613138686, "grad_norm": 0.41786956787109375, "learning_rate": 4.048200972816055e-06, "loss": 0.4772, "step": 1615 }, { "epoch": 1.8149915777653005, "grad_norm": 0.33189916610717773, "learning_rate": 4.041784503994723e-06, "loss": 0.4151, "step": 1616 }, { "epoch": 1.8161145423919147, "grad_norm": 0.3261347711086273, "learning_rate": 4.035369672952516e-06, "loss": 0.4202, "step": 1617 }, { "epoch": 1.817237507018529, "grad_norm": 0.3177904188632965, "learning_rate": 4.028956490653649e-06, "loss": 0.4219, "step": 1618 }, { "epoch": 1.8183604716451431, "grad_norm": 0.331063836812973, "learning_rate": 4.02254496805951e-06, "loss": 0.4195, "step": 1619 }, { "epoch": 1.8194834362717573, "grad_norm": 0.335813045501709, "learning_rate": 4.016135116128656e-06, "loss": 0.4231, "step": 1620 }, { "epoch": 1.8206064008983716, "grad_norm": 0.3216002285480499, "learning_rate": 4.00972694581679e-06, "loss": 0.4089, "step": 1621 }, { "epoch": 1.821729365524986, "grad_norm": 0.29934000968933105, "learning_rate": 4.003320468076733e-06, "loss": 0.4066, "step": 1622 }, { "epoch": 1.8228523301516002, "grad_norm": 0.3379705846309662, "learning_rate": 3.996915693858422e-06, "loss": 0.4497, "step": 1623 }, { "epoch": 1.8239752947782146, "grad_norm": 0.3418227732181549, "learning_rate": 3.990512634108878e-06, "loss": 0.4237, "step": 1624 }, { "epoch": 1.8250982594048288, "grad_norm": 0.34770506620407104, "learning_rate": 3.984111299772188e-06, "loss": 0.4137, "step": 1625 }, { "epoch": 1.826221224031443, "grad_norm": 0.2920346260070801, "learning_rate": 3.977711701789499e-06, "loss": 0.4065, "step": 1626 }, { "epoch": 1.8273441886580573, "grad_norm": 0.33900001645088196, "learning_rate": 3.971313851098984e-06, "loss": 0.4552, "step": 1627 }, { "epoch": 1.8284671532846715, "grad_norm": 0.31734499335289, "learning_rate": 3.964917758635828e-06, "loss": 0.4217, "step": 1628 }, { "epoch": 1.8295901179112857, "grad_norm": 0.2978774309158325, "learning_rate": 3.9585234353322155e-06, "loss": 0.3956, "step": 1629 }, { "epoch": 1.8307130825379, "grad_norm": 0.3059838116168976, "learning_rate": 3.952130892117306e-06, "loss": 0.428, "step": 1630 }, { "epoch": 1.8318360471645143, "grad_norm": 0.33793210983276367, "learning_rate": 3.945740139917213e-06, "loss": 0.3926, "step": 1631 }, { "epoch": 1.8329590117911287, "grad_norm": 0.3210016191005707, "learning_rate": 3.939351189654996e-06, "loss": 0.4149, "step": 1632 }, { "epoch": 1.834081976417743, "grad_norm": 0.3257698118686676, "learning_rate": 3.932964052250626e-06, "loss": 0.4372, "step": 1633 }, { "epoch": 1.8352049410443572, "grad_norm": 0.293089896440506, "learning_rate": 3.926578738620981e-06, "loss": 0.3905, "step": 1634 }, { "epoch": 1.8363279056709714, "grad_norm": 0.31215280294418335, "learning_rate": 3.920195259679822e-06, "loss": 0.4238, "step": 1635 }, { "epoch": 1.8374508702975856, "grad_norm": 0.34193360805511475, "learning_rate": 3.9138136263377686e-06, "loss": 0.4527, "step": 1636 }, { "epoch": 1.8385738349241998, "grad_norm": 0.31182852387428284, "learning_rate": 3.907433849502293e-06, "loss": 0.4052, "step": 1637 }, { "epoch": 1.839696799550814, "grad_norm": 0.34479406476020813, "learning_rate": 3.901055940077691e-06, "loss": 0.4563, "step": 1638 }, { "epoch": 1.8408197641774284, "grad_norm": 0.37023064494132996, "learning_rate": 3.894679908965066e-06, "loss": 0.4147, "step": 1639 }, { "epoch": 1.8419427288040426, "grad_norm": 0.344500869512558, "learning_rate": 3.88830576706231e-06, "loss": 0.4305, "step": 1640 }, { "epoch": 1.843065693430657, "grad_norm": 0.31855419278144836, "learning_rate": 3.881933525264092e-06, "loss": 0.4376, "step": 1641 }, { "epoch": 1.8441886580572713, "grad_norm": 0.33842527866363525, "learning_rate": 3.875563194461825e-06, "loss": 0.4201, "step": 1642 }, { "epoch": 1.8453116226838855, "grad_norm": 0.31712186336517334, "learning_rate": 3.869194785543662e-06, "loss": 0.4055, "step": 1643 }, { "epoch": 1.8464345873104997, "grad_norm": 0.4304543137550354, "learning_rate": 3.862828309394469e-06, "loss": 0.4593, "step": 1644 }, { "epoch": 1.847557551937114, "grad_norm": 0.3445642292499542, "learning_rate": 3.856463776895807e-06, "loss": 0.3995, "step": 1645 }, { "epoch": 1.8486805165637281, "grad_norm": 0.28428617119789124, "learning_rate": 3.850101198925917e-06, "loss": 0.4053, "step": 1646 }, { "epoch": 1.8498034811903425, "grad_norm": 0.42483633756637573, "learning_rate": 3.843740586359701e-06, "loss": 0.4415, "step": 1647 }, { "epoch": 1.8509264458169568, "grad_norm": 0.3756960928440094, "learning_rate": 3.837381950068695e-06, "loss": 0.3986, "step": 1648 }, { "epoch": 1.8520494104435712, "grad_norm": 0.3666488230228424, "learning_rate": 3.831025300921064e-06, "loss": 0.4035, "step": 1649 }, { "epoch": 1.8531723750701854, "grad_norm": 0.3830086588859558, "learning_rate": 3.824670649781576e-06, "loss": 0.4258, "step": 1650 }, { "epoch": 1.8542953396967996, "grad_norm": 0.3498404920101166, "learning_rate": 3.8183180075115775e-06, "loss": 0.4689, "step": 1651 }, { "epoch": 1.8554183043234138, "grad_norm": 0.2784803509712219, "learning_rate": 3.81196738496899e-06, "loss": 0.3745, "step": 1652 }, { "epoch": 1.856541268950028, "grad_norm": 0.35804474353790283, "learning_rate": 3.805618793008279e-06, "loss": 0.4352, "step": 1653 }, { "epoch": 1.8576642335766422, "grad_norm": 0.3392621576786041, "learning_rate": 3.7992722424804363e-06, "loss": 0.4135, "step": 1654 }, { "epoch": 1.8587871982032564, "grad_norm": 0.33848804235458374, "learning_rate": 3.792927744232969e-06, "loss": 0.3959, "step": 1655 }, { "epoch": 1.8599101628298709, "grad_norm": 0.33560463786125183, "learning_rate": 3.786585309109877e-06, "loss": 0.4532, "step": 1656 }, { "epoch": 1.861033127456485, "grad_norm": 0.31130582094192505, "learning_rate": 3.780244947951629e-06, "loss": 0.3976, "step": 1657 }, { "epoch": 1.8621560920830995, "grad_norm": 0.33815956115722656, "learning_rate": 3.7739066715951535e-06, "loss": 0.433, "step": 1658 }, { "epoch": 1.8632790567097137, "grad_norm": 0.329492449760437, "learning_rate": 3.7675704908738136e-06, "loss": 0.4173, "step": 1659 }, { "epoch": 1.864402021336328, "grad_norm": 0.27842992544174194, "learning_rate": 3.7612364166173897e-06, "loss": 0.4206, "step": 1660 }, { "epoch": 1.8655249859629421, "grad_norm": 0.31161028146743774, "learning_rate": 3.7549044596520646e-06, "loss": 0.4242, "step": 1661 }, { "epoch": 1.8666479505895563, "grad_norm": 0.3165256381034851, "learning_rate": 3.7485746308004013e-06, "loss": 0.45, "step": 1662 }, { "epoch": 1.8677709152161706, "grad_norm": 0.27046987414360046, "learning_rate": 3.7422469408813216e-06, "loss": 0.4085, "step": 1663 }, { "epoch": 1.868893879842785, "grad_norm": 0.34185266494750977, "learning_rate": 3.7359214007100967e-06, "loss": 0.4399, "step": 1664 }, { "epoch": 1.8700168444693992, "grad_norm": 0.3198379576206207, "learning_rate": 3.7295980210983233e-06, "loss": 0.4415, "step": 1665 }, { "epoch": 1.8711398090960136, "grad_norm": 0.31607043743133545, "learning_rate": 3.7232768128539e-06, "loss": 0.4223, "step": 1666 }, { "epoch": 1.8722627737226278, "grad_norm": 0.3241162896156311, "learning_rate": 3.7169577867810174e-06, "loss": 0.4465, "step": 1667 }, { "epoch": 1.873385738349242, "grad_norm": 0.32213491201400757, "learning_rate": 3.71064095368014e-06, "loss": 0.4266, "step": 1668 }, { "epoch": 1.8745087029758563, "grad_norm": 0.3356422781944275, "learning_rate": 3.7043263243479773e-06, "loss": 0.4051, "step": 1669 }, { "epoch": 1.8756316676024705, "grad_norm": 0.33349502086639404, "learning_rate": 3.698013909577477e-06, "loss": 0.4395, "step": 1670 }, { "epoch": 1.8767546322290847, "grad_norm": 0.31804442405700684, "learning_rate": 3.6917037201577977e-06, "loss": 0.4155, "step": 1671 }, { "epoch": 1.8778775968556989, "grad_norm": 0.2964915633201599, "learning_rate": 3.685395766874298e-06, "loss": 0.4066, "step": 1672 }, { "epoch": 1.8790005614823133, "grad_norm": 0.3166485130786896, "learning_rate": 3.6790900605085162e-06, "loss": 0.4437, "step": 1673 }, { "epoch": 1.8801235261089275, "grad_norm": 0.318016916513443, "learning_rate": 3.672786611838142e-06, "loss": 0.4164, "step": 1674 }, { "epoch": 1.881246490735542, "grad_norm": 0.31725162267684937, "learning_rate": 3.6664854316370147e-06, "loss": 0.4352, "step": 1675 }, { "epoch": 1.8823694553621562, "grad_norm": 0.3102496564388275, "learning_rate": 3.660186530675094e-06, "loss": 0.4227, "step": 1676 }, { "epoch": 1.8834924199887704, "grad_norm": 0.2913844585418701, "learning_rate": 3.653889919718439e-06, "loss": 0.4041, "step": 1677 }, { "epoch": 1.8846153846153846, "grad_norm": 0.3311614692211151, "learning_rate": 3.6475956095292013e-06, "loss": 0.4103, "step": 1678 }, { "epoch": 1.8857383492419988, "grad_norm": 0.2971237003803253, "learning_rate": 3.6413036108656e-06, "loss": 0.4067, "step": 1679 }, { "epoch": 1.886861313868613, "grad_norm": 0.3693336248397827, "learning_rate": 3.635013934481895e-06, "loss": 0.4747, "step": 1680 }, { "epoch": 1.8879842784952274, "grad_norm": 0.3151419162750244, "learning_rate": 3.6287265911283866e-06, "loss": 0.3966, "step": 1681 }, { "epoch": 1.8891072431218416, "grad_norm": 0.33311915397644043, "learning_rate": 3.6224415915513846e-06, "loss": 0.4098, "step": 1682 }, { "epoch": 1.890230207748456, "grad_norm": 0.3535560369491577, "learning_rate": 3.616158946493188e-06, "loss": 0.4242, "step": 1683 }, { "epoch": 1.8913531723750703, "grad_norm": 0.30791327357292175, "learning_rate": 3.6098786666920787e-06, "loss": 0.4262, "step": 1684 }, { "epoch": 1.8924761370016845, "grad_norm": 0.31504347920417786, "learning_rate": 3.6036007628822934e-06, "loss": 0.4193, "step": 1685 }, { "epoch": 1.8935991016282987, "grad_norm": 0.3193170130252838, "learning_rate": 3.5973252457940034e-06, "loss": 0.4259, "step": 1686 }, { "epoch": 1.894722066254913, "grad_norm": 0.325797975063324, "learning_rate": 3.591052126153306e-06, "loss": 0.4067, "step": 1687 }, { "epoch": 1.8958450308815271, "grad_norm": 0.37144115567207336, "learning_rate": 3.584781414682201e-06, "loss": 0.4627, "step": 1688 }, { "epoch": 1.8969679955081415, "grad_norm": 0.3324344754219055, "learning_rate": 3.578513122098566e-06, "loss": 0.413, "step": 1689 }, { "epoch": 1.8980909601347558, "grad_norm": 0.3115624785423279, "learning_rate": 3.5722472591161493e-06, "loss": 0.4104, "step": 1690 }, { "epoch": 1.89921392476137, "grad_norm": 0.3697892129421234, "learning_rate": 3.5659838364445505e-06, "loss": 0.4465, "step": 1691 }, { "epoch": 1.9003368893879844, "grad_norm": 0.31198325753211975, "learning_rate": 3.559722864789187e-06, "loss": 0.4155, "step": 1692 }, { "epoch": 1.9014598540145986, "grad_norm": 0.34678032994270325, "learning_rate": 3.553464354851295e-06, "loss": 0.4021, "step": 1693 }, { "epoch": 1.9025828186412128, "grad_norm": 0.331285297870636, "learning_rate": 3.547208317327904e-06, "loss": 0.4246, "step": 1694 }, { "epoch": 1.903705783267827, "grad_norm": 0.32212376594543457, "learning_rate": 3.5409547629118124e-06, "loss": 0.4166, "step": 1695 }, { "epoch": 1.9048287478944412, "grad_norm": 0.34711652994155884, "learning_rate": 3.5347037022915787e-06, "loss": 0.4172, "step": 1696 }, { "epoch": 1.9059517125210554, "grad_norm": 0.3425479233264923, "learning_rate": 3.5284551461514972e-06, "loss": 0.4689, "step": 1697 }, { "epoch": 1.9070746771476699, "grad_norm": 0.3227981925010681, "learning_rate": 3.5222091051715803e-06, "loss": 0.3764, "step": 1698 }, { "epoch": 1.908197641774284, "grad_norm": 0.37959498167037964, "learning_rate": 3.5159655900275436e-06, "loss": 0.4428, "step": 1699 }, { "epoch": 1.9093206064008985, "grad_norm": 0.35996347665786743, "learning_rate": 3.509724611390788e-06, "loss": 0.4228, "step": 1700 }, { "epoch": 1.9104435710275127, "grad_norm": 0.3202962875366211, "learning_rate": 3.5034861799283713e-06, "loss": 0.4152, "step": 1701 }, { "epoch": 1.911566535654127, "grad_norm": 0.3100218176841736, "learning_rate": 3.4972503063030043e-06, "loss": 0.4227, "step": 1702 }, { "epoch": 1.9126895002807411, "grad_norm": 0.3152351379394531, "learning_rate": 3.4910170011730267e-06, "loss": 0.4064, "step": 1703 }, { "epoch": 1.9138124649073553, "grad_norm": 0.3654741048812866, "learning_rate": 3.48478627519238e-06, "loss": 0.4262, "step": 1704 }, { "epoch": 1.9149354295339696, "grad_norm": 0.31668299436569214, "learning_rate": 3.478558139010606e-06, "loss": 0.3902, "step": 1705 }, { "epoch": 1.916058394160584, "grad_norm": 0.36022108793258667, "learning_rate": 3.4723326032728187e-06, "loss": 0.4761, "step": 1706 }, { "epoch": 1.9171813587871982, "grad_norm": 0.2966397702693939, "learning_rate": 3.466109678619681e-06, "loss": 0.4002, "step": 1707 }, { "epoch": 1.9183043234138126, "grad_norm": 0.3427334129810333, "learning_rate": 3.4598893756874018e-06, "loss": 0.4405, "step": 1708 }, { "epoch": 1.9194272880404268, "grad_norm": 0.3124261200428009, "learning_rate": 3.4536717051077017e-06, "loss": 0.4547, "step": 1709 }, { "epoch": 1.920550252667041, "grad_norm": 0.2958309054374695, "learning_rate": 3.4474566775078055e-06, "loss": 0.395, "step": 1710 }, { "epoch": 1.9216732172936553, "grad_norm": 0.3578159809112549, "learning_rate": 3.441244303510424e-06, "loss": 0.4059, "step": 1711 }, { "epoch": 1.9227961819202695, "grad_norm": 0.35327303409576416, "learning_rate": 3.435034593733724e-06, "loss": 0.4561, "step": 1712 }, { "epoch": 1.9239191465468837, "grad_norm": 0.33744701743125916, "learning_rate": 3.4288275587913235e-06, "loss": 0.4361, "step": 1713 }, { "epoch": 1.9250421111734979, "grad_norm": 0.3428589403629303, "learning_rate": 3.422623209292273e-06, "loss": 0.4256, "step": 1714 }, { "epoch": 1.9261650758001123, "grad_norm": 0.30562588572502136, "learning_rate": 3.416421555841023e-06, "loss": 0.3975, "step": 1715 }, { "epoch": 1.9272880404267265, "grad_norm": 0.32394111156463623, "learning_rate": 3.4102226090374246e-06, "loss": 0.4452, "step": 1716 }, { "epoch": 1.928411005053341, "grad_norm": 0.2993023097515106, "learning_rate": 3.404026379476701e-06, "loss": 0.4262, "step": 1717 }, { "epoch": 1.9295339696799552, "grad_norm": 0.29942411184310913, "learning_rate": 3.397832877749425e-06, "loss": 0.4146, "step": 1718 }, { "epoch": 1.9306569343065694, "grad_norm": 0.319987028837204, "learning_rate": 3.3916421144415146e-06, "loss": 0.4103, "step": 1719 }, { "epoch": 1.9317798989331836, "grad_norm": 0.31507205963134766, "learning_rate": 3.3854541001342056e-06, "loss": 0.4144, "step": 1720 }, { "epoch": 1.9329028635597978, "grad_norm": 0.31855514645576477, "learning_rate": 3.3792688454040313e-06, "loss": 0.4463, "step": 1721 }, { "epoch": 1.934025828186412, "grad_norm": 0.2958168685436249, "learning_rate": 3.3730863608228125e-06, "loss": 0.3988, "step": 1722 }, { "epoch": 1.9351487928130264, "grad_norm": 0.33677220344543457, "learning_rate": 3.3669066569576338e-06, "loss": 0.4691, "step": 1723 }, { "epoch": 1.9362717574396406, "grad_norm": 0.28470519185066223, "learning_rate": 3.3607297443708253e-06, "loss": 0.3848, "step": 1724 }, { "epoch": 1.937394722066255, "grad_norm": 0.33091995120048523, "learning_rate": 3.35455563361995e-06, "loss": 0.4333, "step": 1725 }, { "epoch": 1.9385176866928693, "grad_norm": 0.3323820233345032, "learning_rate": 3.3483843352577805e-06, "loss": 0.4323, "step": 1726 }, { "epoch": 1.9396406513194835, "grad_norm": 0.318198561668396, "learning_rate": 3.3422158598322797e-06, "loss": 0.429, "step": 1727 }, { "epoch": 1.9407636159460977, "grad_norm": 0.34718263149261475, "learning_rate": 3.336050217886588e-06, "loss": 0.428, "step": 1728 }, { "epoch": 1.941886580572712, "grad_norm": 0.30755794048309326, "learning_rate": 3.329887419959006e-06, "loss": 0.4287, "step": 1729 }, { "epoch": 1.9430095451993261, "grad_norm": 0.2980011999607086, "learning_rate": 3.3237274765829643e-06, "loss": 0.4329, "step": 1730 }, { "epoch": 1.9441325098259403, "grad_norm": 0.321283221244812, "learning_rate": 3.3175703982870232e-06, "loss": 0.4121, "step": 1731 }, { "epoch": 1.9452554744525548, "grad_norm": 0.3459334373474121, "learning_rate": 3.3114161955948443e-06, "loss": 0.3948, "step": 1732 }, { "epoch": 1.946378439079169, "grad_norm": 0.3054325580596924, "learning_rate": 3.305264879025172e-06, "loss": 0.421, "step": 1733 }, { "epoch": 1.9475014037057834, "grad_norm": 0.30659976601600647, "learning_rate": 3.2991164590918162e-06, "loss": 0.4211, "step": 1734 }, { "epoch": 1.9486243683323976, "grad_norm": 0.33869999647140503, "learning_rate": 3.2929709463036413e-06, "loss": 0.4095, "step": 1735 }, { "epoch": 1.9497473329590118, "grad_norm": 0.3495405614376068, "learning_rate": 3.2868283511645375e-06, "loss": 0.4434, "step": 1736 }, { "epoch": 1.950870297585626, "grad_norm": 0.3014717400074005, "learning_rate": 3.280688684173412e-06, "loss": 0.4125, "step": 1737 }, { "epoch": 1.9519932622122402, "grad_norm": 0.3205607533454895, "learning_rate": 3.2745519558241667e-06, "loss": 0.4191, "step": 1738 }, { "epoch": 1.9531162268388544, "grad_norm": 0.31699585914611816, "learning_rate": 3.2684181766056766e-06, "loss": 0.4088, "step": 1739 }, { "epoch": 1.9542391914654689, "grad_norm": 0.3490926921367645, "learning_rate": 3.262287357001781e-06, "loss": 0.424, "step": 1740 }, { "epoch": 1.955362156092083, "grad_norm": 0.3102765381336212, "learning_rate": 3.256159507491261e-06, "loss": 0.4343, "step": 1741 }, { "epoch": 1.9564851207186975, "grad_norm": 0.3316532373428345, "learning_rate": 3.250034638547815e-06, "loss": 0.4497, "step": 1742 }, { "epoch": 1.9576080853453117, "grad_norm": 0.37376007437705994, "learning_rate": 3.2439127606400546e-06, "loss": 0.4135, "step": 1743 }, { "epoch": 1.958731049971926, "grad_norm": 0.37317338585853577, "learning_rate": 3.2377938842314725e-06, "loss": 0.4347, "step": 1744 }, { "epoch": 1.9598540145985401, "grad_norm": 0.3320644497871399, "learning_rate": 3.2316780197804353e-06, "loss": 0.4534, "step": 1745 }, { "epoch": 1.9609769792251543, "grad_norm": 0.2786281108856201, "learning_rate": 3.225565177740163e-06, "loss": 0.3728, "step": 1746 }, { "epoch": 1.9620999438517686, "grad_norm": 0.3369101285934448, "learning_rate": 3.2194553685587043e-06, "loss": 0.4229, "step": 1747 }, { "epoch": 1.9632229084783828, "grad_norm": 0.3242078721523285, "learning_rate": 3.2133486026789284e-06, "loss": 0.4467, "step": 1748 }, { "epoch": 1.9643458731049972, "grad_norm": 0.30461475253105164, "learning_rate": 3.2072448905385046e-06, "loss": 0.4411, "step": 1749 }, { "epoch": 1.9654688377316114, "grad_norm": 0.3284435272216797, "learning_rate": 3.201144242569874e-06, "loss": 0.4315, "step": 1750 }, { "epoch": 1.9665918023582258, "grad_norm": 0.30564817786216736, "learning_rate": 3.19504666920025e-06, "loss": 0.4215, "step": 1751 }, { "epoch": 1.96771476698484, "grad_norm": 0.3232194483280182, "learning_rate": 3.1889521808515888e-06, "loss": 0.4272, "step": 1752 }, { "epoch": 1.9688377316114543, "grad_norm": 0.3039325177669525, "learning_rate": 3.1828607879405676e-06, "loss": 0.4103, "step": 1753 }, { "epoch": 1.9699606962380685, "grad_norm": 0.3036545217037201, "learning_rate": 3.17677250087858e-06, "loss": 0.4047, "step": 1754 }, { "epoch": 1.9710836608646827, "grad_norm": 0.3318231999874115, "learning_rate": 3.1706873300717094e-06, "loss": 0.4147, "step": 1755 }, { "epoch": 1.9722066254912969, "grad_norm": 0.2948015332221985, "learning_rate": 3.1646052859207093e-06, "loss": 0.4013, "step": 1756 }, { "epoch": 1.9733295901179113, "grad_norm": 0.32392287254333496, "learning_rate": 3.158526378820993e-06, "loss": 0.4542, "step": 1757 }, { "epoch": 1.9744525547445255, "grad_norm": 0.3127719759941101, "learning_rate": 3.152450619162612e-06, "loss": 0.4023, "step": 1758 }, { "epoch": 1.97557551937114, "grad_norm": 0.3158482611179352, "learning_rate": 3.146378017330236e-06, "loss": 0.4105, "step": 1759 }, { "epoch": 1.9766984839977542, "grad_norm": 0.2842695713043213, "learning_rate": 3.1403085837031366e-06, "loss": 0.4128, "step": 1760 }, { "epoch": 1.9778214486243684, "grad_norm": 0.32925423979759216, "learning_rate": 3.1342423286551756e-06, "loss": 0.4591, "step": 1761 }, { "epoch": 1.9789444132509826, "grad_norm": 0.3206818699836731, "learning_rate": 3.1281792625547747e-06, "loss": 0.4056, "step": 1762 }, { "epoch": 1.9800673778775968, "grad_norm": 0.3246009945869446, "learning_rate": 3.122119395764911e-06, "loss": 0.4476, "step": 1763 }, { "epoch": 1.981190342504211, "grad_norm": 0.32725098729133606, "learning_rate": 3.116062738643092e-06, "loss": 0.3866, "step": 1764 }, { "epoch": 1.9823133071308254, "grad_norm": 0.341824471950531, "learning_rate": 3.110009301541336e-06, "loss": 0.4418, "step": 1765 }, { "epoch": 1.9834362717574396, "grad_norm": 0.3220565617084503, "learning_rate": 3.1039590948061605e-06, "loss": 0.4277, "step": 1766 }, { "epoch": 1.9845592363840538, "grad_norm": 0.32888713479042053, "learning_rate": 3.097912128778563e-06, "loss": 0.4393, "step": 1767 }, { "epoch": 1.9856822010106683, "grad_norm": 0.30448341369628906, "learning_rate": 3.0918684137939973e-06, "loss": 0.4181, "step": 1768 }, { "epoch": 1.9868051656372825, "grad_norm": 0.29806023836135864, "learning_rate": 3.0858279601823653e-06, "loss": 0.3907, "step": 1769 }, { "epoch": 1.9879281302638967, "grad_norm": 0.3095226585865021, "learning_rate": 3.0797907782679944e-06, "loss": 0.4349, "step": 1770 }, { "epoch": 1.989051094890511, "grad_norm": 0.29942139983177185, "learning_rate": 3.0737568783696136e-06, "loss": 0.4191, "step": 1771 }, { "epoch": 1.9901740595171251, "grad_norm": 0.2984754145145416, "learning_rate": 3.06772627080035e-06, "loss": 0.4344, "step": 1772 }, { "epoch": 1.9912970241437393, "grad_norm": 0.3062698543071747, "learning_rate": 3.061698965867701e-06, "loss": 0.4196, "step": 1773 }, { "epoch": 1.9924199887703538, "grad_norm": 0.31597861647605896, "learning_rate": 3.055674973873517e-06, "loss": 0.4375, "step": 1774 }, { "epoch": 1.993542953396968, "grad_norm": 0.28254324197769165, "learning_rate": 3.0496543051139873e-06, "loss": 0.4009, "step": 1775 }, { "epoch": 1.9946659180235824, "grad_norm": 0.35020703077316284, "learning_rate": 3.043636969879625e-06, "loss": 0.463, "step": 1776 }, { "epoch": 1.9957888826501966, "grad_norm": 0.29070615768432617, "learning_rate": 3.0376229784552362e-06, "loss": 0.4093, "step": 1777 }, { "epoch": 1.9969118472768108, "grad_norm": 0.304119348526001, "learning_rate": 3.0316123411199226e-06, "loss": 0.4158, "step": 1778 }, { "epoch": 1.998034811903425, "grad_norm": 0.30985742807388306, "learning_rate": 3.0256050681470446e-06, "loss": 0.4293, "step": 1779 }, { "epoch": 1.9991577765300392, "grad_norm": 0.30128514766693115, "learning_rate": 3.019601169804216e-06, "loss": 0.4275, "step": 1780 }, { "epoch": 2.0005614823133073, "grad_norm": 0.7849389314651489, "learning_rate": 3.0136006563532857e-06, "loss": 0.6957, "step": 1781 }, { "epoch": 2.0016844469399215, "grad_norm": 0.301169753074646, "learning_rate": 3.007603538050309e-06, "loss": 0.3794, "step": 1782 }, { "epoch": 2.0028074115665357, "grad_norm": 0.33448323607444763, "learning_rate": 3.0016098251455446e-06, "loss": 0.3807, "step": 1783 }, { "epoch": 2.00393037619315, "grad_norm": 0.33645161986351013, "learning_rate": 2.995619527883431e-06, "loss": 0.4351, "step": 1784 }, { "epoch": 2.005053340819764, "grad_norm": 0.2887193262577057, "learning_rate": 2.989632656502564e-06, "loss": 0.3704, "step": 1785 }, { "epoch": 2.0061763054463784, "grad_norm": 0.30473557114601135, "learning_rate": 2.9836492212356893e-06, "loss": 0.4208, "step": 1786 }, { "epoch": 2.0072992700729926, "grad_norm": 0.4772564470767975, "learning_rate": 2.9776692323096757e-06, "loss": 0.4182, "step": 1787 }, { "epoch": 2.008422234699607, "grad_norm": 0.334936261177063, "learning_rate": 2.971692699945502e-06, "loss": 0.4184, "step": 1788 }, { "epoch": 2.0095451993262214, "grad_norm": 0.3180846571922302, "learning_rate": 2.9657196343582404e-06, "loss": 0.3849, "step": 1789 }, { "epoch": 2.0106681639528357, "grad_norm": 0.3168073892593384, "learning_rate": 2.9597500457570403e-06, "loss": 0.385, "step": 1790 }, { "epoch": 2.01179112857945, "grad_norm": 0.3875434696674347, "learning_rate": 2.9537839443451e-06, "loss": 0.4506, "step": 1791 }, { "epoch": 2.012914093206064, "grad_norm": 0.28625744581222534, "learning_rate": 2.947821340319664e-06, "loss": 0.3602, "step": 1792 }, { "epoch": 2.0140370578326783, "grad_norm": 0.31105443835258484, "learning_rate": 2.941862243872002e-06, "loss": 0.3864, "step": 1793 }, { "epoch": 2.0151600224592925, "grad_norm": 0.34436553716659546, "learning_rate": 2.935906665187378e-06, "loss": 0.3836, "step": 1794 }, { "epoch": 2.0162829870859067, "grad_norm": 0.31747475266456604, "learning_rate": 2.929954614445052e-06, "loss": 0.4154, "step": 1795 }, { "epoch": 2.017405951712521, "grad_norm": 0.315250962972641, "learning_rate": 2.9240061018182553e-06, "loss": 0.4155, "step": 1796 }, { "epoch": 2.018528916339135, "grad_norm": 0.302217036485672, "learning_rate": 2.9180611374741623e-06, "loss": 0.3776, "step": 1797 }, { "epoch": 2.0196518809657498, "grad_norm": 0.31083396077156067, "learning_rate": 2.912119731573892e-06, "loss": 0.389, "step": 1798 }, { "epoch": 2.020774845592364, "grad_norm": 0.3151269853115082, "learning_rate": 2.9061818942724795e-06, "loss": 0.4291, "step": 1799 }, { "epoch": 2.021897810218978, "grad_norm": 0.32542282342910767, "learning_rate": 2.900247635718856e-06, "loss": 0.4121, "step": 1800 }, { "epoch": 2.0230207748455924, "grad_norm": 0.27192172408103943, "learning_rate": 2.894316966055839e-06, "loss": 0.3649, "step": 1801 }, { "epoch": 2.0241437394722066, "grad_norm": 0.3043549954891205, "learning_rate": 2.8883898954201152e-06, "loss": 0.4045, "step": 1802 }, { "epoch": 2.025266704098821, "grad_norm": 0.27879294753074646, "learning_rate": 2.8824664339422115e-06, "loss": 0.3594, "step": 1803 }, { "epoch": 2.026389668725435, "grad_norm": 0.2773258090019226, "learning_rate": 2.876546591746494e-06, "loss": 0.3626, "step": 1804 }, { "epoch": 2.0275126333520492, "grad_norm": 0.3027551770210266, "learning_rate": 2.8706303789511388e-06, "loss": 0.4419, "step": 1805 }, { "epoch": 2.028635597978664, "grad_norm": 0.3101172149181366, "learning_rate": 2.8647178056681197e-06, "loss": 0.4393, "step": 1806 }, { "epoch": 2.029758562605278, "grad_norm": 0.3119727373123169, "learning_rate": 2.8588088820031902e-06, "loss": 0.3937, "step": 1807 }, { "epoch": 2.0308815272318923, "grad_norm": 0.33654600381851196, "learning_rate": 2.8529036180558665e-06, "loss": 0.4069, "step": 1808 }, { "epoch": 2.0320044918585065, "grad_norm": 0.29681164026260376, "learning_rate": 2.847002023919406e-06, "loss": 0.3873, "step": 1809 }, { "epoch": 2.0331274564851207, "grad_norm": 0.3617763817310333, "learning_rate": 2.841104109680796e-06, "loss": 0.4586, "step": 1810 }, { "epoch": 2.034250421111735, "grad_norm": 0.310320645570755, "learning_rate": 2.8352098854207384e-06, "loss": 0.37, "step": 1811 }, { "epoch": 2.035373385738349, "grad_norm": 0.34363606572151184, "learning_rate": 2.8293193612136183e-06, "loss": 0.4226, "step": 1812 }, { "epoch": 2.0364963503649633, "grad_norm": 0.3433118462562561, "learning_rate": 2.823432547127506e-06, "loss": 0.4247, "step": 1813 }, { "epoch": 2.0376193149915776, "grad_norm": 0.3124364912509918, "learning_rate": 2.8175494532241277e-06, "loss": 0.39, "step": 1814 }, { "epoch": 2.038742279618192, "grad_norm": 0.2864333987236023, "learning_rate": 2.8116700895588473e-06, "loss": 0.3849, "step": 1815 }, { "epoch": 2.0398652442448064, "grad_norm": 0.30110886693000793, "learning_rate": 2.805794466180659e-06, "loss": 0.4253, "step": 1816 }, { "epoch": 2.0409882088714206, "grad_norm": 0.29214343428611755, "learning_rate": 2.7999225931321585e-06, "loss": 0.3795, "step": 1817 }, { "epoch": 2.042111173498035, "grad_norm": 0.28633052110671997, "learning_rate": 2.7940544804495345e-06, "loss": 0.385, "step": 1818 }, { "epoch": 2.043234138124649, "grad_norm": 0.29198339581489563, "learning_rate": 2.788190138162551e-06, "loss": 0.3827, "step": 1819 }, { "epoch": 2.0443571027512633, "grad_norm": 0.3325103521347046, "learning_rate": 2.7823295762945203e-06, "loss": 0.4244, "step": 1820 }, { "epoch": 2.0454800673778775, "grad_norm": 0.3107115924358368, "learning_rate": 2.7764728048623003e-06, "loss": 0.4156, "step": 1821 }, { "epoch": 2.0466030320044917, "grad_norm": 0.28074267506599426, "learning_rate": 2.770619833876269e-06, "loss": 0.417, "step": 1822 }, { "epoch": 2.0477259966311063, "grad_norm": 0.29199671745300293, "learning_rate": 2.7647706733403035e-06, "loss": 0.3831, "step": 1823 }, { "epoch": 2.0488489612577205, "grad_norm": 0.31043294072151184, "learning_rate": 2.7589253332517736e-06, "loss": 0.4035, "step": 1824 }, { "epoch": 2.0499719258843347, "grad_norm": 0.31044068932533264, "learning_rate": 2.75308382360152e-06, "loss": 0.4306, "step": 1825 }, { "epoch": 2.051094890510949, "grad_norm": 0.3092561662197113, "learning_rate": 2.747246154373829e-06, "loss": 0.4003, "step": 1826 }, { "epoch": 2.052217855137563, "grad_norm": 0.3030568063259125, "learning_rate": 2.741412335546431e-06, "loss": 0.405, "step": 1827 }, { "epoch": 2.0533408197641774, "grad_norm": 0.28756189346313477, "learning_rate": 2.7355823770904737e-06, "loss": 0.4004, "step": 1828 }, { "epoch": 2.0544637843907916, "grad_norm": 0.29250991344451904, "learning_rate": 2.729756288970501e-06, "loss": 0.4096, "step": 1829 }, { "epoch": 2.055586749017406, "grad_norm": 0.2975006401538849, "learning_rate": 2.7239340811444476e-06, "loss": 0.3804, "step": 1830 }, { "epoch": 2.05670971364402, "grad_norm": 0.31611162424087524, "learning_rate": 2.718115763563614e-06, "loss": 0.4285, "step": 1831 }, { "epoch": 2.0578326782706347, "grad_norm": 0.29935789108276367, "learning_rate": 2.7123013461726523e-06, "loss": 0.3794, "step": 1832 }, { "epoch": 2.058955642897249, "grad_norm": 0.34458234906196594, "learning_rate": 2.706490838909547e-06, "loss": 0.427, "step": 1833 }, { "epoch": 2.060078607523863, "grad_norm": 0.30631276965141296, "learning_rate": 2.7006842517056013e-06, "loss": 0.3847, "step": 1834 }, { "epoch": 2.0612015721504773, "grad_norm": 0.3232940435409546, "learning_rate": 2.6948815944854153e-06, "loss": 0.3909, "step": 1835 }, { "epoch": 2.0623245367770915, "grad_norm": 0.30737611651420593, "learning_rate": 2.6890828771668742e-06, "loss": 0.4256, "step": 1836 }, { "epoch": 2.0634475014037057, "grad_norm": 0.3234766125679016, "learning_rate": 2.6832881096611306e-06, "loss": 0.4251, "step": 1837 }, { "epoch": 2.06457046603032, "grad_norm": 0.30827248096466064, "learning_rate": 2.677497301872581e-06, "loss": 0.4237, "step": 1838 }, { "epoch": 2.065693430656934, "grad_norm": 0.3025023639202118, "learning_rate": 2.671710463698859e-06, "loss": 0.3653, "step": 1839 }, { "epoch": 2.0668163952835488, "grad_norm": 0.2953476011753082, "learning_rate": 2.6659276050308136e-06, "loss": 0.3723, "step": 1840 }, { "epoch": 2.067939359910163, "grad_norm": 0.3526267409324646, "learning_rate": 2.660148735752486e-06, "loss": 0.4367, "step": 1841 }, { "epoch": 2.069062324536777, "grad_norm": 0.27386510372161865, "learning_rate": 2.6543738657411033e-06, "loss": 0.3861, "step": 1842 }, { "epoch": 2.0701852891633914, "grad_norm": 0.31825241446495056, "learning_rate": 2.6486030048670596e-06, "loss": 0.4088, "step": 1843 }, { "epoch": 2.0713082537900056, "grad_norm": 0.2678431570529938, "learning_rate": 2.6428361629938903e-06, "loss": 0.3496, "step": 1844 }, { "epoch": 2.07243121841662, "grad_norm": 0.28597742319107056, "learning_rate": 2.6370733499782654e-06, "loss": 0.4374, "step": 1845 }, { "epoch": 2.073554183043234, "grad_norm": 0.27102598547935486, "learning_rate": 2.6313145756699698e-06, "loss": 0.3671, "step": 1846 }, { "epoch": 2.0746771476698482, "grad_norm": 0.31563785672187805, "learning_rate": 2.6255598499118808e-06, "loss": 0.4461, "step": 1847 }, { "epoch": 2.075800112296463, "grad_norm": 0.278677374124527, "learning_rate": 2.6198091825399606e-06, "loss": 0.4012, "step": 1848 }, { "epoch": 2.076923076923077, "grad_norm": 0.269656777381897, "learning_rate": 2.6140625833832345e-06, "loss": 0.3725, "step": 1849 }, { "epoch": 2.0780460415496913, "grad_norm": 0.322941392660141, "learning_rate": 2.60832006226377e-06, "loss": 0.4238, "step": 1850 }, { "epoch": 2.0791690061763055, "grad_norm": 0.2759600281715393, "learning_rate": 2.6025816289966703e-06, "loss": 0.3685, "step": 1851 }, { "epoch": 2.0802919708029197, "grad_norm": 0.2690187692642212, "learning_rate": 2.5968472933900457e-06, "loss": 0.3942, "step": 1852 }, { "epoch": 2.081414935429534, "grad_norm": 0.2913070619106293, "learning_rate": 2.591117065245007e-06, "loss": 0.4099, "step": 1853 }, { "epoch": 2.082537900056148, "grad_norm": 0.3127916157245636, "learning_rate": 2.5853909543556444e-06, "loss": 0.3878, "step": 1854 }, { "epoch": 2.0836608646827623, "grad_norm": 0.3044333755970001, "learning_rate": 2.5796689705090104e-06, "loss": 0.4072, "step": 1855 }, { "epoch": 2.0847838293093766, "grad_norm": 0.27441877126693726, "learning_rate": 2.573951123485101e-06, "loss": 0.3691, "step": 1856 }, { "epoch": 2.085906793935991, "grad_norm": 0.28323671221733093, "learning_rate": 2.568237423056844e-06, "loss": 0.3795, "step": 1857 }, { "epoch": 2.0870297585626054, "grad_norm": 0.3188222348690033, "learning_rate": 2.562527878990081e-06, "loss": 0.4172, "step": 1858 }, { "epoch": 2.0881527231892196, "grad_norm": 0.30868929624557495, "learning_rate": 2.5568225010435464e-06, "loss": 0.3871, "step": 1859 }, { "epoch": 2.089275687815834, "grad_norm": 0.30544817447662354, "learning_rate": 2.5511212989688587e-06, "loss": 0.3907, "step": 1860 }, { "epoch": 2.090398652442448, "grad_norm": 0.3140981197357178, "learning_rate": 2.5454242825104915e-06, "loss": 0.4241, "step": 1861 }, { "epoch": 2.0915216170690623, "grad_norm": 0.32329824566841125, "learning_rate": 2.5397314614057704e-06, "loss": 0.3803, "step": 1862 }, { "epoch": 2.0926445816956765, "grad_norm": 0.3075975775718689, "learning_rate": 2.534042845384851e-06, "loss": 0.3868, "step": 1863 }, { "epoch": 2.0937675463222907, "grad_norm": 0.26057812571525574, "learning_rate": 2.5283584441706956e-06, "loss": 0.3592, "step": 1864 }, { "epoch": 2.094890510948905, "grad_norm": 0.3006591200828552, "learning_rate": 2.5226782674790662e-06, "loss": 0.4357, "step": 1865 }, { "epoch": 2.0960134755755195, "grad_norm": 0.3084562420845032, "learning_rate": 2.517002325018508e-06, "loss": 0.385, "step": 1866 }, { "epoch": 2.0971364402021337, "grad_norm": 0.30432891845703125, "learning_rate": 2.5113306264903215e-06, "loss": 0.3699, "step": 1867 }, { "epoch": 2.098259404828748, "grad_norm": 0.30607178807258606, "learning_rate": 2.5056631815885585e-06, "loss": 0.4508, "step": 1868 }, { "epoch": 2.099382369455362, "grad_norm": 0.26587870717048645, "learning_rate": 2.5000000000000015e-06, "loss": 0.3689, "step": 1869 }, { "epoch": 2.1005053340819764, "grad_norm": 0.2922433912754059, "learning_rate": 2.4943410914041394e-06, "loss": 0.395, "step": 1870 }, { "epoch": 2.1016282987085906, "grad_norm": 0.3016206622123718, "learning_rate": 2.488686465473165e-06, "loss": 0.4102, "step": 1871 }, { "epoch": 2.102751263335205, "grad_norm": 0.2882736623287201, "learning_rate": 2.4830361318719493e-06, "loss": 0.3648, "step": 1872 }, { "epoch": 2.103874227961819, "grad_norm": 0.30714181065559387, "learning_rate": 2.4773901002580235e-06, "loss": 0.3971, "step": 1873 }, { "epoch": 2.1049971925884337, "grad_norm": 0.31683698296546936, "learning_rate": 2.4717483802815696e-06, "loss": 0.4464, "step": 1874 }, { "epoch": 2.106120157215048, "grad_norm": 0.29984551668167114, "learning_rate": 2.4661109815854005e-06, "loss": 0.3935, "step": 1875 }, { "epoch": 2.107243121841662, "grad_norm": 0.28986573219299316, "learning_rate": 2.460477913804938e-06, "loss": 0.3967, "step": 1876 }, { "epoch": 2.1083660864682763, "grad_norm": 0.3030136227607727, "learning_rate": 2.454849186568208e-06, "loss": 0.3845, "step": 1877 }, { "epoch": 2.1094890510948905, "grad_norm": 0.31672045588493347, "learning_rate": 2.449224809495815e-06, "loss": 0.4053, "step": 1878 }, { "epoch": 2.1106120157215047, "grad_norm": 0.2836325168609619, "learning_rate": 2.443604792200925e-06, "loss": 0.3915, "step": 1879 }, { "epoch": 2.111734980348119, "grad_norm": 0.2861262261867523, "learning_rate": 2.437989144289256e-06, "loss": 0.4319, "step": 1880 }, { "epoch": 2.112857944974733, "grad_norm": 0.27900275588035583, "learning_rate": 2.4323778753590582e-06, "loss": 0.3891, "step": 1881 }, { "epoch": 2.1139809096013478, "grad_norm": 0.299130380153656, "learning_rate": 2.4267709950010975e-06, "loss": 0.4157, "step": 1882 }, { "epoch": 2.115103874227962, "grad_norm": 0.2968760132789612, "learning_rate": 2.421168512798634e-06, "loss": 0.4091, "step": 1883 }, { "epoch": 2.116226838854576, "grad_norm": 0.2967083156108856, "learning_rate": 2.4155704383274154e-06, "loss": 0.4259, "step": 1884 }, { "epoch": 2.1173498034811904, "grad_norm": 0.30164745450019836, "learning_rate": 2.409976781155654e-06, "loss": 0.404, "step": 1885 }, { "epoch": 2.1184727681078046, "grad_norm": 0.28130990266799927, "learning_rate": 2.404387550844013e-06, "loss": 0.3935, "step": 1886 }, { "epoch": 2.119595732734419, "grad_norm": 0.31634601950645447, "learning_rate": 2.3988027569455895e-06, "loss": 0.3961, "step": 1887 }, { "epoch": 2.120718697361033, "grad_norm": 0.3025117516517639, "learning_rate": 2.3932224090058938e-06, "loss": 0.4142, "step": 1888 }, { "epoch": 2.1218416619876472, "grad_norm": 0.2771505117416382, "learning_rate": 2.3876465165628436e-06, "loss": 0.4047, "step": 1889 }, { "epoch": 2.1229646266142614, "grad_norm": 0.2922213077545166, "learning_rate": 2.3820750891467355e-06, "loss": 0.3994, "step": 1890 }, { "epoch": 2.124087591240876, "grad_norm": 0.3175619840621948, "learning_rate": 2.3765081362802374e-06, "loss": 0.4144, "step": 1891 }, { "epoch": 2.1252105558674903, "grad_norm": 0.307605117559433, "learning_rate": 2.370945667478371e-06, "loss": 0.426, "step": 1892 }, { "epoch": 2.1263335204941045, "grad_norm": 0.3419835865497589, "learning_rate": 2.365387692248488e-06, "loss": 0.4226, "step": 1893 }, { "epoch": 2.1274564851207187, "grad_norm": 0.29923638701438904, "learning_rate": 2.3598342200902665e-06, "loss": 0.3714, "step": 1894 }, { "epoch": 2.128579449747333, "grad_norm": 0.30123764276504517, "learning_rate": 2.354285260495685e-06, "loss": 0.3951, "step": 1895 }, { "epoch": 2.129702414373947, "grad_norm": 0.28632184863090515, "learning_rate": 2.348740822949006e-06, "loss": 0.3633, "step": 1896 }, { "epoch": 2.1308253790005613, "grad_norm": 0.3121209144592285, "learning_rate": 2.343200916926768e-06, "loss": 0.4447, "step": 1897 }, { "epoch": 2.1319483436271756, "grad_norm": 0.2967601418495178, "learning_rate": 2.337665551897764e-06, "loss": 0.4107, "step": 1898 }, { "epoch": 2.13307130825379, "grad_norm": 0.27809852361679077, "learning_rate": 2.33213473732302e-06, "loss": 0.388, "step": 1899 }, { "epoch": 2.1341942728804044, "grad_norm": 0.30501827597618103, "learning_rate": 2.3266084826557906e-06, "loss": 0.3822, "step": 1900 }, { "epoch": 2.1353172375070186, "grad_norm": 0.28480109572410583, "learning_rate": 2.3210867973415347e-06, "loss": 0.4006, "step": 1901 }, { "epoch": 2.136440202133633, "grad_norm": 0.31813332438468933, "learning_rate": 2.3155696908178974e-06, "loss": 0.4079, "step": 1902 }, { "epoch": 2.137563166760247, "grad_norm": 0.3131076395511627, "learning_rate": 2.310057172514703e-06, "loss": 0.4051, "step": 1903 }, { "epoch": 2.1386861313868613, "grad_norm": 0.3432920277118683, "learning_rate": 2.3045492518539343e-06, "loss": 0.4078, "step": 1904 }, { "epoch": 2.1398090960134755, "grad_norm": 0.28056827187538147, "learning_rate": 2.2990459382497086e-06, "loss": 0.3865, "step": 1905 }, { "epoch": 2.1409320606400897, "grad_norm": 0.32078495621681213, "learning_rate": 2.2935472411082753e-06, "loss": 0.4096, "step": 1906 }, { "epoch": 2.1420550252667043, "grad_norm": 0.2931683659553528, "learning_rate": 2.2880531698279925e-06, "loss": 0.3741, "step": 1907 }, { "epoch": 2.1431779898933185, "grad_norm": 0.319014310836792, "learning_rate": 2.2825637337993094e-06, "loss": 0.4204, "step": 1908 }, { "epoch": 2.1443009545199327, "grad_norm": 0.33892372250556946, "learning_rate": 2.2770789424047566e-06, "loss": 0.4222, "step": 1909 }, { "epoch": 2.145423919146547, "grad_norm": 0.2994024157524109, "learning_rate": 2.2715988050189195e-06, "loss": 0.3787, "step": 1910 }, { "epoch": 2.146546883773161, "grad_norm": 0.3465620279312134, "learning_rate": 2.266123331008436e-06, "loss": 0.4549, "step": 1911 }, { "epoch": 2.1476698483997754, "grad_norm": 0.34397295117378235, "learning_rate": 2.260652529731968e-06, "loss": 0.3997, "step": 1912 }, { "epoch": 2.1487928130263896, "grad_norm": 0.297284334897995, "learning_rate": 2.255186410540197e-06, "loss": 0.3846, "step": 1913 }, { "epoch": 2.149915777653004, "grad_norm": 0.3011060059070587, "learning_rate": 2.2497249827757933e-06, "loss": 0.3837, "step": 1914 }, { "epoch": 2.151038742279618, "grad_norm": 0.2841435968875885, "learning_rate": 2.244268255773415e-06, "loss": 0.3695, "step": 1915 }, { "epoch": 2.1521617069062327, "grad_norm": 0.37009215354919434, "learning_rate": 2.2388162388596867e-06, "loss": 0.4376, "step": 1916 }, { "epoch": 2.153284671532847, "grad_norm": 0.32911205291748047, "learning_rate": 2.233368941353175e-06, "loss": 0.4042, "step": 1917 }, { "epoch": 2.154407636159461, "grad_norm": 0.2850897014141083, "learning_rate": 2.227926372564387e-06, "loss": 0.3776, "step": 1918 }, { "epoch": 2.1555306007860753, "grad_norm": 0.2865746021270752, "learning_rate": 2.2224885417957482e-06, "loss": 0.416, "step": 1919 }, { "epoch": 2.1566535654126895, "grad_norm": 0.3166450262069702, "learning_rate": 2.2170554583415782e-06, "loss": 0.3592, "step": 1920 }, { "epoch": 2.1577765300393037, "grad_norm": 0.3234749436378479, "learning_rate": 2.2116271314880896e-06, "loss": 0.4189, "step": 1921 }, { "epoch": 2.158899494665918, "grad_norm": 0.28501206636428833, "learning_rate": 2.2062035705133644e-06, "loss": 0.3919, "step": 1922 }, { "epoch": 2.160022459292532, "grad_norm": 0.30249863862991333, "learning_rate": 2.2007847846873342e-06, "loss": 0.4098, "step": 1923 }, { "epoch": 2.1611454239191463, "grad_norm": 0.3150129020214081, "learning_rate": 2.1953707832717745e-06, "loss": 0.4093, "step": 1924 }, { "epoch": 2.162268388545761, "grad_norm": 0.3020494878292084, "learning_rate": 2.1899615755202784e-06, "loss": 0.4345, "step": 1925 }, { "epoch": 2.163391353172375, "grad_norm": 0.28092849254608154, "learning_rate": 2.1845571706782486e-06, "loss": 0.3757, "step": 1926 }, { "epoch": 2.1645143177989894, "grad_norm": 0.3101217448711395, "learning_rate": 2.179157577982881e-06, "loss": 0.4157, "step": 1927 }, { "epoch": 2.1656372824256036, "grad_norm": 0.27302488684654236, "learning_rate": 2.173762806663139e-06, "loss": 0.3617, "step": 1928 }, { "epoch": 2.166760247052218, "grad_norm": 0.2816198766231537, "learning_rate": 2.1683728659397517e-06, "loss": 0.3844, "step": 1929 }, { "epoch": 2.167883211678832, "grad_norm": 0.2930549681186676, "learning_rate": 2.1629877650251936e-06, "loss": 0.39, "step": 1930 }, { "epoch": 2.1690061763054462, "grad_norm": 0.31996238231658936, "learning_rate": 2.1576075131236574e-06, "loss": 0.4035, "step": 1931 }, { "epoch": 2.1701291409320604, "grad_norm": 0.27931591868400574, "learning_rate": 2.1522321194310577e-06, "loss": 0.4206, "step": 1932 }, { "epoch": 2.171252105558675, "grad_norm": 0.3025978207588196, "learning_rate": 2.146861593135e-06, "loss": 0.4436, "step": 1933 }, { "epoch": 2.1723750701852893, "grad_norm": 0.2788255512714386, "learning_rate": 2.141495943414774e-06, "loss": 0.3927, "step": 1934 }, { "epoch": 2.1734980348119035, "grad_norm": 0.280332088470459, "learning_rate": 2.1361351794413334e-06, "loss": 0.3877, "step": 1935 }, { "epoch": 2.1746209994385177, "grad_norm": 0.30251309275627136, "learning_rate": 2.1307793103772762e-06, "loss": 0.4151, "step": 1936 }, { "epoch": 2.175743964065132, "grad_norm": 0.2937428057193756, "learning_rate": 2.125428345376841e-06, "loss": 0.413, "step": 1937 }, { "epoch": 2.176866928691746, "grad_norm": 0.27605023980140686, "learning_rate": 2.1200822935858807e-06, "loss": 0.397, "step": 1938 }, { "epoch": 2.1779898933183603, "grad_norm": 0.29239359498023987, "learning_rate": 2.1147411641418535e-06, "loss": 0.4524, "step": 1939 }, { "epoch": 2.1791128579449746, "grad_norm": 0.2887464165687561, "learning_rate": 2.1094049661737986e-06, "loss": 0.3851, "step": 1940 }, { "epoch": 2.180235822571589, "grad_norm": 0.2915821671485901, "learning_rate": 2.1040737088023323e-06, "loss": 0.4201, "step": 1941 }, { "epoch": 2.1813587871982034, "grad_norm": 0.28487229347229004, "learning_rate": 2.098747401139625e-06, "loss": 0.3701, "step": 1942 }, { "epoch": 2.1824817518248176, "grad_norm": 0.2724740505218506, "learning_rate": 2.093426052289384e-06, "loss": 0.3909, "step": 1943 }, { "epoch": 2.183604716451432, "grad_norm": 0.2859509289264679, "learning_rate": 2.0881096713468435e-06, "loss": 0.4145, "step": 1944 }, { "epoch": 2.184727681078046, "grad_norm": 0.3178553879261017, "learning_rate": 2.0827982673987483e-06, "loss": 0.4344, "step": 1945 }, { "epoch": 2.1858506457046603, "grad_norm": 0.30566826462745667, "learning_rate": 2.077491849523332e-06, "loss": 0.3859, "step": 1946 }, { "epoch": 2.1869736103312745, "grad_norm": 0.2980233132839203, "learning_rate": 2.0721904267903097e-06, "loss": 0.3995, "step": 1947 }, { "epoch": 2.1880965749578887, "grad_norm": 0.3464009165763855, "learning_rate": 2.066894008260859e-06, "loss": 0.3738, "step": 1948 }, { "epoch": 2.189219539584503, "grad_norm": 0.2979845404624939, "learning_rate": 2.0616026029875995e-06, "loss": 0.4245, "step": 1949 }, { "epoch": 2.1903425042111175, "grad_norm": 0.2993336319923401, "learning_rate": 2.056316220014588e-06, "loss": 0.3892, "step": 1950 }, { "epoch": 2.1914654688377317, "grad_norm": 0.27937158942222595, "learning_rate": 2.0510348683772966e-06, "loss": 0.3877, "step": 1951 }, { "epoch": 2.192588433464346, "grad_norm": 0.30241861939430237, "learning_rate": 2.0457585571025925e-06, "loss": 0.4269, "step": 1952 }, { "epoch": 2.19371139809096, "grad_norm": 0.3117314279079437, "learning_rate": 2.040487295208732e-06, "loss": 0.3909, "step": 1953 }, { "epoch": 2.1948343627175744, "grad_norm": 0.300813764333725, "learning_rate": 2.0352210917053438e-06, "loss": 0.4362, "step": 1954 }, { "epoch": 2.1959573273441886, "grad_norm": 0.27120837569236755, "learning_rate": 2.029959955593404e-06, "loss": 0.3941, "step": 1955 }, { "epoch": 2.197080291970803, "grad_norm": 0.2901502549648285, "learning_rate": 2.024703895865232e-06, "loss": 0.3929, "step": 1956 }, { "epoch": 2.198203256597417, "grad_norm": 0.3000068962574005, "learning_rate": 2.0194529215044718e-06, "loss": 0.4104, "step": 1957 }, { "epoch": 2.199326221224031, "grad_norm": 0.2672725021839142, "learning_rate": 2.0142070414860704e-06, "loss": 0.3763, "step": 1958 }, { "epoch": 2.200449185850646, "grad_norm": 0.3342515528202057, "learning_rate": 2.0089662647762716e-06, "loss": 0.4273, "step": 1959 }, { "epoch": 2.20157215047726, "grad_norm": 0.30191823840141296, "learning_rate": 2.0037306003325964e-06, "loss": 0.3742, "step": 1960 }, { "epoch": 2.2026951151038743, "grad_norm": 0.30050787329673767, "learning_rate": 1.998500057103826e-06, "loss": 0.4144, "step": 1961 }, { "epoch": 2.2038180797304885, "grad_norm": 0.29189521074295044, "learning_rate": 1.9932746440299926e-06, "loss": 0.371, "step": 1962 }, { "epoch": 2.2049410443571027, "grad_norm": 0.27885472774505615, "learning_rate": 1.9880543700423533e-06, "loss": 0.3858, "step": 1963 }, { "epoch": 2.206064008983717, "grad_norm": 0.30348601937294006, "learning_rate": 1.9828392440633864e-06, "loss": 0.4141, "step": 1964 }, { "epoch": 2.207186973610331, "grad_norm": 0.2691362500190735, "learning_rate": 1.977629275006772e-06, "loss": 0.3756, "step": 1965 }, { "epoch": 2.2083099382369458, "grad_norm": 0.29358971118927, "learning_rate": 1.9724244717773703e-06, "loss": 0.402, "step": 1966 }, { "epoch": 2.20943290286356, "grad_norm": 0.28840798139572144, "learning_rate": 1.967224843271218e-06, "loss": 0.3777, "step": 1967 }, { "epoch": 2.210555867490174, "grad_norm": 0.30779701471328735, "learning_rate": 1.962030398375506e-06, "loss": 0.4326, "step": 1968 }, { "epoch": 2.2116788321167884, "grad_norm": 0.2954394817352295, "learning_rate": 1.9568411459685615e-06, "loss": 0.4257, "step": 1969 }, { "epoch": 2.2128017967434026, "grad_norm": 0.2642882466316223, "learning_rate": 1.951657094919841e-06, "loss": 0.3551, "step": 1970 }, { "epoch": 2.213924761370017, "grad_norm": 0.3375574052333832, "learning_rate": 1.946478254089911e-06, "loss": 0.4255, "step": 1971 }, { "epoch": 2.215047725996631, "grad_norm": 0.28090715408325195, "learning_rate": 1.9413046323304278e-06, "loss": 0.3846, "step": 1972 }, { "epoch": 2.2161706906232452, "grad_norm": 0.3022766709327698, "learning_rate": 1.9361362384841326e-06, "loss": 0.421, "step": 1973 }, { "epoch": 2.2172936552498594, "grad_norm": 0.25339406728744507, "learning_rate": 1.9309730813848302e-06, "loss": 0.3878, "step": 1974 }, { "epoch": 2.218416619876474, "grad_norm": 0.2971627712249756, "learning_rate": 1.9258151698573707e-06, "loss": 0.4157, "step": 1975 }, { "epoch": 2.2195395845030883, "grad_norm": 0.31483760476112366, "learning_rate": 1.920662512717643e-06, "loss": 0.3695, "step": 1976 }, { "epoch": 2.2206625491297025, "grad_norm": 0.29008275270462036, "learning_rate": 1.915515118772555e-06, "loss": 0.3691, "step": 1977 }, { "epoch": 2.2217855137563167, "grad_norm": 0.34538841247558594, "learning_rate": 1.9103729968200145e-06, "loss": 0.4527, "step": 1978 }, { "epoch": 2.222908478382931, "grad_norm": 0.28085678815841675, "learning_rate": 1.905236155648923e-06, "loss": 0.381, "step": 1979 }, { "epoch": 2.224031443009545, "grad_norm": 0.2612300217151642, "learning_rate": 1.9001046040391558e-06, "loss": 0.3673, "step": 1980 }, { "epoch": 2.2251544076361593, "grad_norm": 0.30402663350105286, "learning_rate": 1.8949783507615426e-06, "loss": 0.4218, "step": 1981 }, { "epoch": 2.2262773722627736, "grad_norm": 0.2923140525817871, "learning_rate": 1.8898574045778624e-06, "loss": 0.3961, "step": 1982 }, { "epoch": 2.2274003368893878, "grad_norm": 0.2793225646018982, "learning_rate": 1.884741774240823e-06, "loss": 0.364, "step": 1983 }, { "epoch": 2.2285233015160024, "grad_norm": 0.28225046396255493, "learning_rate": 1.8796314684940415e-06, "loss": 0.417, "step": 1984 }, { "epoch": 2.2296462661426166, "grad_norm": 0.284463107585907, "learning_rate": 1.8745264960720389e-06, "loss": 0.3992, "step": 1985 }, { "epoch": 2.230769230769231, "grad_norm": 0.27850160002708435, "learning_rate": 1.8694268657002197e-06, "loss": 0.3536, "step": 1986 }, { "epoch": 2.231892195395845, "grad_norm": 0.3027052581310272, "learning_rate": 1.8643325860948568e-06, "loss": 0.4033, "step": 1987 }, { "epoch": 2.2330151600224593, "grad_norm": 0.27328038215637207, "learning_rate": 1.8592436659630786e-06, "loss": 0.3714, "step": 1988 }, { "epoch": 2.2341381246490735, "grad_norm": 0.2749581038951874, "learning_rate": 1.8541601140028542e-06, "loss": 0.3864, "step": 1989 }, { "epoch": 2.2352610892756877, "grad_norm": 0.2909168601036072, "learning_rate": 1.8490819389029713e-06, "loss": 0.4311, "step": 1990 }, { "epoch": 2.236384053902302, "grad_norm": 0.2728903889656067, "learning_rate": 1.8440091493430345e-06, "loss": 0.3784, "step": 1991 }, { "epoch": 2.2375070185289165, "grad_norm": 0.3319598138332367, "learning_rate": 1.8389417539934428e-06, "loss": 0.4465, "step": 1992 }, { "epoch": 2.2386299831555307, "grad_norm": 0.2700807750225067, "learning_rate": 1.8338797615153697e-06, "loss": 0.3848, "step": 1993 }, { "epoch": 2.239752947782145, "grad_norm": 0.28644469380378723, "learning_rate": 1.8288231805607593e-06, "loss": 0.3943, "step": 1994 }, { "epoch": 2.240875912408759, "grad_norm": 0.31425344944000244, "learning_rate": 1.8237720197723075e-06, "loss": 0.4137, "step": 1995 }, { "epoch": 2.2419988770353734, "grad_norm": 0.28912168741226196, "learning_rate": 1.81872628778344e-06, "loss": 0.4061, "step": 1996 }, { "epoch": 2.2431218416619876, "grad_norm": 0.27424824237823486, "learning_rate": 1.8136859932183105e-06, "loss": 0.3751, "step": 1997 }, { "epoch": 2.244244806288602, "grad_norm": 0.29128405451774597, "learning_rate": 1.8086511446917715e-06, "loss": 0.3756, "step": 1998 }, { "epoch": 2.245367770915216, "grad_norm": 0.2677030563354492, "learning_rate": 1.8036217508093746e-06, "loss": 0.3955, "step": 1999 }, { "epoch": 2.2464907355418307, "grad_norm": 0.2874849736690521, "learning_rate": 1.7985978201673455e-06, "loss": 0.4011, "step": 2000 }, { "epoch": 2.247613700168445, "grad_norm": 0.2831129729747772, "learning_rate": 1.7935793613525693e-06, "loss": 0.4018, "step": 2001 }, { "epoch": 2.248736664795059, "grad_norm": 0.2607268691062927, "learning_rate": 1.788566382942582e-06, "loss": 0.3913, "step": 2002 }, { "epoch": 2.2498596294216733, "grad_norm": 0.27383288741111755, "learning_rate": 1.7835588935055542e-06, "loss": 0.3877, "step": 2003 }, { "epoch": 2.2509825940482875, "grad_norm": 0.29349103569984436, "learning_rate": 1.7785569016002686e-06, "loss": 0.4255, "step": 2004 }, { "epoch": 2.2521055586749017, "grad_norm": 0.29879453778266907, "learning_rate": 1.7735604157761165e-06, "loss": 0.3903, "step": 2005 }, { "epoch": 2.253228523301516, "grad_norm": 0.2790232300758362, "learning_rate": 1.7685694445730788e-06, "loss": 0.3944, "step": 2006 }, { "epoch": 2.25435148792813, "grad_norm": 0.2784440517425537, "learning_rate": 1.7635839965217055e-06, "loss": 0.4049, "step": 2007 }, { "epoch": 2.2554744525547443, "grad_norm": 0.2787493169307709, "learning_rate": 1.7586040801431115e-06, "loss": 0.3825, "step": 2008 }, { "epoch": 2.256597417181359, "grad_norm": 0.30028894543647766, "learning_rate": 1.7536297039489559e-06, "loss": 0.4503, "step": 2009 }, { "epoch": 2.257720381807973, "grad_norm": 0.270557165145874, "learning_rate": 1.748660876441428e-06, "loss": 0.4173, "step": 2010 }, { "epoch": 2.2588433464345874, "grad_norm": 0.2905243933200836, "learning_rate": 1.7436976061132321e-06, "loss": 0.3957, "step": 2011 }, { "epoch": 2.2599663110612016, "grad_norm": 0.2664340138435364, "learning_rate": 1.7387399014475754e-06, "loss": 0.3823, "step": 2012 }, { "epoch": 2.261089275687816, "grad_norm": 0.2964526116847992, "learning_rate": 1.7337877709181527e-06, "loss": 0.3995, "step": 2013 }, { "epoch": 2.26221224031443, "grad_norm": 0.32127654552459717, "learning_rate": 1.7288412229891315e-06, "loss": 0.3935, "step": 2014 }, { "epoch": 2.2633352049410442, "grad_norm": 0.30939817428588867, "learning_rate": 1.7239002661151383e-06, "loss": 0.4136, "step": 2015 }, { "epoch": 2.2644581695676584, "grad_norm": 0.31615445017814636, "learning_rate": 1.7189649087412385e-06, "loss": 0.4085, "step": 2016 }, { "epoch": 2.2655811341942727, "grad_norm": 0.28828561305999756, "learning_rate": 1.7140351593029324e-06, "loss": 0.4227, "step": 2017 }, { "epoch": 2.2667040988208873, "grad_norm": 0.3275562524795532, "learning_rate": 1.7091110262261356e-06, "loss": 0.4436, "step": 2018 }, { "epoch": 2.2678270634475015, "grad_norm": 0.2741810977458954, "learning_rate": 1.7041925179271584e-06, "loss": 0.3841, "step": 2019 }, { "epoch": 2.2689500280741157, "grad_norm": 0.2728815972805023, "learning_rate": 1.6992796428127017e-06, "loss": 0.3979, "step": 2020 }, { "epoch": 2.27007299270073, "grad_norm": 0.2973443865776062, "learning_rate": 1.6943724092798398e-06, "loss": 0.4195, "step": 2021 }, { "epoch": 2.271195957327344, "grad_norm": 0.27824628353118896, "learning_rate": 1.689470825715998e-06, "loss": 0.3764, "step": 2022 }, { "epoch": 2.2723189219539583, "grad_norm": 0.29291802644729614, "learning_rate": 1.6845749004989508e-06, "loss": 0.3893, "step": 2023 }, { "epoch": 2.2734418865805726, "grad_norm": 0.27821117639541626, "learning_rate": 1.679684641996801e-06, "loss": 0.3785, "step": 2024 }, { "epoch": 2.274564851207187, "grad_norm": 0.3182985186576843, "learning_rate": 1.6748000585679602e-06, "loss": 0.4427, "step": 2025 }, { "epoch": 2.2756878158338014, "grad_norm": 0.29601478576660156, "learning_rate": 1.6699211585611464e-06, "loss": 0.3932, "step": 2026 }, { "epoch": 2.2768107804604156, "grad_norm": 0.2905028462409973, "learning_rate": 1.6650479503153627e-06, "loss": 0.3877, "step": 2027 }, { "epoch": 2.27793374508703, "grad_norm": 0.30522292852401733, "learning_rate": 1.6601804421598787e-06, "loss": 0.395, "step": 2028 }, { "epoch": 2.279056709713644, "grad_norm": 0.31536349654197693, "learning_rate": 1.655318642414227e-06, "loss": 0.4417, "step": 2029 }, { "epoch": 2.2801796743402583, "grad_norm": 0.2804311513900757, "learning_rate": 1.650462559388184e-06, "loss": 0.3812, "step": 2030 }, { "epoch": 2.2813026389668725, "grad_norm": 0.28646227717399597, "learning_rate": 1.6456122013817477e-06, "loss": 0.4281, "step": 2031 }, { "epoch": 2.2824256035934867, "grad_norm": 0.26667070388793945, "learning_rate": 1.6407675766851388e-06, "loss": 0.4187, "step": 2032 }, { "epoch": 2.283548568220101, "grad_norm": 0.2688080966472626, "learning_rate": 1.635928693578777e-06, "loss": 0.4014, "step": 2033 }, { "epoch": 2.2846715328467155, "grad_norm": 0.28266045451164246, "learning_rate": 1.631095560333264e-06, "loss": 0.4008, "step": 2034 }, { "epoch": 2.2857944974733297, "grad_norm": 0.29633837938308716, "learning_rate": 1.6262681852093782e-06, "loss": 0.4292, "step": 2035 }, { "epoch": 2.286917462099944, "grad_norm": 0.3028467893600464, "learning_rate": 1.6214465764580566e-06, "loss": 0.4275, "step": 2036 }, { "epoch": 2.288040426726558, "grad_norm": 0.3047069013118744, "learning_rate": 1.6166307423203765e-06, "loss": 0.3949, "step": 2037 }, { "epoch": 2.2891633913531724, "grad_norm": 0.2905154824256897, "learning_rate": 1.611820691027548e-06, "loss": 0.3888, "step": 2038 }, { "epoch": 2.2902863559797866, "grad_norm": 0.28541865944862366, "learning_rate": 1.607016430800898e-06, "loss": 0.4209, "step": 2039 }, { "epoch": 2.291409320606401, "grad_norm": 0.30770328640937805, "learning_rate": 1.6022179698518525e-06, "loss": 0.4057, "step": 2040 }, { "epoch": 2.292532285233015, "grad_norm": 0.28820711374282837, "learning_rate": 1.5974253163819298e-06, "loss": 0.4052, "step": 2041 }, { "epoch": 2.293655249859629, "grad_norm": 0.31473737955093384, "learning_rate": 1.592638478582716e-06, "loss": 0.4176, "step": 2042 }, { "epoch": 2.294778214486244, "grad_norm": 0.28557637333869934, "learning_rate": 1.5878574646358608e-06, "loss": 0.4421, "step": 2043 }, { "epoch": 2.295901179112858, "grad_norm": 0.2726806402206421, "learning_rate": 1.5830822827130616e-06, "loss": 0.3769, "step": 2044 }, { "epoch": 2.2970241437394723, "grad_norm": 0.2668299973011017, "learning_rate": 1.5783129409760423e-06, "loss": 0.4001, "step": 2045 }, { "epoch": 2.2981471083660865, "grad_norm": 0.276551753282547, "learning_rate": 1.573549447576549e-06, "loss": 0.4024, "step": 2046 }, { "epoch": 2.2992700729927007, "grad_norm": 0.2840927243232727, "learning_rate": 1.5687918106563326e-06, "loss": 0.3896, "step": 2047 }, { "epoch": 2.300393037619315, "grad_norm": 0.2950480878353119, "learning_rate": 1.5640400383471293e-06, "loss": 0.4383, "step": 2048 }, { "epoch": 2.301516002245929, "grad_norm": 0.2729065418243408, "learning_rate": 1.5592941387706562e-06, "loss": 0.3934, "step": 2049 }, { "epoch": 2.3026389668725433, "grad_norm": 0.27623453736305237, "learning_rate": 1.5545541200385916e-06, "loss": 0.4038, "step": 2050 }, { "epoch": 2.3037619314991575, "grad_norm": 0.29540038108825684, "learning_rate": 1.549819990252559e-06, "loss": 0.4426, "step": 2051 }, { "epoch": 2.304884896125772, "grad_norm": 0.2680819630622864, "learning_rate": 1.5450917575041209e-06, "loss": 0.3861, "step": 2052 }, { "epoch": 2.3060078607523864, "grad_norm": 0.2827078700065613, "learning_rate": 1.5403694298747602e-06, "loss": 0.3782, "step": 2053 }, { "epoch": 2.3071308253790006, "grad_norm": 0.298056960105896, "learning_rate": 1.535653015435863e-06, "loss": 0.4128, "step": 2054 }, { "epoch": 2.308253790005615, "grad_norm": 0.30466610193252563, "learning_rate": 1.5309425222487119e-06, "loss": 0.4201, "step": 2055 }, { "epoch": 2.309376754632229, "grad_norm": 0.2935538589954376, "learning_rate": 1.5262379583644704e-06, "loss": 0.3982, "step": 2056 }, { "epoch": 2.3104997192588432, "grad_norm": 0.31471729278564453, "learning_rate": 1.5215393318241612e-06, "loss": 0.3779, "step": 2057 }, { "epoch": 2.3116226838854574, "grad_norm": 0.2725154757499695, "learning_rate": 1.5168466506586654e-06, "loss": 0.3783, "step": 2058 }, { "epoch": 2.312745648512072, "grad_norm": 0.3134515583515167, "learning_rate": 1.5121599228887012e-06, "loss": 0.4423, "step": 2059 }, { "epoch": 2.3138686131386863, "grad_norm": 0.30474787950515747, "learning_rate": 1.5074791565248076e-06, "loss": 0.4197, "step": 2060 }, { "epoch": 2.3149915777653005, "grad_norm": 0.2862532436847687, "learning_rate": 1.502804359567337e-06, "loss": 0.3606, "step": 2061 }, { "epoch": 2.3161145423919147, "grad_norm": 0.2956801950931549, "learning_rate": 1.49813554000644e-06, "loss": 0.4094, "step": 2062 }, { "epoch": 2.317237507018529, "grad_norm": 0.27539893984794617, "learning_rate": 1.4934727058220499e-06, "loss": 0.3695, "step": 2063 }, { "epoch": 2.318360471645143, "grad_norm": 0.30448004603385925, "learning_rate": 1.4888158649838675e-06, "loss": 0.4235, "step": 2064 }, { "epoch": 2.3194834362717573, "grad_norm": 0.271424800157547, "learning_rate": 1.4841650254513512e-06, "loss": 0.3721, "step": 2065 }, { "epoch": 2.3206064008983716, "grad_norm": 0.2611352205276489, "learning_rate": 1.4795201951737037e-06, "loss": 0.3821, "step": 2066 }, { "epoch": 2.3217293655249858, "grad_norm": 0.2895631492137909, "learning_rate": 1.4748813820898554e-06, "loss": 0.4169, "step": 2067 }, { "epoch": 2.3228523301516004, "grad_norm": 0.29484856128692627, "learning_rate": 1.4702485941284534e-06, "loss": 0.4238, "step": 2068 }, { "epoch": 2.3239752947782146, "grad_norm": 0.26578307151794434, "learning_rate": 1.4656218392078415e-06, "loss": 0.3917, "step": 2069 }, { "epoch": 2.325098259404829, "grad_norm": 0.2779478430747986, "learning_rate": 1.4610011252360594e-06, "loss": 0.3929, "step": 2070 }, { "epoch": 2.326221224031443, "grad_norm": 0.29051750898361206, "learning_rate": 1.4563864601108152e-06, "loss": 0.4192, "step": 2071 }, { "epoch": 2.3273441886580573, "grad_norm": 0.27829909324645996, "learning_rate": 1.4517778517194819e-06, "loss": 0.4256, "step": 2072 }, { "epoch": 2.3284671532846715, "grad_norm": 0.27342063188552856, "learning_rate": 1.4471753079390815e-06, "loss": 0.3787, "step": 2073 }, { "epoch": 2.3295901179112857, "grad_norm": 0.2927097976207733, "learning_rate": 1.4425788366362654e-06, "loss": 0.4046, "step": 2074 }, { "epoch": 2.3307130825379, "grad_norm": 0.28893619775772095, "learning_rate": 1.4379884456673105e-06, "loss": 0.4126, "step": 2075 }, { "epoch": 2.331836047164514, "grad_norm": 0.264371782541275, "learning_rate": 1.4334041428781003e-06, "loss": 0.3797, "step": 2076 }, { "epoch": 2.3329590117911287, "grad_norm": 0.26451098918914795, "learning_rate": 1.428825936104109e-06, "loss": 0.3806, "step": 2077 }, { "epoch": 2.334081976417743, "grad_norm": 0.27199819684028625, "learning_rate": 1.424253833170397e-06, "loss": 0.382, "step": 2078 }, { "epoch": 2.335204941044357, "grad_norm": 0.29680174589157104, "learning_rate": 1.4196878418915894e-06, "loss": 0.4289, "step": 2079 }, { "epoch": 2.3363279056709714, "grad_norm": 0.30170002579689026, "learning_rate": 1.4151279700718623e-06, "loss": 0.3995, "step": 2080 }, { "epoch": 2.3374508702975856, "grad_norm": 0.3056568503379822, "learning_rate": 1.410574225504937e-06, "loss": 0.433, "step": 2081 }, { "epoch": 2.3385738349242, "grad_norm": 0.2628425359725952, "learning_rate": 1.4060266159740627e-06, "loss": 0.3402, "step": 2082 }, { "epoch": 2.339696799550814, "grad_norm": 0.3146252930164337, "learning_rate": 1.401485149251996e-06, "loss": 0.4201, "step": 2083 }, { "epoch": 2.3408197641774287, "grad_norm": 0.284625381231308, "learning_rate": 1.3969498331010012e-06, "loss": 0.4228, "step": 2084 }, { "epoch": 2.341942728804043, "grad_norm": 0.2892228662967682, "learning_rate": 1.3924206752728282e-06, "loss": 0.4164, "step": 2085 }, { "epoch": 2.343065693430657, "grad_norm": 0.3271387815475464, "learning_rate": 1.3878976835086971e-06, "loss": 0.4568, "step": 2086 }, { "epoch": 2.3441886580572713, "grad_norm": 0.24272572994232178, "learning_rate": 1.3833808655392943e-06, "loss": 0.3328, "step": 2087 }, { "epoch": 2.3453116226838855, "grad_norm": 0.295335054397583, "learning_rate": 1.3788702290847517e-06, "loss": 0.4099, "step": 2088 }, { "epoch": 2.3464345873104997, "grad_norm": 0.308559387922287, "learning_rate": 1.3743657818546363e-06, "loss": 0.4546, "step": 2089 }, { "epoch": 2.347557551937114, "grad_norm": 0.2718144655227661, "learning_rate": 1.369867531547937e-06, "loss": 0.3605, "step": 2090 }, { "epoch": 2.348680516563728, "grad_norm": 0.2784050703048706, "learning_rate": 1.3653754858530477e-06, "loss": 0.3898, "step": 2091 }, { "epoch": 2.3498034811903423, "grad_norm": 0.27956458926200867, "learning_rate": 1.3608896524477606e-06, "loss": 0.411, "step": 2092 }, { "epoch": 2.350926445816957, "grad_norm": 0.283742219209671, "learning_rate": 1.3564100389992497e-06, "loss": 0.4172, "step": 2093 }, { "epoch": 2.352049410443571, "grad_norm": 0.2687929570674896, "learning_rate": 1.3519366531640589e-06, "loss": 0.4004, "step": 2094 }, { "epoch": 2.3531723750701854, "grad_norm": 0.27838534116744995, "learning_rate": 1.3474695025880818e-06, "loss": 0.3935, "step": 2095 }, { "epoch": 2.3542953396967996, "grad_norm": 0.295901358127594, "learning_rate": 1.343008594906562e-06, "loss": 0.4253, "step": 2096 }, { "epoch": 2.355418304323414, "grad_norm": 0.30143290758132935, "learning_rate": 1.3385539377440709e-06, "loss": 0.3884, "step": 2097 }, { "epoch": 2.356541268950028, "grad_norm": 0.29125338792800903, "learning_rate": 1.3341055387144924e-06, "loss": 0.403, "step": 2098 }, { "epoch": 2.3576642335766422, "grad_norm": 0.2842191457748413, "learning_rate": 1.3296634054210195e-06, "loss": 0.3872, "step": 2099 }, { "epoch": 2.3587871982032564, "grad_norm": 0.2766658365726471, "learning_rate": 1.3252275454561337e-06, "loss": 0.4111, "step": 2100 }, { "epoch": 2.3599101628298707, "grad_norm": 0.27259284257888794, "learning_rate": 1.3207979664015914e-06, "loss": 0.4071, "step": 2101 }, { "epoch": 2.3610331274564853, "grad_norm": 0.2752622961997986, "learning_rate": 1.3163746758284174e-06, "loss": 0.4019, "step": 2102 }, { "epoch": 2.3621560920830995, "grad_norm": 0.28553199768066406, "learning_rate": 1.3119576812968893e-06, "loss": 0.4003, "step": 2103 }, { "epoch": 2.3632790567097137, "grad_norm": 0.29870447516441345, "learning_rate": 1.307546990356518e-06, "loss": 0.3891, "step": 2104 }, { "epoch": 2.364402021336328, "grad_norm": 0.2713184356689453, "learning_rate": 1.3031426105460443e-06, "loss": 0.3937, "step": 2105 }, { "epoch": 2.365524985962942, "grad_norm": 0.2883835434913635, "learning_rate": 1.2987445493934236e-06, "loss": 0.4195, "step": 2106 }, { "epoch": 2.3666479505895563, "grad_norm": 0.2746773958206177, "learning_rate": 1.2943528144158063e-06, "loss": 0.4013, "step": 2107 }, { "epoch": 2.3677709152161706, "grad_norm": 0.27987396717071533, "learning_rate": 1.289967413119535e-06, "loss": 0.4249, "step": 2108 }, { "epoch": 2.3688938798427848, "grad_norm": 0.2663009464740753, "learning_rate": 1.2855883530001228e-06, "loss": 0.3444, "step": 2109 }, { "epoch": 2.370016844469399, "grad_norm": 0.33868250250816345, "learning_rate": 1.2812156415422472e-06, "loss": 0.3986, "step": 2110 }, { "epoch": 2.3711398090960136, "grad_norm": 0.28142571449279785, "learning_rate": 1.2768492862197363e-06, "loss": 0.4252, "step": 2111 }, { "epoch": 2.372262773722628, "grad_norm": 0.27644357085227966, "learning_rate": 1.272489294495548e-06, "loss": 0.4176, "step": 2112 }, { "epoch": 2.373385738349242, "grad_norm": 0.29277801513671875, "learning_rate": 1.2681356738217692e-06, "loss": 0.3934, "step": 2113 }, { "epoch": 2.3745087029758563, "grad_norm": 0.2771528959274292, "learning_rate": 1.263788431639596e-06, "loss": 0.3867, "step": 2114 }, { "epoch": 2.3756316676024705, "grad_norm": 0.2782800793647766, "learning_rate": 1.2594475753793211e-06, "loss": 0.4182, "step": 2115 }, { "epoch": 2.3767546322290847, "grad_norm": 0.2624921500682831, "learning_rate": 1.2551131124603245e-06, "loss": 0.3793, "step": 2116 }, { "epoch": 2.377877596855699, "grad_norm": 0.28729015588760376, "learning_rate": 1.2507850502910578e-06, "loss": 0.413, "step": 2117 }, { "epoch": 2.3790005614823135, "grad_norm": 0.2906534969806671, "learning_rate": 1.2464633962690304e-06, "loss": 0.4062, "step": 2118 }, { "epoch": 2.3801235261089277, "grad_norm": 0.30596888065338135, "learning_rate": 1.242148157780802e-06, "loss": 0.4387, "step": 2119 }, { "epoch": 2.381246490735542, "grad_norm": 0.27870121598243713, "learning_rate": 1.2378393422019663e-06, "loss": 0.3999, "step": 2120 }, { "epoch": 2.382369455362156, "grad_norm": 0.29754382371902466, "learning_rate": 1.2335369568971362e-06, "loss": 0.4342, "step": 2121 }, { "epoch": 2.3834924199887704, "grad_norm": 0.2821967601776123, "learning_rate": 1.229241009219937e-06, "loss": 0.4274, "step": 2122 }, { "epoch": 2.3846153846153846, "grad_norm": 0.27671217918395996, "learning_rate": 1.224951506512992e-06, "loss": 0.3793, "step": 2123 }, { "epoch": 2.385738349241999, "grad_norm": 0.28699377179145813, "learning_rate": 1.2206684561079035e-06, "loss": 0.3872, "step": 2124 }, { "epoch": 2.386861313868613, "grad_norm": 0.29643598198890686, "learning_rate": 1.2163918653252498e-06, "loss": 0.3764, "step": 2125 }, { "epoch": 2.387984278495227, "grad_norm": 0.2696648836135864, "learning_rate": 1.212121741474569e-06, "loss": 0.4069, "step": 2126 }, { "epoch": 2.389107243121842, "grad_norm": 0.2767255902290344, "learning_rate": 1.207858091854342e-06, "loss": 0.4281, "step": 2127 }, { "epoch": 2.390230207748456, "grad_norm": 0.2740270495414734, "learning_rate": 1.2036009237519868e-06, "loss": 0.3797, "step": 2128 }, { "epoch": 2.3913531723750703, "grad_norm": 0.2883926033973694, "learning_rate": 1.1993502444438449e-06, "loss": 0.3904, "step": 2129 }, { "epoch": 2.3924761370016845, "grad_norm": 0.2962181568145752, "learning_rate": 1.1951060611951615e-06, "loss": 0.4407, "step": 2130 }, { "epoch": 2.3935991016282987, "grad_norm": 0.29067903757095337, "learning_rate": 1.190868381260084e-06, "loss": 0.3965, "step": 2131 }, { "epoch": 2.394722066254913, "grad_norm": 0.2850963771343231, "learning_rate": 1.1866372118816444e-06, "loss": 0.4075, "step": 2132 }, { "epoch": 2.395845030881527, "grad_norm": 0.29430723190307617, "learning_rate": 1.1824125602917414e-06, "loss": 0.4372, "step": 2133 }, { "epoch": 2.3969679955081413, "grad_norm": 0.27829715609550476, "learning_rate": 1.178194433711139e-06, "loss": 0.3767, "step": 2134 }, { "epoch": 2.3980909601347555, "grad_norm": 0.27983254194259644, "learning_rate": 1.173982839349448e-06, "loss": 0.3828, "step": 2135 }, { "epoch": 2.39921392476137, "grad_norm": 0.3094474673271179, "learning_rate": 1.1697777844051105e-06, "loss": 0.4226, "step": 2136 }, { "epoch": 2.4003368893879844, "grad_norm": 0.2957216501235962, "learning_rate": 1.1655792760653955e-06, "loss": 0.3895, "step": 2137 }, { "epoch": 2.4014598540145986, "grad_norm": 0.282475084066391, "learning_rate": 1.161387321506383e-06, "loss": 0.4061, "step": 2138 }, { "epoch": 2.402582818641213, "grad_norm": 0.29467281699180603, "learning_rate": 1.1572019278929457e-06, "loss": 0.3964, "step": 2139 }, { "epoch": 2.403705783267827, "grad_norm": 0.2815331220626831, "learning_rate": 1.1530231023787486e-06, "loss": 0.401, "step": 2140 }, { "epoch": 2.4048287478944412, "grad_norm": 0.28201285004615784, "learning_rate": 1.1488508521062274e-06, "loss": 0.4179, "step": 2141 }, { "epoch": 2.4059517125210554, "grad_norm": 0.2753874659538269, "learning_rate": 1.1446851842065804e-06, "loss": 0.4161, "step": 2142 }, { "epoch": 2.40707467714767, "grad_norm": 0.2634162902832031, "learning_rate": 1.1405261057997563e-06, "loss": 0.3695, "step": 2143 }, { "epoch": 2.4081976417742843, "grad_norm": 0.2846906781196594, "learning_rate": 1.1363736239944374e-06, "loss": 0.4056, "step": 2144 }, { "epoch": 2.4093206064008985, "grad_norm": 0.2740996778011322, "learning_rate": 1.1322277458880337e-06, "loss": 0.369, "step": 2145 }, { "epoch": 2.4104435710275127, "grad_norm": 0.3008590340614319, "learning_rate": 1.12808847856667e-06, "loss": 0.4398, "step": 2146 }, { "epoch": 2.411566535654127, "grad_norm": 0.2711624503135681, "learning_rate": 1.1239558291051677e-06, "loss": 0.4237, "step": 2147 }, { "epoch": 2.412689500280741, "grad_norm": 0.2627991735935211, "learning_rate": 1.1198298045670402e-06, "loss": 0.3819, "step": 2148 }, { "epoch": 2.4138124649073553, "grad_norm": 0.26782920956611633, "learning_rate": 1.1157104120044777e-06, "loss": 0.383, "step": 2149 }, { "epoch": 2.4149354295339696, "grad_norm": 0.26780667901039124, "learning_rate": 1.111597658458331e-06, "loss": 0.4096, "step": 2150 }, { "epoch": 2.4160583941605838, "grad_norm": 0.2841688096523285, "learning_rate": 1.1074915509581086e-06, "loss": 0.3837, "step": 2151 }, { "epoch": 2.4171813587871984, "grad_norm": 0.3015299141407013, "learning_rate": 1.103392096521958e-06, "loss": 0.4187, "step": 2152 }, { "epoch": 2.4183043234138126, "grad_norm": 0.2752377390861511, "learning_rate": 1.0992993021566528e-06, "loss": 0.4216, "step": 2153 }, { "epoch": 2.419427288040427, "grad_norm": 0.2619141638278961, "learning_rate": 1.0952131748575855e-06, "loss": 0.3917, "step": 2154 }, { "epoch": 2.420550252667041, "grad_norm": 0.2852632403373718, "learning_rate": 1.0911337216087552e-06, "loss": 0.4095, "step": 2155 }, { "epoch": 2.4216732172936553, "grad_norm": 0.29505640268325806, "learning_rate": 1.0870609493827488e-06, "loss": 0.4072, "step": 2156 }, { "epoch": 2.4227961819202695, "grad_norm": 0.26892712712287903, "learning_rate": 1.0829948651407374e-06, "loss": 0.3696, "step": 2157 }, { "epoch": 2.4239191465468837, "grad_norm": 0.26333877444267273, "learning_rate": 1.078935475832462e-06, "loss": 0.3798, "step": 2158 }, { "epoch": 2.425042111173498, "grad_norm": 0.2586692273616791, "learning_rate": 1.0748827883962165e-06, "loss": 0.391, "step": 2159 }, { "epoch": 2.426165075800112, "grad_norm": 0.2738136053085327, "learning_rate": 1.0708368097588435e-06, "loss": 0.4283, "step": 2160 }, { "epoch": 2.4272880404267267, "grad_norm": 0.27270036935806274, "learning_rate": 1.0667975468357194e-06, "loss": 0.4211, "step": 2161 }, { "epoch": 2.428411005053341, "grad_norm": 0.2785050570964813, "learning_rate": 1.0627650065307372e-06, "loss": 0.3882, "step": 2162 }, { "epoch": 2.429533969679955, "grad_norm": 0.25352659821510315, "learning_rate": 1.0587391957363053e-06, "loss": 0.3793, "step": 2163 }, { "epoch": 2.4306569343065694, "grad_norm": 0.33297908306121826, "learning_rate": 1.0547201213333285e-06, "loss": 0.4279, "step": 2164 }, { "epoch": 2.4317798989331836, "grad_norm": 0.29282787442207336, "learning_rate": 1.0507077901911944e-06, "loss": 0.3853, "step": 2165 }, { "epoch": 2.432902863559798, "grad_norm": 0.27549415826797485, "learning_rate": 1.0467022091677692e-06, "loss": 0.4107, "step": 2166 }, { "epoch": 2.434025828186412, "grad_norm": 0.28508269786834717, "learning_rate": 1.0427033851093804e-06, "loss": 0.3686, "step": 2167 }, { "epoch": 2.435148792813026, "grad_norm": 0.28339481353759766, "learning_rate": 1.0387113248508064e-06, "loss": 0.3922, "step": 2168 }, { "epoch": 2.4362717574396404, "grad_norm": 0.2783678472042084, "learning_rate": 1.0347260352152644e-06, "loss": 0.4133, "step": 2169 }, { "epoch": 2.437394722066255, "grad_norm": 0.26951515674591064, "learning_rate": 1.0307475230144015e-06, "loss": 0.3988, "step": 2170 }, { "epoch": 2.4385176866928693, "grad_norm": 0.2843879461288452, "learning_rate": 1.0267757950482765e-06, "loss": 0.4291, "step": 2171 }, { "epoch": 2.4396406513194835, "grad_norm": 0.2647595703601837, "learning_rate": 1.0228108581053565e-06, "loss": 0.3906, "step": 2172 }, { "epoch": 2.4407636159460977, "grad_norm": 0.28184375166893005, "learning_rate": 1.0188527189625014e-06, "loss": 0.4015, "step": 2173 }, { "epoch": 2.441886580572712, "grad_norm": 0.27224647998809814, "learning_rate": 1.0149013843849487e-06, "loss": 0.3733, "step": 2174 }, { "epoch": 2.443009545199326, "grad_norm": 0.28232502937316895, "learning_rate": 1.0109568611263094e-06, "loss": 0.3992, "step": 2175 }, { "epoch": 2.4441325098259403, "grad_norm": 0.28138768672943115, "learning_rate": 1.0070191559285514e-06, "loss": 0.3744, "step": 2176 }, { "epoch": 2.445255474452555, "grad_norm": 0.29477304220199585, "learning_rate": 1.0030882755219873e-06, "loss": 0.4036, "step": 2177 }, { "epoch": 2.446378439079169, "grad_norm": 0.2767258584499359, "learning_rate": 9.991642266252672e-07, "loss": 0.4081, "step": 2178 }, { "epoch": 2.4475014037057834, "grad_norm": 0.2989879250526428, "learning_rate": 9.952470159453658e-07, "loss": 0.3913, "step": 2179 }, { "epoch": 2.4486243683323976, "grad_norm": 0.27166691422462463, "learning_rate": 9.913366501775651e-07, "loss": 0.4037, "step": 2180 }, { "epoch": 2.449747332959012, "grad_norm": 0.2596884071826935, "learning_rate": 9.87433136005454e-07, "loss": 0.3699, "step": 2181 }, { "epoch": 2.450870297585626, "grad_norm": 0.29273349046707153, "learning_rate": 9.83536480100904e-07, "loss": 0.4514, "step": 2182 }, { "epoch": 2.4519932622122402, "grad_norm": 0.283987432718277, "learning_rate": 9.796466891240702e-07, "loss": 0.3775, "step": 2183 }, { "epoch": 2.4531162268388544, "grad_norm": 0.278410941362381, "learning_rate": 9.757637697233723e-07, "loss": 0.3895, "step": 2184 }, { "epoch": 2.4542391914654687, "grad_norm": 0.3013528883457184, "learning_rate": 9.718877285354838e-07, "loss": 0.3858, "step": 2185 }, { "epoch": 2.4553621560920833, "grad_norm": 0.30021604895591736, "learning_rate": 9.680185721853225e-07, "loss": 0.386, "step": 2186 }, { "epoch": 2.4564851207186975, "grad_norm": 0.3061163127422333, "learning_rate": 9.641563072860416e-07, "loss": 0.4527, "step": 2187 }, { "epoch": 2.4576080853453117, "grad_norm": 0.2706446349620819, "learning_rate": 9.603009404390095e-07, "loss": 0.3772, "step": 2188 }, { "epoch": 2.458731049971926, "grad_norm": 0.2647278606891632, "learning_rate": 9.564524782338102e-07, "loss": 0.355, "step": 2189 }, { "epoch": 2.45985401459854, "grad_norm": 0.3005942702293396, "learning_rate": 9.526109272482237e-07, "loss": 0.4457, "step": 2190 }, { "epoch": 2.4609769792251543, "grad_norm": 0.28408634662628174, "learning_rate": 9.487762940482187e-07, "loss": 0.4082, "step": 2191 }, { "epoch": 2.4620999438517686, "grad_norm": 0.2828903794288635, "learning_rate": 9.449485851879369e-07, "loss": 0.3982, "step": 2192 }, { "epoch": 2.4632229084783828, "grad_norm": 0.2872546911239624, "learning_rate": 9.41127807209688e-07, "loss": 0.4405, "step": 2193 }, { "epoch": 2.464345873104997, "grad_norm": 0.2579364776611328, "learning_rate": 9.373139666439346e-07, "loss": 0.3585, "step": 2194 }, { "epoch": 2.4654688377316116, "grad_norm": 0.2757807970046997, "learning_rate": 9.335070700092824e-07, "loss": 0.4161, "step": 2195 }, { "epoch": 2.466591802358226, "grad_norm": 0.2784770727157593, "learning_rate": 9.297071238124683e-07, "loss": 0.3835, "step": 2196 }, { "epoch": 2.46771476698484, "grad_norm": 0.2885216474533081, "learning_rate": 9.25914134548348e-07, "loss": 0.4034, "step": 2197 }, { "epoch": 2.4688377316114543, "grad_norm": 0.29273226857185364, "learning_rate": 9.221281086998879e-07, "loss": 0.4035, "step": 2198 }, { "epoch": 2.4699606962380685, "grad_norm": 0.29566600918769836, "learning_rate": 9.183490527381539e-07, "loss": 0.4458, "step": 2199 }, { "epoch": 2.4710836608646827, "grad_norm": 0.2590445578098297, "learning_rate": 9.145769731222947e-07, "loss": 0.372, "step": 2200 }, { "epoch": 2.472206625491297, "grad_norm": 0.2821544408798218, "learning_rate": 9.108118762995393e-07, "loss": 0.3908, "step": 2201 }, { "epoch": 2.473329590117911, "grad_norm": 0.28365612030029297, "learning_rate": 9.070537687051817e-07, "loss": 0.4501, "step": 2202 }, { "epoch": 2.4744525547445253, "grad_norm": 0.3225814402103424, "learning_rate": 9.033026567625652e-07, "loss": 0.4111, "step": 2203 }, { "epoch": 2.47557551937114, "grad_norm": 0.3063740134239197, "learning_rate": 8.995585468830814e-07, "loss": 0.4173, "step": 2204 }, { "epoch": 2.476698483997754, "grad_norm": 0.25814756751060486, "learning_rate": 8.958214454661529e-07, "loss": 0.373, "step": 2205 }, { "epoch": 2.4778214486243684, "grad_norm": 0.28323036432266235, "learning_rate": 8.920913588992197e-07, "loss": 0.433, "step": 2206 }, { "epoch": 2.4789444132509826, "grad_norm": 0.27133798599243164, "learning_rate": 8.883682935577359e-07, "loss": 0.3939, "step": 2207 }, { "epoch": 2.480067377877597, "grad_norm": 0.2708415687084198, "learning_rate": 8.846522558051563e-07, "loss": 0.4095, "step": 2208 }, { "epoch": 2.481190342504211, "grad_norm": 0.28072917461395264, "learning_rate": 8.809432519929184e-07, "loss": 0.4173, "step": 2209 }, { "epoch": 2.482313307130825, "grad_norm": 0.3032577633857727, "learning_rate": 8.77241288460442e-07, "loss": 0.3803, "step": 2210 }, { "epoch": 2.48343627175744, "grad_norm": 0.28691479563713074, "learning_rate": 8.735463715351139e-07, "loss": 0.4061, "step": 2211 }, { "epoch": 2.484559236384054, "grad_norm": 0.29063329100608826, "learning_rate": 8.698585075322724e-07, "loss": 0.4041, "step": 2212 }, { "epoch": 2.4856822010106683, "grad_norm": 0.27524280548095703, "learning_rate": 8.66177702755206e-07, "loss": 0.3963, "step": 2213 }, { "epoch": 2.4868051656372825, "grad_norm": 0.2957579493522644, "learning_rate": 8.625039634951354e-07, "loss": 0.4309, "step": 2214 }, { "epoch": 2.4879281302638967, "grad_norm": 0.29772523045539856, "learning_rate": 8.588372960312035e-07, "loss": 0.3642, "step": 2215 }, { "epoch": 2.489051094890511, "grad_norm": 0.27264365553855896, "learning_rate": 8.551777066304684e-07, "loss": 0.3845, "step": 2216 }, { "epoch": 2.490174059517125, "grad_norm": 0.2700076699256897, "learning_rate": 8.515252015478915e-07, "loss": 0.4054, "step": 2217 }, { "epoch": 2.4912970241437393, "grad_norm": 0.28505125641822815, "learning_rate": 8.478797870263206e-07, "loss": 0.4087, "step": 2218 }, { "epoch": 2.4924199887703535, "grad_norm": 0.2622195780277252, "learning_rate": 8.442414692964889e-07, "loss": 0.3914, "step": 2219 }, { "epoch": 2.493542953396968, "grad_norm": 0.2653864026069641, "learning_rate": 8.406102545769989e-07, "loss": 0.3973, "step": 2220 }, { "epoch": 2.4946659180235824, "grad_norm": 0.2620917856693268, "learning_rate": 8.369861490743119e-07, "loss": 0.3978, "step": 2221 }, { "epoch": 2.4957888826501966, "grad_norm": 0.2835758924484253, "learning_rate": 8.333691589827391e-07, "loss": 0.4146, "step": 2222 }, { "epoch": 2.496911847276811, "grad_norm": 0.2834499478340149, "learning_rate": 8.297592904844282e-07, "loss": 0.3805, "step": 2223 }, { "epoch": 2.498034811903425, "grad_norm": 0.27600857615470886, "learning_rate": 8.261565497493562e-07, "loss": 0.4011, "step": 2224 }, { "epoch": 2.4991577765300392, "grad_norm": 0.28048592805862427, "learning_rate": 8.225609429353187e-07, "loss": 0.3947, "step": 2225 }, { "epoch": 2.5002807411566534, "grad_norm": 0.272950679063797, "learning_rate": 8.189724761879131e-07, "loss": 0.3815, "step": 2226 }, { "epoch": 2.501403705783268, "grad_norm": 0.29167261719703674, "learning_rate": 8.153911556405387e-07, "loss": 0.4266, "step": 2227 }, { "epoch": 2.502526670409882, "grad_norm": 0.2746194899082184, "learning_rate": 8.118169874143783e-07, "loss": 0.4192, "step": 2228 }, { "epoch": 2.5036496350364965, "grad_norm": 0.24404732882976532, "learning_rate": 8.082499776183883e-07, "loss": 0.3564, "step": 2229 }, { "epoch": 2.5047725996631107, "grad_norm": 0.2939373552799225, "learning_rate": 8.046901323492917e-07, "loss": 0.4324, "step": 2230 }, { "epoch": 2.505895564289725, "grad_norm": 0.2826438546180725, "learning_rate": 8.011374576915675e-07, "loss": 0.3795, "step": 2231 }, { "epoch": 2.507018528916339, "grad_norm": 0.2788127362728119, "learning_rate": 7.975919597174342e-07, "loss": 0.3926, "step": 2232 }, { "epoch": 2.5081414935429533, "grad_norm": 0.2581125497817993, "learning_rate": 7.94053644486848e-07, "loss": 0.3682, "step": 2233 }, { "epoch": 2.5092644581695676, "grad_norm": 0.25598153471946716, "learning_rate": 7.905225180474879e-07, "loss": 0.3926, "step": 2234 }, { "epoch": 2.5103874227961818, "grad_norm": 0.2913089394569397, "learning_rate": 7.869985864347424e-07, "loss": 0.4146, "step": 2235 }, { "epoch": 2.5115103874227964, "grad_norm": 0.26694032549858093, "learning_rate": 7.834818556717067e-07, "loss": 0.4187, "step": 2236 }, { "epoch": 2.51263335204941, "grad_norm": 0.26305514574050903, "learning_rate": 7.799723317691671e-07, "loss": 0.3825, "step": 2237 }, { "epoch": 2.513756316676025, "grad_norm": 0.2679397165775299, "learning_rate": 7.764700207255904e-07, "loss": 0.3763, "step": 2238 }, { "epoch": 2.514879281302639, "grad_norm": 0.27117231488227844, "learning_rate": 7.729749285271171e-07, "loss": 0.4121, "step": 2239 }, { "epoch": 2.5160022459292533, "grad_norm": 0.255202978849411, "learning_rate": 7.694870611475497e-07, "loss": 0.3923, "step": 2240 }, { "epoch": 2.5171252105558675, "grad_norm": 0.2684265375137329, "learning_rate": 7.660064245483384e-07, "loss": 0.3948, "step": 2241 }, { "epoch": 2.5182481751824817, "grad_norm": 0.28416672348976135, "learning_rate": 7.625330246785784e-07, "loss": 0.408, "step": 2242 }, { "epoch": 2.519371139809096, "grad_norm": 0.2659955620765686, "learning_rate": 7.590668674749946e-07, "loss": 0.4085, "step": 2243 }, { "epoch": 2.52049410443571, "grad_norm": 0.2649502158164978, "learning_rate": 7.556079588619341e-07, "loss": 0.3463, "step": 2244 }, { "epoch": 2.5216170690623247, "grad_norm": 0.2743344306945801, "learning_rate": 7.521563047513508e-07, "loss": 0.4168, "step": 2245 }, { "epoch": 2.522740033688939, "grad_norm": 0.30284109711647034, "learning_rate": 7.487119110428037e-07, "loss": 0.412, "step": 2246 }, { "epoch": 2.523862998315553, "grad_norm": 0.2675303816795349, "learning_rate": 7.452747836234392e-07, "loss": 0.391, "step": 2247 }, { "epoch": 2.5249859629421674, "grad_norm": 0.2682286202907562, "learning_rate": 7.418449283679869e-07, "loss": 0.3689, "step": 2248 }, { "epoch": 2.5261089275687816, "grad_norm": 0.2663945257663727, "learning_rate": 7.384223511387457e-07, "loss": 0.3971, "step": 2249 }, { "epoch": 2.527231892195396, "grad_norm": 0.30254924297332764, "learning_rate": 7.350070577855716e-07, "loss": 0.4475, "step": 2250 }, { "epoch": 2.52835485682201, "grad_norm": 0.24738402664661407, "learning_rate": 7.315990541458767e-07, "loss": 0.3627, "step": 2251 }, { "epoch": 2.529477821448624, "grad_norm": 0.26276442408561707, "learning_rate": 7.281983460446112e-07, "loss": 0.4036, "step": 2252 }, { "epoch": 2.5306007860752384, "grad_norm": 0.2732093334197998, "learning_rate": 7.24804939294253e-07, "loss": 0.3792, "step": 2253 }, { "epoch": 2.531723750701853, "grad_norm": 0.2581872344017029, "learning_rate": 7.214188396948057e-07, "loss": 0.3839, "step": 2254 }, { "epoch": 2.5328467153284673, "grad_norm": 0.2725401222705841, "learning_rate": 7.180400530337789e-07, "loss": 0.4176, "step": 2255 }, { "epoch": 2.5339696799550815, "grad_norm": 0.28798454999923706, "learning_rate": 7.146685850861851e-07, "loss": 0.4111, "step": 2256 }, { "epoch": 2.5350926445816957, "grad_norm": 0.2945457398891449, "learning_rate": 7.113044416145287e-07, "loss": 0.4187, "step": 2257 }, { "epoch": 2.53621560920831, "grad_norm": 0.27824872732162476, "learning_rate": 7.079476283687925e-07, "loss": 0.411, "step": 2258 }, { "epoch": 2.537338573834924, "grad_norm": 0.27058613300323486, "learning_rate": 7.045981510864319e-07, "loss": 0.4007, "step": 2259 }, { "epoch": 2.5384615384615383, "grad_norm": 0.28095743060112, "learning_rate": 7.012560154923659e-07, "loss": 0.4198, "step": 2260 }, { "epoch": 2.539584503088153, "grad_norm": 0.289198637008667, "learning_rate": 6.979212272989599e-07, "loss": 0.4252, "step": 2261 }, { "epoch": 2.5407074677147667, "grad_norm": 0.2838200330734253, "learning_rate": 6.945937922060259e-07, "loss": 0.4205, "step": 2262 }, { "epoch": 2.5418304323413814, "grad_norm": 0.27621331810951233, "learning_rate": 6.91273715900807e-07, "loss": 0.3868, "step": 2263 }, { "epoch": 2.5429533969679956, "grad_norm": 0.28154653310775757, "learning_rate": 6.879610040579654e-07, "loss": 0.402, "step": 2264 }, { "epoch": 2.54407636159461, "grad_norm": 0.26039618253707886, "learning_rate": 6.846556623395795e-07, "loss": 0.4068, "step": 2265 }, { "epoch": 2.545199326221224, "grad_norm": 0.2556605935096741, "learning_rate": 6.813576963951318e-07, "loss": 0.3946, "step": 2266 }, { "epoch": 2.5463222908478382, "grad_norm": 0.2635415494441986, "learning_rate": 6.780671118614929e-07, "loss": 0.3983, "step": 2267 }, { "epoch": 2.5474452554744524, "grad_norm": 0.25149965286254883, "learning_rate": 6.74783914362922e-07, "loss": 0.372, "step": 2268 }, { "epoch": 2.5485682201010667, "grad_norm": 0.27746084332466125, "learning_rate": 6.715081095110504e-07, "loss": 0.3844, "step": 2269 }, { "epoch": 2.5496911847276813, "grad_norm": 0.2820571959018707, "learning_rate": 6.682397029048737e-07, "loss": 0.412, "step": 2270 }, { "epoch": 2.550814149354295, "grad_norm": 0.25880831480026245, "learning_rate": 6.649787001307451e-07, "loss": 0.386, "step": 2271 }, { "epoch": 2.5519371139809097, "grad_norm": 0.3088916838169098, "learning_rate": 6.617251067623581e-07, "loss": 0.4324, "step": 2272 }, { "epoch": 2.553060078607524, "grad_norm": 0.26791393756866455, "learning_rate": 6.584789283607456e-07, "loss": 0.3942, "step": 2273 }, { "epoch": 2.554183043234138, "grad_norm": 0.2639863193035126, "learning_rate": 6.552401704742678e-07, "loss": 0.3629, "step": 2274 }, { "epoch": 2.5553060078607523, "grad_norm": 0.298188179731369, "learning_rate": 6.520088386385998e-07, "loss": 0.4599, "step": 2275 }, { "epoch": 2.5564289724873666, "grad_norm": 0.2586508095264435, "learning_rate": 6.48784938376723e-07, "loss": 0.4012, "step": 2276 }, { "epoch": 2.5575519371139808, "grad_norm": 0.2880944311618805, "learning_rate": 6.455684751989194e-07, "loss": 0.4235, "step": 2277 }, { "epoch": 2.558674901740595, "grad_norm": 0.29616716504096985, "learning_rate": 6.42359454602759e-07, "loss": 0.4374, "step": 2278 }, { "epoch": 2.5597978663672096, "grad_norm": 0.2766810655593872, "learning_rate": 6.391578820730893e-07, "loss": 0.394, "step": 2279 }, { "epoch": 2.560920830993824, "grad_norm": 0.2852585017681122, "learning_rate": 6.359637630820292e-07, "loss": 0.4101, "step": 2280 }, { "epoch": 2.562043795620438, "grad_norm": 0.2585551142692566, "learning_rate": 6.327771030889584e-07, "loss": 0.3799, "step": 2281 }, { "epoch": 2.5631667602470523, "grad_norm": 0.2517673671245575, "learning_rate": 6.295979075405051e-07, "loss": 0.4074, "step": 2282 }, { "epoch": 2.5642897248736665, "grad_norm": 0.25984862446784973, "learning_rate": 6.26426181870542e-07, "loss": 0.4052, "step": 2283 }, { "epoch": 2.5654126895002807, "grad_norm": 0.2820814251899719, "learning_rate": 6.232619315001742e-07, "loss": 0.3821, "step": 2284 }, { "epoch": 2.566535654126895, "grad_norm": 0.29138845205307007, "learning_rate": 6.201051618377269e-07, "loss": 0.4476, "step": 2285 }, { "epoch": 2.5676586187535095, "grad_norm": 0.27896472811698914, "learning_rate": 6.169558782787438e-07, "loss": 0.3911, "step": 2286 }, { "epoch": 2.5687815833801233, "grad_norm": 0.2841276526451111, "learning_rate": 6.13814086205971e-07, "loss": 0.3915, "step": 2287 }, { "epoch": 2.569904548006738, "grad_norm": 0.2627476155757904, "learning_rate": 6.106797909893486e-07, "loss": 0.4208, "step": 2288 }, { "epoch": 2.571027512633352, "grad_norm": 0.24601136147975922, "learning_rate": 6.075529979860068e-07, "loss": 0.368, "step": 2289 }, { "epoch": 2.5721504772599664, "grad_norm": 0.29552391171455383, "learning_rate": 6.044337125402495e-07, "loss": 0.4022, "step": 2290 }, { "epoch": 2.5732734418865806, "grad_norm": 0.27324122190475464, "learning_rate": 6.013219399835507e-07, "loss": 0.4106, "step": 2291 }, { "epoch": 2.574396406513195, "grad_norm": 0.2447424679994583, "learning_rate": 5.982176856345445e-07, "loss": 0.3605, "step": 2292 }, { "epoch": 2.575519371139809, "grad_norm": 0.28657570481300354, "learning_rate": 5.951209547990111e-07, "loss": 0.4286, "step": 2293 }, { "epoch": 2.576642335766423, "grad_norm": 0.2824890911579132, "learning_rate": 5.920317527698744e-07, "loss": 0.3852, "step": 2294 }, { "epoch": 2.577765300393038, "grad_norm": 0.2776879072189331, "learning_rate": 5.889500848271901e-07, "loss": 0.3947, "step": 2295 }, { "epoch": 2.5788882650196516, "grad_norm": 0.26272904872894287, "learning_rate": 5.858759562381361e-07, "loss": 0.4045, "step": 2296 }, { "epoch": 2.5800112296462663, "grad_norm": 0.2835824489593506, "learning_rate": 5.828093722570033e-07, "loss": 0.4573, "step": 2297 }, { "epoch": 2.5811341942728805, "grad_norm": 0.2575940787792206, "learning_rate": 5.797503381251896e-07, "loss": 0.4098, "step": 2298 }, { "epoch": 2.5822571588994947, "grad_norm": 0.24793332815170288, "learning_rate": 5.766988590711853e-07, "loss": 0.3693, "step": 2299 }, { "epoch": 2.583380123526109, "grad_norm": 0.26999279856681824, "learning_rate": 5.736549403105702e-07, "loss": 0.4086, "step": 2300 }, { "epoch": 2.584503088152723, "grad_norm": 0.28280097246170044, "learning_rate": 5.706185870460018e-07, "loss": 0.389, "step": 2301 }, { "epoch": 2.5856260527793373, "grad_norm": 0.26028871536254883, "learning_rate": 5.675898044672057e-07, "loss": 0.3756, "step": 2302 }, { "epoch": 2.5867490174059515, "grad_norm": 0.2728234529495239, "learning_rate": 5.64568597750968e-07, "loss": 0.4074, "step": 2303 }, { "epoch": 2.587871982032566, "grad_norm": 0.25926151871681213, "learning_rate": 5.61554972061128e-07, "loss": 0.3622, "step": 2304 }, { "epoch": 2.5889949466591804, "grad_norm": 0.28393974900245667, "learning_rate": 5.585489325485638e-07, "loss": 0.4156, "step": 2305 }, { "epoch": 2.5901179112857946, "grad_norm": 0.25734743475914, "learning_rate": 5.555504843511905e-07, "loss": 0.4113, "step": 2306 }, { "epoch": 2.591240875912409, "grad_norm": 0.2414899319410324, "learning_rate": 5.525596325939469e-07, "loss": 0.3732, "step": 2307 }, { "epoch": 2.592363840539023, "grad_norm": 0.2817022204399109, "learning_rate": 5.495763823887879e-07, "loss": 0.4126, "step": 2308 }, { "epoch": 2.5934868051656372, "grad_norm": 0.27282148599624634, "learning_rate": 5.466007388346756e-07, "loss": 0.4027, "step": 2309 }, { "epoch": 2.5946097697922514, "grad_norm": 0.26683929562568665, "learning_rate": 5.436327070175729e-07, "loss": 0.4131, "step": 2310 }, { "epoch": 2.5957327344188657, "grad_norm": 0.28948211669921875, "learning_rate": 5.40672292010429e-07, "loss": 0.4031, "step": 2311 }, { "epoch": 2.59685569904548, "grad_norm": 0.2749268412590027, "learning_rate": 5.377194988731776e-07, "loss": 0.3626, "step": 2312 }, { "epoch": 2.5979786636720945, "grad_norm": 0.2863961458206177, "learning_rate": 5.347743326527255e-07, "loss": 0.4226, "step": 2313 }, { "epoch": 2.5991016282987087, "grad_norm": 0.27654561400413513, "learning_rate": 5.318367983829393e-07, "loss": 0.4051, "step": 2314 }, { "epoch": 2.600224592925323, "grad_norm": 0.26900389790534973, "learning_rate": 5.289069010846453e-07, "loss": 0.4104, "step": 2315 }, { "epoch": 2.601347557551937, "grad_norm": 0.24614867568016052, "learning_rate": 5.25984645765617e-07, "loss": 0.3725, "step": 2316 }, { "epoch": 2.6024705221785513, "grad_norm": 0.2729549705982208, "learning_rate": 5.230700374205622e-07, "loss": 0.4167, "step": 2317 }, { "epoch": 2.6035934868051656, "grad_norm": 0.2611064314842224, "learning_rate": 5.201630810311215e-07, "loss": 0.4026, "step": 2318 }, { "epoch": 2.6047164514317798, "grad_norm": 0.28740566968917847, "learning_rate": 5.172637815658583e-07, "loss": 0.4321, "step": 2319 }, { "epoch": 2.6058394160583944, "grad_norm": 0.2559981346130371, "learning_rate": 5.143721439802441e-07, "loss": 0.3656, "step": 2320 }, { "epoch": 2.606962380685008, "grad_norm": 0.27910658717155457, "learning_rate": 5.114881732166588e-07, "loss": 0.4455, "step": 2321 }, { "epoch": 2.608085345311623, "grad_norm": 0.2585473954677582, "learning_rate": 5.086118742043761e-07, "loss": 0.3735, "step": 2322 }, { "epoch": 2.609208309938237, "grad_norm": 0.26908713579177856, "learning_rate": 5.057432518595584e-07, "loss": 0.4048, "step": 2323 }, { "epoch": 2.6103312745648513, "grad_norm": 0.275795578956604, "learning_rate": 5.028823110852465e-07, "loss": 0.4397, "step": 2324 }, { "epoch": 2.6114542391914655, "grad_norm": 0.28397154808044434, "learning_rate": 5.000290567713533e-07, "loss": 0.4398, "step": 2325 }, { "epoch": 2.6125772038180797, "grad_norm": 0.25184062123298645, "learning_rate": 4.971834937946507e-07, "loss": 0.3521, "step": 2326 }, { "epoch": 2.613700168444694, "grad_norm": 0.26265236735343933, "learning_rate": 4.943456270187686e-07, "loss": 0.3607, "step": 2327 }, { "epoch": 2.614823133071308, "grad_norm": 0.2869437634944916, "learning_rate": 4.915154612941781e-07, "loss": 0.4221, "step": 2328 }, { "epoch": 2.6159460976979227, "grad_norm": 0.26948246359825134, "learning_rate": 4.886930014581915e-07, "loss": 0.3928, "step": 2329 }, { "epoch": 2.6170690623245365, "grad_norm": 0.2752903401851654, "learning_rate": 4.858782523349498e-07, "loss": 0.4289, "step": 2330 }, { "epoch": 2.618192026951151, "grad_norm": 0.2821439206600189, "learning_rate": 4.830712187354125e-07, "loss": 0.3909, "step": 2331 }, { "epoch": 2.6193149915777654, "grad_norm": 0.2800488770008087, "learning_rate": 4.802719054573535e-07, "loss": 0.4123, "step": 2332 }, { "epoch": 2.6204379562043796, "grad_norm": 0.2607535421848297, "learning_rate": 4.774803172853526e-07, "loss": 0.3741, "step": 2333 }, { "epoch": 2.621560920830994, "grad_norm": 0.27758172154426575, "learning_rate": 4.7469645899078153e-07, "loss": 0.3708, "step": 2334 }, { "epoch": 2.622683885457608, "grad_norm": 0.28636711835861206, "learning_rate": 4.7192033533180414e-07, "loss": 0.4293, "step": 2335 }, { "epoch": 2.623806850084222, "grad_norm": 0.26201388239860535, "learning_rate": 4.6915195105336374e-07, "loss": 0.4064, "step": 2336 }, { "epoch": 2.6249298147108364, "grad_norm": 0.2742285132408142, "learning_rate": 4.663913108871726e-07, "loss": 0.4114, "step": 2337 }, { "epoch": 2.626052779337451, "grad_norm": 0.2713795006275177, "learning_rate": 4.6363841955171017e-07, "loss": 0.4006, "step": 2338 }, { "epoch": 2.6271757439640653, "grad_norm": 0.27641481161117554, "learning_rate": 4.608932817522105e-07, "loss": 0.4043, "step": 2339 }, { "epoch": 2.6282987085906795, "grad_norm": 0.25992894172668457, "learning_rate": 4.581559021806542e-07, "loss": 0.3841, "step": 2340 }, { "epoch": 2.6294216732172937, "grad_norm": 0.2703031897544861, "learning_rate": 4.554262855157626e-07, "loss": 0.4056, "step": 2341 }, { "epoch": 2.630544637843908, "grad_norm": 0.282470703125, "learning_rate": 4.527044364229893e-07, "loss": 0.4056, "step": 2342 }, { "epoch": 2.631667602470522, "grad_norm": 0.2765389382839203, "learning_rate": 4.4999035955450964e-07, "loss": 0.414, "step": 2343 }, { "epoch": 2.6327905670971363, "grad_norm": 0.25286978483200073, "learning_rate": 4.472840595492167e-07, "loss": 0.3932, "step": 2344 }, { "epoch": 2.633913531723751, "grad_norm": 0.26249992847442627, "learning_rate": 4.44585541032711e-07, "loss": 0.4089, "step": 2345 }, { "epoch": 2.6350364963503647, "grad_norm": 0.29022693634033203, "learning_rate": 4.4189480861729137e-07, "loss": 0.4177, "step": 2346 }, { "epoch": 2.6361594609769794, "grad_norm": 0.26114919781684875, "learning_rate": 4.392118669019502e-07, "loss": 0.3953, "step": 2347 }, { "epoch": 2.6372824256035936, "grad_norm": 0.2698346674442291, "learning_rate": 4.365367204723636e-07, "loss": 0.4186, "step": 2348 }, { "epoch": 2.638405390230208, "grad_norm": 0.261444091796875, "learning_rate": 4.3386937390088366e-07, "loss": 0.3969, "step": 2349 }, { "epoch": 2.639528354856822, "grad_norm": 0.2657788395881653, "learning_rate": 4.312098317465324e-07, "loss": 0.4117, "step": 2350 }, { "epoch": 2.6406513194834362, "grad_norm": 0.2630191445350647, "learning_rate": 4.28558098554992e-07, "loss": 0.3863, "step": 2351 }, { "epoch": 2.6417742841100504, "grad_norm": 0.26042190194129944, "learning_rate": 4.259141788585947e-07, "loss": 0.4053, "step": 2352 }, { "epoch": 2.6428972487366647, "grad_norm": 0.2794458866119385, "learning_rate": 4.2327807717632174e-07, "loss": 0.4289, "step": 2353 }, { "epoch": 2.6440202133632793, "grad_norm": 0.26805150508880615, "learning_rate": 4.2064979801379134e-07, "loss": 0.4, "step": 2354 }, { "epoch": 2.645143177989893, "grad_norm": 0.2685701549053192, "learning_rate": 4.1802934586324897e-07, "loss": 0.3958, "step": 2355 }, { "epoch": 2.6462661426165077, "grad_norm": 0.27697989344596863, "learning_rate": 4.15416725203564e-07, "loss": 0.3988, "step": 2356 }, { "epoch": 2.647389107243122, "grad_norm": 0.2846849262714386, "learning_rate": 4.1281194050022123e-07, "loss": 0.3983, "step": 2357 }, { "epoch": 2.648512071869736, "grad_norm": 0.26208069920539856, "learning_rate": 4.102149962053098e-07, "loss": 0.3817, "step": 2358 }, { "epoch": 2.6496350364963503, "grad_norm": 0.26250022649765015, "learning_rate": 4.076258967575192e-07, "loss": 0.4129, "step": 2359 }, { "epoch": 2.6507580011229646, "grad_norm": 0.27228668332099915, "learning_rate": 4.050446465821323e-07, "loss": 0.4175, "step": 2360 }, { "epoch": 2.6518809657495788, "grad_norm": 0.24344605207443237, "learning_rate": 4.0247125009101275e-07, "loss": 0.3794, "step": 2361 }, { "epoch": 2.653003930376193, "grad_norm": 0.28151506185531616, "learning_rate": 3.999057116826033e-07, "loss": 0.4381, "step": 2362 }, { "epoch": 2.6541268950028076, "grad_norm": 0.2435680776834488, "learning_rate": 3.9734803574191347e-07, "loss": 0.3861, "step": 2363 }, { "epoch": 2.655249859629422, "grad_norm": 0.2785083055496216, "learning_rate": 3.947982266405159e-07, "loss": 0.4278, "step": 2364 }, { "epoch": 2.656372824256036, "grad_norm": 0.28046101331710815, "learning_rate": 3.9225628873653885e-07, "loss": 0.4089, "step": 2365 }, { "epoch": 2.6574957888826503, "grad_norm": 0.24580086767673492, "learning_rate": 3.8972222637465194e-07, "loss": 0.3693, "step": 2366 }, { "epoch": 2.6586187535092645, "grad_norm": 0.2879805862903595, "learning_rate": 3.871960438860689e-07, "loss": 0.4147, "step": 2367 }, { "epoch": 2.6597417181358787, "grad_norm": 0.2782611846923828, "learning_rate": 3.8467774558853474e-07, "loss": 0.4085, "step": 2368 }, { "epoch": 2.660864682762493, "grad_norm": 0.26115089654922485, "learning_rate": 3.8216733578631483e-07, "loss": 0.3881, "step": 2369 }, { "epoch": 2.661987647389107, "grad_norm": 0.2551243305206299, "learning_rate": 3.796648187701957e-07, "loss": 0.3825, "step": 2370 }, { "epoch": 2.6631106120157213, "grad_norm": 0.27538129687309265, "learning_rate": 3.77170198817472e-07, "loss": 0.4322, "step": 2371 }, { "epoch": 2.664233576642336, "grad_norm": 0.2630211114883423, "learning_rate": 3.7468348019194136e-07, "loss": 0.3819, "step": 2372 }, { "epoch": 2.66535654126895, "grad_norm": 0.25980308651924133, "learning_rate": 3.72204667143895e-07, "loss": 0.3724, "step": 2373 }, { "epoch": 2.6664795058955644, "grad_norm": 0.27009427547454834, "learning_rate": 3.697337639101134e-07, "loss": 0.4098, "step": 2374 }, { "epoch": 2.6676024705221786, "grad_norm": 0.2724756896495819, "learning_rate": 3.6727077471385706e-07, "loss": 0.4232, "step": 2375 }, { "epoch": 2.668725435148793, "grad_norm": 0.2818812131881714, "learning_rate": 3.648157037648598e-07, "loss": 0.4151, "step": 2376 }, { "epoch": 2.669848399775407, "grad_norm": 0.2632773518562317, "learning_rate": 3.6236855525932314e-07, "loss": 0.4066, "step": 2377 }, { "epoch": 2.670971364402021, "grad_norm": 0.25788047909736633, "learning_rate": 3.599293333799042e-07, "loss": 0.3722, "step": 2378 }, { "epoch": 2.672094329028636, "grad_norm": 0.2740200459957123, "learning_rate": 3.574980422957147e-07, "loss": 0.4048, "step": 2379 }, { "epoch": 2.6732172936552496, "grad_norm": 0.2767869830131531, "learning_rate": 3.5507468616231054e-07, "loss": 0.4121, "step": 2380 }, { "epoch": 2.6743402582818643, "grad_norm": 0.2626223564147949, "learning_rate": 3.5265926912168457e-07, "loss": 0.393, "step": 2381 }, { "epoch": 2.6754632229084785, "grad_norm": 0.27322664856910706, "learning_rate": 3.5025179530225995e-07, "loss": 0.4122, "step": 2382 }, { "epoch": 2.6765861875350927, "grad_norm": 0.2685525715351105, "learning_rate": 3.478522688188857e-07, "loss": 0.3845, "step": 2383 }, { "epoch": 2.677709152161707, "grad_norm": 0.25356122851371765, "learning_rate": 3.4546069377282333e-07, "loss": 0.3706, "step": 2384 }, { "epoch": 2.678832116788321, "grad_norm": 0.2772447466850281, "learning_rate": 3.43077074251747e-07, "loss": 0.4415, "step": 2385 }, { "epoch": 2.6799550814149353, "grad_norm": 0.2886089086532593, "learning_rate": 3.407014143297327e-07, "loss": 0.4037, "step": 2386 }, { "epoch": 2.6810780460415495, "grad_norm": 0.27276915311813354, "learning_rate": 3.383337180672508e-07, "loss": 0.4041, "step": 2387 }, { "epoch": 2.682201010668164, "grad_norm": 0.2634676694869995, "learning_rate": 3.359739895111602e-07, "loss": 0.4012, "step": 2388 }, { "epoch": 2.683323975294778, "grad_norm": 0.26172882318496704, "learning_rate": 3.3362223269470364e-07, "loss": 0.3861, "step": 2389 }, { "epoch": 2.6844469399213926, "grad_norm": 0.2740027904510498, "learning_rate": 3.312784516374956e-07, "loss": 0.415, "step": 2390 }, { "epoch": 2.685569904548007, "grad_norm": 0.2787792384624481, "learning_rate": 3.289426503455201e-07, "loss": 0.3947, "step": 2391 }, { "epoch": 2.686692869174621, "grad_norm": 0.25864192843437195, "learning_rate": 3.266148328111229e-07, "loss": 0.3911, "step": 2392 }, { "epoch": 2.6878158338012352, "grad_norm": 0.2683207392692566, "learning_rate": 3.242950030130021e-07, "loss": 0.4036, "step": 2393 }, { "epoch": 2.6889387984278494, "grad_norm": 0.2761908769607544, "learning_rate": 3.2198316491620305e-07, "loss": 0.399, "step": 2394 }, { "epoch": 2.6900617630544637, "grad_norm": 0.2698078751564026, "learning_rate": 3.196793224721151e-07, "loss": 0.4124, "step": 2395 }, { "epoch": 2.691184727681078, "grad_norm": 0.2534320056438446, "learning_rate": 3.1738347961845663e-07, "loss": 0.3742, "step": 2396 }, { "epoch": 2.6923076923076925, "grad_norm": 0.24717842042446136, "learning_rate": 3.150956402792765e-07, "loss": 0.3856, "step": 2397 }, { "epoch": 2.6934306569343067, "grad_norm": 0.2638513147830963, "learning_rate": 3.128158083649435e-07, "loss": 0.4237, "step": 2398 }, { "epoch": 2.694553621560921, "grad_norm": 0.25547027587890625, "learning_rate": 3.1054398777213944e-07, "loss": 0.382, "step": 2399 }, { "epoch": 2.695676586187535, "grad_norm": 0.2742813229560852, "learning_rate": 3.082801823838527e-07, "loss": 0.3822, "step": 2400 }, { "epoch": 2.6967995508141493, "grad_norm": 0.2803206443786621, "learning_rate": 3.0602439606937316e-07, "loss": 0.4617, "step": 2401 }, { "epoch": 2.6979225154407636, "grad_norm": 0.2603592574596405, "learning_rate": 3.0377663268428504e-07, "loss": 0.3734, "step": 2402 }, { "epoch": 2.6990454800673778, "grad_norm": 0.2567569315433502, "learning_rate": 3.015368960704584e-07, "loss": 0.3951, "step": 2403 }, { "epoch": 2.700168444693992, "grad_norm": 0.24992452561855316, "learning_rate": 2.9930519005604395e-07, "loss": 0.4086, "step": 2404 }, { "epoch": 2.701291409320606, "grad_norm": 0.2431694120168686, "learning_rate": 2.9708151845546763e-07, "loss": 0.3698, "step": 2405 }, { "epoch": 2.702414373947221, "grad_norm": 0.2681894302368164, "learning_rate": 2.9486588506942303e-07, "loss": 0.4073, "step": 2406 }, { "epoch": 2.703537338573835, "grad_norm": 0.27330055832862854, "learning_rate": 2.9265829368486264e-07, "loss": 0.4343, "step": 2407 }, { "epoch": 2.7046603032004493, "grad_norm": 0.2520916759967804, "learning_rate": 2.9045874807499654e-07, "loss": 0.3538, "step": 2408 }, { "epoch": 2.7057832678270635, "grad_norm": 0.2689089775085449, "learning_rate": 2.882672519992824e-07, "loss": 0.4123, "step": 2409 }, { "epoch": 2.7069062324536777, "grad_norm": 0.27180227637290955, "learning_rate": 2.8608380920341685e-07, "loss": 0.4145, "step": 2410 }, { "epoch": 2.708029197080292, "grad_norm": 0.2573905289173126, "learning_rate": 2.8390842341933457e-07, "loss": 0.3861, "step": 2411 }, { "epoch": 2.709152161706906, "grad_norm": 0.2816315293312073, "learning_rate": 2.817410983651997e-07, "loss": 0.432, "step": 2412 }, { "epoch": 2.7102751263335207, "grad_norm": 0.24748654663562775, "learning_rate": 2.7958183774539684e-07, "loss": 0.3818, "step": 2413 }, { "epoch": 2.7113980909601345, "grad_norm": 0.253329873085022, "learning_rate": 2.7743064525052765e-07, "loss": 0.3758, "step": 2414 }, { "epoch": 2.712521055586749, "grad_norm": 0.28041866421699524, "learning_rate": 2.7528752455740606e-07, "loss": 0.4296, "step": 2415 }, { "epoch": 2.7136440202133634, "grad_norm": 0.27838680148124695, "learning_rate": 2.7315247932904464e-07, "loss": 0.419, "step": 2416 }, { "epoch": 2.7147669848399776, "grad_norm": 0.2591458559036255, "learning_rate": 2.710255132146589e-07, "loss": 0.3905, "step": 2417 }, { "epoch": 2.715889949466592, "grad_norm": 0.27304983139038086, "learning_rate": 2.6890662984965234e-07, "loss": 0.3718, "step": 2418 }, { "epoch": 2.717012914093206, "grad_norm": 0.3039124608039856, "learning_rate": 2.667958328556142e-07, "loss": 0.4217, "step": 2419 }, { "epoch": 2.71813587871982, "grad_norm": 0.27633747458457947, "learning_rate": 2.646931258403118e-07, "loss": 0.3793, "step": 2420 }, { "epoch": 2.7192588433464344, "grad_norm": 0.2736916244029999, "learning_rate": 2.625985123976876e-07, "loss": 0.4127, "step": 2421 }, { "epoch": 2.720381807973049, "grad_norm": 0.2862861156463623, "learning_rate": 2.605119961078473e-07, "loss": 0.4305, "step": 2422 }, { "epoch": 2.721504772599663, "grad_norm": 0.2432491034269333, "learning_rate": 2.584335805370586e-07, "loss": 0.3851, "step": 2423 }, { "epoch": 2.7226277372262775, "grad_norm": 0.28086429834365845, "learning_rate": 2.5636326923774325e-07, "loss": 0.4258, "step": 2424 }, { "epoch": 2.7237507018528917, "grad_norm": 0.2649998068809509, "learning_rate": 2.5430106574847235e-07, "loss": 0.355, "step": 2425 }, { "epoch": 2.724873666479506, "grad_norm": 0.270733118057251, "learning_rate": 2.52246973593957e-07, "loss": 0.3784, "step": 2426 }, { "epoch": 2.72599663110612, "grad_norm": 0.27305787801742554, "learning_rate": 2.5020099628504603e-07, "loss": 0.4205, "step": 2427 }, { "epoch": 2.7271195957327343, "grad_norm": 0.2506488263607025, "learning_rate": 2.4816313731871713e-07, "loss": 0.3635, "step": 2428 }, { "epoch": 2.7282425603593485, "grad_norm": 0.2791696786880493, "learning_rate": 2.4613340017807406e-07, "loss": 0.4247, "step": 2429 }, { "epoch": 2.7293655249859627, "grad_norm": 0.26240578293800354, "learning_rate": 2.441117883323374e-07, "loss": 0.3978, "step": 2430 }, { "epoch": 2.7304884896125774, "grad_norm": 0.2544209957122803, "learning_rate": 2.4209830523683864e-07, "loss": 0.3954, "step": 2431 }, { "epoch": 2.7316114542391916, "grad_norm": 0.26131442189216614, "learning_rate": 2.4009295433301936e-07, "loss": 0.4149, "step": 2432 }, { "epoch": 2.732734418865806, "grad_norm": 0.2547316253185272, "learning_rate": 2.3809573904841844e-07, "loss": 0.3694, "step": 2433 }, { "epoch": 2.73385738349242, "grad_norm": 0.2540450692176819, "learning_rate": 2.3610666279667017e-07, "loss": 0.386, "step": 2434 }, { "epoch": 2.7349803481190342, "grad_norm": 0.266061931848526, "learning_rate": 2.3412572897749842e-07, "loss": 0.4127, "step": 2435 }, { "epoch": 2.7361033127456484, "grad_norm": 0.2613297402858734, "learning_rate": 2.3215294097670927e-07, "loss": 0.3692, "step": 2436 }, { "epoch": 2.7372262773722627, "grad_norm": 0.26784709095954895, "learning_rate": 2.301883021661855e-07, "loss": 0.4182, "step": 2437 }, { "epoch": 2.7383492419988773, "grad_norm": 0.2648299038410187, "learning_rate": 2.2823181590388378e-07, "loss": 0.4128, "step": 2438 }, { "epoch": 2.739472206625491, "grad_norm": 0.26793184876441956, "learning_rate": 2.262834855338225e-07, "loss": 0.3737, "step": 2439 }, { "epoch": 2.7405951712521057, "grad_norm": 0.2642846703529358, "learning_rate": 2.2434331438608404e-07, "loss": 0.3939, "step": 2440 }, { "epoch": 2.74171813587872, "grad_norm": 0.2663211524486542, "learning_rate": 2.2241130577680292e-07, "loss": 0.4121, "step": 2441 }, { "epoch": 2.742841100505334, "grad_norm": 0.27002906799316406, "learning_rate": 2.204874630081616e-07, "loss": 0.4107, "step": 2442 }, { "epoch": 2.7439640651319483, "grad_norm": 0.25698450207710266, "learning_rate": 2.1857178936838697e-07, "loss": 0.4132, "step": 2443 }, { "epoch": 2.7450870297585626, "grad_norm": 0.26420673727989197, "learning_rate": 2.1666428813174433e-07, "loss": 0.3663, "step": 2444 }, { "epoch": 2.7462099943851768, "grad_norm": 0.2709164023399353, "learning_rate": 2.1476496255852685e-07, "loss": 0.3916, "step": 2445 }, { "epoch": 2.747332959011791, "grad_norm": 0.26703858375549316, "learning_rate": 2.1287381589505717e-07, "loss": 0.4187, "step": 2446 }, { "epoch": 2.7484559236384056, "grad_norm": 0.28506118059158325, "learning_rate": 2.109908513736786e-07, "loss": 0.4008, "step": 2447 }, { "epoch": 2.7495788882650194, "grad_norm": 0.2460588961839676, "learning_rate": 2.091160722127472e-07, "loss": 0.4033, "step": 2448 }, { "epoch": 2.750701852891634, "grad_norm": 0.2528029680252075, "learning_rate": 2.0724948161663094e-07, "loss": 0.3757, "step": 2449 }, { "epoch": 2.7518248175182483, "grad_norm": 0.2475043088197708, "learning_rate": 2.0539108277570051e-07, "loss": 0.377, "step": 2450 }, { "epoch": 2.7529477821448625, "grad_norm": 0.26086580753326416, "learning_rate": 2.0354087886632623e-07, "loss": 0.3851, "step": 2451 }, { "epoch": 2.7540707467714767, "grad_norm": 0.26077860593795776, "learning_rate": 2.016988730508712e-07, "loss": 0.4208, "step": 2452 }, { "epoch": 2.755193711398091, "grad_norm": 0.26848262548446655, "learning_rate": 1.9986506847768593e-07, "loss": 0.41, "step": 2453 }, { "epoch": 2.756316676024705, "grad_norm": 0.24983298778533936, "learning_rate": 1.9803946828110376e-07, "loss": 0.3844, "step": 2454 }, { "epoch": 2.7574396406513193, "grad_norm": 0.2680094838142395, "learning_rate": 1.9622207558143537e-07, "loss": 0.43, "step": 2455 }, { "epoch": 2.758562605277934, "grad_norm": 0.27007555961608887, "learning_rate": 1.9441289348496428e-07, "loss": 0.3998, "step": 2456 }, { "epoch": 2.759685569904548, "grad_norm": 0.25160422921180725, "learning_rate": 1.9261192508393755e-07, "loss": 0.3773, "step": 2457 }, { "epoch": 2.7608085345311624, "grad_norm": 0.27239957451820374, "learning_rate": 1.9081917345656508e-07, "loss": 0.4046, "step": 2458 }, { "epoch": 2.7619314991577766, "grad_norm": 0.26542919874191284, "learning_rate": 1.890346416670147e-07, "loss": 0.403, "step": 2459 }, { "epoch": 2.763054463784391, "grad_norm": 0.257852703332901, "learning_rate": 1.8725833276540095e-07, "loss": 0.3948, "step": 2460 }, { "epoch": 2.764177428411005, "grad_norm": 0.27156031131744385, "learning_rate": 1.854902497877864e-07, "loss": 0.425, "step": 2461 }, { "epoch": 2.765300393037619, "grad_norm": 0.262203186750412, "learning_rate": 1.8373039575617368e-07, "loss": 0.3834, "step": 2462 }, { "epoch": 2.7664233576642334, "grad_norm": 0.27496686577796936, "learning_rate": 1.8197877367849948e-07, "loss": 0.3936, "step": 2463 }, { "epoch": 2.7675463222908476, "grad_norm": 0.27588874101638794, "learning_rate": 1.8023538654863115e-07, "loss": 0.3779, "step": 2464 }, { "epoch": 2.7686692869174623, "grad_norm": 0.2892899513244629, "learning_rate": 1.7850023734636234e-07, "loss": 0.4232, "step": 2465 }, { "epoch": 2.7697922515440765, "grad_norm": 0.2844388782978058, "learning_rate": 1.7677332903740296e-07, "loss": 0.3729, "step": 2466 }, { "epoch": 2.7709152161706907, "grad_norm": 0.2701784372329712, "learning_rate": 1.7505466457338082e-07, "loss": 0.4061, "step": 2467 }, { "epoch": 2.772038180797305, "grad_norm": 0.27529677748680115, "learning_rate": 1.7334424689183282e-07, "loss": 0.4163, "step": 2468 }, { "epoch": 2.773161145423919, "grad_norm": 0.24317710101604462, "learning_rate": 1.7164207891619823e-07, "loss": 0.3953, "step": 2469 }, { "epoch": 2.7742841100505333, "grad_norm": 0.26829344034194946, "learning_rate": 1.6994816355581867e-07, "loss": 0.4347, "step": 2470 }, { "epoch": 2.7754070746771475, "grad_norm": 0.24958421289920807, "learning_rate": 1.6826250370592877e-07, "loss": 0.4022, "step": 2471 }, { "epoch": 2.776530039303762, "grad_norm": 0.2416200488805771, "learning_rate": 1.6658510224765333e-07, "loss": 0.3952, "step": 2472 }, { "epoch": 2.777653003930376, "grad_norm": 0.2508910298347473, "learning_rate": 1.649159620480034e-07, "loss": 0.3878, "step": 2473 }, { "epoch": 2.7787759685569906, "grad_norm": 0.2721861004829407, "learning_rate": 1.632550859598664e-07, "loss": 0.4134, "step": 2474 }, { "epoch": 2.779898933183605, "grad_norm": 0.2739666998386383, "learning_rate": 1.6160247682200813e-07, "loss": 0.4076, "step": 2475 }, { "epoch": 2.781021897810219, "grad_norm": 0.2684274911880493, "learning_rate": 1.5995813745906364e-07, "loss": 0.3836, "step": 2476 }, { "epoch": 2.7821448624368332, "grad_norm": 0.257438600063324, "learning_rate": 1.5832207068153194e-07, "loss": 0.3815, "step": 2477 }, { "epoch": 2.7832678270634474, "grad_norm": 0.2831555902957916, "learning_rate": 1.566942792857745e-07, "loss": 0.4362, "step": 2478 }, { "epoch": 2.7843907916900617, "grad_norm": 0.3052050471305847, "learning_rate": 1.550747660540075e-07, "loss": 0.4009, "step": 2479 }, { "epoch": 2.785513756316676, "grad_norm": 0.2511120140552521, "learning_rate": 1.534635337542978e-07, "loss": 0.3608, "step": 2480 }, { "epoch": 2.7866367209432905, "grad_norm": 0.2730899155139923, "learning_rate": 1.5186058514055912e-07, "loss": 0.4193, "step": 2481 }, { "epoch": 2.7877596855699043, "grad_norm": 0.26551350951194763, "learning_rate": 1.502659229525466e-07, "loss": 0.3603, "step": 2482 }, { "epoch": 2.788882650196519, "grad_norm": 0.2710522711277008, "learning_rate": 1.4867954991585221e-07, "loss": 0.3981, "step": 2483 }, { "epoch": 2.790005614823133, "grad_norm": 0.25707995891571045, "learning_rate": 1.471014687418998e-07, "loss": 0.3902, "step": 2484 }, { "epoch": 2.7911285794497473, "grad_norm": 0.26466354727745056, "learning_rate": 1.4553168212794178e-07, "loss": 0.4298, "step": 2485 }, { "epoch": 2.7922515440763616, "grad_norm": 0.2586175799369812, "learning_rate": 1.4397019275705194e-07, "loss": 0.3868, "step": 2486 }, { "epoch": 2.7933745087029758, "grad_norm": 0.27499744296073914, "learning_rate": 1.4241700329812368e-07, "loss": 0.4183, "step": 2487 }, { "epoch": 2.79449747332959, "grad_norm": 0.2562949061393738, "learning_rate": 1.4087211640586461e-07, "loss": 0.3895, "step": 2488 }, { "epoch": 2.795620437956204, "grad_norm": 0.2752149999141693, "learning_rate": 1.3933553472078976e-07, "loss": 0.3916, "step": 2489 }, { "epoch": 2.796743402582819, "grad_norm": 0.2701072096824646, "learning_rate": 1.3780726086922103e-07, "loss": 0.4209, "step": 2490 }, { "epoch": 2.797866367209433, "grad_norm": 0.2653520107269287, "learning_rate": 1.3628729746327895e-07, "loss": 0.3979, "step": 2491 }, { "epoch": 2.7989893318360473, "grad_norm": 0.27480193972587585, "learning_rate": 1.3477564710088097e-07, "loss": 0.3975, "step": 2492 }, { "epoch": 2.8001122964626615, "grad_norm": 0.27834147214889526, "learning_rate": 1.332723123657348e-07, "loss": 0.3969, "step": 2493 }, { "epoch": 2.8012352610892757, "grad_norm": 0.260728120803833, "learning_rate": 1.3177729582733722e-07, "loss": 0.4374, "step": 2494 }, { "epoch": 2.80235822571589, "grad_norm": 0.23796997964382172, "learning_rate": 1.3029060004096428e-07, "loss": 0.3737, "step": 2495 }, { "epoch": 2.803481190342504, "grad_norm": 0.27897775173187256, "learning_rate": 1.288122275476733e-07, "loss": 0.405, "step": 2496 }, { "epoch": 2.8046041549691187, "grad_norm": 0.26597675681114197, "learning_rate": 1.2734218087429417e-07, "loss": 0.398, "step": 2497 }, { "epoch": 2.8057271195957325, "grad_norm": 0.26896989345550537, "learning_rate": 1.258804625334259e-07, "loss": 0.4242, "step": 2498 }, { "epoch": 2.806850084222347, "grad_norm": 0.26305922865867615, "learning_rate": 1.244270750234333e-07, "loss": 0.4036, "step": 2499 }, { "epoch": 2.8079730488489614, "grad_norm": 0.25404611229896545, "learning_rate": 1.229820208284427e-07, "loss": 0.3754, "step": 2500 }, { "epoch": 2.8090960134755756, "grad_norm": 0.25417548418045044, "learning_rate": 1.2154530241833497e-07, "loss": 0.4057, "step": 2501 }, { "epoch": 2.81021897810219, "grad_norm": 0.25784555077552795, "learning_rate": 1.201169222487464e-07, "loss": 0.4071, "step": 2502 }, { "epoch": 2.811341942728804, "grad_norm": 0.25659769773483276, "learning_rate": 1.1869688276106018e-07, "loss": 0.4207, "step": 2503 }, { "epoch": 2.812464907355418, "grad_norm": 0.2710270285606384, "learning_rate": 1.172851863824026e-07, "loss": 0.4122, "step": 2504 }, { "epoch": 2.8135878719820324, "grad_norm": 0.24498076736927032, "learning_rate": 1.1588183552564247e-07, "loss": 0.3531, "step": 2505 }, { "epoch": 2.814710836608647, "grad_norm": 0.2714432179927826, "learning_rate": 1.144868325893822e-07, "loss": 0.4269, "step": 2506 }, { "epoch": 2.815833801235261, "grad_norm": 0.2624036371707916, "learning_rate": 1.1310017995795619e-07, "loss": 0.4279, "step": 2507 }, { "epoch": 2.8169567658618755, "grad_norm": 0.26456499099731445, "learning_rate": 1.1172188000142803e-07, "loss": 0.379, "step": 2508 }, { "epoch": 2.8180797304884897, "grad_norm": 0.27564048767089844, "learning_rate": 1.1035193507558329e-07, "loss": 0.4043, "step": 2509 }, { "epoch": 2.819202695115104, "grad_norm": 0.26350364089012146, "learning_rate": 1.0899034752192839e-07, "loss": 0.371, "step": 2510 }, { "epoch": 2.820325659741718, "grad_norm": 0.2775675654411316, "learning_rate": 1.0763711966768453e-07, "loss": 0.4138, "step": 2511 }, { "epoch": 2.8214486243683323, "grad_norm": 0.2574467957019806, "learning_rate": 1.0629225382578435e-07, "loss": 0.3785, "step": 2512 }, { "epoch": 2.8225715889949465, "grad_norm": 0.26955074071884155, "learning_rate": 1.049557522948691e-07, "loss": 0.4192, "step": 2513 }, { "epoch": 2.8236945536215607, "grad_norm": 0.27838996052742004, "learning_rate": 1.0362761735928372e-07, "loss": 0.4232, "step": 2514 }, { "epoch": 2.8248175182481754, "grad_norm": 0.25297296047210693, "learning_rate": 1.0230785128907184e-07, "loss": 0.3665, "step": 2515 }, { "epoch": 2.8259404828747896, "grad_norm": 0.27924495935440063, "learning_rate": 1.0099645633997401e-07, "loss": 0.4174, "step": 2516 }, { "epoch": 2.827063447501404, "grad_norm": 0.2539713680744171, "learning_rate": 9.969343475342285e-08, "loss": 0.3837, "step": 2517 }, { "epoch": 2.828186412128018, "grad_norm": 0.2587847411632538, "learning_rate": 9.839878875653852e-08, "loss": 0.399, "step": 2518 }, { "epoch": 2.8293093767546322, "grad_norm": 0.2547721862792969, "learning_rate": 9.71125205621265e-08, "loss": 0.373, "step": 2519 }, { "epoch": 2.8304323413812464, "grad_norm": 0.2769564390182495, "learning_rate": 9.583463236867318e-08, "loss": 0.4246, "step": 2520 }, { "epoch": 2.8315553060078607, "grad_norm": 0.259678453207016, "learning_rate": 9.456512636034032e-08, "loss": 0.369, "step": 2521 }, { "epoch": 2.832678270634475, "grad_norm": 0.2633889615535736, "learning_rate": 9.330400470696387e-08, "loss": 0.3933, "step": 2522 }, { "epoch": 2.833801235261089, "grad_norm": 0.24940979480743408, "learning_rate": 9.205126956405075e-08, "loss": 0.4256, "step": 2523 }, { "epoch": 2.8349241998877037, "grad_norm": 0.26046472787857056, "learning_rate": 9.080692307277094e-08, "loss": 0.3949, "step": 2524 }, { "epoch": 2.836047164514318, "grad_norm": 0.28680044412612915, "learning_rate": 8.957096735995762e-08, "loss": 0.432, "step": 2525 }, { "epoch": 2.837170129140932, "grad_norm": 0.2793481945991516, "learning_rate": 8.834340453810375e-08, "loss": 0.4029, "step": 2526 }, { "epoch": 2.8382930937675463, "grad_norm": 0.263904869556427, "learning_rate": 8.712423670535541e-08, "loss": 0.3956, "step": 2527 }, { "epoch": 2.8394160583941606, "grad_norm": 0.27581560611724854, "learning_rate": 8.59134659455102e-08, "loss": 0.393, "step": 2528 }, { "epoch": 2.8405390230207748, "grad_norm": 0.25220590829849243, "learning_rate": 8.471109432801494e-08, "loss": 0.3885, "step": 2529 }, { "epoch": 2.841661987647389, "grad_norm": 0.2509588599205017, "learning_rate": 8.351712390795963e-08, "loss": 0.397, "step": 2530 }, { "epoch": 2.8427849522740036, "grad_norm": 0.27233538031578064, "learning_rate": 8.233155672607406e-08, "loss": 0.4426, "step": 2531 }, { "epoch": 2.8439079169006174, "grad_norm": 0.27243179082870483, "learning_rate": 8.11543948087279e-08, "loss": 0.3968, "step": 2532 }, { "epoch": 2.845030881527232, "grad_norm": 0.2778673470020294, "learning_rate": 7.99856401679211e-08, "loss": 0.3828, "step": 2533 }, { "epoch": 2.8461538461538463, "grad_norm": 0.2749861180782318, "learning_rate": 7.882529480128687e-08, "loss": 0.4125, "step": 2534 }, { "epoch": 2.8472768107804605, "grad_norm": 0.2613597512245178, "learning_rate": 7.76733606920832e-08, "loss": 0.4277, "step": 2535 }, { "epoch": 2.8483997754070747, "grad_norm": 0.27153998613357544, "learning_rate": 7.652983980919348e-08, "loss": 0.4124, "step": 2536 }, { "epoch": 2.849522740033689, "grad_norm": 0.26524579524993896, "learning_rate": 7.539473410711928e-08, "loss": 0.3971, "step": 2537 }, { "epoch": 2.850645704660303, "grad_norm": 0.27763038873672485, "learning_rate": 7.426804552598088e-08, "loss": 0.4161, "step": 2538 }, { "epoch": 2.8517686692869173, "grad_norm": 0.2811379134654999, "learning_rate": 7.314977599151008e-08, "loss": 0.421, "step": 2539 }, { "epoch": 2.852891633913532, "grad_norm": 0.23815974593162537, "learning_rate": 7.203992741505073e-08, "loss": 0.3724, "step": 2540 }, { "epoch": 2.8540145985401457, "grad_norm": 0.24651320278644562, "learning_rate": 7.093850169355266e-08, "loss": 0.3908, "step": 2541 }, { "epoch": 2.8551375631667604, "grad_norm": 0.2514438331127167, "learning_rate": 6.984550070956886e-08, "loss": 0.4166, "step": 2542 }, { "epoch": 2.8562605277933746, "grad_norm": 0.27320507168769836, "learning_rate": 6.876092633125441e-08, "loss": 0.4097, "step": 2543 }, { "epoch": 2.857383492419989, "grad_norm": 0.25708264112472534, "learning_rate": 6.768478041236037e-08, "loss": 0.3636, "step": 2544 }, { "epoch": 2.858506457046603, "grad_norm": 0.2634989321231842, "learning_rate": 6.661706479223152e-08, "loss": 0.4215, "step": 2545 }, { "epoch": 2.859629421673217, "grad_norm": 0.25893738865852356, "learning_rate": 6.555778129580471e-08, "loss": 0.4054, "step": 2546 }, { "epoch": 2.8607523862998314, "grad_norm": 0.24729645252227783, "learning_rate": 6.450693173360445e-08, "loss": 0.3524, "step": 2547 }, { "epoch": 2.8618753509264456, "grad_norm": 0.27089250087738037, "learning_rate": 6.346451790173958e-08, "loss": 0.4281, "step": 2548 }, { "epoch": 2.8629983155530603, "grad_norm": 0.2662295401096344, "learning_rate": 6.243054158190043e-08, "loss": 0.4189, "step": 2549 }, { "epoch": 2.8641212801796745, "grad_norm": 0.26587989926338196, "learning_rate": 6.140500454135668e-08, "loss": 0.3928, "step": 2550 }, { "epoch": 2.8652442448062887, "grad_norm": 0.2734512388706207, "learning_rate": 6.03879085329534e-08, "loss": 0.4228, "step": 2551 }, { "epoch": 2.866367209432903, "grad_norm": 0.24846868216991425, "learning_rate": 5.93792552951078e-08, "loss": 0.3912, "step": 2552 }, { "epoch": 2.867490174059517, "grad_norm": 0.2736835181713104, "learning_rate": 5.8379046551807486e-08, "loss": 0.3661, "step": 2553 }, { "epoch": 2.8686131386861313, "grad_norm": 0.2701999545097351, "learning_rate": 5.738728401260551e-08, "loss": 0.4072, "step": 2554 }, { "epoch": 2.8697361033127455, "grad_norm": 0.2630336284637451, "learning_rate": 5.6403969372619826e-08, "loss": 0.3988, "step": 2555 }, { "epoch": 2.87085906793936, "grad_norm": 0.27641788125038147, "learning_rate": 5.542910431252935e-08, "loss": 0.3832, "step": 2556 }, { "epoch": 2.871982032565974, "grad_norm": 0.27471765875816345, "learning_rate": 5.4462690498570114e-08, "loss": 0.4026, "step": 2557 }, { "epoch": 2.8731049971925886, "grad_norm": 0.2534099221229553, "learning_rate": 5.350472958253416e-08, "loss": 0.4207, "step": 2558 }, { "epoch": 2.874227961819203, "grad_norm": 0.24805252254009247, "learning_rate": 5.255522320176565e-08, "loss": 0.3606, "step": 2559 }, { "epoch": 2.875350926445817, "grad_norm": 0.2579372823238373, "learning_rate": 5.16141729791575e-08, "loss": 0.4018, "step": 2560 }, { "epoch": 2.8764738910724312, "grad_norm": 0.25109636783599854, "learning_rate": 5.068158052315031e-08, "loss": 0.3898, "step": 2561 }, { "epoch": 2.8775968556990454, "grad_norm": 0.26681214570999146, "learning_rate": 4.975744742772848e-08, "loss": 0.4304, "step": 2562 }, { "epoch": 2.8787198203256597, "grad_norm": 0.26372575759887695, "learning_rate": 4.8841775272416846e-08, "loss": 0.3801, "step": 2563 }, { "epoch": 2.879842784952274, "grad_norm": 0.2671085000038147, "learning_rate": 4.7934565622281274e-08, "loss": 0.4203, "step": 2564 }, { "epoch": 2.8809657495788885, "grad_norm": 0.28826555609703064, "learning_rate": 4.7035820027920284e-08, "loss": 0.3758, "step": 2565 }, { "epoch": 2.8820887142055023, "grad_norm": 0.27976885437965393, "learning_rate": 4.6145540025467337e-08, "loss": 0.4084, "step": 2566 }, { "epoch": 2.883211678832117, "grad_norm": 0.25248798727989197, "learning_rate": 4.526372713658744e-08, "loss": 0.3963, "step": 2567 }, { "epoch": 2.884334643458731, "grad_norm": 0.24470974504947662, "learning_rate": 4.439038286847164e-08, "loss": 0.3932, "step": 2568 }, { "epoch": 2.8854576080853453, "grad_norm": 0.24869517982006073, "learning_rate": 4.352550871383809e-08, "loss": 0.3975, "step": 2569 }, { "epoch": 2.8865805727119596, "grad_norm": 0.2667178809642792, "learning_rate": 4.2669106150926564e-08, "loss": 0.4347, "step": 2570 }, { "epoch": 2.8877035373385738, "grad_norm": 0.2538471519947052, "learning_rate": 4.182117664349783e-08, "loss": 0.3657, "step": 2571 }, { "epoch": 2.888826501965188, "grad_norm": 0.2860618531703949, "learning_rate": 4.098172164083092e-08, "loss": 0.4221, "step": 2572 }, { "epoch": 2.889949466591802, "grad_norm": 0.24827823042869568, "learning_rate": 4.0150742577720334e-08, "loss": 0.3872, "step": 2573 }, { "epoch": 2.891072431218417, "grad_norm": 0.26983192563056946, "learning_rate": 3.9328240874471624e-08, "loss": 0.4275, "step": 2574 }, { "epoch": 2.892195395845031, "grad_norm": 0.24773791432380676, "learning_rate": 3.8514217936903576e-08, "loss": 0.3745, "step": 2575 }, { "epoch": 2.8933183604716453, "grad_norm": 0.2646888792514801, "learning_rate": 3.770867515634158e-08, "loss": 0.3919, "step": 2576 }, { "epoch": 2.8944413250982595, "grad_norm": 0.2690393626689911, "learning_rate": 3.6911613909616505e-08, "loss": 0.399, "step": 2577 }, { "epoch": 2.8955642897248737, "grad_norm": 0.27760377526283264, "learning_rate": 3.6123035559063047e-08, "loss": 0.4418, "step": 2578 }, { "epoch": 2.896687254351488, "grad_norm": 0.25624901056289673, "learning_rate": 3.534294145251749e-08, "loss": 0.4044, "step": 2579 }, { "epoch": 2.897810218978102, "grad_norm": 0.2626415491104126, "learning_rate": 3.457133292331494e-08, "loss": 0.4052, "step": 2580 }, { "epoch": 2.8989331836047163, "grad_norm": 0.2557370662689209, "learning_rate": 3.3808211290284886e-08, "loss": 0.4038, "step": 2581 }, { "epoch": 2.9000561482313305, "grad_norm": 0.2741665244102478, "learning_rate": 3.305357785775398e-08, "loss": 0.4175, "step": 2582 }, { "epoch": 2.901179112857945, "grad_norm": 0.24624177813529968, "learning_rate": 3.230743391553881e-08, "loss": 0.3731, "step": 2583 }, { "epoch": 2.9023020774845594, "grad_norm": 0.24726594984531403, "learning_rate": 3.156978073894701e-08, "loss": 0.3872, "step": 2584 }, { "epoch": 2.9034250421111736, "grad_norm": 0.26437288522720337, "learning_rate": 3.084061958877227e-08, "loss": 0.422, "step": 2585 }, { "epoch": 2.904548006737788, "grad_norm": 0.268906831741333, "learning_rate": 3.011995171129545e-08, "loss": 0.4134, "step": 2586 }, { "epoch": 2.905670971364402, "grad_norm": 0.2560522258281708, "learning_rate": 2.9407778338280124e-08, "loss": 0.4022, "step": 2587 }, { "epoch": 2.906793935991016, "grad_norm": 0.24872729182243347, "learning_rate": 2.870410068697038e-08, "loss": 0.3751, "step": 2588 }, { "epoch": 2.9079169006176304, "grad_norm": 0.278974324464798, "learning_rate": 2.8008919960090253e-08, "loss": 0.3879, "step": 2589 }, { "epoch": 2.909039865244245, "grad_norm": 0.25535717606544495, "learning_rate": 2.7322237345840387e-08, "loss": 0.4032, "step": 2590 }, { "epoch": 2.910162829870859, "grad_norm": 0.2563422918319702, "learning_rate": 2.6644054017896937e-08, "loss": 0.4012, "step": 2591 }, { "epoch": 2.9112857944974735, "grad_norm": 0.2548699975013733, "learning_rate": 2.5974371135408792e-08, "loss": 0.3888, "step": 2592 }, { "epoch": 2.9124087591240877, "grad_norm": 0.24391932785511017, "learning_rate": 2.5313189842996465e-08, "loss": 0.3865, "step": 2593 }, { "epoch": 2.913531723750702, "grad_norm": 0.24572822451591492, "learning_rate": 2.46605112707482e-08, "loss": 0.3941, "step": 2594 }, { "epoch": 2.914654688377316, "grad_norm": 0.25445792078971863, "learning_rate": 2.401633653422053e-08, "loss": 0.3928, "step": 2595 }, { "epoch": 2.9157776530039303, "grad_norm": 0.2709309160709381, "learning_rate": 2.3380666734436062e-08, "loss": 0.4045, "step": 2596 }, { "epoch": 2.9169006176305445, "grad_norm": 0.28139108419418335, "learning_rate": 2.2753502957877925e-08, "loss": 0.4319, "step": 2597 }, { "epoch": 2.9180235822571587, "grad_norm": 0.2427232265472412, "learning_rate": 2.2134846276494205e-08, "loss": 0.402, "step": 2598 }, { "epoch": 2.9191465468837734, "grad_norm": 0.2615796625614166, "learning_rate": 2.1524697747690725e-08, "loss": 0.3879, "step": 2599 }, { "epoch": 2.920269511510387, "grad_norm": 0.2605128288269043, "learning_rate": 2.0923058414331066e-08, "loss": 0.4419, "step": 2600 }, { "epoch": 2.921392476137002, "grad_norm": 0.260249525308609, "learning_rate": 2.032992930473543e-08, "loss": 0.3871, "step": 2601 }, { "epoch": 2.922515440763616, "grad_norm": 0.2633686065673828, "learning_rate": 1.9745311432678993e-08, "loss": 0.3818, "step": 2602 }, { "epoch": 2.9236384053902302, "grad_norm": 0.2813769280910492, "learning_rate": 1.9169205797388568e-08, "loss": 0.4409, "step": 2603 }, { "epoch": 2.9247613700168444, "grad_norm": 0.26654165983200073, "learning_rate": 1.860161338354205e-08, "loss": 0.4319, "step": 2604 }, { "epoch": 2.9258843346434587, "grad_norm": 0.27008771896362305, "learning_rate": 1.8042535161267306e-08, "loss": 0.3839, "step": 2605 }, { "epoch": 2.927007299270073, "grad_norm": 0.2590080499649048, "learning_rate": 1.7491972086138287e-08, "loss": 0.3887, "step": 2606 }, { "epoch": 2.928130263896687, "grad_norm": 0.24742592871189117, "learning_rate": 1.69499250991767e-08, "loss": 0.4038, "step": 2607 }, { "epoch": 2.9292532285233017, "grad_norm": 0.2446199655532837, "learning_rate": 1.6416395126847005e-08, "loss": 0.3451, "step": 2608 }, { "epoch": 2.930376193149916, "grad_norm": 0.2781931459903717, "learning_rate": 1.5891383081057532e-08, "loss": 0.4255, "step": 2609 }, { "epoch": 2.93149915777653, "grad_norm": 0.2844160199165344, "learning_rate": 1.5374889859157137e-08, "loss": 0.427, "step": 2610 }, { "epoch": 2.9326221224031443, "grad_norm": 0.2443605661392212, "learning_rate": 1.4866916343934667e-08, "loss": 0.3587, "step": 2611 }, { "epoch": 2.9337450870297586, "grad_norm": 0.2551402151584625, "learning_rate": 1.4367463403616721e-08, "loss": 0.42, "step": 2612 }, { "epoch": 2.9348680516563728, "grad_norm": 0.26562076807022095, "learning_rate": 1.3876531891867106e-08, "loss": 0.4348, "step": 2613 }, { "epoch": 2.935991016282987, "grad_norm": 0.2476344257593155, "learning_rate": 1.3394122647784058e-08, "loss": 0.3406, "step": 2614 }, { "epoch": 2.937113980909601, "grad_norm": 0.2618557810783386, "learning_rate": 1.292023649590024e-08, "loss": 0.387, "step": 2615 }, { "epoch": 2.9382369455362154, "grad_norm": 0.2689482867717743, "learning_rate": 1.2454874246181081e-08, "loss": 0.3952, "step": 2616 }, { "epoch": 2.93935991016283, "grad_norm": 0.26815280318260193, "learning_rate": 1.1998036694021442e-08, "loss": 0.4008, "step": 2617 }, { "epoch": 2.9404828747894443, "grad_norm": 0.27435627579689026, "learning_rate": 1.1549724620247283e-08, "loss": 0.4046, "step": 2618 }, { "epoch": 2.9416058394160585, "grad_norm": 0.24282604455947876, "learning_rate": 1.1109938791112328e-08, "loss": 0.3772, "step": 2619 }, { "epoch": 2.9427288040426727, "grad_norm": 0.27076494693756104, "learning_rate": 1.0678679958296966e-08, "loss": 0.4192, "step": 2620 }, { "epoch": 2.943851768669287, "grad_norm": 0.2787020206451416, "learning_rate": 1.0255948858907683e-08, "loss": 0.3706, "step": 2621 }, { "epoch": 2.944974733295901, "grad_norm": 0.24679070711135864, "learning_rate": 9.841746215474845e-09, "loss": 0.3548, "step": 2622 }, { "epoch": 2.9460976979225153, "grad_norm": 0.26585137844085693, "learning_rate": 9.43607273595326e-09, "loss": 0.4139, "step": 2623 }, { "epoch": 2.94722066254913, "grad_norm": 0.2649494409561157, "learning_rate": 9.038929113718287e-09, "loss": 0.4121, "step": 2624 }, { "epoch": 2.9483436271757437, "grad_norm": 0.26359909772872925, "learning_rate": 8.650316027566386e-09, "loss": 0.4036, "step": 2625 }, { "epoch": 2.9494665918023584, "grad_norm": 0.26836904883384705, "learning_rate": 8.270234141714572e-09, "loss": 0.4436, "step": 2626 }, { "epoch": 2.9505895564289726, "grad_norm": 0.24827072024345398, "learning_rate": 7.898684105797084e-09, "loss": 0.3933, "step": 2627 }, { "epoch": 2.951712521055587, "grad_norm": 0.2514251470565796, "learning_rate": 7.535666554866483e-09, "loss": 0.394, "step": 2628 }, { "epoch": 2.952835485682201, "grad_norm": 0.24649810791015625, "learning_rate": 7.181182109391449e-09, "loss": 0.4013, "step": 2629 }, { "epoch": 2.953958450308815, "grad_norm": 0.24731650948524475, "learning_rate": 6.835231375255658e-09, "loss": 0.3926, "step": 2630 }, { "epoch": 2.9550814149354294, "grad_norm": 0.27158939838409424, "learning_rate": 6.497814943756675e-09, "loss": 0.391, "step": 2631 }, { "epoch": 2.9562043795620436, "grad_norm": 0.268110066652298, "learning_rate": 6.168933391605958e-09, "loss": 0.4294, "step": 2632 }, { "epoch": 2.9573273441886583, "grad_norm": 0.25730767846107483, "learning_rate": 5.848587280927187e-09, "loss": 0.3865, "step": 2633 }, { "epoch": 2.958450308815272, "grad_norm": 0.2740709185600281, "learning_rate": 5.536777159254603e-09, "loss": 0.4188, "step": 2634 }, { "epoch": 2.9595732734418867, "grad_norm": 0.2662360966205597, "learning_rate": 5.233503559533559e-09, "loss": 0.433, "step": 2635 }, { "epoch": 2.960696238068501, "grad_norm": 0.24334050714969635, "learning_rate": 4.93876700011775e-09, "loss": 0.3478, "step": 2636 }, { "epoch": 2.961819202695115, "grad_norm": 0.2780497968196869, "learning_rate": 4.652567984770873e-09, "loss": 0.4832, "step": 2637 }, { "epoch": 2.9629421673217293, "grad_norm": 0.26721400022506714, "learning_rate": 4.374907002662743e-09, "loss": 0.358, "step": 2638 }, { "epoch": 2.9640651319483435, "grad_norm": 0.26856672763824463, "learning_rate": 4.1057845283709596e-09, "loss": 0.3998, "step": 2639 }, { "epoch": 2.9651880965749577, "grad_norm": 0.2754051387310028, "learning_rate": 3.845201021879241e-09, "loss": 0.4245, "step": 2640 }, { "epoch": 2.966311061201572, "grad_norm": 0.24751798808574677, "learning_rate": 3.5931569285757586e-09, "loss": 0.3729, "step": 2641 }, { "epoch": 2.9674340258281866, "grad_norm": 0.261075496673584, "learning_rate": 3.3496526792531348e-09, "loss": 0.3914, "step": 2642 }, { "epoch": 2.968556990454801, "grad_norm": 0.271716445684433, "learning_rate": 3.1146886901090024e-09, "loss": 0.4071, "step": 2643 }, { "epoch": 2.969679955081415, "grad_norm": 0.25174158811569214, "learning_rate": 2.8882653627421153e-09, "loss": 0.3812, "step": 2644 }, { "epoch": 2.9708029197080292, "grad_norm": 0.2628293037414551, "learning_rate": 2.670383084155681e-09, "loss": 0.3972, "step": 2645 }, { "epoch": 2.9719258843346434, "grad_norm": 0.26284271478652954, "learning_rate": 2.461042226752919e-09, "loss": 0.4054, "step": 2646 }, { "epoch": 2.9730488489612577, "grad_norm": 0.27374470233917236, "learning_rate": 2.2602431483387254e-09, "loss": 0.4285, "step": 2647 }, { "epoch": 2.974171813587872, "grad_norm": 0.2803319990634918, "learning_rate": 2.0679861921174548e-09, "loss": 0.4034, "step": 2648 }, { "epoch": 2.9752947782144865, "grad_norm": 0.25088220834732056, "learning_rate": 1.8842716866956935e-09, "loss": 0.3598, "step": 2649 }, { "epoch": 2.9764177428411003, "grad_norm": 0.26998060941696167, "learning_rate": 1.7090999460767088e-09, "loss": 0.4258, "step": 2650 }, { "epoch": 2.977540707467715, "grad_norm": 0.24931347370147705, "learning_rate": 1.542471269663226e-09, "loss": 0.3623, "step": 2651 }, { "epoch": 2.978663672094329, "grad_norm": 0.2807588577270508, "learning_rate": 1.3843859422574269e-09, "loss": 0.4072, "step": 2652 }, { "epoch": 2.9797866367209433, "grad_norm": 0.27697110176086426, "learning_rate": 1.2348442340576194e-09, "loss": 0.4349, "step": 2653 }, { "epoch": 2.9809096013475576, "grad_norm": 0.2451767772436142, "learning_rate": 1.0938464006604588e-09, "loss": 0.3737, "step": 2654 }, { "epoch": 2.9820325659741718, "grad_norm": 0.2732963263988495, "learning_rate": 9.613926830587262e-10, "loss": 0.3961, "step": 2655 }, { "epoch": 2.983155530600786, "grad_norm": 0.262090802192688, "learning_rate": 8.374833076424394e-10, "loss": 0.3992, "step": 2656 }, { "epoch": 2.9842784952274, "grad_norm": 0.2540877163410187, "learning_rate": 7.221184861966324e-10, "loss": 0.4004, "step": 2657 }, { "epoch": 2.985401459854015, "grad_norm": 0.25300681591033936, "learning_rate": 6.152984159024655e-10, "loss": 0.3628, "step": 2658 }, { "epoch": 2.9865244244806286, "grad_norm": 0.2663578689098358, "learning_rate": 5.170232793366703e-10, "loss": 0.421, "step": 2659 }, { "epoch": 2.9876473891072433, "grad_norm": 0.26131415367126465, "learning_rate": 4.272932444709943e-10, "loss": 0.3842, "step": 2660 }, { "epoch": 2.9887703537338575, "grad_norm": 0.28280434012413025, "learning_rate": 3.4610846467109106e-10, "loss": 0.4142, "step": 2661 }, { "epoch": 2.9898933183604717, "grad_norm": 0.267385333776474, "learning_rate": 2.734690786987404e-10, "loss": 0.4052, "step": 2662 }, { "epoch": 2.991016282987086, "grad_norm": 0.27853038907051086, "learning_rate": 2.0937521070851785e-10, "loss": 0.4012, "step": 2663 }, { "epoch": 2.9921392476137, "grad_norm": 0.2572419345378876, "learning_rate": 1.538269702494599e-10, "loss": 0.3856, "step": 2664 }, { "epoch": 2.9932622122403143, "grad_norm": 0.263293594121933, "learning_rate": 1.0682445226395389e-10, "loss": 0.4061, "step": 2665 }, { "epoch": 2.9943851768669285, "grad_norm": 0.2548677921295166, "learning_rate": 6.836773708940314e-11, "loss": 0.392, "step": 2666 }, { "epoch": 2.995508141493543, "grad_norm": 0.2504030764102936, "learning_rate": 3.8456890455451646e-11, "loss": 0.3736, "step": 2667 }, { "epoch": 2.9966311061201574, "grad_norm": 0.2898169159889221, "learning_rate": 1.7091963485649232e-11, "loss": 0.4162, "step": 2668 }, { "epoch": 2.9977540707467716, "grad_norm": 0.25964173674583435, "learning_rate": 4.272992697451628e-12, "loss": 0.4111, "step": 2669 }, { "epoch": 2.998877035373386, "grad_norm": 0.27346548438072205, "learning_rate": 0.0, "loss": 0.3948, "step": 2670 }, { "epoch": 2.998877035373386, "step": 2670, "total_flos": 4617526237986816.0, "train_loss": 0.4445273021968563, "train_runtime": 119673.128, "train_samples_per_second": 2.857, "train_steps_per_second": 0.022 } ], "logging_steps": 1.0, "max_steps": 2670, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4617526237986816.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }