{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004576659038901602, "grad_norm": 1.0062716007232666, "learning_rate": 9.09090909090909e-08, "loss": 1.9361354112625122, "step": 2 }, { "epoch": 0.009153318077803204, "grad_norm": 1.1005034446716309, "learning_rate": 2.727272727272727e-07, "loss": 1.9705393314361572, "step": 4 }, { "epoch": 0.013729977116704805, "grad_norm": 2.435994863510132, "learning_rate": 4.545454545454545e-07, "loss": 1.9127593040466309, "step": 6 }, { "epoch": 0.018306636155606407, "grad_norm": 4.2885661125183105, "learning_rate": 6.363636363636363e-07, "loss": 1.974473237991333, "step": 8 }, { "epoch": 0.02288329519450801, "grad_norm": 1.8748376369476318, "learning_rate": 8.181818181818182e-07, "loss": 2.226160764694214, "step": 10 }, { "epoch": 0.02745995423340961, "grad_norm": 2.666358232498169, "learning_rate": 1e-06, "loss": 2.0884294509887695, "step": 12 }, { "epoch": 0.032036613272311214, "grad_norm": 0.6817159056663513, "learning_rate": 1.1818181818181818e-06, "loss": 1.8083596229553223, "step": 14 }, { "epoch": 0.036613272311212815, "grad_norm": 0.9749477505683899, "learning_rate": 1.3636363636363634e-06, "loss": 1.7538858652114868, "step": 16 }, { "epoch": 0.041189931350114416, "grad_norm": 3.5282654762268066, "learning_rate": 1.5454545454545454e-06, "loss": 1.3263461589813232, "step": 18 }, { "epoch": 0.04576659038901602, "grad_norm": 1.0763177871704102, "learning_rate": 1.7272727272727273e-06, "loss": 1.3052548170089722, "step": 20 }, { "epoch": 0.05034324942791762, "grad_norm": 4.2185378074646, "learning_rate": 1.909090909090909e-06, "loss": 0.8882780075073242, "step": 22 }, { "epoch": 0.05491990846681922, "grad_norm": 4.18178129196167, "learning_rate": 2.0909090909090907e-06, "loss": 1.695899248123169, "step": 24 }, { "epoch": 0.059496567505720827, "grad_norm": 1.5718276500701904, "learning_rate": 2.2727272727272728e-06, "loss": 1.406914472579956, "step": 26 }, { "epoch": 0.06407322654462243, "grad_norm": 4.563444137573242, "learning_rate": 2.4545454545454544e-06, "loss": 0.9809740781784058, "step": 28 }, { "epoch": 0.06864988558352403, "grad_norm": 0.6998274922370911, "learning_rate": 2.636363636363636e-06, "loss": 1.5562105178833008, "step": 30 }, { "epoch": 0.07322654462242563, "grad_norm": 0.8538845777511597, "learning_rate": 2.818181818181818e-06, "loss": 1.6105421781539917, "step": 32 }, { "epoch": 0.07780320366132723, "grad_norm": 0.6970071196556091, "learning_rate": 3e-06, "loss": 1.4942922592163086, "step": 34 }, { "epoch": 0.08237986270022883, "grad_norm": 0.6283139586448669, "learning_rate": 3.1818181818181817e-06, "loss": 1.4711008071899414, "step": 36 }, { "epoch": 0.08695652173913043, "grad_norm": 3.1694211959838867, "learning_rate": 3.3636363636363637e-06, "loss": 1.3710073232650757, "step": 38 }, { "epoch": 0.09153318077803203, "grad_norm": 0.9061569571495056, "learning_rate": 3.5454545454545454e-06, "loss": 1.3197435140609741, "step": 40 }, { "epoch": 0.09610983981693363, "grad_norm": 1.3489203453063965, "learning_rate": 3.727272727272727e-06, "loss": 1.3319520950317383, "step": 42 }, { "epoch": 0.10068649885583524, "grad_norm": 0.7797649502754211, "learning_rate": 3.909090909090909e-06, "loss": 1.093435287475586, "step": 44 }, { "epoch": 0.10526315789473684, "grad_norm": 0.7038461565971375, "learning_rate": 4.090909090909091e-06, "loss": 1.39493989944458, "step": 46 }, { "epoch": 0.10983981693363844, "grad_norm": 4.629183769226074, "learning_rate": 4.272727272727272e-06, "loss": 1.6254442930221558, "step": 48 }, { "epoch": 0.11441647597254005, "grad_norm": 0.7052221894264221, "learning_rate": 4.454545454545454e-06, "loss": 1.4149081707000732, "step": 50 }, { "epoch": 0.11899313501144165, "grad_norm": 0.6668927669525146, "learning_rate": 4.636363636363636e-06, "loss": 1.1638028621673584, "step": 52 }, { "epoch": 0.12356979405034325, "grad_norm": 1.8050854206085205, "learning_rate": 4.818181818181818e-06, "loss": 0.893460750579834, "step": 54 }, { "epoch": 0.12814645308924486, "grad_norm": 0.6511980295181274, "learning_rate": 4.9999999999999996e-06, "loss": 1.3653751611709595, "step": 56 }, { "epoch": 0.13272311212814644, "grad_norm": 1.0147000551223755, "learning_rate": 5.181818181818181e-06, "loss": 0.8794707655906677, "step": 58 }, { "epoch": 0.13729977116704806, "grad_norm": 0.7929437756538391, "learning_rate": 5.363636363636363e-06, "loss": 1.4038774967193604, "step": 60 }, { "epoch": 0.14187643020594964, "grad_norm": 1.6879537105560303, "learning_rate": 5.545454545454545e-06, "loss": 1.3440090417861938, "step": 62 }, { "epoch": 0.14645308924485126, "grad_norm": 2.4623799324035645, "learning_rate": 5.727272727272727e-06, "loss": 1.2824642658233643, "step": 64 }, { "epoch": 0.15102974828375287, "grad_norm": 0.8197077512741089, "learning_rate": 5.9090909090909085e-06, "loss": 1.3210649490356445, "step": 66 }, { "epoch": 0.15560640732265446, "grad_norm": 0.5857513546943665, "learning_rate": 6.090909090909091e-06, "loss": 1.3227065801620483, "step": 68 }, { "epoch": 0.16018306636155608, "grad_norm": 1.8220713138580322, "learning_rate": 6.272727272727273e-06, "loss": 1.1438708305358887, "step": 70 }, { "epoch": 0.16475972540045766, "grad_norm": 1.1689298152923584, "learning_rate": 6.454545454545454e-06, "loss": 1.2297066450119019, "step": 72 }, { "epoch": 0.16933638443935928, "grad_norm": 0.6110695004463196, "learning_rate": 6.636363636363637e-06, "loss": 1.2812331914901733, "step": 74 }, { "epoch": 0.17391304347826086, "grad_norm": 0.8178548216819763, "learning_rate": 6.818181818181818e-06, "loss": 1.1592620611190796, "step": 76 }, { "epoch": 0.17848970251716248, "grad_norm": 0.551541268825531, "learning_rate": 7e-06, "loss": 1.3165788650512695, "step": 78 }, { "epoch": 0.18306636155606407, "grad_norm": 0.6924233436584473, "learning_rate": 7.1818181818181815e-06, "loss": 0.9857466220855713, "step": 80 }, { "epoch": 0.18764302059496568, "grad_norm": 0.8627240657806396, "learning_rate": 7.363636363636363e-06, "loss": 1.5733084678649902, "step": 82 }, { "epoch": 0.19221967963386727, "grad_norm": 1.4400972127914429, "learning_rate": 7.545454545454546e-06, "loss": 1.2073043584823608, "step": 84 }, { "epoch": 0.19679633867276888, "grad_norm": 0.751667320728302, "learning_rate": 7.727272727272727e-06, "loss": 1.3525782823562622, "step": 86 }, { "epoch": 0.20137299771167047, "grad_norm": 0.982463002204895, "learning_rate": 7.909090909090909e-06, "loss": 1.0466015338897705, "step": 88 }, { "epoch": 0.20594965675057209, "grad_norm": 1.4768906831741333, "learning_rate": 7.999993553025613e-06, "loss": 1.5740259885787964, "step": 90 }, { "epoch": 0.21052631578947367, "grad_norm": 0.8806318640708923, "learning_rate": 7.999941977369066e-06, "loss": 1.0948330163955688, "step": 92 }, { "epoch": 0.2151029748283753, "grad_norm": 0.8388352394104004, "learning_rate": 7.999838826794875e-06, "loss": 1.3860080242156982, "step": 94 }, { "epoch": 0.21967963386727687, "grad_norm": 1.9620765447616577, "learning_rate": 7.999684102780836e-06, "loss": 0.6549928784370422, "step": 96 }, { "epoch": 0.2242562929061785, "grad_norm": 1.8647589683532715, "learning_rate": 7.99947780754362e-06, "loss": 1.1090216636657715, "step": 98 }, { "epoch": 0.2288329519450801, "grad_norm": 0.47244909405708313, "learning_rate": 7.999219944038733e-06, "loss": 1.3478974103927612, "step": 100 }, { "epoch": 0.2334096109839817, "grad_norm": 1.5291101932525635, "learning_rate": 7.998910515960482e-06, "loss": 1.2799420356750488, "step": 102 }, { "epoch": 0.2379862700228833, "grad_norm": 0.7091822028160095, "learning_rate": 7.99854952774192e-06, "loss": 1.0800800323486328, "step": 104 }, { "epoch": 0.2425629290617849, "grad_norm": 0.5217757225036621, "learning_rate": 7.99813698455478e-06, "loss": 1.2569258213043213, "step": 106 }, { "epoch": 0.2471395881006865, "grad_norm": 0.8455982208251953, "learning_rate": 7.997672892309399e-06, "loss": 1.2004809379577637, "step": 108 }, { "epoch": 0.2517162471395881, "grad_norm": 1.3651947975158691, "learning_rate": 7.997157257654642e-06, "loss": 0.9266209006309509, "step": 110 }, { "epoch": 0.2562929061784897, "grad_norm": 2.029172658920288, "learning_rate": 7.996590087977799e-06, "loss": 0.6148236989974976, "step": 112 }, { "epoch": 0.2608695652173913, "grad_norm": 1.3701057434082031, "learning_rate": 7.995971391404479e-06, "loss": 1.3206911087036133, "step": 114 }, { "epoch": 0.2654462242562929, "grad_norm": 1.0008249282836914, "learning_rate": 7.9953011767985e-06, "loss": 1.5282856225967407, "step": 116 }, { "epoch": 0.2700228832951945, "grad_norm": 1.6718090772628784, "learning_rate": 7.994579453761756e-06, "loss": 1.2926162481307983, "step": 118 }, { "epoch": 0.2745995423340961, "grad_norm": 1.2445013523101807, "learning_rate": 7.993806232634083e-06, "loss": 1.14131498336792, "step": 120 }, { "epoch": 0.2791762013729977, "grad_norm": 0.8166568279266357, "learning_rate": 7.992981524493107e-06, "loss": 0.9494431614875793, "step": 122 }, { "epoch": 0.2837528604118993, "grad_norm": 1.60244882106781, "learning_rate": 7.992105341154091e-06, "loss": 1.0895594358444214, "step": 124 }, { "epoch": 0.28832951945080093, "grad_norm": 0.7963911890983582, "learning_rate": 7.99117769516976e-06, "loss": 1.2929226160049438, "step": 126 }, { "epoch": 0.2929061784897025, "grad_norm": 1.1980226039886475, "learning_rate": 7.990198599830122e-06, "loss": 1.0238358974456787, "step": 128 }, { "epoch": 0.2974828375286041, "grad_norm": 0.8094168901443481, "learning_rate": 7.989168069162285e-06, "loss": 1.2957402467727661, "step": 130 }, { "epoch": 0.30205949656750575, "grad_norm": 0.6273795366287231, "learning_rate": 7.988086117930241e-06, "loss": 1.316016435623169, "step": 132 }, { "epoch": 0.30663615560640733, "grad_norm": 1.0645545721054077, "learning_rate": 7.986952761634676e-06, "loss": 0.9111831784248352, "step": 134 }, { "epoch": 0.3112128146453089, "grad_norm": 0.4852690100669861, "learning_rate": 7.985768016512724e-06, "loss": 1.2701897621154785, "step": 136 }, { "epoch": 0.3157894736842105, "grad_norm": 0.6316735744476318, "learning_rate": 7.984531899537751e-06, "loss": 1.3380374908447266, "step": 138 }, { "epoch": 0.32036613272311215, "grad_norm": 0.6724331378936768, "learning_rate": 7.98324442841911e-06, "loss": 1.0774213075637817, "step": 140 }, { "epoch": 0.32494279176201374, "grad_norm": 0.5823185443878174, "learning_rate": 7.981905621601877e-06, "loss": 1.2555238008499146, "step": 142 }, { "epoch": 0.3295194508009153, "grad_norm": 0.6299719214439392, "learning_rate": 7.9805154982666e-06, "loss": 0.9718642830848694, "step": 144 }, { "epoch": 0.3340961098398169, "grad_norm": 0.7177326679229736, "learning_rate": 7.979074078329013e-06, "loss": 1.0789949893951416, "step": 146 }, { "epoch": 0.33867276887871856, "grad_norm": 1.0502492189407349, "learning_rate": 7.977581382439763e-06, "loss": 0.8612786531448364, "step": 148 }, { "epoch": 0.34324942791762014, "grad_norm": 0.8035002946853638, "learning_rate": 7.976037431984097e-06, "loss": 1.327546238899231, "step": 150 }, { "epoch": 0.34782608695652173, "grad_norm": 0.8929418325424194, "learning_rate": 7.974442249081574e-06, "loss": 0.7809689044952393, "step": 152 }, { "epoch": 0.3524027459954233, "grad_norm": 0.9010236859321594, "learning_rate": 7.972795856585738e-06, "loss": 1.2843378782272339, "step": 154 }, { "epoch": 0.35697940503432496, "grad_norm": 0.834439218044281, "learning_rate": 7.971098278083786e-06, "loss": 1.1299086809158325, "step": 156 }, { "epoch": 0.36155606407322655, "grad_norm": 0.6221197247505188, "learning_rate": 7.969349537896246e-06, "loss": 1.282414436340332, "step": 158 }, { "epoch": 0.36613272311212813, "grad_norm": 0.8410677313804626, "learning_rate": 7.96754966107661e-06, "loss": 1.2675057649612427, "step": 160 }, { "epoch": 0.3707093821510298, "grad_norm": 0.9409856796264648, "learning_rate": 7.965698673410988e-06, "loss": 0.917771577835083, "step": 162 }, { "epoch": 0.37528604118993136, "grad_norm": 0.4894721210002899, "learning_rate": 7.963796601417737e-06, "loss": 1.2448134422302246, "step": 164 }, { "epoch": 0.37986270022883295, "grad_norm": 0.4557670056819916, "learning_rate": 7.961843472347074e-06, "loss": 1.253254771232605, "step": 166 }, { "epoch": 0.38443935926773454, "grad_norm": 0.455538272857666, "learning_rate": 7.959839314180691e-06, "loss": 1.2726975679397583, "step": 168 }, { "epoch": 0.3890160183066362, "grad_norm": 0.6479983329772949, "learning_rate": 7.957784155631355e-06, "loss": 0.9780842065811157, "step": 170 }, { "epoch": 0.39359267734553777, "grad_norm": 0.7144932150840759, "learning_rate": 7.955678026142495e-06, "loss": 1.0534026622772217, "step": 172 }, { "epoch": 0.39816933638443935, "grad_norm": 0.6702520251274109, "learning_rate": 7.95352095588778e-06, "loss": 0.9748213887214661, "step": 174 }, { "epoch": 0.40274599542334094, "grad_norm": 0.6281000971794128, "learning_rate": 7.951312975770682e-06, "loss": 1.2368974685668945, "step": 176 }, { "epoch": 0.4073226544622426, "grad_norm": 1.0416147708892822, "learning_rate": 7.949054117424044e-06, "loss": 0.9652445316314697, "step": 178 }, { "epoch": 0.41189931350114417, "grad_norm": 0.36721283197402954, "learning_rate": 7.946744413209623e-06, "loss": 1.143812656402588, "step": 180 }, { "epoch": 0.41647597254004576, "grad_norm": 0.7831065654754639, "learning_rate": 7.944383896217614e-06, "loss": 1.2468208074569702, "step": 182 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5314235687255859, "learning_rate": 7.941972600266196e-06, "loss": 1.23768150806427, "step": 184 }, { "epoch": 0.425629290617849, "grad_norm": 1.5851482152938843, "learning_rate": 7.939510559901035e-06, "loss": 1.0880942344665527, "step": 186 }, { "epoch": 0.4302059496567506, "grad_norm": 0.5965549349784851, "learning_rate": 7.936997810394788e-06, "loss": 1.32969069480896, "step": 188 }, { "epoch": 0.43478260869565216, "grad_norm": 0.8173778653144836, "learning_rate": 7.934434387746609e-06, "loss": 1.1866960525512695, "step": 190 }, { "epoch": 0.43935926773455375, "grad_norm": 1.3977761268615723, "learning_rate": 7.931820328681615e-06, "loss": 0.6947106719017029, "step": 192 }, { "epoch": 0.4439359267734554, "grad_norm": 0.6947522759437561, "learning_rate": 7.92915567065038e-06, "loss": 1.0972782373428345, "step": 194 }, { "epoch": 0.448512585812357, "grad_norm": 0.48104986548423767, "learning_rate": 7.926440451828384e-06, "loss": 1.2536075115203857, "step": 196 }, { "epoch": 0.45308924485125857, "grad_norm": 0.4325115382671356, "learning_rate": 7.923674711115476e-06, "loss": 1.0892629623413086, "step": 198 }, { "epoch": 0.4576659038901602, "grad_norm": 1.5888208150863647, "learning_rate": 7.920858488135305e-06, "loss": 1.1664975881576538, "step": 200 }, { "epoch": 0.4622425629290618, "grad_norm": 1.1934003829956055, "learning_rate": 7.917991823234762e-06, "loss": 1.325711965560913, "step": 202 }, { "epoch": 0.4668192219679634, "grad_norm": 0.9322173595428467, "learning_rate": 7.915074757483403e-06, "loss": 1.133060097694397, "step": 204 }, { "epoch": 0.47139588100686497, "grad_norm": 1.1125494241714478, "learning_rate": 7.91210733267285e-06, "loss": 1.2609753608703613, "step": 206 }, { "epoch": 0.4759725400457666, "grad_norm": 0.402997225522995, "learning_rate": 7.909089591316204e-06, "loss": 1.1940068006515503, "step": 208 }, { "epoch": 0.4805491990846682, "grad_norm": 2.592806339263916, "learning_rate": 7.906021576647428e-06, "loss": 0.990831732749939, "step": 210 }, { "epoch": 0.4851258581235698, "grad_norm": 2.9936859607696533, "learning_rate": 7.902903332620733e-06, "loss": 1.0334012508392334, "step": 212 }, { "epoch": 0.4897025171624714, "grad_norm": 0.9770790338516235, "learning_rate": 7.89973490390994e-06, "loss": 0.8017320036888123, "step": 214 }, { "epoch": 0.494279176201373, "grad_norm": 0.7555065751075745, "learning_rate": 7.896516335907856e-06, "loss": 0.8924547433853149, "step": 216 }, { "epoch": 0.4988558352402746, "grad_norm": 0.6128323674201965, "learning_rate": 7.893247674725605e-06, "loss": 1.4787169694900513, "step": 218 }, { "epoch": 0.5034324942791762, "grad_norm": 1.4939192533493042, "learning_rate": 7.889928967191976e-06, "loss": 0.9237180948257446, "step": 220 }, { "epoch": 0.5080091533180778, "grad_norm": 1.1495096683502197, "learning_rate": 7.886560260852757e-06, "loss": 1.079113483428955, "step": 222 }, { "epoch": 0.5125858123569794, "grad_norm": 0.7052268981933594, "learning_rate": 7.883141603970044e-06, "loss": 1.2029013633728027, "step": 224 }, { "epoch": 0.517162471395881, "grad_norm": 0.6179990768432617, "learning_rate": 7.879673045521558e-06, "loss": 1.2771333456039429, "step": 226 }, { "epoch": 0.5217391304347826, "grad_norm": 1.7824820280075073, "learning_rate": 7.876154635199936e-06, "loss": 1.0359081029891968, "step": 228 }, { "epoch": 0.5263157894736842, "grad_norm": 1.3128830194473267, "learning_rate": 7.872586423412026e-06, "loss": 0.9820423722267151, "step": 230 }, { "epoch": 0.5308924485125858, "grad_norm": 1.6290223598480225, "learning_rate": 7.868968461278157e-06, "loss": 1.2495921850204468, "step": 232 }, { "epoch": 0.5354691075514875, "grad_norm": 0.8125019669532776, "learning_rate": 7.865300800631418e-06, "loss": 1.2157059907913208, "step": 234 }, { "epoch": 0.540045766590389, "grad_norm": 0.6958226561546326, "learning_rate": 7.861583494016904e-06, "loss": 1.1479324102401733, "step": 236 }, { "epoch": 0.5446224256292906, "grad_norm": 0.6316367387771606, "learning_rate": 7.857816594690967e-06, "loss": 1.2824212312698364, "step": 238 }, { "epoch": 0.5491990846681922, "grad_norm": 0.7441359162330627, "learning_rate": 7.854000156620456e-06, "loss": 1.2517260313034058, "step": 240 }, { "epoch": 0.5537757437070938, "grad_norm": 0.4228470027446747, "learning_rate": 7.85013423448194e-06, "loss": 1.5795619487762451, "step": 242 }, { "epoch": 0.5583524027459954, "grad_norm": 0.5715126991271973, "learning_rate": 7.846218883660927e-06, "loss": 1.2237060070037842, "step": 244 }, { "epoch": 0.562929061784897, "grad_norm": 1.9045436382293701, "learning_rate": 7.842254160251073e-06, "loss": 1.079658031463623, "step": 246 }, { "epoch": 0.5675057208237986, "grad_norm": 1.443070411682129, "learning_rate": 7.838240121053368e-06, "loss": 0.7990419268608093, "step": 248 }, { "epoch": 0.5720823798627003, "grad_norm": 0.6427567005157471, "learning_rate": 7.834176823575338e-06, "loss": 1.2759908437728882, "step": 250 }, { "epoch": 0.5766590389016019, "grad_norm": 0.8063530325889587, "learning_rate": 7.830064326030206e-06, "loss": 1.292599081993103, "step": 252 }, { "epoch": 0.5812356979405034, "grad_norm": 0.7218191623687744, "learning_rate": 7.825902687336065e-06, "loss": 1.2685648202896118, "step": 254 }, { "epoch": 0.585812356979405, "grad_norm": 0.25028905272483826, "learning_rate": 7.821691967115038e-06, "loss": 1.4054419994354248, "step": 256 }, { "epoch": 0.5903890160183066, "grad_norm": 0.6193069815635681, "learning_rate": 7.817432225692415e-06, "loss": 1.2499405145645142, "step": 258 }, { "epoch": 0.5949656750572082, "grad_norm": 0.5317597985267639, "learning_rate": 7.813123524095793e-06, "loss": 0.9288328289985657, "step": 260 }, { "epoch": 0.5995423340961098, "grad_norm": 0.4047868549823761, "learning_rate": 7.808765924054205e-06, "loss": 0.9483444690704346, "step": 262 }, { "epoch": 0.6041189931350115, "grad_norm": 0.5687839388847351, "learning_rate": 7.80435948799723e-06, "loss": 1.1598079204559326, "step": 264 }, { "epoch": 0.6086956521739131, "grad_norm": 0.48621973395347595, "learning_rate": 7.799904279054102e-06, "loss": 1.3577367067337036, "step": 266 }, { "epoch": 0.6132723112128147, "grad_norm": 0.47008511424064636, "learning_rate": 7.795400361052801e-06, "loss": 0.8724009394645691, "step": 268 }, { "epoch": 0.6178489702517163, "grad_norm": 0.4799520969390869, "learning_rate": 7.790847798519149e-06, "loss": 1.2594244480133057, "step": 270 }, { "epoch": 0.6224256292906178, "grad_norm": 2.5323190689086914, "learning_rate": 7.78624665667587e-06, "loss": 1.0330383777618408, "step": 272 }, { "epoch": 0.6270022883295194, "grad_norm": 0.4832422137260437, "learning_rate": 7.781597001441669e-06, "loss": 0.9544627666473389, "step": 274 }, { "epoch": 0.631578947368421, "grad_norm": 0.6891815066337585, "learning_rate": 7.776898899430286e-06, "loss": 0.8545240759849548, "step": 276 }, { "epoch": 0.6361556064073226, "grad_norm": 0.9485855102539062, "learning_rate": 7.772152417949531e-06, "loss": 0.829641580581665, "step": 278 }, { "epoch": 0.6407322654462243, "grad_norm": 1.344787359237671, "learning_rate": 7.767357625000333e-06, "loss": 0.8731590509414673, "step": 280 }, { "epoch": 0.6453089244851259, "grad_norm": 1.829545497894287, "learning_rate": 7.762514589275758e-06, "loss": 0.6393548846244812, "step": 282 }, { "epoch": 0.6498855835240275, "grad_norm": 0.5783978700637817, "learning_rate": 7.757623380160026e-06, "loss": 1.0918872356414795, "step": 284 }, { "epoch": 0.6544622425629291, "grad_norm": 0.5951263904571533, "learning_rate": 7.752684067727519e-06, "loss": 1.551460862159729, "step": 286 }, { "epoch": 0.6590389016018307, "grad_norm": 0.9634541273117065, "learning_rate": 7.747696722741773e-06, "loss": 0.8825801014900208, "step": 288 }, { "epoch": 0.6636155606407322, "grad_norm": 0.8059589862823486, "learning_rate": 7.742661416654473e-06, "loss": 0.9882611036300659, "step": 290 }, { "epoch": 0.6681922196796338, "grad_norm": 0.5588476657867432, "learning_rate": 7.737578221604416e-06, "loss": 1.2011680603027344, "step": 292 }, { "epoch": 0.6727688787185355, "grad_norm": 0.9811854362487793, "learning_rate": 7.732447210416492e-06, "loss": 1.2716686725616455, "step": 294 }, { "epoch": 0.6773455377574371, "grad_norm": 1.7746957540512085, "learning_rate": 7.727268456600627e-06, "loss": 0.8755344748497009, "step": 296 }, { "epoch": 0.6819221967963387, "grad_norm": 0.6389427185058594, "learning_rate": 7.722042034350742e-06, "loss": 1.26163649559021, "step": 298 }, { "epoch": 0.6864988558352403, "grad_norm": 0.6335075497627258, "learning_rate": 7.71676801854368e-06, "loss": 1.2194799184799194, "step": 300 }, { "epoch": 0.6910755148741419, "grad_norm": 0.6459981799125671, "learning_rate": 7.711446484738143e-06, "loss": 1.0250097513198853, "step": 302 }, { "epoch": 0.6956521739130435, "grad_norm": 1.382996916770935, "learning_rate": 7.706077509173595e-06, "loss": 0.839065432548523, "step": 304 }, { "epoch": 0.700228832951945, "grad_norm": 1.7975640296936035, "learning_rate": 7.70066116876919e-06, "loss": 1.1089591979980469, "step": 306 }, { "epoch": 0.7048054919908466, "grad_norm": 1.6443461179733276, "learning_rate": 7.69519754112265e-06, "loss": 1.1687304973602295, "step": 308 }, { "epoch": 0.7093821510297483, "grad_norm": 3.3438291549682617, "learning_rate": 7.68968670450917e-06, "loss": 0.9455866813659668, "step": 310 }, { "epoch": 0.7139588100686499, "grad_norm": 1.3685333728790283, "learning_rate": 7.68412873788028e-06, "loss": 1.100264310836792, "step": 312 }, { "epoch": 0.7185354691075515, "grad_norm": 0.9479398131370544, "learning_rate": 7.678523720862733e-06, "loss": 1.0691652297973633, "step": 314 }, { "epoch": 0.7231121281464531, "grad_norm": 1.0067410469055176, "learning_rate": 7.672871733757345e-06, "loss": 1.057770013809204, "step": 316 }, { "epoch": 0.7276887871853547, "grad_norm": 1.1182042360305786, "learning_rate": 7.667172857537857e-06, "loss": 0.6116782426834106, "step": 318 }, { "epoch": 0.7322654462242563, "grad_norm": 0.6759845614433289, "learning_rate": 7.661427173849773e-06, "loss": 0.9427492022514343, "step": 320 }, { "epoch": 0.7368421052631579, "grad_norm": 1.3793621063232422, "learning_rate": 7.655634765009187e-06, "loss": 0.8351959586143494, "step": 322 }, { "epoch": 0.7414187643020596, "grad_norm": 0.438629686832428, "learning_rate": 7.649795714001604e-06, "loss": 1.2085388898849487, "step": 324 }, { "epoch": 0.7459954233409611, "grad_norm": 0.7775664329528809, "learning_rate": 7.643910104480756e-06, "loss": 1.1938120126724243, "step": 326 }, { "epoch": 0.7505720823798627, "grad_norm": 0.49513670802116394, "learning_rate": 7.637978020767396e-06, "loss": 1.266683578491211, "step": 328 }, { "epoch": 0.7551487414187643, "grad_norm": 0.5126597881317139, "learning_rate": 7.631999547848101e-06, "loss": 1.2423049211502075, "step": 330 }, { "epoch": 0.7597254004576659, "grad_norm": 0.5846846103668213, "learning_rate": 7.6259747713740375e-06, "loss": 1.1801856756210327, "step": 332 }, { "epoch": 0.7643020594965675, "grad_norm": 0.812533974647522, "learning_rate": 7.619903777659752e-06, "loss": 1.1939353942871094, "step": 334 }, { "epoch": 0.7688787185354691, "grad_norm": 0.6008883118629456, "learning_rate": 7.613786653681925e-06, "loss": 1.2463386058807373, "step": 336 }, { "epoch": 0.7734553775743707, "grad_norm": 0.888378381729126, "learning_rate": 7.6076234870781235e-06, "loss": 1.2093459367752075, "step": 338 }, { "epoch": 0.7780320366132724, "grad_norm": 0.39801281690597534, "learning_rate": 7.601414366145554e-06, "loss": 1.2171316146850586, "step": 340 }, { "epoch": 0.782608695652174, "grad_norm": 0.5358569622039795, "learning_rate": 7.5951593798397864e-06, "loss": 1.2029497623443604, "step": 342 }, { "epoch": 0.7871853546910755, "grad_norm": 0.7264346480369568, "learning_rate": 7.588858617773492e-06, "loss": 1.1039307117462158, "step": 344 }, { "epoch": 0.7917620137299771, "grad_norm": 0.641931414604187, "learning_rate": 7.582512170215146e-06, "loss": 1.2453477382659912, "step": 346 }, { "epoch": 0.7963386727688787, "grad_norm": 0.9022382497787476, "learning_rate": 7.5761201280877445e-06, "loss": 1.0837339162826538, "step": 348 }, { "epoch": 0.8009153318077803, "grad_norm": 0.6649481654167175, "learning_rate": 7.569682582967502e-06, "loss": 1.307905673980713, "step": 350 }, { "epoch": 0.8054919908466819, "grad_norm": 0.45646166801452637, "learning_rate": 7.563199627082528e-06, "loss": 1.236507534980774, "step": 352 }, { "epoch": 0.8100686498855835, "grad_norm": 1.7149626016616821, "learning_rate": 7.5566713533115215e-06, "loss": 1.1184428930282593, "step": 354 }, { "epoch": 0.8146453089244852, "grad_norm": 2.212782144546509, "learning_rate": 7.550097855182428e-06, "loss": 0.9626376628875732, "step": 356 }, { "epoch": 0.8192219679633868, "grad_norm": 0.4924994111061096, "learning_rate": 7.543479226871106e-06, "loss": 1.239965796470642, "step": 358 }, { "epoch": 0.8237986270022883, "grad_norm": 0.8329795002937317, "learning_rate": 7.536815563199976e-06, "loss": 1.0326392650604248, "step": 360 }, { "epoch": 0.8283752860411899, "grad_norm": 0.9352472424507141, "learning_rate": 7.530106959636661e-06, "loss": 1.404736876487732, "step": 362 }, { "epoch": 0.8329519450800915, "grad_norm": 0.575380265712738, "learning_rate": 7.523353512292619e-06, "loss": 1.152444839477539, "step": 364 }, { "epoch": 0.8375286041189931, "grad_norm": 0.49745839834213257, "learning_rate": 7.51655531792177e-06, "loss": 1.269639015197754, "step": 366 }, { "epoch": 0.8421052631578947, "grad_norm": 0.8237262964248657, "learning_rate": 7.509712473919102e-06, "loss": 1.484926462173462, "step": 368 }, { "epoch": 0.8466819221967964, "grad_norm": 0.4979184865951538, "learning_rate": 7.502825078319286e-06, "loss": 1.1458359956741333, "step": 370 }, { "epoch": 0.851258581235698, "grad_norm": 0.8595190644264221, "learning_rate": 7.495893229795259e-06, "loss": 1.0554736852645874, "step": 372 }, { "epoch": 0.8558352402745996, "grad_norm": 0.35459527373313904, "learning_rate": 7.488917027656824e-06, "loss": 1.0308196544647217, "step": 374 }, { "epoch": 0.8604118993135011, "grad_norm": 0.5745582580566406, "learning_rate": 7.481896571849214e-06, "loss": 0.8475030660629272, "step": 376 }, { "epoch": 0.8649885583524027, "grad_norm": 0.4970310628414154, "learning_rate": 7.4748319629516725e-06, "loss": 1.235321283340454, "step": 378 }, { "epoch": 0.8695652173913043, "grad_norm": 0.52949059009552, "learning_rate": 7.467723302176002e-06, "loss": 1.2889394760131836, "step": 380 }, { "epoch": 0.8741418764302059, "grad_norm": 0.4958958625793457, "learning_rate": 7.46057069136512e-06, "loss": 1.2626429796218872, "step": 382 }, { "epoch": 0.8787185354691075, "grad_norm": 0.49720507860183716, "learning_rate": 7.453374232991599e-06, "loss": 1.2046830654144287, "step": 384 }, { "epoch": 0.8832951945080092, "grad_norm": 0.36593207716941833, "learning_rate": 7.446134030156197e-06, "loss": 1.0390335321426392, "step": 386 }, { "epoch": 0.8878718535469108, "grad_norm": 0.49038416147232056, "learning_rate": 7.438850186586382e-06, "loss": 1.2193632125854492, "step": 388 }, { "epoch": 0.8924485125858124, "grad_norm": 0.5042470097541809, "learning_rate": 7.431522806634845e-06, "loss": 1.2251640558242798, "step": 390 }, { "epoch": 0.897025171624714, "grad_norm": 0.4519674777984619, "learning_rate": 7.424151995278005e-06, "loss": 0.8979975581169128, "step": 392 }, { "epoch": 0.9016018306636155, "grad_norm": 0.8786867260932922, "learning_rate": 7.416737858114503e-06, "loss": 1.0744414329528809, "step": 394 }, { "epoch": 0.9061784897025171, "grad_norm": 0.5781663656234741, "learning_rate": 7.409280501363697e-06, "loss": 0.9060631990432739, "step": 396 }, { "epoch": 0.9107551487414187, "grad_norm": 0.6782101392745972, "learning_rate": 7.4017800318641296e-06, "loss": 1.0410590171813965, "step": 398 }, { "epoch": 0.9153318077803204, "grad_norm": 0.7507611513137817, "learning_rate": 7.394236557072005e-06, "loss": 0.9724853038787842, "step": 400 }, { "epoch": 0.919908466819222, "grad_norm": 0.8260324001312256, "learning_rate": 7.386650185059644e-06, "loss": 1.1944291591644287, "step": 402 }, { "epoch": 0.9244851258581236, "grad_norm": 0.37671464681625366, "learning_rate": 7.379021024513942e-06, "loss": 1.2070642709732056, "step": 404 }, { "epoch": 0.9290617848970252, "grad_norm": 0.39617058634757996, "learning_rate": 7.371349184734808e-06, "loss": 1.1787455081939697, "step": 406 }, { "epoch": 0.9336384439359268, "grad_norm": 5.384557723999023, "learning_rate": 7.3636347756335965e-06, "loss": 0.9677321910858154, "step": 408 }, { "epoch": 0.9382151029748284, "grad_norm": 0.4886989891529083, "learning_rate": 7.355877907731536e-06, "loss": 0.9373984932899475, "step": 410 }, { "epoch": 0.9427917620137299, "grad_norm": 0.4791018068790436, "learning_rate": 7.34807869215815e-06, "loss": 1.203087329864502, "step": 412 }, { "epoch": 0.9473684210526315, "grad_norm": 0.605128288269043, "learning_rate": 7.340237240649653e-06, "loss": 0.9440658688545227, "step": 414 }, { "epoch": 0.9519450800915332, "grad_norm": 0.6311787962913513, "learning_rate": 7.3323536655473606e-06, "loss": 1.196823000907898, "step": 416 }, { "epoch": 0.9565217391304348, "grad_norm": 1.343970775604248, "learning_rate": 7.324428079796077e-06, "loss": 0.7435826063156128, "step": 418 }, { "epoch": 0.9610983981693364, "grad_norm": 0.4175608158111572, "learning_rate": 7.316460596942473e-06, "loss": 1.2795171737670898, "step": 420 }, { "epoch": 0.965675057208238, "grad_norm": 20.577402114868164, "learning_rate": 7.308451331133465e-06, "loss": 0.8990521430969238, "step": 422 }, { "epoch": 0.9702517162471396, "grad_norm": 0.568890392780304, "learning_rate": 7.3004003971145765e-06, "loss": 1.1218894720077515, "step": 424 }, { "epoch": 0.9748283752860412, "grad_norm": 0.489161878824234, "learning_rate": 7.292307910228291e-06, "loss": 0.8210854530334473, "step": 426 }, { "epoch": 0.9794050343249427, "grad_norm": 0.6115430593490601, "learning_rate": 7.28417398641241e-06, "loss": 0.9877936244010925, "step": 428 }, { "epoch": 0.9839816933638444, "grad_norm": 0.7786864638328552, "learning_rate": 7.275998742198379e-06, "loss": 1.2866984605789185, "step": 430 }, { "epoch": 0.988558352402746, "grad_norm": 1.2606936693191528, "learning_rate": 7.267782294709628e-06, "loss": 1.2489262819290161, "step": 432 }, { "epoch": 0.9931350114416476, "grad_norm": 0.37724900245666504, "learning_rate": 7.259524761659886e-06, "loss": 1.177677869796753, "step": 434 }, { "epoch": 0.9977116704805492, "grad_norm": 0.5038094520568848, "learning_rate": 7.251226261351502e-06, "loss": 0.8965859413146973, "step": 436 }, { "epoch": 1.002288329519451, "grad_norm": 0.5005406141281128, "learning_rate": 7.242886912673746e-06, "loss": 1.2278937101364136, "step": 438 }, { "epoch": 1.0068649885583525, "grad_norm": 0.7988545298576355, "learning_rate": 7.234506835101103e-06, "loss": 0.7848602533340454, "step": 440 }, { "epoch": 1.011441647597254, "grad_norm": 0.7789489030838013, "learning_rate": 7.22608614869157e-06, "loss": 0.7383415699005127, "step": 442 }, { "epoch": 1.0160183066361557, "grad_norm": 0.8829706311225891, "learning_rate": 7.217624974084921e-06, "loss": 1.0122401714324951, "step": 444 }, { "epoch": 1.0205949656750573, "grad_norm": 0.8735617399215698, "learning_rate": 7.209123432501e-06, "loss": 0.7811689376831055, "step": 446 }, { "epoch": 1.0251716247139588, "grad_norm": 2.609708309173584, "learning_rate": 7.2005816457379634e-06, "loss": 0.4612530767917633, "step": 448 }, { "epoch": 1.0297482837528604, "grad_norm": 0.750656008720398, "learning_rate": 7.191999736170548e-06, "loss": 0.7795161008834839, "step": 450 }, { "epoch": 1.034324942791762, "grad_norm": 2.268953800201416, "learning_rate": 7.183377826748313e-06, "loss": 0.902981698513031, "step": 452 }, { "epoch": 1.0389016018306636, "grad_norm": 0.6813761591911316, "learning_rate": 7.174716040993879e-06, "loss": 1.0745810270309448, "step": 454 }, { "epoch": 1.0434782608695652, "grad_norm": 0.7869388461112976, "learning_rate": 7.166014503001159e-06, "loss": 0.9484665393829346, "step": 456 }, { "epoch": 1.0480549199084668, "grad_norm": 1.6086623668670654, "learning_rate": 7.15727333743358e-06, "loss": 1.0312416553497314, "step": 458 }, { "epoch": 1.0526315789473684, "grad_norm": 1.016251564025879, "learning_rate": 7.148492669522301e-06, "loss": 0.8899783492088318, "step": 460 }, { "epoch": 1.05720823798627, "grad_norm": 1.0114802122116089, "learning_rate": 7.139672625064407e-06, "loss": 0.6136757731437683, "step": 462 }, { "epoch": 1.0617848970251715, "grad_norm": 1.2711036205291748, "learning_rate": 7.130813330421122e-06, "loss": 0.650338888168335, "step": 464 }, { "epoch": 1.0663615560640731, "grad_norm": 0.5720494985580444, "learning_rate": 7.12191491251599e-06, "loss": 1.0441478490829468, "step": 466 }, { "epoch": 1.070938215102975, "grad_norm": 1.6055749654769897, "learning_rate": 7.112977498833056e-06, "loss": 0.6316787004470825, "step": 468 }, { "epoch": 1.0755148741418765, "grad_norm": 1.1760274171829224, "learning_rate": 7.104001217415046e-06, "loss": 1.0118086338043213, "step": 470 }, { "epoch": 1.080091533180778, "grad_norm": 1.1817903518676758, "learning_rate": 7.094986196861522e-06, "loss": 0.8072628378868103, "step": 472 }, { "epoch": 1.0846681922196797, "grad_norm": 0.6759648323059082, "learning_rate": 7.085932566327053e-06, "loss": 1.1159520149230957, "step": 474 }, { "epoch": 1.0892448512585813, "grad_norm": 0.4874918758869171, "learning_rate": 7.076840455519351e-06, "loss": 0.8690568804740906, "step": 476 }, { "epoch": 1.0938215102974829, "grad_norm": 0.3136481046676636, "learning_rate": 7.067709994697427e-06, "loss": 0.6553730964660645, "step": 478 }, { "epoch": 1.0983981693363845, "grad_norm": 2.1855392456054688, "learning_rate": 7.058541314669709e-06, "loss": 0.8631330728530884, "step": 480 }, { "epoch": 1.102974828375286, "grad_norm": 0.570698082447052, "learning_rate": 7.049334546792182e-06, "loss": 0.9237059950828552, "step": 482 }, { "epoch": 1.1075514874141876, "grad_norm": 0.5239076614379883, "learning_rate": 7.040089822966498e-06, "loss": 1.1490368843078613, "step": 484 }, { "epoch": 1.1121281464530892, "grad_norm": 0.6927589774131775, "learning_rate": 7.030807275638089e-06, "loss": 0.9833446741104126, "step": 486 }, { "epoch": 1.1167048054919908, "grad_norm": 0.7562556266784668, "learning_rate": 7.0214870377942695e-06, "loss": 0.9206014275550842, "step": 488 }, { "epoch": 1.1212814645308924, "grad_norm": 0.7045158743858337, "learning_rate": 7.012129242962328e-06, "loss": 0.9196767807006836, "step": 490 }, { "epoch": 1.125858123569794, "grad_norm": 0.2471354752779007, "learning_rate": 7.0027340252076204e-06, "loss": 0.8276454210281372, "step": 492 }, { "epoch": 1.1304347826086956, "grad_norm": 1.010713815689087, "learning_rate": 6.9933015191316456e-06, "loss": 0.7970508337020874, "step": 494 }, { "epoch": 1.1350114416475972, "grad_norm": 0.7042145729064941, "learning_rate": 6.983831859870115e-06, "loss": 0.5978801250457764, "step": 496 }, { "epoch": 1.139588100686499, "grad_norm": 1.2138888835906982, "learning_rate": 6.9743251830910195e-06, "loss": 0.8952941298484802, "step": 498 }, { "epoch": 1.1441647597254005, "grad_norm": 6.797738552093506, "learning_rate": 6.964781624992687e-06, "loss": 0.862623393535614, "step": 500 }, { "epoch": 1.1487414187643021, "grad_norm": 0.5526463985443115, "learning_rate": 6.955201322301825e-06, "loss": 0.7486683130264282, "step": 502 }, { "epoch": 1.1533180778032037, "grad_norm": 1.4951832294464111, "learning_rate": 6.9455844122715704e-06, "loss": 1.175217628479004, "step": 504 }, { "epoch": 1.1578947368421053, "grad_norm": 1.2167983055114746, "learning_rate": 6.935931032679517e-06, "loss": 0.7940334677696228, "step": 506 }, { "epoch": 1.162471395881007, "grad_norm": 0.5917116403579712, "learning_rate": 6.926241321825741e-06, "loss": 0.5206097960472107, "step": 508 }, { "epoch": 1.1670480549199085, "grad_norm": 0.9334290027618408, "learning_rate": 6.916515418530827e-06, "loss": 0.7569844126701355, "step": 510 }, { "epoch": 1.17162471395881, "grad_norm": 0.45942240953445435, "learning_rate": 6.906753462133869e-06, "loss": 0.8078737258911133, "step": 512 }, { "epoch": 1.1762013729977117, "grad_norm": 1.3607593774795532, "learning_rate": 6.896955592490482e-06, "loss": 0.9944840669631958, "step": 514 }, { "epoch": 1.1807780320366132, "grad_norm": 0.7573413848876953, "learning_rate": 6.887121949970796e-06, "loss": 0.9973494410514832, "step": 516 }, { "epoch": 1.1853546910755148, "grad_norm": 0.7948639988899231, "learning_rate": 6.8772526754574424e-06, "loss": 0.8695286512374878, "step": 518 }, { "epoch": 1.1899313501144164, "grad_norm": 0.877565324306488, "learning_rate": 6.867347910343539e-06, "loss": 1.0234124660491943, "step": 520 }, { "epoch": 1.194508009153318, "grad_norm": 0.9130783677101135, "learning_rate": 6.857407796530663e-06, "loss": 1.0572234392166138, "step": 522 }, { "epoch": 1.1990846681922196, "grad_norm": 0.5426770448684692, "learning_rate": 6.847432476426821e-06, "loss": 0.7567615509033203, "step": 524 }, { "epoch": 1.2036613272311212, "grad_norm": 3.1823055744171143, "learning_rate": 6.8374220929443994e-06, "loss": 1.0477484464645386, "step": 526 }, { "epoch": 1.208237986270023, "grad_norm": 0.5345934629440308, "learning_rate": 6.82737678949813e-06, "loss": 0.9875526428222656, "step": 528 }, { "epoch": 1.2128146453089246, "grad_norm": 1.0108869075775146, "learning_rate": 6.817296710003026e-06, "loss": 0.9593431353569031, "step": 530 }, { "epoch": 1.2173913043478262, "grad_norm": 0.7925944328308105, "learning_rate": 6.807181998872322e-06, "loss": 1.1987873315811157, "step": 532 }, { "epoch": 1.2219679633867278, "grad_norm": 0.9541200399398804, "learning_rate": 6.797032801015407e-06, "loss": 0.6915098428726196, "step": 534 }, { "epoch": 1.2265446224256293, "grad_norm": 0.5937853455543518, "learning_rate": 6.78684926183575e-06, "loss": 1.0340373516082764, "step": 536 }, { "epoch": 1.231121281464531, "grad_norm": 0.5773180723190308, "learning_rate": 6.776631527228811e-06, "loss": 1.004191279411316, "step": 538 }, { "epoch": 1.2356979405034325, "grad_norm": 0.6959558129310608, "learning_rate": 6.766379743579954e-06, "loss": 1.0310280323028564, "step": 540 }, { "epoch": 1.240274599542334, "grad_norm": 0.5717634558677673, "learning_rate": 6.756094057762353e-06, "loss": 0.8756218552589417, "step": 542 }, { "epoch": 1.2448512585812357, "grad_norm": 0.6597534418106079, "learning_rate": 6.74577461713488e-06, "loss": 0.9393812417984009, "step": 544 }, { "epoch": 1.2494279176201373, "grad_norm": 0.7435896992683411, "learning_rate": 6.735421569540004e-06, "loss": 0.5626208782196045, "step": 546 }, { "epoch": 1.2540045766590389, "grad_norm": 0.7489412426948547, "learning_rate": 6.7250350633016655e-06, "loss": 0.7460745573043823, "step": 548 }, { "epoch": 1.2585812356979404, "grad_norm": 1.607967495918274, "learning_rate": 6.714615247223148e-06, "loss": 0.7691885828971863, "step": 550 }, { "epoch": 1.263157894736842, "grad_norm": 0.45488154888153076, "learning_rate": 6.7041622705849625e-06, "loss": 1.0543756484985352, "step": 552 }, { "epoch": 1.2677345537757438, "grad_norm": 0.4644421935081482, "learning_rate": 6.693676283142687e-06, "loss": 1.0251156091690063, "step": 554 }, { "epoch": 1.2723112128146452, "grad_norm": 0.6865373253822327, "learning_rate": 6.68315743512484e-06, "loss": 0.9979171752929688, "step": 556 }, { "epoch": 1.276887871853547, "grad_norm": 1.0765372514724731, "learning_rate": 6.672605877230714e-06, "loss": 0.42125648260116577, "step": 558 }, { "epoch": 1.2814645308924484, "grad_norm": 0.5296036005020142, "learning_rate": 6.662021760628231e-06, "loss": 0.6392301917076111, "step": 560 }, { "epoch": 1.2860411899313502, "grad_norm": 0.6849478483200073, "learning_rate": 6.651405236951756e-06, "loss": 1.038710355758667, "step": 562 }, { "epoch": 1.2906178489702518, "grad_norm": 1.4734174013137817, "learning_rate": 6.640756458299951e-06, "loss": 0.9006748199462891, "step": 564 }, { "epoch": 1.2951945080091534, "grad_norm": 4.2481369972229, "learning_rate": 6.630075577233572e-06, "loss": 0.9259494543075562, "step": 566 }, { "epoch": 1.299771167048055, "grad_norm": 0.4776517450809479, "learning_rate": 6.619362746773299e-06, "loss": 1.0371928215026855, "step": 568 }, { "epoch": 1.3043478260869565, "grad_norm": 1.0805084705352783, "learning_rate": 6.608618120397533e-06, "loss": 0.5052875280380249, "step": 570 }, { "epoch": 1.3089244851258581, "grad_norm": 1.1686513423919678, "learning_rate": 6.597841852040207e-06, "loss": 0.756683349609375, "step": 572 }, { "epoch": 1.3135011441647597, "grad_norm": 2.118833065032959, "learning_rate": 6.587034096088575e-06, "loss": 0.8216329216957092, "step": 574 }, { "epoch": 1.3180778032036613, "grad_norm": 1.1685985326766968, "learning_rate": 6.576195007380998e-06, "loss": 1.0449680089950562, "step": 576 }, { "epoch": 1.322654462242563, "grad_norm": 1.8055627346038818, "learning_rate": 6.5653247412047324e-06, "loss": 0.5522174835205078, "step": 578 }, { "epoch": 1.3272311212814645, "grad_norm": 0.8082292079925537, "learning_rate": 6.554423453293698e-06, "loss": 0.9655364155769348, "step": 580 }, { "epoch": 1.331807780320366, "grad_norm": 0.6602555513381958, "learning_rate": 6.543491299826255e-06, "loss": 1.1645584106445312, "step": 582 }, { "epoch": 1.3363844393592679, "grad_norm": 0.5118871927261353, "learning_rate": 6.532528437422959e-06, "loss": 1.0177983045578003, "step": 584 }, { "epoch": 1.3409610983981692, "grad_norm": 0.8692869544029236, "learning_rate": 6.521535023144319e-06, "loss": 0.7149632573127747, "step": 586 }, { "epoch": 1.345537757437071, "grad_norm": 0.8488191962242126, "learning_rate": 6.510511214488554e-06, "loss": 1.0412425994873047, "step": 588 }, { "epoch": 1.3501144164759724, "grad_norm": 0.5424400568008423, "learning_rate": 6.499457169389324e-06, "loss": 1.0742121934890747, "step": 590 }, { "epoch": 1.3546910755148742, "grad_norm": 0.7974820733070374, "learning_rate": 6.4883730462134754e-06, "loss": 1.0228968858718872, "step": 592 }, { "epoch": 1.3592677345537758, "grad_norm": 0.5494404435157776, "learning_rate": 6.477259003758778e-06, "loss": 0.8311363458633423, "step": 594 }, { "epoch": 1.3638443935926774, "grad_norm": 1.1344202756881714, "learning_rate": 6.466115201251637e-06, "loss": 0.9677754640579224, "step": 596 }, { "epoch": 1.368421052631579, "grad_norm": 0.9898701310157776, "learning_rate": 6.454941798344816e-06, "loss": 0.8938733339309692, "step": 598 }, { "epoch": 1.3729977116704806, "grad_norm": 0.843387246131897, "learning_rate": 6.443738955115158e-06, "loss": 0.7526968121528625, "step": 600 }, { "epoch": 1.3775743707093822, "grad_norm": 0.6977670788764954, "learning_rate": 6.432506832061283e-06, "loss": 0.9662184119224548, "step": 602 }, { "epoch": 1.3821510297482837, "grad_norm": 0.7610148191452026, "learning_rate": 6.421245590101285e-06, "loss": 1.0838618278503418, "step": 604 }, { "epoch": 1.3867276887871853, "grad_norm": 1.5654877424240112, "learning_rate": 6.409955390570444e-06, "loss": 1.034435510635376, "step": 606 }, { "epoch": 1.391304347826087, "grad_norm": 0.5249960422515869, "learning_rate": 6.398636395218895e-06, "loss": 1.0667188167572021, "step": 608 }, { "epoch": 1.3958810068649885, "grad_norm": 0.8519649505615234, "learning_rate": 6.387288766209325e-06, "loss": 1.0217965841293335, "step": 610 }, { "epoch": 1.40045766590389, "grad_norm": 1.1158952713012695, "learning_rate": 6.375912666114637e-06, "loss": 0.834468424320221, "step": 612 }, { "epoch": 1.4050343249427917, "grad_norm": 4.450192928314209, "learning_rate": 6.364508257915633e-06, "loss": 0.6208648085594177, "step": 614 }, { "epoch": 1.4096109839816933, "grad_norm": 0.5419265031814575, "learning_rate": 6.353075704998674e-06, "loss": 1.0283397436141968, "step": 616 }, { "epoch": 1.414187643020595, "grad_norm": 0.71892249584198, "learning_rate": 6.341615171153334e-06, "loss": 1.074945092201233, "step": 618 }, { "epoch": 1.4187643020594964, "grad_norm": 0.46902936697006226, "learning_rate": 6.330126820570066e-06, "loss": 1.1522700786590576, "step": 620 }, { "epoch": 1.4233409610983982, "grad_norm": 0.6041531562805176, "learning_rate": 6.318610817837834e-06, "loss": 0.9732744097709656, "step": 622 }, { "epoch": 1.4279176201372998, "grad_norm": 1.0350428819656372, "learning_rate": 6.307067327941769e-06, "loss": 0.9294142127037048, "step": 624 }, { "epoch": 1.4324942791762014, "grad_norm": 0.8469648361206055, "learning_rate": 6.2954965162607995e-06, "loss": 0.9963588714599609, "step": 626 }, { "epoch": 1.437070938215103, "grad_norm": 0.471488356590271, "learning_rate": 6.283898548565278e-06, "loss": 1.0501569509506226, "step": 628 }, { "epoch": 1.4416475972540046, "grad_norm": 0.2829485237598419, "learning_rate": 6.272273591014614e-06, "loss": 0.918632447719574, "step": 630 }, { "epoch": 1.4462242562929062, "grad_norm": 0.7685482501983643, "learning_rate": 6.260621810154889e-06, "loss": 1.004805326461792, "step": 632 }, { "epoch": 1.4508009153318078, "grad_norm": 0.5823658108711243, "learning_rate": 6.24894337291647e-06, "loss": 1.0062049627304077, "step": 634 }, { "epoch": 1.4553775743707094, "grad_norm": 0.4799053966999054, "learning_rate": 6.23723844661162e-06, "loss": 0.9444741010665894, "step": 636 }, { "epoch": 1.459954233409611, "grad_norm": 1.4403252601623535, "learning_rate": 6.2255071989321e-06, "loss": 0.3563728332519531, "step": 638 }, { "epoch": 1.4645308924485125, "grad_norm": 0.5606778860092163, "learning_rate": 6.2137497979467664e-06, "loss": 1.084834098815918, "step": 640 }, { "epoch": 1.4691075514874141, "grad_norm": 1.5342822074890137, "learning_rate": 6.201966412099164e-06, "loss": 0.88880455493927, "step": 642 }, { "epoch": 1.4736842105263157, "grad_norm": 0.9060531258583069, "learning_rate": 6.190157210205114e-06, "loss": 0.7019488215446472, "step": 644 }, { "epoch": 1.4782608695652173, "grad_norm": 0.6863506436347961, "learning_rate": 6.17832236145029e-06, "loss": 1.0984705686569214, "step": 646 }, { "epoch": 1.482837528604119, "grad_norm": 0.5668753385543823, "learning_rate": 6.1664620353878e-06, "loss": 0.5110697150230408, "step": 648 }, { "epoch": 1.4874141876430205, "grad_norm": 0.5630995631217957, "learning_rate": 6.154576401935756e-06, "loss": 1.0951957702636719, "step": 650 }, { "epoch": 1.4919908466819223, "grad_norm": 0.49918022751808167, "learning_rate": 6.1426656313748375e-06, "loss": 0.6987397074699402, "step": 652 }, { "epoch": 1.4965675057208239, "grad_norm": 1.3646057844161987, "learning_rate": 6.130729894345851e-06, "loss": 1.1859971284866333, "step": 654 }, { "epoch": 1.5011441647597255, "grad_norm": 1.018013596534729, "learning_rate": 6.118769361847293e-06, "loss": 0.9196734428405762, "step": 656 }, { "epoch": 1.505720823798627, "grad_norm": 0.35407713055610657, "learning_rate": 6.106784205232888e-06, "loss": 1.0442997217178345, "step": 658 }, { "epoch": 1.5102974828375286, "grad_norm": 1.8714250326156616, "learning_rate": 6.094774596209148e-06, "loss": 0.6379448771476746, "step": 660 }, { "epoch": 1.5148741418764302, "grad_norm": 0.4186966121196747, "learning_rate": 6.082740706832897e-06, "loss": 1.0916244983673096, "step": 662 }, { "epoch": 1.5194508009153318, "grad_norm": 1.4749921560287476, "learning_rate": 6.07068270950882e-06, "loss": 1.0407416820526123, "step": 664 }, { "epoch": 1.5240274599542334, "grad_norm": 0.8104400634765625, "learning_rate": 6.0586007769869824e-06, "loss": 0.4534456133842468, "step": 666 }, { "epoch": 1.528604118993135, "grad_norm": 0.7797476053237915, "learning_rate": 6.046495082360364e-06, "loss": 1.105326771736145, "step": 668 }, { "epoch": 1.5331807780320366, "grad_norm": 0.4904558062553406, "learning_rate": 6.034365799062368e-06, "loss": 0.7495214939117432, "step": 670 }, { "epoch": 1.5377574370709381, "grad_norm": 0.4737433195114136, "learning_rate": 6.022213100864351e-06, "loss": 1.113190770149231, "step": 672 }, { "epoch": 1.54233409610984, "grad_norm": 0.6883158087730408, "learning_rate": 6.01003716187312e-06, "loss": 0.7975556254386902, "step": 674 }, { "epoch": 1.5469107551487413, "grad_norm": 7.956705093383789, "learning_rate": 5.9978381565284456e-06, "loss": 0.86496502161026, "step": 676 }, { "epoch": 1.5514874141876431, "grad_norm": 0.6836668848991394, "learning_rate": 5.985616259600559e-06, "loss": 1.0243340730667114, "step": 678 }, { "epoch": 1.5560640732265445, "grad_norm": 1.809205174446106, "learning_rate": 5.973371646187653e-06, "loss": 0.7693554759025574, "step": 680 }, { "epoch": 1.5606407322654463, "grad_norm": 0.5701059699058533, "learning_rate": 5.961104491713367e-06, "loss": 1.0237873792648315, "step": 682 }, { "epoch": 1.5652173913043477, "grad_norm": 0.6132348775863647, "learning_rate": 5.948814971924277e-06, "loss": 1.0140217542648315, "step": 684 }, { "epoch": 1.5697940503432495, "grad_norm": 1.0674152374267578, "learning_rate": 5.936503262887384e-06, "loss": 0.8194214701652527, "step": 686 }, { "epoch": 1.574370709382151, "grad_norm": 0.5556658506393433, "learning_rate": 5.924169540987577e-06, "loss": 1.0535566806793213, "step": 688 }, { "epoch": 1.5789473684210527, "grad_norm": 0.9293789267539978, "learning_rate": 5.911813982925118e-06, "loss": 0.8254351615905762, "step": 690 }, { "epoch": 1.5835240274599542, "grad_norm": 0.554201602935791, "learning_rate": 5.8994367657131095e-06, "loss": 0.769871711730957, "step": 692 }, { "epoch": 1.5881006864988558, "grad_norm": 0.642005205154419, "learning_rate": 5.887038066674952e-06, "loss": 1.2380577325820923, "step": 694 }, { "epoch": 1.5926773455377574, "grad_norm": 1.3365410566329956, "learning_rate": 5.874618063441807e-06, "loss": 0.9225341081619263, "step": 696 }, { "epoch": 1.597254004576659, "grad_norm": 0.7327346205711365, "learning_rate": 5.862176933950059e-06, "loss": 0.8071609139442444, "step": 698 }, { "epoch": 1.6018306636155606, "grad_norm": 0.5678279399871826, "learning_rate": 5.849714856438752e-06, "loss": 1.0254077911376953, "step": 700 }, { "epoch": 1.6064073226544622, "grad_norm": 0.5600216388702393, "learning_rate": 5.837232009447051e-06, "loss": 0.8201320171356201, "step": 702 }, { "epoch": 1.610983981693364, "grad_norm": 1.1897834539413452, "learning_rate": 5.824728571811667e-06, "loss": 0.7250789999961853, "step": 704 }, { "epoch": 1.6155606407322654, "grad_norm": 1.4982539415359497, "learning_rate": 5.812204722664317e-06, "loss": 0.5860614776611328, "step": 706 }, { "epoch": 1.6201372997711672, "grad_norm": 0.4147469103336334, "learning_rate": 5.799660641429135e-06, "loss": 1.0965193510055542, "step": 708 }, { "epoch": 1.6247139588100685, "grad_norm": 0.8616934418678284, "learning_rate": 5.787096507820122e-06, "loss": 0.6443649530410767, "step": 710 }, { "epoch": 1.6292906178489703, "grad_norm": 0.39017513394355774, "learning_rate": 5.774512501838552e-06, "loss": 1.081406593322754, "step": 712 }, { "epoch": 1.6338672768878717, "grad_norm": 0.526287317276001, "learning_rate": 5.761908803770406e-06, "loss": 0.8772663474082947, "step": 714 }, { "epoch": 1.6384439359267735, "grad_norm": 8.515963554382324, "learning_rate": 5.7492855941837886e-06, "loss": 0.7267716526985168, "step": 716 }, { "epoch": 1.643020594965675, "grad_norm": 1.0429171323776245, "learning_rate": 5.7366430539263335e-06, "loss": 0.962812066078186, "step": 718 }, { "epoch": 1.6475972540045767, "grad_norm": 0.571498453617096, "learning_rate": 5.7239813641226185e-06, "loss": 0.7786587476730347, "step": 720 }, { "epoch": 1.6521739130434783, "grad_norm": 0.4392048418521881, "learning_rate": 5.711300706171571e-06, "loss": 0.6211113333702087, "step": 722 }, { "epoch": 1.6567505720823799, "grad_norm": 61.38982391357422, "learning_rate": 5.698601261743866e-06, "loss": 0.8990558385848999, "step": 724 }, { "epoch": 1.6613272311212814, "grad_norm": 3.1536953449249268, "learning_rate": 5.685883212779324e-06, "loss": 0.7748126983642578, "step": 726 }, { "epoch": 1.665903890160183, "grad_norm": 1.6510205268859863, "learning_rate": 5.673146741484308e-06, "loss": 0.5790435671806335, "step": 728 }, { "epoch": 1.6704805491990846, "grad_norm": 0.7334662675857544, "learning_rate": 5.660392030329107e-06, "loss": 0.8529476523399353, "step": 730 }, { "epoch": 1.6750572082379862, "grad_norm": 0.5321954488754272, "learning_rate": 5.647619262045326e-06, "loss": 1.0975439548492432, "step": 732 }, { "epoch": 1.679633867276888, "grad_norm": 0.49299484491348267, "learning_rate": 5.634828619623269e-06, "loss": 1.08144211769104, "step": 734 }, { "epoch": 1.6842105263157894, "grad_norm": 0.47397059202194214, "learning_rate": 5.622020286309315e-06, "loss": 0.7880240678787231, "step": 736 }, { "epoch": 1.6887871853546912, "grad_norm": 0.4795650839805603, "learning_rate": 5.6091944456032896e-06, "loss": 1.0663710832595825, "step": 738 }, { "epoch": 1.6933638443935926, "grad_norm": 1.204559087753296, "learning_rate": 5.5963512812558456e-06, "loss": 0.6782804727554321, "step": 740 }, { "epoch": 1.6979405034324944, "grad_norm": 0.5787196159362793, "learning_rate": 5.583490977265819e-06, "loss": 0.5317763686180115, "step": 742 }, { "epoch": 1.7025171624713957, "grad_norm": 0.7164469361305237, "learning_rate": 5.570613717877605e-06, "loss": 1.0468082427978516, "step": 744 }, { "epoch": 1.7070938215102975, "grad_norm": 0.5695748329162598, "learning_rate": 5.557719687578507e-06, "loss": 1.039925217628479, "step": 746 }, { "epoch": 1.7116704805491991, "grad_norm": 0.6732376217842102, "learning_rate": 5.544809071096098e-06, "loss": 0.7266217470169067, "step": 748 }, { "epoch": 1.7162471395881007, "grad_norm": 0.5064290761947632, "learning_rate": 5.531882053395577e-06, "loss": 0.7927247285842896, "step": 750 }, { "epoch": 1.7208237986270023, "grad_norm": 0.4806613326072693, "learning_rate": 5.5189388196771166e-06, "loss": 1.0538541078567505, "step": 752 }, { "epoch": 1.7254004576659039, "grad_norm": 0.46450909972190857, "learning_rate": 5.5059795553732094e-06, "loss": 0.7507013082504272, "step": 754 }, { "epoch": 1.7299771167048055, "grad_norm": 0.627607524394989, "learning_rate": 5.49300444614601e-06, "loss": 1.0437747240066528, "step": 756 }, { "epoch": 1.734553775743707, "grad_norm": 0.679521381855011, "learning_rate": 5.4800136778846814e-06, "loss": 1.0802175998687744, "step": 758 }, { "epoch": 1.7391304347826086, "grad_norm": 0.5956997275352478, "learning_rate": 5.467007436702721e-06, "loss": 0.8692449331283569, "step": 760 }, { "epoch": 1.7437070938215102, "grad_norm": 1.8609957695007324, "learning_rate": 5.453985908935304e-06, "loss": 0.7606918215751648, "step": 762 }, { "epoch": 1.748283752860412, "grad_norm": 0.900421679019928, "learning_rate": 5.440949281136612e-06, "loss": 1.09746515750885, "step": 764 }, { "epoch": 1.7528604118993134, "grad_norm": 0.48335880041122437, "learning_rate": 5.4278977400771545e-06, "loss": 0.8808884024620056, "step": 766 }, { "epoch": 1.7574370709382152, "grad_norm": 0.8775651454925537, "learning_rate": 5.4148314727411e-06, "loss": 1.0945841073989868, "step": 768 }, { "epoch": 1.7620137299771166, "grad_norm": 0.4176448881626129, "learning_rate": 5.401750666323595e-06, "loss": 0.6265594959259033, "step": 770 }, { "epoch": 1.7665903890160184, "grad_norm": 1.7150315046310425, "learning_rate": 5.3886555082280794e-06, "loss": 0.5311535000801086, "step": 772 }, { "epoch": 1.7711670480549198, "grad_norm": 0.482889860868454, "learning_rate": 5.375546186063606e-06, "loss": 0.9832878112792969, "step": 774 }, { "epoch": 1.7757437070938216, "grad_norm": 0.6067187786102295, "learning_rate": 5.362422887642148e-06, "loss": 0.5880881547927856, "step": 776 }, { "epoch": 1.7803203661327232, "grad_norm": 0.8287796974182129, "learning_rate": 5.3492858009759115e-06, "loss": 0.987903356552124, "step": 778 }, { "epoch": 1.7848970251716247, "grad_norm": 3.733571767807007, "learning_rate": 5.3361351142746425e-06, "loss": 0.8192511796951294, "step": 780 }, { "epoch": 1.7894736842105263, "grad_norm": 0.6592600345611572, "learning_rate": 5.32297101594293e-06, "loss": 1.0811833143234253, "step": 782 }, { "epoch": 1.794050343249428, "grad_norm": 1.0604369640350342, "learning_rate": 5.3097936945775034e-06, "loss": 0.7256264686584473, "step": 784 }, { "epoch": 1.7986270022883295, "grad_norm": 4.27371883392334, "learning_rate": 5.2966033389645345e-06, "loss": 0.6703461408615112, "step": 786 }, { "epoch": 1.803203661327231, "grad_norm": 0.9975327849388123, "learning_rate": 5.283400138076932e-06, "loss": 0.6835663914680481, "step": 788 }, { "epoch": 1.8077803203661327, "grad_norm": 0.7741729617118835, "learning_rate": 5.270184281071633e-06, "loss": 0.5445237159729004, "step": 790 }, { "epoch": 1.8123569794050343, "grad_norm": 0.5315194129943848, "learning_rate": 5.256955957286892e-06, "loss": 0.9225422143936157, "step": 792 }, { "epoch": 1.816933638443936, "grad_norm": 0.5030812621116638, "learning_rate": 5.243715356239573e-06, "loss": 1.0199389457702637, "step": 794 }, { "epoch": 1.8215102974828374, "grad_norm": 0.5460532903671265, "learning_rate": 5.230462667622431e-06, "loss": 0.9883009195327759, "step": 796 }, { "epoch": 1.8260869565217392, "grad_norm": 0.6946282386779785, "learning_rate": 5.217198081301393e-06, "loss": 1.1155096292495728, "step": 798 }, { "epoch": 1.8306636155606406, "grad_norm": 0.5267988443374634, "learning_rate": 5.20392178731284e-06, "loss": 0.8357391357421875, "step": 800 }, { "epoch": 1.8352402745995424, "grad_norm": 0.9400213956832886, "learning_rate": 5.190633975860886e-06, "loss": 1.2634918689727783, "step": 802 }, { "epoch": 1.8398169336384438, "grad_norm": 0.4523729085922241, "learning_rate": 5.1773348373146495e-06, "loss": 1.0519038438796997, "step": 804 }, { "epoch": 1.8443935926773456, "grad_norm": 0.5702998042106628, "learning_rate": 5.164024562205527e-06, "loss": 1.0271356105804443, "step": 806 }, { "epoch": 1.8489702517162472, "grad_norm": 0.3808506429195404, "learning_rate": 5.150703341224464e-06, "loss": 1.0296131372451782, "step": 808 }, { "epoch": 1.8535469107551488, "grad_norm": 0.49956274032592773, "learning_rate": 5.137371365219225e-06, "loss": 0.9183504581451416, "step": 810 }, { "epoch": 1.8581235697940504, "grad_norm": 0.42203259468078613, "learning_rate": 5.1240288251916576e-06, "loss": 1.1768728494644165, "step": 812 }, { "epoch": 1.862700228832952, "grad_norm": 0.39127910137176514, "learning_rate": 5.110675912294954e-06, "loss": 0.7790493965148926, "step": 814 }, { "epoch": 1.8672768878718535, "grad_norm": 0.422817587852478, "learning_rate": 5.097312817830913e-06, "loss": 1.0952423810958862, "step": 816 }, { "epoch": 1.8718535469107551, "grad_norm": 0.6157692670822144, "learning_rate": 5.083939733247205e-06, "loss": 0.9987185001373291, "step": 818 }, { "epoch": 1.8764302059496567, "grad_norm": 0.43606990575790405, "learning_rate": 5.07055685013462e-06, "loss": 0.6182645559310913, "step": 820 }, { "epoch": 1.8810068649885583, "grad_norm": 1.6980317831039429, "learning_rate": 5.057164360224333e-06, "loss": 0.8162880539894104, "step": 822 }, { "epoch": 1.88558352402746, "grad_norm": 1.3754355907440186, "learning_rate": 5.0437624553851465e-06, "loss": 0.9700958728790283, "step": 824 }, { "epoch": 1.8901601830663615, "grad_norm": 0.695946991443634, "learning_rate": 5.03035132762075e-06, "loss": 0.7681318521499634, "step": 826 }, { "epoch": 1.8947368421052633, "grad_norm": 1.2942097187042236, "learning_rate": 5.016931169066964e-06, "loss": 0.7290353178977966, "step": 828 }, { "epoch": 1.8993135011441646, "grad_norm": 0.5696609616279602, "learning_rate": 5.003502171988991e-06, "loss": 1.1246237754821777, "step": 830 }, { "epoch": 1.9038901601830664, "grad_norm": 0.6707322597503662, "learning_rate": 4.990064528778662e-06, "loss": 1.0869580507278442, "step": 832 }, { "epoch": 1.9084668192219678, "grad_norm": 1.6657345294952393, "learning_rate": 4.976618431951673e-06, "loss": 1.004643201828003, "step": 834 }, { "epoch": 1.9130434782608696, "grad_norm": 0.7031993269920349, "learning_rate": 4.963164074144831e-06, "loss": 0.9117480516433716, "step": 836 }, { "epoch": 1.9176201372997712, "grad_norm": 0.6919447779655457, "learning_rate": 4.949701648113299e-06, "loss": 1.0158711671829224, "step": 838 }, { "epoch": 1.9221967963386728, "grad_norm": 1.0875664949417114, "learning_rate": 4.93623134672783e-06, "loss": 0.877010703086853, "step": 840 }, { "epoch": 1.9267734553775744, "grad_norm": 0.5526543259620667, "learning_rate": 4.922753362972e-06, "loss": 0.7300304770469666, "step": 842 }, { "epoch": 1.931350114416476, "grad_norm": 0.9483214020729065, "learning_rate": 4.90926788993945e-06, "loss": 0.9140143394470215, "step": 844 }, { "epoch": 1.9359267734553776, "grad_norm": 0.37856608629226685, "learning_rate": 4.895775120831117e-06, "loss": 0.9656968116760254, "step": 846 }, { "epoch": 1.9405034324942791, "grad_norm": 0.519496738910675, "learning_rate": 4.8822752489524655e-06, "loss": 0.7197529673576355, "step": 848 }, { "epoch": 1.9450800915331807, "grad_norm": 1.0532692670822144, "learning_rate": 4.868768467710718e-06, "loss": 0.6470821499824524, "step": 850 }, { "epoch": 1.9496567505720823, "grad_norm": 0.7968090176582336, "learning_rate": 4.855254970612085e-06, "loss": 0.8287680149078369, "step": 852 }, { "epoch": 1.9542334096109841, "grad_norm": 0.3205246925354004, "learning_rate": 4.841734951258991e-06, "loss": 1.038404941558838, "step": 854 }, { "epoch": 1.9588100686498855, "grad_norm": 0.5507828593254089, "learning_rate": 4.828208603347306e-06, "loss": 1.0363742113113403, "step": 856 }, { "epoch": 1.9633867276887873, "grad_norm": 1.1318949460983276, "learning_rate": 4.8146761206635635e-06, "loss": 0.49166426062583923, "step": 858 }, { "epoch": 1.9679633867276887, "grad_norm": 0.5373101234436035, "learning_rate": 4.801137697082188e-06, "loss": 0.9425434470176697, "step": 860 }, { "epoch": 1.9725400457665905, "grad_norm": 0.41498398780822754, "learning_rate": 4.787593526562718e-06, "loss": 1.0520573854446411, "step": 862 }, { "epoch": 1.9771167048054918, "grad_norm": 2.031755208969116, "learning_rate": 4.774043803147023e-06, "loss": 0.8243655562400818, "step": 864 }, { "epoch": 1.9816933638443937, "grad_norm": 0.6944281458854675, "learning_rate": 4.76048872095653e-06, "loss": 1.229433536529541, "step": 866 }, { "epoch": 1.9862700228832952, "grad_norm": 1.356571078300476, "learning_rate": 4.746928474189438e-06, "loss": 0.4065757691860199, "step": 868 }, { "epoch": 1.9908466819221968, "grad_norm": 0.8086536526679993, "learning_rate": 4.733363257117937e-06, "loss": 1.044547200202942, "step": 870 }, { "epoch": 1.9954233409610984, "grad_norm": 1.1679444313049316, "learning_rate": 4.719793264085423e-06, "loss": 0.7908722758293152, "step": 872 }, { "epoch": 2.0, "grad_norm": 0.5113776922225952, "learning_rate": 4.7062186895037155e-06, "loss": 1.0065279006958008, "step": 874 }, { "epoch": 2.004576659038902, "grad_norm": 1.2484664916992188, "learning_rate": 4.692639727850277e-06, "loss": 0.8650211095809937, "step": 876 }, { "epoch": 2.009153318077803, "grad_norm": 0.19410383701324463, "learning_rate": 4.679056573665413e-06, "loss": 0.5260782837867737, "step": 878 }, { "epoch": 2.013729977116705, "grad_norm": 0.6674540638923645, "learning_rate": 4.6654694215495e-06, "loss": 0.8760992884635925, "step": 880 }, { "epoch": 2.0183066361556063, "grad_norm": 0.5311161875724792, "learning_rate": 4.651878466160191e-06, "loss": 0.5279438495635986, "step": 882 }, { "epoch": 2.022883295194508, "grad_norm": 0.8211866617202759, "learning_rate": 4.638283902209623e-06, "loss": 0.7287083864212036, "step": 884 }, { "epoch": 2.0274599542334095, "grad_norm": 1.3398568630218506, "learning_rate": 4.624685924461638e-06, "loss": 0.7964086532592773, "step": 886 }, { "epoch": 2.0320366132723113, "grad_norm": 1.409980297088623, "learning_rate": 4.611084727728979e-06, "loss": 0.7654436230659485, "step": 888 }, { "epoch": 2.0366132723112127, "grad_norm": 1.149587631225586, "learning_rate": 4.59748050687051e-06, "loss": 0.7754305601119995, "step": 890 }, { "epoch": 2.0411899313501145, "grad_norm": 0.6738428473472595, "learning_rate": 4.583873456788419e-06, "loss": 0.6593428254127502, "step": 892 }, { "epoch": 2.045766590389016, "grad_norm": 0.582455039024353, "learning_rate": 4.570263772425429e-06, "loss": 0.5679339170455933, "step": 894 }, { "epoch": 2.0503432494279177, "grad_norm": 1.0730149745941162, "learning_rate": 4.556651648762e-06, "loss": 0.47743597626686096, "step": 896 }, { "epoch": 2.054919908466819, "grad_norm": 0.6254774928092957, "learning_rate": 4.543037280813544e-06, "loss": 0.7976337671279907, "step": 898 }, { "epoch": 2.059496567505721, "grad_norm": 0.6231757998466492, "learning_rate": 4.52942086362762e-06, "loss": 0.9103917479515076, "step": 900 }, { "epoch": 2.064073226544622, "grad_norm": 0.7223190665245056, "learning_rate": 4.515802592281151e-06, "loss": 0.6968510746955872, "step": 902 }, { "epoch": 2.068649885583524, "grad_norm": 1.1463390588760376, "learning_rate": 4.50218266187762e-06, "loss": 0.5448936820030212, "step": 904 }, { "epoch": 2.073226544622426, "grad_norm": 1.5348836183547974, "learning_rate": 4.4885612675442795e-06, "loss": 0.1512419879436493, "step": 906 }, { "epoch": 2.077803203661327, "grad_norm": 1.0921180248260498, "learning_rate": 4.474938604429356e-06, "loss": 0.3723929226398468, "step": 908 }, { "epoch": 2.082379862700229, "grad_norm": 2.096736431121826, "learning_rate": 4.4613148676992534e-06, "loss": 0.5486660599708557, "step": 910 }, { "epoch": 2.0869565217391304, "grad_norm": 0.9730957746505737, "learning_rate": 4.447690252535757e-06, "loss": 0.48607346415519714, "step": 912 }, { "epoch": 2.091533180778032, "grad_norm": 0.9440922737121582, "learning_rate": 4.434064954133233e-06, "loss": 0.8132773041725159, "step": 914 }, { "epoch": 2.0961098398169336, "grad_norm": 0.5760765671730042, "learning_rate": 4.4204391676958456e-06, "loss": 0.8012776374816895, "step": 916 }, { "epoch": 2.1006864988558354, "grad_norm": 1.3130919933319092, "learning_rate": 4.406813088434739e-06, "loss": 0.5709822177886963, "step": 918 }, { "epoch": 2.1052631578947367, "grad_norm": 0.4790267050266266, "learning_rate": 4.393186911565262e-06, "loss": 0.793640673160553, "step": 920 }, { "epoch": 2.1098398169336385, "grad_norm": 1.7611640691757202, "learning_rate": 4.379560832304155e-06, "loss": 0.8374230861663818, "step": 922 }, { "epoch": 2.11441647597254, "grad_norm": 0.5854694247245789, "learning_rate": 4.365935045866765e-06, "loss": 0.7813926935195923, "step": 924 }, { "epoch": 2.1189931350114417, "grad_norm": 0.768767237663269, "learning_rate": 4.352309747464244e-06, "loss": 0.547953188419342, "step": 926 }, { "epoch": 2.123569794050343, "grad_norm": 0.6132713556289673, "learning_rate": 4.338685132300746e-06, "loss": 0.805243968963623, "step": 928 }, { "epoch": 2.128146453089245, "grad_norm": 0.4178018569946289, "learning_rate": 4.325061395570644e-06, "loss": 0.44079485535621643, "step": 930 }, { "epoch": 2.1327231121281462, "grad_norm": 1.141923189163208, "learning_rate": 4.311438732455722e-06, "loss": 0.5015432238578796, "step": 932 }, { "epoch": 2.137299771167048, "grad_norm": 0.764822781085968, "learning_rate": 4.297817338122382e-06, "loss": 0.4405544102191925, "step": 934 }, { "epoch": 2.14187643020595, "grad_norm": 0.8727527260780334, "learning_rate": 4.28419740771885e-06, "loss": 0.7797104120254517, "step": 936 }, { "epoch": 2.1464530892448512, "grad_norm": 0.47269201278686523, "learning_rate": 4.27057913637238e-06, "loss": 0.14955393970012665, "step": 938 }, { "epoch": 2.151029748283753, "grad_norm": 0.574876606464386, "learning_rate": 4.2569627191864566e-06, "loss": 0.6506639719009399, "step": 940 }, { "epoch": 2.1556064073226544, "grad_norm": 0.43111807107925415, "learning_rate": 4.243348351238e-06, "loss": 0.8360904455184937, "step": 942 }, { "epoch": 2.160183066361556, "grad_norm": 0.6124604940414429, "learning_rate": 4.229736227574573e-06, "loss": 0.4769759178161621, "step": 944 }, { "epoch": 2.1647597254004576, "grad_norm": 0.5514540672302246, "learning_rate": 4.216126543211582e-06, "loss": 0.3582886755466461, "step": 946 }, { "epoch": 2.1693363844393594, "grad_norm": 0.6413321495056152, "learning_rate": 4.2025194931294905e-06, "loss": 0.6755134463310242, "step": 948 }, { "epoch": 2.1739130434782608, "grad_norm": 1.2605396509170532, "learning_rate": 4.188915272271021e-06, "loss": 0.26411038637161255, "step": 950 }, { "epoch": 2.1784897025171626, "grad_norm": 2.322275161743164, "learning_rate": 4.175314075538362e-06, "loss": 0.5067375302314758, "step": 952 }, { "epoch": 2.183066361556064, "grad_norm": 0.8927626609802246, "learning_rate": 4.1617160977903755e-06, "loss": 0.763658344745636, "step": 954 }, { "epoch": 2.1876430205949657, "grad_norm": 0.5768288969993591, "learning_rate": 4.148121533839809e-06, "loss": 0.5286126732826233, "step": 956 }, { "epoch": 2.192219679633867, "grad_norm": 0.7886779308319092, "learning_rate": 4.134530578450499e-06, "loss": 0.5433465838432312, "step": 958 }, { "epoch": 2.196796338672769, "grad_norm": 0.5103306770324707, "learning_rate": 4.120943426334587e-06, "loss": 0.781455397605896, "step": 960 }, { "epoch": 2.2013729977116703, "grad_norm": 1.2081961631774902, "learning_rate": 4.107360272149724e-06, "loss": 0.44611290097236633, "step": 962 }, { "epoch": 2.205949656750572, "grad_norm": 0.5938241481781006, "learning_rate": 4.093781310496284e-06, "loss": 0.5686028599739075, "step": 964 }, { "epoch": 2.2105263157894735, "grad_norm": 1.259899377822876, "learning_rate": 4.080206735914578e-06, "loss": 0.5111551880836487, "step": 966 }, { "epoch": 2.2151029748283753, "grad_norm": 0.6723149418830872, "learning_rate": 4.066636742882064e-06, "loss": 0.7907751798629761, "step": 968 }, { "epoch": 2.219679633867277, "grad_norm": 0.7715116739273071, "learning_rate": 4.053071525810562e-06, "loss": 0.8536049127578735, "step": 970 }, { "epoch": 2.2242562929061784, "grad_norm": 0.6745052933692932, "learning_rate": 4.039511279043469e-06, "loss": 0.49250656366348267, "step": 972 }, { "epoch": 2.2288329519450802, "grad_norm": 5.653090476989746, "learning_rate": 4.025956196852978e-06, "loss": 0.9080660939216614, "step": 974 }, { "epoch": 2.2334096109839816, "grad_norm": 0.39930716156959534, "learning_rate": 4.0124064734372824e-06, "loss": 0.5726509094238281, "step": 976 }, { "epoch": 2.2379862700228834, "grad_norm": 0.7205491662025452, "learning_rate": 3.998862302917812e-06, "loss": 0.5259721279144287, "step": 978 }, { "epoch": 2.242562929061785, "grad_norm": 0.5892427563667297, "learning_rate": 3.985323879336437e-06, "loss": 0.70969158411026, "step": 980 }, { "epoch": 2.2471395881006866, "grad_norm": 0.8280232548713684, "learning_rate": 3.9717913966526935e-06, "loss": 0.7658937573432922, "step": 982 }, { "epoch": 2.251716247139588, "grad_norm": 0.46589598059654236, "learning_rate": 3.958265048741008e-06, "loss": 0.6634811758995056, "step": 984 }, { "epoch": 2.2562929061784898, "grad_norm": 1.1013447046279907, "learning_rate": 3.944745029387916e-06, "loss": 0.6400603652000427, "step": 986 }, { "epoch": 2.260869565217391, "grad_norm": 0.5064674019813538, "learning_rate": 3.931231532289282e-06, "loss": 0.7838316559791565, "step": 988 }, { "epoch": 2.265446224256293, "grad_norm": 0.6013701558113098, "learning_rate": 3.917724751047534e-06, "loss": 0.5404883623123169, "step": 990 }, { "epoch": 2.2700228832951943, "grad_norm": 1.0321763753890991, "learning_rate": 3.904224879168882e-06, "loss": 0.6090070605278015, "step": 992 }, { "epoch": 2.274599542334096, "grad_norm": 0.4366181492805481, "learning_rate": 3.89073211006055e-06, "loss": 0.8297733068466187, "step": 994 }, { "epoch": 2.279176201372998, "grad_norm": 0.6068903207778931, "learning_rate": 3.877246637027999e-06, "loss": 0.816207766532898, "step": 996 }, { "epoch": 2.2837528604118993, "grad_norm": 0.9020872712135315, "learning_rate": 3.863768653272171e-06, "loss": 0.851694643497467, "step": 998 }, { "epoch": 2.288329519450801, "grad_norm": 0.699506938457489, "learning_rate": 3.850298351886699e-06, "loss": 0.5200238823890686, "step": 1000 }, { "epoch": 2.2929061784897025, "grad_norm": 1.1063220500946045, "learning_rate": 3.836835925855168e-06, "loss": 0.46105656027793884, "step": 1002 }, { "epoch": 2.2974828375286043, "grad_norm": 0.7333200573921204, "learning_rate": 3.823381568048329e-06, "loss": 0.4054844081401825, "step": 1004 }, { "epoch": 2.3020594965675056, "grad_norm": 1.0534933805465698, "learning_rate": 3.8099354712213375e-06, "loss": 0.786138653755188, "step": 1006 }, { "epoch": 2.3066361556064074, "grad_norm": 0.7661805152893066, "learning_rate": 3.7964978280110078e-06, "loss": 0.5901062488555908, "step": 1008 }, { "epoch": 2.311212814645309, "grad_norm": 0.5444358587265015, "learning_rate": 3.783068830933037e-06, "loss": 0.3989161550998688, "step": 1010 }, { "epoch": 2.3157894736842106, "grad_norm": 0.8000374436378479, "learning_rate": 3.7696486723792508e-06, "loss": 0.46749627590179443, "step": 1012 }, { "epoch": 2.320366132723112, "grad_norm": 0.7630489468574524, "learning_rate": 3.756237544614853e-06, "loss": 0.5220370292663574, "step": 1014 }, { "epoch": 2.324942791762014, "grad_norm": 0.7297222018241882, "learning_rate": 3.7428356397756672e-06, "loss": 0.9120384454727173, "step": 1016 }, { "epoch": 2.329519450800915, "grad_norm": 1.5217046737670898, "learning_rate": 3.7294431498653792e-06, "loss": 0.28965604305267334, "step": 1018 }, { "epoch": 2.334096109839817, "grad_norm": 1.606251835823059, "learning_rate": 3.7160602667527954e-06, "loss": 0.46628880500793457, "step": 1020 }, { "epoch": 2.3386727688787188, "grad_norm": 0.49629804491996765, "learning_rate": 3.7026871821690877e-06, "loss": 0.8182316422462463, "step": 1022 }, { "epoch": 2.34324942791762, "grad_norm": 0.2531331479549408, "learning_rate": 3.6893240877050467e-06, "loss": 0.7435541152954102, "step": 1024 }, { "epoch": 2.3478260869565215, "grad_norm": 0.48934581875801086, "learning_rate": 3.6759711748083416e-06, "loss": 0.8625474572181702, "step": 1026 }, { "epoch": 2.3524027459954233, "grad_norm": 1.5544626712799072, "learning_rate": 3.6626286347807753e-06, "loss": 0.5781938433647156, "step": 1028 }, { "epoch": 2.356979405034325, "grad_norm": 0.654556930065155, "learning_rate": 3.6492966587755356e-06, "loss": 0.6204836368560791, "step": 1030 }, { "epoch": 2.3615560640732265, "grad_norm": 1.397822618484497, "learning_rate": 3.6359754377944726e-06, "loss": 0.4885711967945099, "step": 1032 }, { "epoch": 2.3661327231121283, "grad_norm": 1.5980985164642334, "learning_rate": 3.622665162685351e-06, "loss": 0.6035375595092773, "step": 1034 }, { "epoch": 2.3707093821510297, "grad_norm": 0.5939676761627197, "learning_rate": 3.6093660241391134e-06, "loss": 0.6923315525054932, "step": 1036 }, { "epoch": 2.3752860411899315, "grad_norm": 2.990952730178833, "learning_rate": 3.5960782126871588e-06, "loss": 0.3957272171974182, "step": 1038 }, { "epoch": 2.379862700228833, "grad_norm": 1.2808301448822021, "learning_rate": 3.5828019186986076e-06, "loss": 0.35686612129211426, "step": 1040 }, { "epoch": 2.3844393592677346, "grad_norm": 0.537975013256073, "learning_rate": 3.5695373323775694e-06, "loss": 0.7028881311416626, "step": 1042 }, { "epoch": 2.389016018306636, "grad_norm": 0.5620653033256531, "learning_rate": 3.556284643760426e-06, "loss": 0.7551029920578003, "step": 1044 }, { "epoch": 2.393592677345538, "grad_norm": 0.9883461594581604, "learning_rate": 3.5430440427131087e-06, "loss": 0.4496672749519348, "step": 1046 }, { "epoch": 2.398169336384439, "grad_norm": 0.68025803565979, "learning_rate": 3.5298157189283673e-06, "loss": 0.7668349742889404, "step": 1048 }, { "epoch": 2.402745995423341, "grad_norm": 0.7667739987373352, "learning_rate": 3.5165998619230683e-06, "loss": 0.7979832291603088, "step": 1050 }, { "epoch": 2.4073226544622424, "grad_norm": 0.6154050827026367, "learning_rate": 3.5033966610354655e-06, "loss": 0.7762300968170166, "step": 1052 }, { "epoch": 2.411899313501144, "grad_norm": 1.798595666885376, "learning_rate": 3.4902063054224966e-06, "loss": 0.46241626143455505, "step": 1054 }, { "epoch": 2.416475972540046, "grad_norm": 0.9119539260864258, "learning_rate": 3.4770289840570693e-06, "loss": 0.18137064576148987, "step": 1056 }, { "epoch": 2.4210526315789473, "grad_norm": 0.567173957824707, "learning_rate": 3.463864885725358e-06, "loss": 0.5903480648994446, "step": 1058 }, { "epoch": 2.425629290617849, "grad_norm": 0.6367594599723816, "learning_rate": 3.450714199024089e-06, "loss": 0.4741784930229187, "step": 1060 }, { "epoch": 2.4302059496567505, "grad_norm": 0.7184843420982361, "learning_rate": 3.4375771123578527e-06, "loss": 0.8506764769554138, "step": 1062 }, { "epoch": 2.4347826086956523, "grad_norm": 0.7258248925209045, "learning_rate": 3.424453813936394e-06, "loss": 0.6282183527946472, "step": 1064 }, { "epoch": 2.4393592677345537, "grad_norm": 0.9743756651878357, "learning_rate": 3.4113444917719206e-06, "loss": 0.5036060810089111, "step": 1066 }, { "epoch": 2.4439359267734555, "grad_norm": 1.7455062866210938, "learning_rate": 3.3982493336764046e-06, "loss": 0.7877315282821655, "step": 1068 }, { "epoch": 2.448512585812357, "grad_norm": 0.5519979000091553, "learning_rate": 3.3851685272588995e-06, "loss": 0.7620252966880798, "step": 1070 }, { "epoch": 2.4530892448512587, "grad_norm": 1.4960353374481201, "learning_rate": 3.3721022599228455e-06, "loss": 0.6300139427185059, "step": 1072 }, { "epoch": 2.45766590389016, "grad_norm": 0.8801581859588623, "learning_rate": 3.359050718863388e-06, "loss": 0.7080258131027222, "step": 1074 }, { "epoch": 2.462242562929062, "grad_norm": 1.3520958423614502, "learning_rate": 3.3460140910646953e-06, "loss": 0.5704418420791626, "step": 1076 }, { "epoch": 2.466819221967963, "grad_norm": 0.9597600102424622, "learning_rate": 3.332992563297279e-06, "loss": 0.8418775200843811, "step": 1078 }, { "epoch": 2.471395881006865, "grad_norm": 0.7216306328773499, "learning_rate": 3.3199863221153194e-06, "loss": 0.8978160619735718, "step": 1080 }, { "epoch": 2.475972540045767, "grad_norm": 0.3640708029270172, "learning_rate": 3.3069955538539898e-06, "loss": 0.5792869329452515, "step": 1082 }, { "epoch": 2.480549199084668, "grad_norm": 0.4946868121623993, "learning_rate": 3.294020444626791e-06, "loss": 0.7915354371070862, "step": 1084 }, { "epoch": 2.4851258581235696, "grad_norm": 0.6146786212921143, "learning_rate": 3.281061180322883e-06, "loss": 0.6907005310058594, "step": 1086 }, { "epoch": 2.4897025171624714, "grad_norm": 0.7421013116836548, "learning_rate": 3.2681179466044234e-06, "loss": 0.487212598323822, "step": 1088 }, { "epoch": 2.494279176201373, "grad_norm": 0.41838371753692627, "learning_rate": 3.2551909289039026e-06, "loss": 0.5698995590209961, "step": 1090 }, { "epoch": 2.4988558352402745, "grad_norm": 0.3784742057323456, "learning_rate": 3.2422803124214938e-06, "loss": 0.5521360039710999, "step": 1092 }, { "epoch": 2.5034324942791764, "grad_norm": 3.2495124340057373, "learning_rate": 3.2293862821223954e-06, "loss": 0.25563108921051025, "step": 1094 }, { "epoch": 2.5080091533180777, "grad_norm": 0.7314022183418274, "learning_rate": 3.216509022734181e-06, "loss": 0.7299931049346924, "step": 1096 }, { "epoch": 2.5125858123569795, "grad_norm": 0.7065015435218811, "learning_rate": 3.203648718744155e-06, "loss": 0.697057843208313, "step": 1098 }, { "epoch": 2.517162471395881, "grad_norm": 0.490582674741745, "learning_rate": 3.1908055543967117e-06, "loss": 0.5592939257621765, "step": 1100 }, { "epoch": 2.5217391304347827, "grad_norm": 0.41221341490745544, "learning_rate": 3.177979713690686e-06, "loss": 0.8416492342948914, "step": 1102 }, { "epoch": 2.526315789473684, "grad_norm": 1.0584267377853394, "learning_rate": 3.1651713803767308e-06, "loss": 0.7285148501396179, "step": 1104 }, { "epoch": 2.530892448512586, "grad_norm": 0.5549395680427551, "learning_rate": 3.152380737954674e-06, "loss": 0.7810051441192627, "step": 1106 }, { "epoch": 2.5354691075514877, "grad_norm": 0.6194555759429932, "learning_rate": 3.1396079696708933e-06, "loss": 0.8377099633216858, "step": 1108 }, { "epoch": 2.540045766590389, "grad_norm": 0.48931312561035156, "learning_rate": 3.126853258515692e-06, "loss": 0.8033105731010437, "step": 1110 }, { "epoch": 2.5446224256292904, "grad_norm": 3.8660051822662354, "learning_rate": 3.114116787220676e-06, "loss": 0.8483383059501648, "step": 1112 }, { "epoch": 2.5491990846681922, "grad_norm": 0.508552074432373, "learning_rate": 3.101398738256134e-06, "loss": 0.8187400698661804, "step": 1114 }, { "epoch": 2.553775743707094, "grad_norm": 0.7558878064155579, "learning_rate": 3.0886992938284283e-06, "loss": 0.5747739672660828, "step": 1116 }, { "epoch": 2.5583524027459954, "grad_norm": 3.0800881385803223, "learning_rate": 3.076018635877382e-06, "loss": 0.6367532014846802, "step": 1118 }, { "epoch": 2.5629290617848968, "grad_norm": 1.0385645627975464, "learning_rate": 3.063356946073667e-06, "loss": 0.6223986148834229, "step": 1120 }, { "epoch": 2.5675057208237986, "grad_norm": 0.7576572895050049, "learning_rate": 3.050714405816212e-06, "loss": 0.716172456741333, "step": 1122 }, { "epoch": 2.5720823798627004, "grad_norm": 1.1599595546722412, "learning_rate": 3.038091196229594e-06, "loss": 0.7232475280761719, "step": 1124 }, { "epoch": 2.5766590389016018, "grad_norm": 0.6561689972877502, "learning_rate": 3.025487498161449e-06, "loss": 0.4857536554336548, "step": 1126 }, { "epoch": 2.5812356979405036, "grad_norm": 0.6233944892883301, "learning_rate": 3.0129034921798784e-06, "loss": 0.8831377029418945, "step": 1128 }, { "epoch": 2.585812356979405, "grad_norm": 0.6065633296966553, "learning_rate": 3.000339358570864e-06, "loss": 0.801806628704071, "step": 1130 }, { "epoch": 2.5903890160183067, "grad_norm": 0.5193330645561218, "learning_rate": 2.9877952773356835e-06, "loss": 0.7977845668792725, "step": 1132 }, { "epoch": 2.594965675057208, "grad_norm": 0.6371508240699768, "learning_rate": 2.9752714281883338e-06, "loss": 0.7237481474876404, "step": 1134 }, { "epoch": 2.59954233409611, "grad_norm": 4.00014591217041, "learning_rate": 2.9627679905529503e-06, "loss": 0.6999197006225586, "step": 1136 }, { "epoch": 2.6041189931350113, "grad_norm": 0.9605089426040649, "learning_rate": 2.9502851435612474e-06, "loss": 0.5667267441749573, "step": 1138 }, { "epoch": 2.608695652173913, "grad_norm": 0.8212308883666992, "learning_rate": 2.937823066049941e-06, "loss": 0.84372878074646, "step": 1140 }, { "epoch": 2.613272311212815, "grad_norm": 0.6663596034049988, "learning_rate": 2.9253819365581923e-06, "loss": 0.7569270133972168, "step": 1142 }, { "epoch": 2.6178489702517163, "grad_norm": 1.229351282119751, "learning_rate": 2.9129619333250482e-06, "loss": 0.4845171868801117, "step": 1144 }, { "epoch": 2.6224256292906176, "grad_norm": 1.0321894884109497, "learning_rate": 2.900563234286891e-06, "loss": 0.47281792759895325, "step": 1146 }, { "epoch": 2.6270022883295194, "grad_norm": 2.871659994125366, "learning_rate": 2.888186017074882e-06, "loss": 0.6280223727226257, "step": 1148 }, { "epoch": 2.6315789473684212, "grad_norm": 0.3605870306491852, "learning_rate": 2.875830459012424e-06, "loss": 0.07345299422740936, "step": 1150 }, { "epoch": 2.6361556064073226, "grad_norm": 0.5999372005462646, "learning_rate": 2.8634967371126165e-06, "loss": 0.7574805617332458, "step": 1152 }, { "epoch": 2.6407322654462244, "grad_norm": 0.6183683276176453, "learning_rate": 2.851185028075723e-06, "loss": 0.4930267333984375, "step": 1154 }, { "epoch": 2.645308924485126, "grad_norm": 0.5491237640380859, "learning_rate": 2.8388955082866333e-06, "loss": 0.6228328943252563, "step": 1156 }, { "epoch": 2.6498855835240276, "grad_norm": 0.5937864184379578, "learning_rate": 2.826628353812348e-06, "loss": 0.8555384278297424, "step": 1158 }, { "epoch": 2.654462242562929, "grad_norm": 1.6247820854187012, "learning_rate": 2.8143837403994396e-06, "loss": 0.5580495595932007, "step": 1160 }, { "epoch": 2.6590389016018308, "grad_norm": 7.112198352813721, "learning_rate": 2.8021618434715545e-06, "loss": 0.478378564119339, "step": 1162 }, { "epoch": 2.663615560640732, "grad_norm": 0.6588612794876099, "learning_rate": 2.7899628381268805e-06, "loss": 0.46248742938041687, "step": 1164 }, { "epoch": 2.668192219679634, "grad_norm": 0.46749043464660645, "learning_rate": 2.777786899135649e-06, "loss": 0.5418421030044556, "step": 1166 }, { "epoch": 2.6727688787185357, "grad_norm": 0.7305618524551392, "learning_rate": 2.765634200937632e-06, "loss": 0.4692259728908539, "step": 1168 }, { "epoch": 2.677345537757437, "grad_norm": 1.3373067378997803, "learning_rate": 2.753504917639637e-06, "loss": 0.6015383005142212, "step": 1170 }, { "epoch": 2.6819221967963385, "grad_norm": 1.2775124311447144, "learning_rate": 2.741399223013018e-06, "loss": 0.6598130464553833, "step": 1172 }, { "epoch": 2.6864988558352403, "grad_norm": 0.6947876214981079, "learning_rate": 2.72931729049118e-06, "loss": 0.5946341156959534, "step": 1174 }, { "epoch": 2.691075514874142, "grad_norm": 0.5818822383880615, "learning_rate": 2.7172592931671033e-06, "loss": 0.8561844229698181, "step": 1176 }, { "epoch": 2.6956521739130435, "grad_norm": 0.40154317021369934, "learning_rate": 2.705225403790853e-06, "loss": 0.5496221780776978, "step": 1178 }, { "epoch": 2.700228832951945, "grad_norm": 0.6244839429855347, "learning_rate": 2.693215794767111e-06, "loss": 0.7859920859336853, "step": 1180 }, { "epoch": 2.7048054919908466, "grad_norm": 1.3127132654190063, "learning_rate": 2.6812306381527084e-06, "loss": 0.7597426176071167, "step": 1182 }, { "epoch": 2.7093821510297484, "grad_norm": 5.444507598876953, "learning_rate": 2.6692701056541486e-06, "loss": 0.199199840426445, "step": 1184 }, { "epoch": 2.71395881006865, "grad_norm": 0.6784061789512634, "learning_rate": 2.657334368625163e-06, "loss": 0.6887036561965942, "step": 1186 }, { "epoch": 2.7185354691075516, "grad_norm": 0.5293588638305664, "learning_rate": 2.6454235980642436e-06, "loss": 0.8016695380210876, "step": 1188 }, { "epoch": 2.723112128146453, "grad_norm": 0.8115622997283936, "learning_rate": 2.6335379646121993e-06, "loss": 0.6355752944946289, "step": 1190 }, { "epoch": 2.727688787185355, "grad_norm": 0.6038773059844971, "learning_rate": 2.6216776385497098e-06, "loss": 0.6571699380874634, "step": 1192 }, { "epoch": 2.732265446224256, "grad_norm": 0.559846043586731, "learning_rate": 2.6098427897948867e-06, "loss": 0.7960324883460999, "step": 1194 }, { "epoch": 2.736842105263158, "grad_norm": 0.40631529688835144, "learning_rate": 2.5980335879008364e-06, "loss": 0.7562568783760071, "step": 1196 }, { "epoch": 2.7414187643020593, "grad_norm": 1.7513792514801025, "learning_rate": 2.586250202053233e-06, "loss": 0.48247501254081726, "step": 1198 }, { "epoch": 2.745995423340961, "grad_norm": 0.474061518907547, "learning_rate": 2.574492801067902e-06, "loss": 0.8583283424377441, "step": 1200 }, { "epoch": 2.750572082379863, "grad_norm": 0.46135658025741577, "learning_rate": 2.5627615533883803e-06, "loss": 0.774861216545105, "step": 1202 }, { "epoch": 2.7551487414187643, "grad_norm": 1.1955369710922241, "learning_rate": 2.55105662708353e-06, "loss": 0.5863184332847595, "step": 1204 }, { "epoch": 2.7597254004576657, "grad_norm": 5.386921405792236, "learning_rate": 2.539378189845112e-06, "loss": 0.7277770638465881, "step": 1206 }, { "epoch": 2.7643020594965675, "grad_norm": 0.7777206301689148, "learning_rate": 2.5277264089853852e-06, "loss": 0.32458746433258057, "step": 1208 }, { "epoch": 2.7688787185354693, "grad_norm": 3.4855971336364746, "learning_rate": 2.5161014514347212e-06, "loss": 0.48918047547340393, "step": 1210 }, { "epoch": 2.7734553775743707, "grad_norm": 0.9984601736068726, "learning_rate": 2.5045034837392e-06, "loss": 0.5557206869125366, "step": 1212 }, { "epoch": 2.7780320366132725, "grad_norm": 0.3992338180541992, "learning_rate": 2.492932672058231e-06, "loss": 0.5206654071807861, "step": 1214 }, { "epoch": 2.782608695652174, "grad_norm": 1.431427240371704, "learning_rate": 2.4813891821621653e-06, "loss": 0.28042441606521606, "step": 1216 }, { "epoch": 2.7871853546910756, "grad_norm": 2.218466281890869, "learning_rate": 2.4698731794299354e-06, "loss": 0.3877430856227875, "step": 1218 }, { "epoch": 2.791762013729977, "grad_norm": 0.46863940358161926, "learning_rate": 2.4583848288466662e-06, "loss": 0.493195503950119, "step": 1220 }, { "epoch": 2.796338672768879, "grad_norm": 0.6733099222183228, "learning_rate": 2.446924295001326e-06, "loss": 0.7823275923728943, "step": 1222 }, { "epoch": 2.80091533180778, "grad_norm": 1.4674835205078125, "learning_rate": 2.435491742084368e-06, "loss": 0.6491841077804565, "step": 1224 }, { "epoch": 2.805491990846682, "grad_norm": 0.47652876377105713, "learning_rate": 2.4240873338853628e-06, "loss": 0.7960876226425171, "step": 1226 }, { "epoch": 2.8100686498855834, "grad_norm": 0.4711756110191345, "learning_rate": 2.4127112337906754e-06, "loss": 0.536849319934845, "step": 1228 }, { "epoch": 2.814645308924485, "grad_norm": 0.7543594241142273, "learning_rate": 2.401363604781104e-06, "loss": 0.6167381405830383, "step": 1230 }, { "epoch": 2.8192219679633865, "grad_norm": 0.5257206559181213, "learning_rate": 2.390044609429556e-06, "loss": 0.8745390176773071, "step": 1232 }, { "epoch": 2.8237986270022883, "grad_norm": 1.4748421907424927, "learning_rate": 2.3787544098987148e-06, "loss": 0.5339797139167786, "step": 1234 }, { "epoch": 2.82837528604119, "grad_norm": 0.7052022218704224, "learning_rate": 2.3674931679387184e-06, "loss": 0.797744631767273, "step": 1236 }, { "epoch": 2.8329519450800915, "grad_norm": 0.27761203050613403, "learning_rate": 2.3562610448848415e-06, "loss": 0.49625930190086365, "step": 1238 }, { "epoch": 2.837528604118993, "grad_norm": 0.5554156303405762, "learning_rate": 2.3450582016551826e-06, "loss": 0.7721536159515381, "step": 1240 }, { "epoch": 2.8421052631578947, "grad_norm": 0.4830935597419739, "learning_rate": 2.3338847987483645e-06, "loss": 0.810258686542511, "step": 1242 }, { "epoch": 2.8466819221967965, "grad_norm": 0.7139240503311157, "learning_rate": 2.3227409962412204e-06, "loss": 0.8331779837608337, "step": 1244 }, { "epoch": 2.851258581235698, "grad_norm": 1.6733156442642212, "learning_rate": 2.3116269537865233e-06, "loss": 0.6354835629463196, "step": 1246 }, { "epoch": 2.8558352402745997, "grad_norm": 0.8812215328216553, "learning_rate": 2.3005428306106773e-06, "loss": 0.7057042121887207, "step": 1248 }, { "epoch": 2.860411899313501, "grad_norm": 0.784724235534668, "learning_rate": 2.2894887855114463e-06, "loss": 0.843634843826294, "step": 1250 }, { "epoch": 2.864988558352403, "grad_norm": 0.49218860268592834, "learning_rate": 2.27846497685568e-06, "loss": 0.7869199514389038, "step": 1252 }, { "epoch": 2.869565217391304, "grad_norm": 2.8198912143707275, "learning_rate": 2.2674715625770415e-06, "loss": 0.549948513507843, "step": 1254 }, { "epoch": 2.874141876430206, "grad_norm": 0.4654172956943512, "learning_rate": 2.256508700173745e-06, "loss": 0.5740110874176025, "step": 1256 }, { "epoch": 2.8787185354691074, "grad_norm": 0.37810707092285156, "learning_rate": 2.245576546706301e-06, "loss": 0.5503138303756714, "step": 1258 }, { "epoch": 2.883295194508009, "grad_norm": 0.8275235891342163, "learning_rate": 2.234675258795269e-06, "loss": 0.5749093890190125, "step": 1260 }, { "epoch": 2.887871853546911, "grad_norm": 0.5841154456138611, "learning_rate": 2.2238049926190025e-06, "loss": 0.714622437953949, "step": 1262 }, { "epoch": 2.8924485125858124, "grad_norm": 0.5689151287078857, "learning_rate": 2.2129659039114243e-06, "loss": 0.46536874771118164, "step": 1264 }, { "epoch": 2.8970251716247137, "grad_norm": 0.5501039028167725, "learning_rate": 2.2021581479597927e-06, "loss": 0.7893953919410706, "step": 1266 }, { "epoch": 2.9016018306636155, "grad_norm": 0.8494502902030945, "learning_rate": 2.191381879602466e-06, "loss": 0.6205670237541199, "step": 1268 }, { "epoch": 2.9061784897025174, "grad_norm": 0.6330673098564148, "learning_rate": 2.1806372532267006e-06, "loss": 0.5450186133384705, "step": 1270 }, { "epoch": 2.9107551487414187, "grad_norm": 0.5317128896713257, "learning_rate": 2.1699244227664272e-06, "loss": 0.4947490096092224, "step": 1272 }, { "epoch": 2.9153318077803205, "grad_norm": 0.42626556754112244, "learning_rate": 2.1592435417000485e-06, "loss": 0.5672276020050049, "step": 1274 }, { "epoch": 2.919908466819222, "grad_norm": 0.8525965213775635, "learning_rate": 2.1485947630482434e-06, "loss": 0.36214679479599, "step": 1276 }, { "epoch": 2.9244851258581237, "grad_norm": 0.5449546575546265, "learning_rate": 2.137978239371771e-06, "loss": 0.8090663552284241, "step": 1278 }, { "epoch": 2.929061784897025, "grad_norm": 1.8602927923202515, "learning_rate": 2.127394122769286e-06, "loss": 0.5202322006225586, "step": 1280 }, { "epoch": 2.933638443935927, "grad_norm": 1.372131109237671, "learning_rate": 2.11684256487516e-06, "loss": 0.7397865056991577, "step": 1282 }, { "epoch": 2.9382151029748282, "grad_norm": 0.4317033886909485, "learning_rate": 2.1063237168573135e-06, "loss": 0.8207894563674927, "step": 1284 }, { "epoch": 2.94279176201373, "grad_norm": 1.2322509288787842, "learning_rate": 2.0958377294150375e-06, "loss": 0.6003591418266296, "step": 1286 }, { "epoch": 2.9473684210526314, "grad_norm": 0.511931300163269, "learning_rate": 2.085384752776851e-06, "loss": 0.5409091114997864, "step": 1288 }, { "epoch": 2.9519450800915332, "grad_norm": 0.4843280017375946, "learning_rate": 2.074964936698335e-06, "loss": 0.5519256591796875, "step": 1290 }, { "epoch": 2.9565217391304346, "grad_norm": 0.8314401507377625, "learning_rate": 2.0645784304599952e-06, "loss": 0.7691973447799683, "step": 1292 }, { "epoch": 2.9610983981693364, "grad_norm": 1.611412763595581, "learning_rate": 2.0542253828651193e-06, "loss": 0.46346092224121094, "step": 1294 }, { "epoch": 2.965675057208238, "grad_norm": 0.45376840233802795, "learning_rate": 2.0439059422376476e-06, "loss": 0.7660473585128784, "step": 1296 }, { "epoch": 2.9702517162471396, "grad_norm": 0.5917079448699951, "learning_rate": 2.033620256420046e-06, "loss": 0.7450892329216003, "step": 1298 }, { "epoch": 2.974828375286041, "grad_norm": 0.5337526798248291, "learning_rate": 2.0233684727711883e-06, "loss": 0.7948495149612427, "step": 1300 }, { "epoch": 2.9794050343249427, "grad_norm": 2.367062568664551, "learning_rate": 2.0131507381642506e-06, "loss": 0.47295913100242615, "step": 1302 }, { "epoch": 2.9839816933638446, "grad_norm": 1.1392394304275513, "learning_rate": 2.0029671989845923e-06, "loss": 0.750167965888977, "step": 1304 }, { "epoch": 2.988558352402746, "grad_norm": 0.531143307685852, "learning_rate": 1.992818001127678e-06, "loss": 0.39104270935058594, "step": 1306 }, { "epoch": 2.9931350114416477, "grad_norm": 0.7250871062278748, "learning_rate": 1.9827032899969756e-06, "loss": 0.22440141439437866, "step": 1308 }, { "epoch": 2.997711670480549, "grad_norm": 0.703926682472229, "learning_rate": 1.9726232105018697e-06, "loss": 0.7429046630859375, "step": 1310 }, { "epoch": 3.002288329519451, "grad_norm": 0.43497398495674133, "learning_rate": 1.9625779070556e-06, "loss": 0.7082179188728333, "step": 1312 }, { "epoch": 3.0068649885583523, "grad_norm": 0.5466197729110718, "learning_rate": 1.9525675235731793e-06, "loss": 0.3564700484275818, "step": 1314 }, { "epoch": 3.011441647597254, "grad_norm": 0.5676448345184326, "learning_rate": 1.9425922034693363e-06, "loss": 0.6031548380851746, "step": 1316 }, { "epoch": 3.0160183066361554, "grad_norm": 0.6110860109329224, "learning_rate": 1.9326520896564614e-06, "loss": 0.5565021634101868, "step": 1318 }, { "epoch": 3.0205949656750573, "grad_norm": 0.7073854207992554, "learning_rate": 1.9227473245425584e-06, "loss": 0.6478375196456909, "step": 1320 }, { "epoch": 3.0251716247139586, "grad_norm": 0.7119851112365723, "learning_rate": 1.912878050029205e-06, "loss": 0.3045092821121216, "step": 1322 }, { "epoch": 3.0297482837528604, "grad_norm": 0.668566882610321, "learning_rate": 1.9030444075095169e-06, "loss": 0.5307950973510742, "step": 1324 }, { "epoch": 3.034324942791762, "grad_norm": 0.6608180403709412, "learning_rate": 1.8932465378661315e-06, "loss": 0.28795385360717773, "step": 1326 }, { "epoch": 3.0389016018306636, "grad_norm": 1.2194944620132446, "learning_rate": 1.8834845814691727e-06, "loss": 0.6476283073425293, "step": 1328 }, { "epoch": 3.0434782608695654, "grad_norm": 0.6446576118469238, "learning_rate": 1.873758678174258e-06, "loss": 0.6247165203094482, "step": 1330 }, { "epoch": 3.0480549199084668, "grad_norm": 0.10556092858314514, "learning_rate": 1.864068967320483e-06, "loss": 0.07942181080579758, "step": 1332 }, { "epoch": 3.0526315789473686, "grad_norm": 1.9154139757156372, "learning_rate": 1.8544155877284292e-06, "loss": 0.2606959939002991, "step": 1334 }, { "epoch": 3.05720823798627, "grad_norm": 0.7502045035362244, "learning_rate": 1.8447986776981746e-06, "loss": 0.6145456433296204, "step": 1336 }, { "epoch": 3.0617848970251718, "grad_norm": 0.7055021524429321, "learning_rate": 1.8352183750073134e-06, "loss": 0.43598175048828125, "step": 1338 }, { "epoch": 3.066361556064073, "grad_norm": 0.5926067233085632, "learning_rate": 1.8256748169089803e-06, "loss": 0.2945636212825775, "step": 1340 }, { "epoch": 3.070938215102975, "grad_norm": 1.8550186157226562, "learning_rate": 1.8161681401298842e-06, "loss": 0.43354955315589905, "step": 1342 }, { "epoch": 3.0755148741418763, "grad_norm": 2.016162395477295, "learning_rate": 1.8066984808683547e-06, "loss": 0.35298022627830505, "step": 1344 }, { "epoch": 3.080091533180778, "grad_norm": 0.677463948726654, "learning_rate": 1.7972659747923785e-06, "loss": 0.358053982257843, "step": 1346 }, { "epoch": 3.0846681922196795, "grad_norm": 1.3669031858444214, "learning_rate": 1.787870757037672e-06, "loss": 0.30859601497650146, "step": 1348 }, { "epoch": 3.0892448512585813, "grad_norm": 1.2701737880706787, "learning_rate": 1.7785129622057312e-06, "loss": 0.5989816188812256, "step": 1350 }, { "epoch": 3.0938215102974826, "grad_norm": 0.4244588613510132, "learning_rate": 1.7691927243619105e-06, "loss": 0.3882506191730499, "step": 1352 }, { "epoch": 3.0983981693363845, "grad_norm": 4.394486427307129, "learning_rate": 1.7599101770335015e-06, "loss": 0.13407155871391296, "step": 1354 }, { "epoch": 3.1029748283752863, "grad_norm": 0.7348482012748718, "learning_rate": 1.7506654532078176e-06, "loss": 0.1811959147453308, "step": 1356 }, { "epoch": 3.1075514874141876, "grad_norm": 0.8721897602081299, "learning_rate": 1.7414586853302909e-06, "loss": 0.42143383622169495, "step": 1358 }, { "epoch": 3.1121281464530894, "grad_norm": 5.011177062988281, "learning_rate": 1.732290005302572e-06, "loss": 0.31767553091049194, "step": 1360 }, { "epoch": 3.116704805491991, "grad_norm": 0.8464891910552979, "learning_rate": 1.7231595444806483e-06, "loss": 0.6350334882736206, "step": 1362 }, { "epoch": 3.1212814645308926, "grad_norm": 0.7363600134849548, "learning_rate": 1.7140674336729477e-06, "loss": 0.34634676575660706, "step": 1364 }, { "epoch": 3.125858123569794, "grad_norm": 0.6495455503463745, "learning_rate": 1.7050138031384776e-06, "loss": 0.5073356032371521, "step": 1366 }, { "epoch": 3.130434782608696, "grad_norm": 0.8677927851676941, "learning_rate": 1.6959987825849548e-06, "loss": 0.41521307826042175, "step": 1368 }, { "epoch": 3.135011441647597, "grad_norm": 0.6031743288040161, "learning_rate": 1.6870225011669433e-06, "loss": 0.5051814913749695, "step": 1370 }, { "epoch": 3.139588100686499, "grad_norm": 0.5029802918434143, "learning_rate": 1.67808508748401e-06, "loss": 0.060064904391765594, "step": 1372 }, { "epoch": 3.1441647597254003, "grad_norm": 0.49608922004699707, "learning_rate": 1.6691866695788778e-06, "loss": 0.5580463409423828, "step": 1374 }, { "epoch": 3.148741418764302, "grad_norm": 0.5855568051338196, "learning_rate": 1.6603273749355932e-06, "loss": 0.5613203644752502, "step": 1376 }, { "epoch": 3.1533180778032035, "grad_norm": 0.6027638912200928, "learning_rate": 1.6515073304776996e-06, "loss": 0.3589206635951996, "step": 1378 }, { "epoch": 3.1578947368421053, "grad_norm": 0.7994204759597778, "learning_rate": 1.642726662566419e-06, "loss": 0.4340493083000183, "step": 1380 }, { "epoch": 3.1624713958810067, "grad_norm": 0.8440326452255249, "learning_rate": 1.6339854969988412e-06, "loss": 0.5287958383560181, "step": 1382 }, { "epoch": 3.1670480549199085, "grad_norm": 1.337528109550476, "learning_rate": 1.6252839590061203e-06, "loss": 0.48380246758461, "step": 1384 }, { "epoch": 3.17162471395881, "grad_norm": 0.18262973427772522, "learning_rate": 1.6166221732516876e-06, "loss": 0.029057124629616737, "step": 1386 }, { "epoch": 3.1762013729977117, "grad_norm": 1.918377161026001, "learning_rate": 1.6080002638294516e-06, "loss": 0.09718252718448639, "step": 1388 }, { "epoch": 3.1807780320366135, "grad_norm": 0.17276065051555634, "learning_rate": 1.599418354262036e-06, "loss": 0.17694096267223358, "step": 1390 }, { "epoch": 3.185354691075515, "grad_norm": 0.8832252621650696, "learning_rate": 1.5908765674989995e-06, "loss": 0.4334821403026581, "step": 1392 }, { "epoch": 3.1899313501144166, "grad_norm": 6.330790996551514, "learning_rate": 1.582375025915078e-06, "loss": 0.2820512652397156, "step": 1394 }, { "epoch": 3.194508009153318, "grad_norm": 0.6005669832229614, "learning_rate": 1.573913851308431e-06, "loss": 0.22479744255542755, "step": 1396 }, { "epoch": 3.19908466819222, "grad_norm": 1.287660002708435, "learning_rate": 1.5654931648988962e-06, "loss": 0.4539932906627655, "step": 1398 }, { "epoch": 3.203661327231121, "grad_norm": 0.5233741402626038, "learning_rate": 1.5571130873262542e-06, "loss": 0.17698650062084198, "step": 1400 }, { "epoch": 3.208237986270023, "grad_norm": 0.9421582818031311, "learning_rate": 1.5487737386484966e-06, "loss": 0.4471212327480316, "step": 1402 }, { "epoch": 3.2128146453089244, "grad_norm": 1.1425485610961914, "learning_rate": 1.5404752383401145e-06, "loss": 0.49938952922821045, "step": 1404 }, { "epoch": 3.217391304347826, "grad_norm": 1.4222649335861206, "learning_rate": 1.5322177052903725e-06, "loss": 0.2908313572406769, "step": 1406 }, { "epoch": 3.2219679633867275, "grad_norm": 0.6182151436805725, "learning_rate": 1.5240012578016205e-06, "loss": 0.45964670181274414, "step": 1408 }, { "epoch": 3.2265446224256293, "grad_norm": 0.7475869059562683, "learning_rate": 1.5158260135875908e-06, "loss": 0.4192732572555542, "step": 1410 }, { "epoch": 3.2311212814645307, "grad_norm": 0.8350936770439148, "learning_rate": 1.507692089771708e-06, "loss": 0.6540165543556213, "step": 1412 }, { "epoch": 3.2356979405034325, "grad_norm": 1.6320070028305054, "learning_rate": 1.4995996028854237e-06, "loss": 0.42058131098747253, "step": 1414 }, { "epoch": 3.2402745995423343, "grad_norm": 0.9828153252601624, "learning_rate": 1.4915486688665344e-06, "loss": 0.4954679012298584, "step": 1416 }, { "epoch": 3.2448512585812357, "grad_norm": 0.4938318431377411, "learning_rate": 1.4835394030575266e-06, "loss": 0.5675240159034729, "step": 1418 }, { "epoch": 3.2494279176201375, "grad_norm": 0.8090194463729858, "learning_rate": 1.475571920203923e-06, "loss": 0.5262541174888611, "step": 1420 }, { "epoch": 3.254004576659039, "grad_norm": 0.9066356420516968, "learning_rate": 1.4676463344526395e-06, "loss": 0.5531529188156128, "step": 1422 }, { "epoch": 3.2585812356979407, "grad_norm": 0.8314560651779175, "learning_rate": 1.4597627593503473e-06, "loss": 0.4790411591529846, "step": 1424 }, { "epoch": 3.263157894736842, "grad_norm": 2.9613871574401855, "learning_rate": 1.4519213078418494e-06, "loss": 0.13854338228702545, "step": 1426 }, { "epoch": 3.267734553775744, "grad_norm": 1.013364315032959, "learning_rate": 1.4441220922684637e-06, "loss": 0.49466753005981445, "step": 1428 }, { "epoch": 3.272311212814645, "grad_norm": 0.32285788655281067, "learning_rate": 1.4363652243664036e-06, "loss": 0.045904774218797684, "step": 1430 }, { "epoch": 3.276887871853547, "grad_norm": 1.0177583694458008, "learning_rate": 1.4286508152651916e-06, "loss": 0.43011364340782166, "step": 1432 }, { "epoch": 3.2814645308924484, "grad_norm": 0.6156055927276611, "learning_rate": 1.4209789754860566e-06, "loss": 0.26439258456230164, "step": 1434 }, { "epoch": 3.28604118993135, "grad_norm": 0.535756528377533, "learning_rate": 1.4133498149403554e-06, "loss": 0.3485221862792969, "step": 1436 }, { "epoch": 3.2906178489702516, "grad_norm": 0.6638374924659729, "learning_rate": 1.405763442927995e-06, "loss": 0.43791279196739197, "step": 1438 }, { "epoch": 3.2951945080091534, "grad_norm": 0.6146953701972961, "learning_rate": 1.3982199681358703e-06, "loss": 0.43667203187942505, "step": 1440 }, { "epoch": 3.2997711670480547, "grad_norm": 0.8798004388809204, "learning_rate": 1.3907194986363029e-06, "loss": 0.679145872592926, "step": 1442 }, { "epoch": 3.3043478260869565, "grad_norm": 2.4789793491363525, "learning_rate": 1.383262141885496e-06, "loss": 0.46688565611839294, "step": 1444 }, { "epoch": 3.308924485125858, "grad_norm": 0.7254453897476196, "learning_rate": 1.3758480047219964e-06, "loss": 0.4225999414920807, "step": 1446 }, { "epoch": 3.3135011441647597, "grad_norm": 1.1638004779815674, "learning_rate": 1.3684771933651547e-06, "loss": 0.5533671975135803, "step": 1448 }, { "epoch": 3.3180778032036615, "grad_norm": 1.8972636461257935, "learning_rate": 1.3611498134136171e-06, "loss": 0.49239760637283325, "step": 1450 }, { "epoch": 3.322654462242563, "grad_norm": 0.7231550216674805, "learning_rate": 1.353865969843803e-06, "loss": 0.6097209453582764, "step": 1452 }, { "epoch": 3.3272311212814647, "grad_norm": 0.6900179982185364, "learning_rate": 1.3466257670084006e-06, "loss": 0.5563924312591553, "step": 1454 }, { "epoch": 3.331807780320366, "grad_norm": 0.657533586025238, "learning_rate": 1.3394293086348796e-06, "loss": 0.5384810566902161, "step": 1456 }, { "epoch": 3.336384439359268, "grad_norm": 0.20614972710609436, "learning_rate": 1.3322766978239977e-06, "loss": 0.21752725541591644, "step": 1458 }, { "epoch": 3.3409610983981692, "grad_norm": 0.43330979347229004, "learning_rate": 1.325168037048327e-06, "loss": 0.1562982201576233, "step": 1460 }, { "epoch": 3.345537757437071, "grad_norm": 0.8868097066879272, "learning_rate": 1.3181034281507846e-06, "loss": 0.039067141711711884, "step": 1462 }, { "epoch": 3.3501144164759724, "grad_norm": 0.39520424604415894, "learning_rate": 1.3110829723431763e-06, "loss": 0.31014665961265564, "step": 1464 }, { "epoch": 3.354691075514874, "grad_norm": 1.6159244775772095, "learning_rate": 1.3041067702047407e-06, "loss": 0.07728109508752823, "step": 1466 }, { "epoch": 3.3592677345537756, "grad_norm": 0.7340835928916931, "learning_rate": 1.297174921680714e-06, "loss": 0.30891698598861694, "step": 1468 }, { "epoch": 3.3638443935926774, "grad_norm": 1.7658299207687378, "learning_rate": 1.2902875260808978e-06, "loss": 0.07568443566560745, "step": 1470 }, { "epoch": 3.3684210526315788, "grad_norm": 1.2565313577651978, "learning_rate": 1.28344468207823e-06, "loss": 0.2796335220336914, "step": 1472 }, { "epoch": 3.3729977116704806, "grad_norm": 0.7257245182991028, "learning_rate": 1.2766464877073805e-06, "loss": 0.46368178725242615, "step": 1474 }, { "epoch": 3.3775743707093824, "grad_norm": 1.0831571817398071, "learning_rate": 1.2698930403633389e-06, "loss": 0.39964616298675537, "step": 1476 }, { "epoch": 3.3821510297482837, "grad_norm": 0.8762648105621338, "learning_rate": 1.2631844368000236e-06, "loss": 0.022859321907162666, "step": 1478 }, { "epoch": 3.386727688787185, "grad_norm": 0.7149327993392944, "learning_rate": 1.256520773128893e-06, "loss": 0.29580366611480713, "step": 1480 }, { "epoch": 3.391304347826087, "grad_norm": 5.417656898498535, "learning_rate": 1.2499021448175713e-06, "loss": 0.3861583173274994, "step": 1482 }, { "epoch": 3.3958810068649887, "grad_norm": 0.5899186730384827, "learning_rate": 1.2433286466884783e-06, "loss": 0.5882078409194946, "step": 1484 }, { "epoch": 3.40045766590389, "grad_norm": 0.6683065295219421, "learning_rate": 1.2368003729174708e-06, "loss": 0.5233013033866882, "step": 1486 }, { "epoch": 3.405034324942792, "grad_norm": 0.7072110772132874, "learning_rate": 1.2303174170324984e-06, "loss": 0.5888646245002747, "step": 1488 }, { "epoch": 3.4096109839816933, "grad_norm": 1.6215583086013794, "learning_rate": 1.223879871912254e-06, "loss": 0.04785463213920593, "step": 1490 }, { "epoch": 3.414187643020595, "grad_norm": 0.6127355098724365, "learning_rate": 1.2174878297848537e-06, "loss": 0.0625072568655014, "step": 1492 }, { "epoch": 3.4187643020594964, "grad_norm": 0.48454123735427856, "learning_rate": 1.2111413822265077e-06, "loss": 0.37776947021484375, "step": 1494 }, { "epoch": 3.4233409610983982, "grad_norm": 0.6533837914466858, "learning_rate": 1.2048406201602123e-06, "loss": 0.17243488132953644, "step": 1496 }, { "epoch": 3.4279176201372996, "grad_norm": 1.2736353874206543, "learning_rate": 1.1985856338544457e-06, "loss": 0.3051704466342926, "step": 1498 }, { "epoch": 3.4324942791762014, "grad_norm": 0.9836899042129517, "learning_rate": 1.1923765129218759e-06, "loss": 0.34713953733444214, "step": 1500 }, { "epoch": 3.437070938215103, "grad_norm": 1.0902957916259766, "learning_rate": 1.1862133463180752e-06, "loss": 0.4851178228855133, "step": 1502 }, { "epoch": 3.4416475972540046, "grad_norm": 0.7768452167510986, "learning_rate": 1.1800962223402466e-06, "loss": 0.23431611061096191, "step": 1504 }, { "epoch": 3.446224256292906, "grad_norm": 0.789863646030426, "learning_rate": 1.174025228625962e-06, "loss": 0.4964331090450287, "step": 1506 }, { "epoch": 3.4508009153318078, "grad_norm": 0.9043194055557251, "learning_rate": 1.168000452151899e-06, "loss": 0.5745983719825745, "step": 1508 }, { "epoch": 3.4553775743707096, "grad_norm": 0.6853423714637756, "learning_rate": 1.1620219792326019e-06, "loss": 0.3494628071784973, "step": 1510 }, { "epoch": 3.459954233409611, "grad_norm": 0.7719877362251282, "learning_rate": 1.1560898955192442e-06, "loss": 0.07691881060600281, "step": 1512 }, { "epoch": 3.4645308924485128, "grad_norm": 0.5891416072845459, "learning_rate": 1.1502042859983956e-06, "loss": 0.3369552493095398, "step": 1514 }, { "epoch": 3.469107551487414, "grad_norm": 0.7656334638595581, "learning_rate": 1.144365234990813e-06, "loss": 0.5648132562637329, "step": 1516 }, { "epoch": 3.473684210526316, "grad_norm": 0.8338443636894226, "learning_rate": 1.1385728261502265e-06, "loss": 0.5203874707221985, "step": 1518 }, { "epoch": 3.4782608695652173, "grad_norm": 1.7429405450820923, "learning_rate": 1.1328271424621426e-06, "loss": 0.4457243084907532, "step": 1520 }, { "epoch": 3.482837528604119, "grad_norm": 0.6419166922569275, "learning_rate": 1.127128266242655e-06, "loss": 0.5154409408569336, "step": 1522 }, { "epoch": 3.4874141876430205, "grad_norm": 0.7125447988510132, "learning_rate": 1.1214762791372668e-06, "loss": 0.38710278272628784, "step": 1524 }, { "epoch": 3.4919908466819223, "grad_norm": 0.5910897850990295, "learning_rate": 1.1158712621197187e-06, "loss": 0.5922369956970215, "step": 1526 }, { "epoch": 3.4965675057208236, "grad_norm": 0.42722317576408386, "learning_rate": 1.1103132954908296e-06, "loss": 0.03717589005827904, "step": 1528 }, { "epoch": 3.5011441647597255, "grad_norm": 0.7920047044754028, "learning_rate": 1.1048024588773493e-06, "loss": 0.5138665437698364, "step": 1530 }, { "epoch": 3.505720823798627, "grad_norm": 0.6985714435577393, "learning_rate": 1.09933883123081e-06, "loss": 0.5611461997032166, "step": 1532 }, { "epoch": 3.5102974828375286, "grad_norm": 0.5378302931785583, "learning_rate": 1.0939224908264042e-06, "loss": 0.5791317820549011, "step": 1534 }, { "epoch": 3.5148741418764304, "grad_norm": 0.10158717632293701, "learning_rate": 1.0885535152618574e-06, "loss": 0.2312551885843277, "step": 1536 }, { "epoch": 3.519450800915332, "grad_norm": 0.9983445405960083, "learning_rate": 1.0832319814563188e-06, "loss": 0.32253575325012207, "step": 1538 }, { "epoch": 3.524027459954233, "grad_norm": 1.0612983703613281, "learning_rate": 1.0779579656492575e-06, "loss": 0.3236549496650696, "step": 1540 }, { "epoch": 3.528604118993135, "grad_norm": 0.7103164792060852, "learning_rate": 1.072731543399372e-06, "loss": 0.3252106308937073, "step": 1542 }, { "epoch": 3.533180778032037, "grad_norm": 0.2725476622581482, "learning_rate": 1.067552789583508e-06, "loss": 0.23804107308387756, "step": 1544 }, { "epoch": 3.537757437070938, "grad_norm": 1.5089448690414429, "learning_rate": 1.0624217783955839e-06, "loss": 0.38195565342903137, "step": 1546 }, { "epoch": 3.54233409610984, "grad_norm": 0.6526052355766296, "learning_rate": 1.0573385833455275e-06, "loss": 0.380737841129303, "step": 1548 }, { "epoch": 3.5469107551487413, "grad_norm": 1.1080514192581177, "learning_rate": 1.0523032772582262e-06, "loss": 0.38047873973846436, "step": 1550 }, { "epoch": 3.551487414187643, "grad_norm": 0.7343178391456604, "learning_rate": 1.047315932272482e-06, "loss": 0.4824707508087158, "step": 1552 }, { "epoch": 3.5560640732265445, "grad_norm": 0.633358359336853, "learning_rate": 1.0423766198399744e-06, "loss": 0.6635564565658569, "step": 1554 }, { "epoch": 3.5606407322654463, "grad_norm": 0.5067686438560486, "learning_rate": 1.0374854107242416e-06, "loss": 0.3814176023006439, "step": 1556 }, { "epoch": 3.5652173913043477, "grad_norm": 0.9163781404495239, "learning_rate": 1.032642374999667e-06, "loss": 0.37396469712257385, "step": 1558 }, { "epoch": 3.5697940503432495, "grad_norm": 0.7029092311859131, "learning_rate": 1.0278475820504685e-06, "loss": 0.35979732871055603, "step": 1560 }, { "epoch": 3.5743707093821513, "grad_norm": 0.5607945322990417, "learning_rate": 1.0231011005697145e-06, "loss": 0.5650622248649597, "step": 1562 }, { "epoch": 3.5789473684210527, "grad_norm": 1.1728614568710327, "learning_rate": 1.0184029985583304e-06, "loss": 0.5536704063415527, "step": 1564 }, { "epoch": 3.583524027459954, "grad_norm": 0.7152750492095947, "learning_rate": 1.013753343324131e-06, "loss": 0.594580888748169, "step": 1566 }, { "epoch": 3.588100686498856, "grad_norm": 1.2149173021316528, "learning_rate": 1.009152201480852e-06, "loss": 0.2382848709821701, "step": 1568 }, { "epoch": 3.5926773455377576, "grad_norm": 0.7747706770896912, "learning_rate": 1.0045996389471982e-06, "loss": 0.26458844542503357, "step": 1570 }, { "epoch": 3.597254004576659, "grad_norm": 0.8040603995323181, "learning_rate": 1.000095720945898e-06, "loss": 0.5273915529251099, "step": 1572 }, { "epoch": 3.6018306636155604, "grad_norm": 0.7427345514297485, "learning_rate": 9.956405120027684e-07, "loss": 0.630144476890564, "step": 1574 }, { "epoch": 3.606407322654462, "grad_norm": 0.7674996852874756, "learning_rate": 9.912340759457942e-07, "loss": 0.3695995509624481, "step": 1576 }, { "epoch": 3.610983981693364, "grad_norm": 0.6453800797462463, "learning_rate": 9.868764759042061e-07, "loss": 0.3349326550960541, "step": 1578 }, { "epoch": 3.6155606407322654, "grad_norm": 0.8265721201896667, "learning_rate": 9.82567774307585e-07, "loss": 0.22504082322120667, "step": 1580 }, { "epoch": 3.620137299771167, "grad_norm": 0.905486524105072, "learning_rate": 9.783080328849617e-07, "loss": 0.3136463165283203, "step": 1582 }, { "epoch": 3.6247139588100685, "grad_norm": 0.6223527789115906, "learning_rate": 9.740973126639342e-07, "loss": 0.5206019282341003, "step": 1584 }, { "epoch": 3.6292906178489703, "grad_norm": 0.7490825057029724, "learning_rate": 9.699356739697942e-07, "loss": 0.2996978163719177, "step": 1586 }, { "epoch": 3.6338672768878717, "grad_norm": 1.3377209901809692, "learning_rate": 9.658231764246612e-07, "loss": 0.49734920263290405, "step": 1588 }, { "epoch": 3.6384439359267735, "grad_norm": 0.6160526275634766, "learning_rate": 9.617598789466309e-07, "loss": 0.4488961696624756, "step": 1590 }, { "epoch": 3.643020594965675, "grad_norm": 0.6402332782745361, "learning_rate": 9.577458397489267e-07, "loss": 0.34510815143585205, "step": 1592 }, { "epoch": 3.6475972540045767, "grad_norm": 2.2822258472442627, "learning_rate": 9.537811163390726e-07, "loss": 0.29307034611701965, "step": 1594 }, { "epoch": 3.6521739130434785, "grad_norm": 0.6280086636543274, "learning_rate": 9.498657655180603e-07, "loss": 0.7618148326873779, "step": 1596 }, { "epoch": 3.65675057208238, "grad_norm": 1.1775965690612793, "learning_rate": 9.459998433795451e-07, "loss": 0.5977869033813477, "step": 1598 }, { "epoch": 3.6613272311212812, "grad_norm": 1.0905108451843262, "learning_rate": 9.421834053090337e-07, "loss": 0.19339358806610107, "step": 1600 }, { "epoch": 3.665903890160183, "grad_norm": 0.7036289572715759, "learning_rate": 9.384165059830962e-07, "loss": 0.3755697011947632, "step": 1602 }, { "epoch": 3.670480549199085, "grad_norm": 1.6435350179672241, "learning_rate": 9.346991993685812e-07, "loss": 0.23423030972480774, "step": 1604 }, { "epoch": 3.675057208237986, "grad_norm": 0.6723319888114929, "learning_rate": 9.310315387218422e-07, "loss": 0.3673589825630188, "step": 1606 }, { "epoch": 3.679633867276888, "grad_norm": 0.6365793943405151, "learning_rate": 9.274135765879747e-07, "loss": 0.5359373092651367, "step": 1608 }, { "epoch": 3.6842105263157894, "grad_norm": 0.7959802150726318, "learning_rate": 9.238453648000641e-07, "loss": 0.1777590662240982, "step": 1610 }, { "epoch": 3.688787185354691, "grad_norm": 0.6853817105293274, "learning_rate": 9.203269544784425e-07, "loss": 0.38532423973083496, "step": 1612 }, { "epoch": 3.6933638443935926, "grad_norm": 1.2843016386032104, "learning_rate": 9.168583960299554e-07, "loss": 0.08693390339612961, "step": 1614 }, { "epoch": 3.6979405034324944, "grad_norm": 0.7138749361038208, "learning_rate": 9.134397391472428e-07, "loss": 0.39513278007507324, "step": 1616 }, { "epoch": 3.7025171624713957, "grad_norm": 0.6505438685417175, "learning_rate": 9.100710328080235e-07, "loss": 0.6121611595153809, "step": 1618 }, { "epoch": 3.7070938215102975, "grad_norm": 0.9069646596908569, "learning_rate": 9.06752325274395e-07, "loss": 0.5526926517486572, "step": 1620 }, { "epoch": 3.7116704805491993, "grad_norm": 1.1165289878845215, "learning_rate": 9.034836640921429e-07, "loss": 0.07703058421611786, "step": 1622 }, { "epoch": 3.7162471395881007, "grad_norm": 0.7630195021629333, "learning_rate": 9.00265096090058e-07, "loss": 0.46056246757507324, "step": 1624 }, { "epoch": 3.720823798627002, "grad_norm": 0.6535618305206299, "learning_rate": 8.970966673792673e-07, "loss": 0.42857468128204346, "step": 1626 }, { "epoch": 3.725400457665904, "grad_norm": 0.8734167218208313, "learning_rate": 8.939784233525715e-07, "loss": 0.256624311208725, "step": 1628 }, { "epoch": 3.7299771167048057, "grad_norm": 0.7594066858291626, "learning_rate": 8.909104086837956e-07, "loss": 0.2708790898323059, "step": 1630 }, { "epoch": 3.734553775743707, "grad_norm": 0.7039399743080139, "learning_rate": 8.878926673271494e-07, "loss": 0.24956341087818146, "step": 1632 }, { "epoch": 3.7391304347826084, "grad_norm": 0.5838392376899719, "learning_rate": 8.849252425165964e-07, "loss": 0.15410839021205902, "step": 1634 }, { "epoch": 3.7437070938215102, "grad_norm": 0.9392322301864624, "learning_rate": 8.82008176765237e-07, "loss": 0.28980621695518494, "step": 1636 }, { "epoch": 3.748283752860412, "grad_norm": 0.7467678785324097, "learning_rate": 8.791415118646951e-07, "loss": 0.43078869581222534, "step": 1638 }, { "epoch": 3.7528604118993134, "grad_norm": 0.9660971760749817, "learning_rate": 8.763252888845239e-07, "loss": 0.047181982547044754, "step": 1640 }, { "epoch": 3.757437070938215, "grad_norm": 0.8727622032165527, "learning_rate": 8.735595481716144e-07, "loss": 0.696696937084198, "step": 1642 }, { "epoch": 3.7620137299771166, "grad_norm": 0.6368957757949829, "learning_rate": 8.708443293496197e-07, "loss": 0.5460187196731567, "step": 1644 }, { "epoch": 3.7665903890160184, "grad_norm": 0.7762473821640015, "learning_rate": 8.681796713183851e-07, "loss": 0.5858830809593201, "step": 1646 }, { "epoch": 3.7711670480549198, "grad_norm": 0.8503757119178772, "learning_rate": 8.655656122533918e-07, "loss": 0.33725491166114807, "step": 1648 }, { "epoch": 3.7757437070938216, "grad_norm": 0.9335038661956787, "learning_rate": 8.630021896052107e-07, "loss": 0.3240436017513275, "step": 1650 }, { "epoch": 3.780320366132723, "grad_norm": 0.6535862684249878, "learning_rate": 8.604894400989643e-07, "loss": 0.5961631536483765, "step": 1652 }, { "epoch": 3.7848970251716247, "grad_norm": 0.6312614679336548, "learning_rate": 8.580273997338029e-07, "loss": 0.5986257791519165, "step": 1654 }, { "epoch": 3.7894736842105265, "grad_norm": 1.371013879776001, "learning_rate": 8.556161037823857e-07, "loss": 0.1755972057580948, "step": 1656 }, { "epoch": 3.794050343249428, "grad_norm": 0.6041772961616516, "learning_rate": 8.532555867903774e-07, "loss": 0.4229702055454254, "step": 1658 }, { "epoch": 3.7986270022883293, "grad_norm": 0.7439691424369812, "learning_rate": 8.509458825759552e-07, "loss": 0.42115482687950134, "step": 1660 }, { "epoch": 3.803203661327231, "grad_norm": 0.712613046169281, "learning_rate": 8.486870242293181e-07, "loss": 0.27668675780296326, "step": 1662 }, { "epoch": 3.807780320366133, "grad_norm": 0.6914879679679871, "learning_rate": 8.46479044112221e-07, "loss": 0.48982763290405273, "step": 1664 }, { "epoch": 3.8123569794050343, "grad_norm": 0.22564047574996948, "learning_rate": 8.443219738575045e-07, "loss": 0.03790595009922981, "step": 1666 }, { "epoch": 3.816933638443936, "grad_norm": 1.0601603984832764, "learning_rate": 8.422158443686438e-07, "loss": 0.29513728618621826, "step": 1668 }, { "epoch": 3.8215102974828374, "grad_norm": 0.8904151320457458, "learning_rate": 8.401606858193082e-07, "loss": 0.5875513553619385, "step": 1670 }, { "epoch": 3.8260869565217392, "grad_norm": 2.976503610610962, "learning_rate": 8.381565276529259e-07, "loss": 0.6087798476219177, "step": 1672 }, { "epoch": 3.8306636155606406, "grad_norm": 0.632571280002594, "learning_rate": 8.362033985822622e-07, "loss": 0.32114294171333313, "step": 1674 }, { "epoch": 3.8352402745995424, "grad_norm": 0.601119339466095, "learning_rate": 8.343013265890103e-07, "loss": 0.5873348712921143, "step": 1676 }, { "epoch": 3.839816933638444, "grad_norm": 0.9582832455635071, "learning_rate": 8.324503389233897e-07, "loss": 0.3884204924106598, "step": 1678 }, { "epoch": 3.8443935926773456, "grad_norm": 0.6112123727798462, "learning_rate": 8.306504621037538e-07, "loss": 0.5339500308036804, "step": 1680 }, { "epoch": 3.8489702517162474, "grad_norm": 1.0593942403793335, "learning_rate": 8.289017219162127e-07, "loss": 0.5693700909614563, "step": 1682 }, { "epoch": 3.8535469107551488, "grad_norm": 0.12832017242908478, "learning_rate": 8.27204143414262e-07, "loss": 0.35359063744544983, "step": 1684 }, { "epoch": 3.85812356979405, "grad_norm": 0.6210229992866516, "learning_rate": 8.25557750918425e-07, "loss": 0.5230538845062256, "step": 1686 }, { "epoch": 3.862700228832952, "grad_norm": 0.16382178664207458, "learning_rate": 8.239625680159025e-07, "loss": 0.26382216811180115, "step": 1688 }, { "epoch": 3.8672768878718538, "grad_norm": 2.582536220550537, "learning_rate": 8.224186175602379e-07, "loss": 0.4692237079143524, "step": 1690 }, { "epoch": 3.871853546910755, "grad_norm": 0.19737273454666138, "learning_rate": 8.209259216709867e-07, "loss": 0.20311836898326874, "step": 1692 }, { "epoch": 3.8764302059496565, "grad_norm": 0.6413448452949524, "learning_rate": 8.19484501733401e-07, "loss": 0.3281576633453369, "step": 1694 }, { "epoch": 3.8810068649885583, "grad_norm": 0.5512455701828003, "learning_rate": 8.180943783981235e-07, "loss": 0.43348217010498047, "step": 1696 }, { "epoch": 3.88558352402746, "grad_norm": 0.6660462021827698, "learning_rate": 8.167555715808909e-07, "loss": 0.2919246256351471, "step": 1698 }, { "epoch": 3.8901601830663615, "grad_norm": 0.6395224928855896, "learning_rate": 8.154681004622488e-07, "loss": 0.6133857369422913, "step": 1700 }, { "epoch": 3.8947368421052633, "grad_norm": 0.9475614428520203, "learning_rate": 8.142319834872765e-07, "loss": 0.11332155019044876, "step": 1702 }, { "epoch": 3.8993135011441646, "grad_norm": 0.9015949368476868, "learning_rate": 8.130472383653242e-07, "loss": 0.46062496304512024, "step": 1704 }, { "epoch": 3.9038901601830664, "grad_norm": 1.1249760389328003, "learning_rate": 8.119138820697578e-07, "loss": 0.30429723858833313, "step": 1706 }, { "epoch": 3.908466819221968, "grad_norm": 0.6844801306724548, "learning_rate": 8.108319308377159e-07, "loss": 0.2837408483028412, "step": 1708 }, { "epoch": 3.9130434782608696, "grad_norm": 0.6176382303237915, "learning_rate": 8.098014001698775e-07, "loss": 0.3056495487689972, "step": 1710 }, { "epoch": 3.917620137299771, "grad_norm": 0.6619205474853516, "learning_rate": 8.088223048302401e-07, "loss": 0.3151998519897461, "step": 1712 }, { "epoch": 3.922196796338673, "grad_norm": 0.6511373519897461, "learning_rate": 8.078946588459083e-07, "loss": 0.31710049510002136, "step": 1714 }, { "epoch": 3.9267734553775746, "grad_norm": 0.45258191227912903, "learning_rate": 8.070184755068918e-07, "loss": 0.49992504715919495, "step": 1716 }, { "epoch": 3.931350114416476, "grad_norm": 1.0866183042526245, "learning_rate": 8.061937673659166e-07, "loss": 0.40558746457099915, "step": 1718 }, { "epoch": 3.9359267734553773, "grad_norm": 0.6440381407737732, "learning_rate": 8.054205462382437e-07, "loss": 0.3771609663963318, "step": 1720 }, { "epoch": 3.940503432494279, "grad_norm": 0.5777904987335205, "learning_rate": 8.046988232015002e-07, "loss": 0.5135948061943054, "step": 1722 }, { "epoch": 3.945080091533181, "grad_norm": 0.6155217885971069, "learning_rate": 8.040286085955212e-07, "loss": 0.5833529233932495, "step": 1724 }, { "epoch": 3.9496567505720823, "grad_norm": 0.9023070335388184, "learning_rate": 8.034099120222018e-07, "loss": 0.33015888929367065, "step": 1726 }, { "epoch": 3.954233409610984, "grad_norm": 0.558512806892395, "learning_rate": 8.028427423453575e-07, "loss": 0.3327302932739258, "step": 1728 }, { "epoch": 3.9588100686498855, "grad_norm": 1.3595865964889526, "learning_rate": 8.023271076906006e-07, "loss": 0.33213916420936584, "step": 1730 }, { "epoch": 3.9633867276887873, "grad_norm": 0.9910792708396912, "learning_rate": 8.018630154452202e-07, "loss": 0.303692489862442, "step": 1732 }, { "epoch": 3.9679633867276887, "grad_norm": 0.8309425711631775, "learning_rate": 8.01450472258079e-07, "loss": 0.5128837823867798, "step": 1734 }, { "epoch": 3.9725400457665905, "grad_norm": 0.7355870604515076, "learning_rate": 8.010894840395169e-07, "loss": 0.3522130846977234, "step": 1736 }, { "epoch": 3.977116704805492, "grad_norm": 0.762816846370697, "learning_rate": 8.007800559612672e-07, "loss": 0.6588464379310608, "step": 1738 }, { "epoch": 3.9816933638443937, "grad_norm": 0.8466113209724426, "learning_rate": 8.005221924563803e-07, "loss": 0.7762544751167297, "step": 1740 }, { "epoch": 3.9862700228832955, "grad_norm": 0.7509016990661621, "learning_rate": 8.003158972191635e-07, "loss": 0.39617398381233215, "step": 1742 }, { "epoch": 3.990846681922197, "grad_norm": 0.9751285910606384, "learning_rate": 8.001611732051253e-07, "loss": 0.3054667115211487, "step": 1744 }, { "epoch": 3.995423340961098, "grad_norm": 1.1020561456680298, "learning_rate": 8.000580226309339e-07, "loss": 0.30580854415893555, "step": 1746 }, { "epoch": 4.0, "grad_norm": 0.5312338471412659, "learning_rate": 8.000064469743863e-07, "loss": 0.5320942401885986, "step": 1748 }, { "epoch": 4.0, "step": 1748, "total_flos": 3.454020596210336e+18, "train_loss": 0.7742519322697508, "train_runtime": 25098.6052, "train_samples_per_second": 2.089, "train_steps_per_second": 0.07 } ], "logging_steps": 2, "max_steps": 1748, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.454020596210336e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }