{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999501329787234, "eval_steps": 500, "global_step": 6015, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004986702127659574, "grad_norm": 5.041937828063965, "learning_rate": 1.6611295681063125e-08, "loss": 0.845, "step": 1 }, { "epoch": 0.0009973404255319148, "grad_norm": 5.029145240783691, "learning_rate": 3.322259136212625e-08, "loss": 0.7972, "step": 2 }, { "epoch": 0.0014960106382978723, "grad_norm": 5.162477970123291, "learning_rate": 4.983388704318937e-08, "loss": 0.8068, "step": 3 }, { "epoch": 0.0019946808510638296, "grad_norm": 5.3116135597229, "learning_rate": 6.64451827242525e-08, "loss": 0.8248, "step": 4 }, { "epoch": 0.0024933510638297874, "grad_norm": 5.366575241088867, "learning_rate": 8.305647840531563e-08, "loss": 0.8418, "step": 5 }, { "epoch": 0.0029920212765957447, "grad_norm": 4.911136627197266, "learning_rate": 9.966777408637874e-08, "loss": 0.8154, "step": 6 }, { "epoch": 0.003490691489361702, "grad_norm": 5.3164873123168945, "learning_rate": 1.1627906976744187e-07, "loss": 0.8309, "step": 7 }, { "epoch": 0.003989361702127659, "grad_norm": 4.8563551902771, "learning_rate": 1.32890365448505e-07, "loss": 0.7658, "step": 8 }, { "epoch": 0.004488031914893617, "grad_norm": 5.097818374633789, "learning_rate": 1.4950166112956811e-07, "loss": 0.8247, "step": 9 }, { "epoch": 0.004986702127659575, "grad_norm": 5.430413246154785, "learning_rate": 1.6611295681063126e-07, "loss": 0.7879, "step": 10 }, { "epoch": 0.005485372340425532, "grad_norm": 5.13662052154541, "learning_rate": 1.827242524916944e-07, "loss": 0.7796, "step": 11 }, { "epoch": 0.005984042553191489, "grad_norm": 5.2010369300842285, "learning_rate": 1.9933554817275749e-07, "loss": 0.8083, "step": 12 }, { "epoch": 0.006482712765957447, "grad_norm": 5.279240608215332, "learning_rate": 2.159468438538206e-07, "loss": 0.826, "step": 13 }, { "epoch": 0.006981382978723404, "grad_norm": 5.220824718475342, "learning_rate": 2.3255813953488374e-07, "loss": 0.8025, "step": 14 }, { "epoch": 0.007480053191489362, "grad_norm": 4.779820442199707, "learning_rate": 2.491694352159469e-07, "loss": 0.8038, "step": 15 }, { "epoch": 0.007978723404255319, "grad_norm": 4.620970726013184, "learning_rate": 2.6578073089701e-07, "loss": 0.7979, "step": 16 }, { "epoch": 0.008477393617021276, "grad_norm": 4.727428436279297, "learning_rate": 2.823920265780731e-07, "loss": 0.7929, "step": 17 }, { "epoch": 0.008976063829787235, "grad_norm": 4.871448040008545, "learning_rate": 2.9900332225913623e-07, "loss": 0.8128, "step": 18 }, { "epoch": 0.009474734042553192, "grad_norm": 5.118088722229004, "learning_rate": 3.1561461794019934e-07, "loss": 0.8593, "step": 19 }, { "epoch": 0.00997340425531915, "grad_norm": 5.304712772369385, "learning_rate": 3.322259136212625e-07, "loss": 0.8529, "step": 20 }, { "epoch": 0.010472074468085107, "grad_norm": 4.961639881134033, "learning_rate": 3.488372093023256e-07, "loss": 0.8468, "step": 21 }, { "epoch": 0.010970744680851064, "grad_norm": 4.998758316040039, "learning_rate": 3.654485049833888e-07, "loss": 0.8022, "step": 22 }, { "epoch": 0.011469414893617021, "grad_norm": 4.689671516418457, "learning_rate": 3.8205980066445186e-07, "loss": 0.8102, "step": 23 }, { "epoch": 0.011968085106382979, "grad_norm": 4.048353672027588, "learning_rate": 3.9867109634551497e-07, "loss": 0.7943, "step": 24 }, { "epoch": 0.012466755319148936, "grad_norm": 4.141507625579834, "learning_rate": 4.1528239202657814e-07, "loss": 0.7733, "step": 25 }, { "epoch": 0.012965425531914893, "grad_norm": 3.631518840789795, "learning_rate": 4.318936877076412e-07, "loss": 0.7217, "step": 26 }, { "epoch": 0.01346409574468085, "grad_norm": 3.880974531173706, "learning_rate": 4.4850498338870437e-07, "loss": 0.7786, "step": 27 }, { "epoch": 0.013962765957446808, "grad_norm": 3.858417272567749, "learning_rate": 4.651162790697675e-07, "loss": 0.7637, "step": 28 }, { "epoch": 0.014461436170212765, "grad_norm": 3.791649580001831, "learning_rate": 4.817275747508306e-07, "loss": 0.7798, "step": 29 }, { "epoch": 0.014960106382978724, "grad_norm": 3.8120455741882324, "learning_rate": 4.983388704318938e-07, "loss": 0.8065, "step": 30 }, { "epoch": 0.015458776595744681, "grad_norm": 3.653823137283325, "learning_rate": 5.149501661129568e-07, "loss": 0.779, "step": 31 }, { "epoch": 0.015957446808510637, "grad_norm": 2.9630494117736816, "learning_rate": 5.3156146179402e-07, "loss": 0.7667, "step": 32 }, { "epoch": 0.016456117021276594, "grad_norm": 2.222691774368286, "learning_rate": 5.481727574750831e-07, "loss": 0.6813, "step": 33 }, { "epoch": 0.01695478723404255, "grad_norm": 2.2919483184814453, "learning_rate": 5.647840531561462e-07, "loss": 0.7032, "step": 34 }, { "epoch": 0.01745345744680851, "grad_norm": 2.2937912940979004, "learning_rate": 5.813953488372094e-07, "loss": 0.739, "step": 35 }, { "epoch": 0.01795212765957447, "grad_norm": 2.1467792987823486, "learning_rate": 5.980066445182725e-07, "loss": 0.7251, "step": 36 }, { "epoch": 0.018450797872340427, "grad_norm": 2.2362937927246094, "learning_rate": 6.146179401993356e-07, "loss": 0.7038, "step": 37 }, { "epoch": 0.018949468085106384, "grad_norm": 2.232919931411743, "learning_rate": 6.312292358803987e-07, "loss": 0.7498, "step": 38 }, { "epoch": 0.01944813829787234, "grad_norm": 1.8852896690368652, "learning_rate": 6.478405315614617e-07, "loss": 0.6748, "step": 39 }, { "epoch": 0.0199468085106383, "grad_norm": 1.9515573978424072, "learning_rate": 6.64451827242525e-07, "loss": 0.709, "step": 40 }, { "epoch": 0.020445478723404256, "grad_norm": 1.784574270248413, "learning_rate": 6.810631229235881e-07, "loss": 0.6899, "step": 41 }, { "epoch": 0.020944148936170214, "grad_norm": 1.8259271383285522, "learning_rate": 6.976744186046513e-07, "loss": 0.7308, "step": 42 }, { "epoch": 0.02144281914893617, "grad_norm": 1.5328142642974854, "learning_rate": 7.142857142857143e-07, "loss": 0.7231, "step": 43 }, { "epoch": 0.021941489361702128, "grad_norm": 1.3611787557601929, "learning_rate": 7.308970099667776e-07, "loss": 0.6875, "step": 44 }, { "epoch": 0.022440159574468085, "grad_norm": 1.5230084657669067, "learning_rate": 7.475083056478406e-07, "loss": 0.7223, "step": 45 }, { "epoch": 0.022938829787234043, "grad_norm": 1.5788061618804932, "learning_rate": 7.641196013289037e-07, "loss": 0.6933, "step": 46 }, { "epoch": 0.0234375, "grad_norm": 1.631176233291626, "learning_rate": 7.807308970099668e-07, "loss": 0.6744, "step": 47 }, { "epoch": 0.023936170212765957, "grad_norm": 1.815906286239624, "learning_rate": 7.973421926910299e-07, "loss": 0.7659, "step": 48 }, { "epoch": 0.024434840425531915, "grad_norm": 1.709101676940918, "learning_rate": 8.139534883720931e-07, "loss": 0.632, "step": 49 }, { "epoch": 0.024933510638297872, "grad_norm": 1.7471897602081299, "learning_rate": 8.305647840531563e-07, "loss": 0.7054, "step": 50 }, { "epoch": 0.02543218085106383, "grad_norm": 1.676865577697754, "learning_rate": 8.471760797342193e-07, "loss": 0.6766, "step": 51 }, { "epoch": 0.025930851063829786, "grad_norm": 1.496412754058838, "learning_rate": 8.637873754152824e-07, "loss": 0.6582, "step": 52 }, { "epoch": 0.026429521276595744, "grad_norm": 1.4708505868911743, "learning_rate": 8.803986710963456e-07, "loss": 0.7245, "step": 53 }, { "epoch": 0.0269281914893617, "grad_norm": 1.32184636592865, "learning_rate": 8.970099667774087e-07, "loss": 0.7004, "step": 54 }, { "epoch": 0.02742686170212766, "grad_norm": 1.0908914804458618, "learning_rate": 9.136212624584719e-07, "loss": 0.6523, "step": 55 }, { "epoch": 0.027925531914893616, "grad_norm": 1.0169106721878052, "learning_rate": 9.30232558139535e-07, "loss": 0.6542, "step": 56 }, { "epoch": 0.028424202127659573, "grad_norm": 0.8692624568939209, "learning_rate": 9.46843853820598e-07, "loss": 0.6863, "step": 57 }, { "epoch": 0.02892287234042553, "grad_norm": 0.8742132186889648, "learning_rate": 9.634551495016612e-07, "loss": 0.6386, "step": 58 }, { "epoch": 0.02942154255319149, "grad_norm": 0.9292728900909424, "learning_rate": 9.800664451827244e-07, "loss": 0.6188, "step": 59 }, { "epoch": 0.02992021276595745, "grad_norm": 0.9336127638816833, "learning_rate": 9.966777408637875e-07, "loss": 0.6654, "step": 60 }, { "epoch": 0.030418882978723406, "grad_norm": 0.8894085884094238, "learning_rate": 1.0132890365448505e-06, "loss": 0.5919, "step": 61 }, { "epoch": 0.030917553191489363, "grad_norm": 0.9941591620445251, "learning_rate": 1.0299003322259137e-06, "loss": 0.6192, "step": 62 }, { "epoch": 0.03141622340425532, "grad_norm": 0.8467283248901367, "learning_rate": 1.0465116279069768e-06, "loss": 0.6319, "step": 63 }, { "epoch": 0.031914893617021274, "grad_norm": 0.7923358678817749, "learning_rate": 1.06312292358804e-06, "loss": 0.619, "step": 64 }, { "epoch": 0.03241356382978723, "grad_norm": 0.7837202548980713, "learning_rate": 1.0797342192691032e-06, "loss": 0.6347, "step": 65 }, { "epoch": 0.03291223404255319, "grad_norm": 0.7627928256988525, "learning_rate": 1.0963455149501661e-06, "loss": 0.6224, "step": 66 }, { "epoch": 0.033410904255319146, "grad_norm": 0.7718964219093323, "learning_rate": 1.1129568106312293e-06, "loss": 0.6376, "step": 67 }, { "epoch": 0.0339095744680851, "grad_norm": 0.8981025218963623, "learning_rate": 1.1295681063122925e-06, "loss": 0.6925, "step": 68 }, { "epoch": 0.03440824468085106, "grad_norm": 0.7283675074577332, "learning_rate": 1.1461794019933556e-06, "loss": 0.6182, "step": 69 }, { "epoch": 0.03490691489361702, "grad_norm": 0.7472421526908875, "learning_rate": 1.1627906976744188e-06, "loss": 0.6203, "step": 70 }, { "epoch": 0.03540558510638298, "grad_norm": 0.7049282789230347, "learning_rate": 1.1794019933554817e-06, "loss": 0.6405, "step": 71 }, { "epoch": 0.03590425531914894, "grad_norm": 0.6997106075286865, "learning_rate": 1.196013289036545e-06, "loss": 0.6144, "step": 72 }, { "epoch": 0.0364029255319149, "grad_norm": 0.6354292631149292, "learning_rate": 1.212624584717608e-06, "loss": 0.5921, "step": 73 }, { "epoch": 0.036901595744680854, "grad_norm": 0.6182045340538025, "learning_rate": 1.2292358803986712e-06, "loss": 0.5941, "step": 74 }, { "epoch": 0.03740026595744681, "grad_norm": 0.6654198169708252, "learning_rate": 1.2458471760797342e-06, "loss": 0.5889, "step": 75 }, { "epoch": 0.03789893617021277, "grad_norm": 0.6257880926132202, "learning_rate": 1.2624584717607974e-06, "loss": 0.5932, "step": 76 }, { "epoch": 0.038397606382978726, "grad_norm": 0.6403027772903442, "learning_rate": 1.2790697674418605e-06, "loss": 0.5739, "step": 77 }, { "epoch": 0.03889627659574468, "grad_norm": 0.61916583776474, "learning_rate": 1.2956810631229235e-06, "loss": 0.603, "step": 78 }, { "epoch": 0.03939494680851064, "grad_norm": 0.6974622011184692, "learning_rate": 1.3122923588039869e-06, "loss": 0.6346, "step": 79 }, { "epoch": 0.0398936170212766, "grad_norm": 0.5871825218200684, "learning_rate": 1.32890365448505e-06, "loss": 0.58, "step": 80 }, { "epoch": 0.040392287234042555, "grad_norm": 0.6517924666404724, "learning_rate": 1.345514950166113e-06, "loss": 0.5802, "step": 81 }, { "epoch": 0.04089095744680851, "grad_norm": 0.5317974090576172, "learning_rate": 1.3621262458471762e-06, "loss": 0.5796, "step": 82 }, { "epoch": 0.04138962765957447, "grad_norm": 0.5975214838981628, "learning_rate": 1.3787375415282391e-06, "loss": 0.5989, "step": 83 }, { "epoch": 0.04188829787234043, "grad_norm": 0.5880606770515442, "learning_rate": 1.3953488372093025e-06, "loss": 0.6453, "step": 84 }, { "epoch": 0.042386968085106384, "grad_norm": 0.5232079029083252, "learning_rate": 1.4119601328903657e-06, "loss": 0.5535, "step": 85 }, { "epoch": 0.04288563829787234, "grad_norm": 0.5505807995796204, "learning_rate": 1.4285714285714286e-06, "loss": 0.5471, "step": 86 }, { "epoch": 0.0433843085106383, "grad_norm": 0.5291038155555725, "learning_rate": 1.4451827242524918e-06, "loss": 0.5986, "step": 87 }, { "epoch": 0.043882978723404256, "grad_norm": 0.6015099287033081, "learning_rate": 1.4617940199335552e-06, "loss": 0.5766, "step": 88 }, { "epoch": 0.044381648936170214, "grad_norm": 0.6161697506904602, "learning_rate": 1.4784053156146181e-06, "loss": 0.5998, "step": 89 }, { "epoch": 0.04488031914893617, "grad_norm": 0.5295201539993286, "learning_rate": 1.4950166112956813e-06, "loss": 0.5606, "step": 90 }, { "epoch": 0.04537898936170213, "grad_norm": 0.5604075789451599, "learning_rate": 1.5116279069767443e-06, "loss": 0.5713, "step": 91 }, { "epoch": 0.045877659574468085, "grad_norm": 0.6596866250038147, "learning_rate": 1.5282392026578074e-06, "loss": 0.5868, "step": 92 }, { "epoch": 0.04637632978723404, "grad_norm": 0.526219367980957, "learning_rate": 1.5448504983388706e-06, "loss": 0.5584, "step": 93 }, { "epoch": 0.046875, "grad_norm": 0.530194103717804, "learning_rate": 1.5614617940199335e-06, "loss": 0.5784, "step": 94 }, { "epoch": 0.04737367021276596, "grad_norm": 0.5239599943161011, "learning_rate": 1.578073089700997e-06, "loss": 0.5846, "step": 95 }, { "epoch": 0.047872340425531915, "grad_norm": 0.5473668575286865, "learning_rate": 1.5946843853820599e-06, "loss": 0.5229, "step": 96 }, { "epoch": 0.04837101063829787, "grad_norm": 0.553165853023529, "learning_rate": 1.611295681063123e-06, "loss": 0.5745, "step": 97 }, { "epoch": 0.04886968085106383, "grad_norm": 0.6283134818077087, "learning_rate": 1.6279069767441862e-06, "loss": 0.5645, "step": 98 }, { "epoch": 0.049368351063829786, "grad_norm": 0.6052577495574951, "learning_rate": 1.6445182724252492e-06, "loss": 0.5627, "step": 99 }, { "epoch": 0.049867021276595744, "grad_norm": 0.6121793389320374, "learning_rate": 1.6611295681063126e-06, "loss": 0.5841, "step": 100 }, { "epoch": 0.0503656914893617, "grad_norm": 0.5605393052101135, "learning_rate": 1.6777408637873755e-06, "loss": 0.5792, "step": 101 }, { "epoch": 0.05086436170212766, "grad_norm": 0.49386581778526306, "learning_rate": 1.6943521594684387e-06, "loss": 0.5546, "step": 102 }, { "epoch": 0.051363031914893616, "grad_norm": 0.5071396827697754, "learning_rate": 1.7109634551495018e-06, "loss": 0.5678, "step": 103 }, { "epoch": 0.05186170212765957, "grad_norm": 0.6183278560638428, "learning_rate": 1.7275747508305648e-06, "loss": 0.5894, "step": 104 }, { "epoch": 0.05236037234042553, "grad_norm": 0.5702501535415649, "learning_rate": 1.7441860465116282e-06, "loss": 0.5769, "step": 105 }, { "epoch": 0.05285904255319149, "grad_norm": 0.6303514242172241, "learning_rate": 1.7607973421926911e-06, "loss": 0.56, "step": 106 }, { "epoch": 0.053357712765957445, "grad_norm": 0.5506829619407654, "learning_rate": 1.7774086378737543e-06, "loss": 0.5746, "step": 107 }, { "epoch": 0.0538563829787234, "grad_norm": 0.4913070797920227, "learning_rate": 1.7940199335548175e-06, "loss": 0.5656, "step": 108 }, { "epoch": 0.05435505319148936, "grad_norm": 0.5125821232795715, "learning_rate": 1.8106312292358804e-06, "loss": 0.5224, "step": 109 }, { "epoch": 0.05485372340425532, "grad_norm": 0.5043580532073975, "learning_rate": 1.8272425249169438e-06, "loss": 0.5833, "step": 110 }, { "epoch": 0.055352393617021274, "grad_norm": 0.5058549046516418, "learning_rate": 1.8438538205980068e-06, "loss": 0.5192, "step": 111 }, { "epoch": 0.05585106382978723, "grad_norm": 0.5181486010551453, "learning_rate": 1.86046511627907e-06, "loss": 0.5641, "step": 112 }, { "epoch": 0.05634973404255319, "grad_norm": 0.5478882193565369, "learning_rate": 1.877076411960133e-06, "loss": 0.5687, "step": 113 }, { "epoch": 0.056848404255319146, "grad_norm": 0.5732421875, "learning_rate": 1.893687707641196e-06, "loss": 0.549, "step": 114 }, { "epoch": 0.0573470744680851, "grad_norm": 0.603874921798706, "learning_rate": 1.9102990033222592e-06, "loss": 0.5423, "step": 115 }, { "epoch": 0.05784574468085106, "grad_norm": 0.4818834960460663, "learning_rate": 1.9269102990033224e-06, "loss": 0.5166, "step": 116 }, { "epoch": 0.05834441489361702, "grad_norm": 0.49788013100624084, "learning_rate": 1.9435215946843856e-06, "loss": 0.5383, "step": 117 }, { "epoch": 0.05884308510638298, "grad_norm": 0.5274236798286438, "learning_rate": 1.9601328903654487e-06, "loss": 0.5521, "step": 118 }, { "epoch": 0.05934175531914894, "grad_norm": 0.5348125100135803, "learning_rate": 1.976744186046512e-06, "loss": 0.5686, "step": 119 }, { "epoch": 0.0598404255319149, "grad_norm": 0.5435785055160522, "learning_rate": 1.993355481727575e-06, "loss": 0.5428, "step": 120 }, { "epoch": 0.060339095744680854, "grad_norm": 0.5257694125175476, "learning_rate": 2.0099667774086382e-06, "loss": 0.5775, "step": 121 }, { "epoch": 0.06083776595744681, "grad_norm": 0.507828950881958, "learning_rate": 2.026578073089701e-06, "loss": 0.5661, "step": 122 }, { "epoch": 0.06133643617021277, "grad_norm": 0.5494296550750732, "learning_rate": 2.043189368770764e-06, "loss": 0.5552, "step": 123 }, { "epoch": 0.061835106382978726, "grad_norm": 0.515095055103302, "learning_rate": 2.0598006644518273e-06, "loss": 0.5456, "step": 124 }, { "epoch": 0.06233377659574468, "grad_norm": 0.5605705380439758, "learning_rate": 2.0764119601328905e-06, "loss": 0.5327, "step": 125 }, { "epoch": 0.06283244680851063, "grad_norm": 0.4932507872581482, "learning_rate": 2.0930232558139536e-06, "loss": 0.5179, "step": 126 }, { "epoch": 0.0633311170212766, "grad_norm": 0.5098196864128113, "learning_rate": 2.109634551495017e-06, "loss": 0.5562, "step": 127 }, { "epoch": 0.06382978723404255, "grad_norm": 0.5044757127761841, "learning_rate": 2.12624584717608e-06, "loss": 0.5375, "step": 128 }, { "epoch": 0.06432845744680851, "grad_norm": 0.49699127674102783, "learning_rate": 2.1428571428571427e-06, "loss": 0.542, "step": 129 }, { "epoch": 0.06482712765957446, "grad_norm": 0.5430847406387329, "learning_rate": 2.1594684385382063e-06, "loss": 0.5522, "step": 130 }, { "epoch": 0.06532579787234043, "grad_norm": 0.5421743988990784, "learning_rate": 2.1760797342192695e-06, "loss": 0.5679, "step": 131 }, { "epoch": 0.06582446808510638, "grad_norm": 0.461617648601532, "learning_rate": 2.1926910299003322e-06, "loss": 0.5664, "step": 132 }, { "epoch": 0.06632313829787234, "grad_norm": 0.5660932660102844, "learning_rate": 2.2093023255813954e-06, "loss": 0.5664, "step": 133 }, { "epoch": 0.06682180851063829, "grad_norm": 0.5421304106712341, "learning_rate": 2.2259136212624586e-06, "loss": 0.5243, "step": 134 }, { "epoch": 0.06732047872340426, "grad_norm": 0.5189849734306335, "learning_rate": 2.2425249169435217e-06, "loss": 0.5936, "step": 135 }, { "epoch": 0.0678191489361702, "grad_norm": 0.5277193188667297, "learning_rate": 2.259136212624585e-06, "loss": 0.5321, "step": 136 }, { "epoch": 0.06831781914893617, "grad_norm": 0.5615208148956299, "learning_rate": 2.275747508305648e-06, "loss": 0.5409, "step": 137 }, { "epoch": 0.06881648936170212, "grad_norm": 0.5315436124801636, "learning_rate": 2.2923588039867112e-06, "loss": 0.5094, "step": 138 }, { "epoch": 0.06931515957446809, "grad_norm": 0.5491966605186462, "learning_rate": 2.3089700996677744e-06, "loss": 0.5166, "step": 139 }, { "epoch": 0.06981382978723404, "grad_norm": 0.6505034565925598, "learning_rate": 2.3255813953488376e-06, "loss": 0.5451, "step": 140 }, { "epoch": 0.0703125, "grad_norm": 0.5390035510063171, "learning_rate": 2.3421926910299007e-06, "loss": 0.5468, "step": 141 }, { "epoch": 0.07081117021276596, "grad_norm": 0.5046887993812561, "learning_rate": 2.3588039867109635e-06, "loss": 0.5022, "step": 142 }, { "epoch": 0.07130984042553191, "grad_norm": 0.49491816759109497, "learning_rate": 2.3754152823920267e-06, "loss": 0.502, "step": 143 }, { "epoch": 0.07180851063829788, "grad_norm": 0.48867475986480713, "learning_rate": 2.39202657807309e-06, "loss": 0.5521, "step": 144 }, { "epoch": 0.07230718085106383, "grad_norm": 0.5198736786842346, "learning_rate": 2.408637873754153e-06, "loss": 0.5533, "step": 145 }, { "epoch": 0.0728058510638298, "grad_norm": 0.5489479303359985, "learning_rate": 2.425249169435216e-06, "loss": 0.5431, "step": 146 }, { "epoch": 0.07330452127659574, "grad_norm": 0.539905309677124, "learning_rate": 2.4418604651162793e-06, "loss": 0.5196, "step": 147 }, { "epoch": 0.07380319148936171, "grad_norm": 0.5448138117790222, "learning_rate": 2.4584717607973425e-06, "loss": 0.5153, "step": 148 }, { "epoch": 0.07430186170212766, "grad_norm": 0.5031436085700989, "learning_rate": 2.4750830564784057e-06, "loss": 0.5143, "step": 149 }, { "epoch": 0.07480053191489362, "grad_norm": 0.5556774735450745, "learning_rate": 2.4916943521594684e-06, "loss": 0.5188, "step": 150 }, { "epoch": 0.07529920212765957, "grad_norm": 0.48575928807258606, "learning_rate": 2.5083056478405316e-06, "loss": 0.5118, "step": 151 }, { "epoch": 0.07579787234042554, "grad_norm": 0.5565800666809082, "learning_rate": 2.5249169435215947e-06, "loss": 0.5759, "step": 152 }, { "epoch": 0.07629654255319149, "grad_norm": 0.5038402080535889, "learning_rate": 2.541528239202658e-06, "loss": 0.5394, "step": 153 }, { "epoch": 0.07679521276595745, "grad_norm": 0.5466840267181396, "learning_rate": 2.558139534883721e-06, "loss": 0.5438, "step": 154 }, { "epoch": 0.0772938829787234, "grad_norm": 0.4872336685657501, "learning_rate": 2.5747508305647847e-06, "loss": 0.5572, "step": 155 }, { "epoch": 0.07779255319148937, "grad_norm": 0.5930878520011902, "learning_rate": 2.591362126245847e-06, "loss": 0.5635, "step": 156 }, { "epoch": 0.07829122340425532, "grad_norm": 0.5293257832527161, "learning_rate": 2.6079734219269106e-06, "loss": 0.524, "step": 157 }, { "epoch": 0.07878989361702128, "grad_norm": 0.5051547884941101, "learning_rate": 2.6245847176079738e-06, "loss": 0.5187, "step": 158 }, { "epoch": 0.07928856382978723, "grad_norm": 0.5163655877113342, "learning_rate": 2.641196013289037e-06, "loss": 0.5664, "step": 159 }, { "epoch": 0.0797872340425532, "grad_norm": 0.5110372304916382, "learning_rate": 2.6578073089701e-06, "loss": 0.5507, "step": 160 }, { "epoch": 0.08028590425531915, "grad_norm": 0.5038115382194519, "learning_rate": 2.674418604651163e-06, "loss": 0.5215, "step": 161 }, { "epoch": 0.08078457446808511, "grad_norm": 0.5829526782035828, "learning_rate": 2.691029900332226e-06, "loss": 0.5294, "step": 162 }, { "epoch": 0.08128324468085106, "grad_norm": 0.49868232011795044, "learning_rate": 2.707641196013289e-06, "loss": 0.5337, "step": 163 }, { "epoch": 0.08178191489361702, "grad_norm": 0.5376306176185608, "learning_rate": 2.7242524916943523e-06, "loss": 0.5159, "step": 164 }, { "epoch": 0.08228058510638298, "grad_norm": 0.5952631235122681, "learning_rate": 2.7408637873754155e-06, "loss": 0.5412, "step": 165 }, { "epoch": 0.08277925531914894, "grad_norm": 0.5957065224647522, "learning_rate": 2.7574750830564782e-06, "loss": 0.5514, "step": 166 }, { "epoch": 0.08327792553191489, "grad_norm": 0.5493671298027039, "learning_rate": 2.7740863787375414e-06, "loss": 0.4961, "step": 167 }, { "epoch": 0.08377659574468085, "grad_norm": 0.5856424570083618, "learning_rate": 2.790697674418605e-06, "loss": 0.4906, "step": 168 }, { "epoch": 0.0842752659574468, "grad_norm": 0.518429696559906, "learning_rate": 2.807308970099668e-06, "loss": 0.534, "step": 169 }, { "epoch": 0.08477393617021277, "grad_norm": 0.5402636528015137, "learning_rate": 2.8239202657807313e-06, "loss": 0.5402, "step": 170 }, { "epoch": 0.08527260638297872, "grad_norm": 0.618583083152771, "learning_rate": 2.840531561461794e-06, "loss": 0.4997, "step": 171 }, { "epoch": 0.08577127659574468, "grad_norm": 0.5137509107589722, "learning_rate": 2.8571428571428573e-06, "loss": 0.5102, "step": 172 }, { "epoch": 0.08626994680851063, "grad_norm": 0.5550179481506348, "learning_rate": 2.8737541528239204e-06, "loss": 0.5574, "step": 173 }, { "epoch": 0.0867686170212766, "grad_norm": 0.5251392126083374, "learning_rate": 2.8903654485049836e-06, "loss": 0.5123, "step": 174 }, { "epoch": 0.08726728723404255, "grad_norm": 0.5467750430107117, "learning_rate": 2.9069767441860468e-06, "loss": 0.4897, "step": 175 }, { "epoch": 0.08776595744680851, "grad_norm": 0.5499750971794128, "learning_rate": 2.9235880398671104e-06, "loss": 0.5136, "step": 176 }, { "epoch": 0.08826462765957446, "grad_norm": 0.4759860038757324, "learning_rate": 2.9401993355481727e-06, "loss": 0.5071, "step": 177 }, { "epoch": 0.08876329787234043, "grad_norm": 0.5209143161773682, "learning_rate": 2.9568106312292363e-06, "loss": 0.524, "step": 178 }, { "epoch": 0.08926196808510638, "grad_norm": 0.5410250425338745, "learning_rate": 2.9734219269102994e-06, "loss": 0.5232, "step": 179 }, { "epoch": 0.08976063829787234, "grad_norm": 0.5178418159484863, "learning_rate": 2.9900332225913626e-06, "loss": 0.5058, "step": 180 }, { "epoch": 0.09025930851063829, "grad_norm": 0.5218544602394104, "learning_rate": 3.0066445182724258e-06, "loss": 0.485, "step": 181 }, { "epoch": 0.09075797872340426, "grad_norm": 0.5209022164344788, "learning_rate": 3.0232558139534885e-06, "loss": 0.5204, "step": 182 }, { "epoch": 0.0912566489361702, "grad_norm": 0.523345410823822, "learning_rate": 3.0398671096345517e-06, "loss": 0.4991, "step": 183 }, { "epoch": 0.09175531914893617, "grad_norm": 0.5010770559310913, "learning_rate": 3.056478405315615e-06, "loss": 0.5259, "step": 184 }, { "epoch": 0.09225398936170212, "grad_norm": 0.5434916615486145, "learning_rate": 3.073089700996678e-06, "loss": 0.5316, "step": 185 }, { "epoch": 0.09275265957446809, "grad_norm": 0.4862251877784729, "learning_rate": 3.089700996677741e-06, "loss": 0.5373, "step": 186 }, { "epoch": 0.09325132978723404, "grad_norm": 0.5355278253555298, "learning_rate": 3.106312292358804e-06, "loss": 0.5232, "step": 187 }, { "epoch": 0.09375, "grad_norm": 0.6163966655731201, "learning_rate": 3.122923588039867e-06, "loss": 0.5366, "step": 188 }, { "epoch": 0.09424867021276596, "grad_norm": 0.5975522994995117, "learning_rate": 3.1395348837209307e-06, "loss": 0.4945, "step": 189 }, { "epoch": 0.09474734042553191, "grad_norm": 0.5236570835113525, "learning_rate": 3.156146179401994e-06, "loss": 0.528, "step": 190 }, { "epoch": 0.09524601063829788, "grad_norm": 0.5212184190750122, "learning_rate": 3.172757475083057e-06, "loss": 0.4802, "step": 191 }, { "epoch": 0.09574468085106383, "grad_norm": 0.5117728114128113, "learning_rate": 3.1893687707641198e-06, "loss": 0.5134, "step": 192 }, { "epoch": 0.0962433510638298, "grad_norm": 0.6809417009353638, "learning_rate": 3.205980066445183e-06, "loss": 0.5444, "step": 193 }, { "epoch": 0.09674202127659574, "grad_norm": 0.5660489201545715, "learning_rate": 3.222591362126246e-06, "loss": 0.562, "step": 194 }, { "epoch": 0.09724069148936171, "grad_norm": 0.536969006061554, "learning_rate": 3.2392026578073093e-06, "loss": 0.5058, "step": 195 }, { "epoch": 0.09773936170212766, "grad_norm": 0.479020893573761, "learning_rate": 3.2558139534883724e-06, "loss": 0.5199, "step": 196 }, { "epoch": 0.09823803191489362, "grad_norm": 0.516930878162384, "learning_rate": 3.272425249169435e-06, "loss": 0.528, "step": 197 }, { "epoch": 0.09873670212765957, "grad_norm": 0.5619287490844727, "learning_rate": 3.2890365448504984e-06, "loss": 0.5135, "step": 198 }, { "epoch": 0.09923537234042554, "grad_norm": 0.6015617847442627, "learning_rate": 3.305647840531562e-06, "loss": 0.4757, "step": 199 }, { "epoch": 0.09973404255319149, "grad_norm": 0.5321779847145081, "learning_rate": 3.322259136212625e-06, "loss": 0.5104, "step": 200 }, { "epoch": 0.10023271276595745, "grad_norm": 0.5019960403442383, "learning_rate": 3.3388704318936883e-06, "loss": 0.4787, "step": 201 }, { "epoch": 0.1007313829787234, "grad_norm": 0.48650068044662476, "learning_rate": 3.355481727574751e-06, "loss": 0.5355, "step": 202 }, { "epoch": 0.10123005319148937, "grad_norm": 0.47486990690231323, "learning_rate": 3.372093023255814e-06, "loss": 0.4975, "step": 203 }, { "epoch": 0.10172872340425532, "grad_norm": 0.48074761033058167, "learning_rate": 3.3887043189368774e-06, "loss": 0.5043, "step": 204 }, { "epoch": 0.10222739361702128, "grad_norm": 0.5484845638275146, "learning_rate": 3.4053156146179405e-06, "loss": 0.5472, "step": 205 }, { "epoch": 0.10272606382978723, "grad_norm": 0.4848787784576416, "learning_rate": 3.4219269102990037e-06, "loss": 0.4954, "step": 206 }, { "epoch": 0.1032247340425532, "grad_norm": 0.5236577987670898, "learning_rate": 3.4385382059800664e-06, "loss": 0.5038, "step": 207 }, { "epoch": 0.10372340425531915, "grad_norm": 0.57725989818573, "learning_rate": 3.4551495016611296e-06, "loss": 0.5597, "step": 208 }, { "epoch": 0.10422207446808511, "grad_norm": 0.5717082023620605, "learning_rate": 3.4717607973421928e-06, "loss": 0.5113, "step": 209 }, { "epoch": 0.10472074468085106, "grad_norm": 0.5325572490692139, "learning_rate": 3.4883720930232564e-06, "loss": 0.5106, "step": 210 }, { "epoch": 0.10521941489361702, "grad_norm": 0.5694642663002014, "learning_rate": 3.5049833887043195e-06, "loss": 0.5049, "step": 211 }, { "epoch": 0.10571808510638298, "grad_norm": 0.541289746761322, "learning_rate": 3.5215946843853823e-06, "loss": 0.546, "step": 212 }, { "epoch": 0.10621675531914894, "grad_norm": 0.5627945065498352, "learning_rate": 3.5382059800664454e-06, "loss": 0.5659, "step": 213 }, { "epoch": 0.10671542553191489, "grad_norm": 0.5938902497291565, "learning_rate": 3.5548172757475086e-06, "loss": 0.4906, "step": 214 }, { "epoch": 0.10721409574468085, "grad_norm": 0.5777013897895813, "learning_rate": 3.5714285714285718e-06, "loss": 0.5245, "step": 215 }, { "epoch": 0.1077127659574468, "grad_norm": 0.5456171035766602, "learning_rate": 3.588039867109635e-06, "loss": 0.5168, "step": 216 }, { "epoch": 0.10821143617021277, "grad_norm": 0.5043011903762817, "learning_rate": 3.6046511627906977e-06, "loss": 0.4956, "step": 217 }, { "epoch": 0.10871010638297872, "grad_norm": 0.5057460069656372, "learning_rate": 3.621262458471761e-06, "loss": 0.5875, "step": 218 }, { "epoch": 0.10920877659574468, "grad_norm": 0.5740747451782227, "learning_rate": 3.637873754152824e-06, "loss": 0.496, "step": 219 }, { "epoch": 0.10970744680851063, "grad_norm": 0.5573886036872864, "learning_rate": 3.6544850498338876e-06, "loss": 0.519, "step": 220 }, { "epoch": 0.1102061170212766, "grad_norm": 0.5448163747787476, "learning_rate": 3.671096345514951e-06, "loss": 0.5334, "step": 221 }, { "epoch": 0.11070478723404255, "grad_norm": 0.47392332553863525, "learning_rate": 3.6877076411960135e-06, "loss": 0.5187, "step": 222 }, { "epoch": 0.11120345744680851, "grad_norm": 0.5237601399421692, "learning_rate": 3.7043189368770767e-06, "loss": 0.5053, "step": 223 }, { "epoch": 0.11170212765957446, "grad_norm": 0.5265491008758545, "learning_rate": 3.72093023255814e-06, "loss": 0.5089, "step": 224 }, { "epoch": 0.11220079787234043, "grad_norm": 0.6316487193107605, "learning_rate": 3.737541528239203e-06, "loss": 0.4968, "step": 225 }, { "epoch": 0.11269946808510638, "grad_norm": 0.49390578269958496, "learning_rate": 3.754152823920266e-06, "loss": 0.4845, "step": 226 }, { "epoch": 0.11319813829787234, "grad_norm": 0.5778462290763855, "learning_rate": 3.7707641196013294e-06, "loss": 0.5099, "step": 227 }, { "epoch": 0.11369680851063829, "grad_norm": 0.5743120908737183, "learning_rate": 3.787375415282392e-06, "loss": 0.5354, "step": 228 }, { "epoch": 0.11419547872340426, "grad_norm": 0.5942612290382385, "learning_rate": 3.8039867109634553e-06, "loss": 0.5546, "step": 229 }, { "epoch": 0.1146941489361702, "grad_norm": 0.500817596912384, "learning_rate": 3.8205980066445185e-06, "loss": 0.5625, "step": 230 }, { "epoch": 0.11519281914893617, "grad_norm": 0.5744107961654663, "learning_rate": 3.837209302325582e-06, "loss": 0.5131, "step": 231 }, { "epoch": 0.11569148936170212, "grad_norm": 0.5681580305099487, "learning_rate": 3.853820598006645e-06, "loss": 0.5361, "step": 232 }, { "epoch": 0.11619015957446809, "grad_norm": 0.5379349589347839, "learning_rate": 3.8704318936877075e-06, "loss": 0.5156, "step": 233 }, { "epoch": 0.11668882978723404, "grad_norm": 0.5775149464607239, "learning_rate": 3.887043189368771e-06, "loss": 0.5018, "step": 234 }, { "epoch": 0.1171875, "grad_norm": 0.5088963508605957, "learning_rate": 3.903654485049834e-06, "loss": 0.5259, "step": 235 }, { "epoch": 0.11768617021276596, "grad_norm": 0.5554478764533997, "learning_rate": 3.9202657807308975e-06, "loss": 0.5252, "step": 236 }, { "epoch": 0.11818484042553191, "grad_norm": 0.529472291469574, "learning_rate": 3.936877076411961e-06, "loss": 0.5014, "step": 237 }, { "epoch": 0.11868351063829788, "grad_norm": 0.6289129257202148, "learning_rate": 3.953488372093024e-06, "loss": 0.4927, "step": 238 }, { "epoch": 0.11918218085106383, "grad_norm": 0.5931803584098816, "learning_rate": 3.9700996677740865e-06, "loss": 0.5314, "step": 239 }, { "epoch": 0.1196808510638298, "grad_norm": 0.5517998933792114, "learning_rate": 3.98671096345515e-06, "loss": 0.5066, "step": 240 }, { "epoch": 0.12017952127659574, "grad_norm": 0.49503839015960693, "learning_rate": 4.003322259136213e-06, "loss": 0.5066, "step": 241 }, { "epoch": 0.12067819148936171, "grad_norm": 0.5341817140579224, "learning_rate": 4.0199335548172765e-06, "loss": 0.5243, "step": 242 }, { "epoch": 0.12117686170212766, "grad_norm": 0.5035473108291626, "learning_rate": 4.036544850498339e-06, "loss": 0.4712, "step": 243 }, { "epoch": 0.12167553191489362, "grad_norm": 0.5755202174186707, "learning_rate": 4.053156146179402e-06, "loss": 0.4876, "step": 244 }, { "epoch": 0.12217420212765957, "grad_norm": 0.5627059936523438, "learning_rate": 4.0697674418604655e-06, "loss": 0.505, "step": 245 }, { "epoch": 0.12267287234042554, "grad_norm": 0.612430989742279, "learning_rate": 4.086378737541528e-06, "loss": 0.5211, "step": 246 }, { "epoch": 0.12317154255319149, "grad_norm": 0.5359200239181519, "learning_rate": 4.102990033222592e-06, "loss": 0.5054, "step": 247 }, { "epoch": 0.12367021276595745, "grad_norm": 0.5383697152137756, "learning_rate": 4.119601328903655e-06, "loss": 0.4974, "step": 248 }, { "epoch": 0.1241688829787234, "grad_norm": 0.5147137641906738, "learning_rate": 4.136212624584718e-06, "loss": 0.5063, "step": 249 }, { "epoch": 0.12466755319148937, "grad_norm": 0.5158129930496216, "learning_rate": 4.152823920265781e-06, "loss": 0.4579, "step": 250 }, { "epoch": 0.12516622340425532, "grad_norm": 0.5460744500160217, "learning_rate": 4.1694352159468446e-06, "loss": 0.5178, "step": 251 }, { "epoch": 0.12566489361702127, "grad_norm": 0.5364257097244263, "learning_rate": 4.186046511627907e-06, "loss": 0.4881, "step": 252 }, { "epoch": 0.12616356382978725, "grad_norm": 0.4994054436683655, "learning_rate": 4.20265780730897e-06, "loss": 0.5265, "step": 253 }, { "epoch": 0.1266622340425532, "grad_norm": 0.4760734736919403, "learning_rate": 4.219269102990034e-06, "loss": 0.486, "step": 254 }, { "epoch": 0.12716090425531915, "grad_norm": 0.5598995685577393, "learning_rate": 4.235880398671096e-06, "loss": 0.4885, "step": 255 }, { "epoch": 0.1276595744680851, "grad_norm": 0.5896456837654114, "learning_rate": 4.25249169435216e-06, "loss": 0.4953, "step": 256 }, { "epoch": 0.12815824468085107, "grad_norm": 0.5594127774238586, "learning_rate": 4.2691029900332236e-06, "loss": 0.516, "step": 257 }, { "epoch": 0.12865691489361702, "grad_norm": 0.5354496240615845, "learning_rate": 4.2857142857142855e-06, "loss": 0.4726, "step": 258 }, { "epoch": 0.12915558510638298, "grad_norm": 0.559644877910614, "learning_rate": 4.302325581395349e-06, "loss": 0.5101, "step": 259 }, { "epoch": 0.12965425531914893, "grad_norm": 0.6123584508895874, "learning_rate": 4.318936877076413e-06, "loss": 0.5099, "step": 260 }, { "epoch": 0.1301529255319149, "grad_norm": 0.49750375747680664, "learning_rate": 4.335548172757475e-06, "loss": 0.5088, "step": 261 }, { "epoch": 0.13065159574468085, "grad_norm": 0.5179445147514343, "learning_rate": 4.352159468438539e-06, "loss": 0.5239, "step": 262 }, { "epoch": 0.1311502659574468, "grad_norm": 0.5309945344924927, "learning_rate": 4.368770764119602e-06, "loss": 0.5045, "step": 263 }, { "epoch": 0.13164893617021275, "grad_norm": 0.5661154985427856, "learning_rate": 4.3853820598006645e-06, "loss": 0.4846, "step": 264 }, { "epoch": 0.13214760638297873, "grad_norm": 0.5255202651023865, "learning_rate": 4.401993355481728e-06, "loss": 0.4877, "step": 265 }, { "epoch": 0.13264627659574468, "grad_norm": 0.5495125651359558, "learning_rate": 4.418604651162791e-06, "loss": 0.5299, "step": 266 }, { "epoch": 0.13314494680851063, "grad_norm": 0.5583246350288391, "learning_rate": 4.435215946843854e-06, "loss": 0.4984, "step": 267 }, { "epoch": 0.13364361702127658, "grad_norm": 0.61139315366745, "learning_rate": 4.451827242524917e-06, "loss": 0.5168, "step": 268 }, { "epoch": 0.13414228723404256, "grad_norm": 0.5519322752952576, "learning_rate": 4.46843853820598e-06, "loss": 0.5258, "step": 269 }, { "epoch": 0.1346409574468085, "grad_norm": 0.5157970190048218, "learning_rate": 4.4850498338870435e-06, "loss": 0.5393, "step": 270 }, { "epoch": 0.13513962765957446, "grad_norm": 0.5480062365531921, "learning_rate": 4.501661129568107e-06, "loss": 0.5135, "step": 271 }, { "epoch": 0.1356382978723404, "grad_norm": 0.49799618124961853, "learning_rate": 4.51827242524917e-06, "loss": 0.4904, "step": 272 }, { "epoch": 0.1361369680851064, "grad_norm": 0.5166665315628052, "learning_rate": 4.5348837209302326e-06, "loss": 0.4962, "step": 273 }, { "epoch": 0.13663563829787234, "grad_norm": 0.50180584192276, "learning_rate": 4.551495016611296e-06, "loss": 0.4603, "step": 274 }, { "epoch": 0.1371343085106383, "grad_norm": 0.5414620041847229, "learning_rate": 4.568106312292359e-06, "loss": 0.4934, "step": 275 }, { "epoch": 0.13763297872340424, "grad_norm": 0.48706889152526855, "learning_rate": 4.5847176079734225e-06, "loss": 0.5319, "step": 276 }, { "epoch": 0.13813164893617022, "grad_norm": 0.5182044506072998, "learning_rate": 4.601328903654485e-06, "loss": 0.5138, "step": 277 }, { "epoch": 0.13863031914893617, "grad_norm": 0.4896914064884186, "learning_rate": 4.617940199335549e-06, "loss": 0.5232, "step": 278 }, { "epoch": 0.13912898936170212, "grad_norm": 0.5272147059440613, "learning_rate": 4.6345514950166116e-06, "loss": 0.5069, "step": 279 }, { "epoch": 0.13962765957446807, "grad_norm": 0.5151172280311584, "learning_rate": 4.651162790697675e-06, "loss": 0.51, "step": 280 }, { "epoch": 0.14012632978723405, "grad_norm": 0.5648430585861206, "learning_rate": 4.667774086378738e-06, "loss": 0.4871, "step": 281 }, { "epoch": 0.140625, "grad_norm": 0.5634363293647766, "learning_rate": 4.6843853820598015e-06, "loss": 0.5049, "step": 282 }, { "epoch": 0.14112367021276595, "grad_norm": 0.5709763765335083, "learning_rate": 4.700996677740864e-06, "loss": 0.5318, "step": 283 }, { "epoch": 0.14162234042553193, "grad_norm": 0.5153307914733887, "learning_rate": 4.717607973421927e-06, "loss": 0.5093, "step": 284 }, { "epoch": 0.14212101063829788, "grad_norm": 0.5161314606666565, "learning_rate": 4.7342192691029906e-06, "loss": 0.5108, "step": 285 }, { "epoch": 0.14261968085106383, "grad_norm": 0.4995707869529724, "learning_rate": 4.750830564784053e-06, "loss": 0.5124, "step": 286 }, { "epoch": 0.14311835106382978, "grad_norm": 0.571901798248291, "learning_rate": 4.767441860465117e-06, "loss": 0.4918, "step": 287 }, { "epoch": 0.14361702127659576, "grad_norm": 0.5418944358825684, "learning_rate": 4.78405315614618e-06, "loss": 0.5133, "step": 288 }, { "epoch": 0.1441156914893617, "grad_norm": 0.49909308552742004, "learning_rate": 4.800664451827242e-06, "loss": 0.516, "step": 289 }, { "epoch": 0.14461436170212766, "grad_norm": 0.5122935771942139, "learning_rate": 4.817275747508306e-06, "loss": 0.5071, "step": 290 }, { "epoch": 0.1451130319148936, "grad_norm": 0.5605318546295166, "learning_rate": 4.83388704318937e-06, "loss": 0.479, "step": 291 }, { "epoch": 0.1456117021276596, "grad_norm": 0.5152603387832642, "learning_rate": 4.850498338870432e-06, "loss": 0.492, "step": 292 }, { "epoch": 0.14611037234042554, "grad_norm": 0.5619462132453918, "learning_rate": 4.867109634551496e-06, "loss": 0.532, "step": 293 }, { "epoch": 0.1466090425531915, "grad_norm": 0.5618111491203308, "learning_rate": 4.883720930232559e-06, "loss": 0.4768, "step": 294 }, { "epoch": 0.14710771276595744, "grad_norm": 0.5376899242401123, "learning_rate": 4.900332225913621e-06, "loss": 0.49, "step": 295 }, { "epoch": 0.14760638297872342, "grad_norm": 0.603203296661377, "learning_rate": 4.916943521594685e-06, "loss": 0.4933, "step": 296 }, { "epoch": 0.14810505319148937, "grad_norm": 0.5392637252807617, "learning_rate": 4.933554817275748e-06, "loss": 0.5012, "step": 297 }, { "epoch": 0.14860372340425532, "grad_norm": 0.5034738779067993, "learning_rate": 4.950166112956811e-06, "loss": 0.4863, "step": 298 }, { "epoch": 0.14910239361702127, "grad_norm": 0.5505192279815674, "learning_rate": 4.966777408637874e-06, "loss": 0.477, "step": 299 }, { "epoch": 0.14960106382978725, "grad_norm": 0.5112382173538208, "learning_rate": 4.983388704318937e-06, "loss": 0.4709, "step": 300 }, { "epoch": 0.1500997340425532, "grad_norm": 0.5732061266899109, "learning_rate": 5e-06, "loss": 0.5047, "step": 301 }, { "epoch": 0.15059840425531915, "grad_norm": 0.5068342089653015, "learning_rate": 5.016611295681063e-06, "loss": 0.5086, "step": 302 }, { "epoch": 0.1510970744680851, "grad_norm": 0.4864322543144226, "learning_rate": 5.033222591362127e-06, "loss": 0.4672, "step": 303 }, { "epoch": 0.15159574468085107, "grad_norm": 0.5796331167221069, "learning_rate": 5.0498338870431895e-06, "loss": 0.502, "step": 304 }, { "epoch": 0.15209441489361702, "grad_norm": 0.5381379723548889, "learning_rate": 5.066445182724253e-06, "loss": 0.4472, "step": 305 }, { "epoch": 0.15259308510638298, "grad_norm": 0.5281261205673218, "learning_rate": 5.083056478405316e-06, "loss": 0.4485, "step": 306 }, { "epoch": 0.15309175531914893, "grad_norm": 0.5829383134841919, "learning_rate": 5.0996677740863786e-06, "loss": 0.5017, "step": 307 }, { "epoch": 0.1535904255319149, "grad_norm": 0.5411919951438904, "learning_rate": 5.116279069767442e-06, "loss": 0.5164, "step": 308 }, { "epoch": 0.15408909574468085, "grad_norm": 0.4629952609539032, "learning_rate": 5.132890365448505e-06, "loss": 0.4905, "step": 309 }, { "epoch": 0.1545877659574468, "grad_norm": 0.5428441166877747, "learning_rate": 5.149501661129569e-06, "loss": 0.5073, "step": 310 }, { "epoch": 0.15508643617021275, "grad_norm": 0.6246626377105713, "learning_rate": 5.166112956810631e-06, "loss": 0.4957, "step": 311 }, { "epoch": 0.15558510638297873, "grad_norm": 0.6010832190513611, "learning_rate": 5.182724252491694e-06, "loss": 0.4787, "step": 312 }, { "epoch": 0.15608377659574468, "grad_norm": 0.5599226951599121, "learning_rate": 5.1993355481727584e-06, "loss": 0.4993, "step": 313 }, { "epoch": 0.15658244680851063, "grad_norm": 0.5356563329696655, "learning_rate": 5.215946843853821e-06, "loss": 0.4817, "step": 314 }, { "epoch": 0.15708111702127658, "grad_norm": 0.6082132458686829, "learning_rate": 5.232558139534885e-06, "loss": 0.4713, "step": 315 }, { "epoch": 0.15757978723404256, "grad_norm": 0.5730721354484558, "learning_rate": 5.2491694352159475e-06, "loss": 0.4315, "step": 316 }, { "epoch": 0.1580784574468085, "grad_norm": 0.5036988258361816, "learning_rate": 5.26578073089701e-06, "loss": 0.4878, "step": 317 }, { "epoch": 0.15857712765957446, "grad_norm": 0.5737367868423462, "learning_rate": 5.282392026578074e-06, "loss": 0.4983, "step": 318 }, { "epoch": 0.1590757978723404, "grad_norm": 0.6215083003044128, "learning_rate": 5.299003322259137e-06, "loss": 0.5197, "step": 319 }, { "epoch": 0.1595744680851064, "grad_norm": 0.5943993926048279, "learning_rate": 5.3156146179402e-06, "loss": 0.5184, "step": 320 }, { "epoch": 0.16007313829787234, "grad_norm": 0.596855640411377, "learning_rate": 5.332225913621263e-06, "loss": 0.5053, "step": 321 }, { "epoch": 0.1605718085106383, "grad_norm": 0.5349458456039429, "learning_rate": 5.348837209302326e-06, "loss": 0.4912, "step": 322 }, { "epoch": 0.16107047872340424, "grad_norm": 0.6385061144828796, "learning_rate": 5.365448504983389e-06, "loss": 0.4875, "step": 323 }, { "epoch": 0.16156914893617022, "grad_norm": 0.5830777287483215, "learning_rate": 5.382059800664452e-06, "loss": 0.5109, "step": 324 }, { "epoch": 0.16206781914893617, "grad_norm": 0.5612320303916931, "learning_rate": 5.398671096345516e-06, "loss": 0.4843, "step": 325 }, { "epoch": 0.16256648936170212, "grad_norm": 0.622662365436554, "learning_rate": 5.415282392026578e-06, "loss": 0.4494, "step": 326 }, { "epoch": 0.16306515957446807, "grad_norm": 0.5290312767028809, "learning_rate": 5.431893687707641e-06, "loss": 0.5092, "step": 327 }, { "epoch": 0.16356382978723405, "grad_norm": 0.5777674913406372, "learning_rate": 5.448504983388705e-06, "loss": 0.5284, "step": 328 }, { "epoch": 0.1640625, "grad_norm": 0.532122790813446, "learning_rate": 5.465116279069767e-06, "loss": 0.4904, "step": 329 }, { "epoch": 0.16456117021276595, "grad_norm": 0.5975939631462097, "learning_rate": 5.481727574750831e-06, "loss": 0.4969, "step": 330 }, { "epoch": 0.16505984042553193, "grad_norm": 0.5772044658660889, "learning_rate": 5.498338870431894e-06, "loss": 0.4764, "step": 331 }, { "epoch": 0.16555851063829788, "grad_norm": 0.5552728772163391, "learning_rate": 5.5149501661129565e-06, "loss": 0.482, "step": 332 }, { "epoch": 0.16605718085106383, "grad_norm": 0.5720219612121582, "learning_rate": 5.531561461794021e-06, "loss": 0.4768, "step": 333 }, { "epoch": 0.16655585106382978, "grad_norm": 0.5810685157775879, "learning_rate": 5.548172757475083e-06, "loss": 0.4994, "step": 334 }, { "epoch": 0.16705452127659576, "grad_norm": 0.5678473114967346, "learning_rate": 5.564784053156147e-06, "loss": 0.463, "step": 335 }, { "epoch": 0.1675531914893617, "grad_norm": 0.5789889693260193, "learning_rate": 5.58139534883721e-06, "loss": 0.4854, "step": 336 }, { "epoch": 0.16805186170212766, "grad_norm": 0.583807110786438, "learning_rate": 5.598006644518273e-06, "loss": 0.4577, "step": 337 }, { "epoch": 0.1685505319148936, "grad_norm": 0.5611709952354431, "learning_rate": 5.614617940199336e-06, "loss": 0.4861, "step": 338 }, { "epoch": 0.1690492021276596, "grad_norm": 0.5327596068382263, "learning_rate": 5.631229235880399e-06, "loss": 0.5094, "step": 339 }, { "epoch": 0.16954787234042554, "grad_norm": 0.6597830057144165, "learning_rate": 5.647840531561463e-06, "loss": 0.5084, "step": 340 }, { "epoch": 0.1700465425531915, "grad_norm": 0.650718092918396, "learning_rate": 5.6644518272425254e-06, "loss": 0.5057, "step": 341 }, { "epoch": 0.17054521276595744, "grad_norm": 0.6081916093826294, "learning_rate": 5.681063122923588e-06, "loss": 0.5035, "step": 342 }, { "epoch": 0.17104388297872342, "grad_norm": 0.5869320631027222, "learning_rate": 5.697674418604652e-06, "loss": 0.5005, "step": 343 }, { "epoch": 0.17154255319148937, "grad_norm": 0.5317733883857727, "learning_rate": 5.7142857142857145e-06, "loss": 0.4895, "step": 344 }, { "epoch": 0.17204122340425532, "grad_norm": 0.536307156085968, "learning_rate": 5.730897009966778e-06, "loss": 0.4891, "step": 345 }, { "epoch": 0.17253989361702127, "grad_norm": 0.5625855922698975, "learning_rate": 5.747508305647841e-06, "loss": 0.4969, "step": 346 }, { "epoch": 0.17303856382978725, "grad_norm": 0.629438042640686, "learning_rate": 5.764119601328904e-06, "loss": 0.4951, "step": 347 }, { "epoch": 0.1735372340425532, "grad_norm": 0.6191278696060181, "learning_rate": 5.780730897009967e-06, "loss": 0.5157, "step": 348 }, { "epoch": 0.17403590425531915, "grad_norm": 0.5686163902282715, "learning_rate": 5.79734219269103e-06, "loss": 0.4726, "step": 349 }, { "epoch": 0.1745345744680851, "grad_norm": 0.6766439080238342, "learning_rate": 5.8139534883720935e-06, "loss": 0.4631, "step": 350 }, { "epoch": 0.17503324468085107, "grad_norm": 0.5187313556671143, "learning_rate": 5.830564784053156e-06, "loss": 0.4984, "step": 351 }, { "epoch": 0.17553191489361702, "grad_norm": 0.6099503636360168, "learning_rate": 5.847176079734221e-06, "loss": 0.4933, "step": 352 }, { "epoch": 0.17603058510638298, "grad_norm": 0.5839822292327881, "learning_rate": 5.863787375415283e-06, "loss": 0.5219, "step": 353 }, { "epoch": 0.17652925531914893, "grad_norm": 0.5391499996185303, "learning_rate": 5.880398671096345e-06, "loss": 0.5262, "step": 354 }, { "epoch": 0.1770279255319149, "grad_norm": 0.5514128804206848, "learning_rate": 5.89700996677741e-06, "loss": 0.4983, "step": 355 }, { "epoch": 0.17752659574468085, "grad_norm": 0.568305492401123, "learning_rate": 5.9136212624584725e-06, "loss": 0.4751, "step": 356 }, { "epoch": 0.1780252659574468, "grad_norm": 0.555429995059967, "learning_rate": 5.930232558139536e-06, "loss": 0.4934, "step": 357 }, { "epoch": 0.17852393617021275, "grad_norm": 0.5795806050300598, "learning_rate": 5.946843853820599e-06, "loss": 0.4967, "step": 358 }, { "epoch": 0.17902260638297873, "grad_norm": 0.5295781493186951, "learning_rate": 5.963455149501662e-06, "loss": 0.4925, "step": 359 }, { "epoch": 0.17952127659574468, "grad_norm": 0.5897250771522522, "learning_rate": 5.980066445182725e-06, "loss": 0.488, "step": 360 }, { "epoch": 0.18001994680851063, "grad_norm": 0.5851795077323914, "learning_rate": 5.996677740863788e-06, "loss": 0.5003, "step": 361 }, { "epoch": 0.18051861702127658, "grad_norm": 0.585107147693634, "learning_rate": 6.0132890365448515e-06, "loss": 0.4805, "step": 362 }, { "epoch": 0.18101728723404256, "grad_norm": 0.6014913320541382, "learning_rate": 6.029900332225914e-06, "loss": 0.4925, "step": 363 }, { "epoch": 0.1815159574468085, "grad_norm": 0.5732492804527283, "learning_rate": 6.046511627906977e-06, "loss": 0.5066, "step": 364 }, { "epoch": 0.18201462765957446, "grad_norm": 0.5341842174530029, "learning_rate": 6.063122923588041e-06, "loss": 0.5018, "step": 365 }, { "epoch": 0.1825132978723404, "grad_norm": 0.5281921625137329, "learning_rate": 6.079734219269103e-06, "loss": 0.4866, "step": 366 }, { "epoch": 0.1830119680851064, "grad_norm": 0.5354170799255371, "learning_rate": 6.096345514950167e-06, "loss": 0.5093, "step": 367 }, { "epoch": 0.18351063829787234, "grad_norm": 0.6058481335639954, "learning_rate": 6.11295681063123e-06, "loss": 0.5317, "step": 368 }, { "epoch": 0.1840093085106383, "grad_norm": 0.5789216160774231, "learning_rate": 6.1295681063122924e-06, "loss": 0.5046, "step": 369 }, { "epoch": 0.18450797872340424, "grad_norm": 0.5588173866271973, "learning_rate": 6.146179401993356e-06, "loss": 0.5077, "step": 370 }, { "epoch": 0.18500664893617022, "grad_norm": 0.46870294213294983, "learning_rate": 6.162790697674419e-06, "loss": 0.4789, "step": 371 }, { "epoch": 0.18550531914893617, "grad_norm": 0.5503048300743103, "learning_rate": 6.179401993355482e-06, "loss": 0.5159, "step": 372 }, { "epoch": 0.18600398936170212, "grad_norm": 0.5455701947212219, "learning_rate": 6.196013289036545e-06, "loss": 0.5072, "step": 373 }, { "epoch": 0.18650265957446807, "grad_norm": 0.5645993947982788, "learning_rate": 6.212624584717608e-06, "loss": 0.5063, "step": 374 }, { "epoch": 0.18700132978723405, "grad_norm": 0.5072670578956604, "learning_rate": 6.229235880398672e-06, "loss": 0.42, "step": 375 }, { "epoch": 0.1875, "grad_norm": 0.5537001490592957, "learning_rate": 6.245847176079734e-06, "loss": 0.4714, "step": 376 }, { "epoch": 0.18799867021276595, "grad_norm": 0.5300961136817932, "learning_rate": 6.262458471760799e-06, "loss": 0.4807, "step": 377 }, { "epoch": 0.18849734042553193, "grad_norm": 0.5460841655731201, "learning_rate": 6.279069767441861e-06, "loss": 0.4746, "step": 378 }, { "epoch": 0.18899601063829788, "grad_norm": 0.5180844068527222, "learning_rate": 6.295681063122924e-06, "loss": 0.4822, "step": 379 }, { "epoch": 0.18949468085106383, "grad_norm": 0.5807256102561951, "learning_rate": 6.312292358803988e-06, "loss": 0.4734, "step": 380 }, { "epoch": 0.18999335106382978, "grad_norm": 0.5614348649978638, "learning_rate": 6.3289036544850505e-06, "loss": 0.48, "step": 381 }, { "epoch": 0.19049202127659576, "grad_norm": 0.5175067782402039, "learning_rate": 6.345514950166114e-06, "loss": 0.4495, "step": 382 }, { "epoch": 0.1909906914893617, "grad_norm": 0.526826798915863, "learning_rate": 6.362126245847177e-06, "loss": 0.4733, "step": 383 }, { "epoch": 0.19148936170212766, "grad_norm": 0.5284005403518677, "learning_rate": 6.3787375415282395e-06, "loss": 0.4645, "step": 384 }, { "epoch": 0.1919880319148936, "grad_norm": 0.5166677832603455, "learning_rate": 6.395348837209303e-06, "loss": 0.4856, "step": 385 }, { "epoch": 0.1924867021276596, "grad_norm": 0.48329007625579834, "learning_rate": 6.411960132890366e-06, "loss": 0.4949, "step": 386 }, { "epoch": 0.19298537234042554, "grad_norm": 0.5857308506965637, "learning_rate": 6.4285714285714295e-06, "loss": 0.4747, "step": 387 }, { "epoch": 0.1934840425531915, "grad_norm": 0.5376214385032654, "learning_rate": 6.445182724252492e-06, "loss": 0.4756, "step": 388 }, { "epoch": 0.19398271276595744, "grad_norm": 0.5097772479057312, "learning_rate": 6.461794019933555e-06, "loss": 0.4954, "step": 389 }, { "epoch": 0.19448138297872342, "grad_norm": 0.5624365210533142, "learning_rate": 6.4784053156146185e-06, "loss": 0.501, "step": 390 }, { "epoch": 0.19498005319148937, "grad_norm": 0.5303126573562622, "learning_rate": 6.495016611295681e-06, "loss": 0.4507, "step": 391 }, { "epoch": 0.19547872340425532, "grad_norm": 0.577808141708374, "learning_rate": 6.511627906976745e-06, "loss": 0.5254, "step": 392 }, { "epoch": 0.19597739361702127, "grad_norm": 0.576484203338623, "learning_rate": 6.528239202657808e-06, "loss": 0.4915, "step": 393 }, { "epoch": 0.19647606382978725, "grad_norm": 0.6179257035255432, "learning_rate": 6.54485049833887e-06, "loss": 0.5199, "step": 394 }, { "epoch": 0.1969747340425532, "grad_norm": 0.5191903114318848, "learning_rate": 6.561461794019934e-06, "loss": 0.5059, "step": 395 }, { "epoch": 0.19747340425531915, "grad_norm": 0.5592581033706665, "learning_rate": 6.578073089700997e-06, "loss": 0.5313, "step": 396 }, { "epoch": 0.1979720744680851, "grad_norm": 0.5591293573379517, "learning_rate": 6.594684385382061e-06, "loss": 0.5171, "step": 397 }, { "epoch": 0.19847074468085107, "grad_norm": 0.5669188499450684, "learning_rate": 6.611295681063124e-06, "loss": 0.4943, "step": 398 }, { "epoch": 0.19896941489361702, "grad_norm": 0.5532863736152649, "learning_rate": 6.627906976744186e-06, "loss": 0.4813, "step": 399 }, { "epoch": 0.19946808510638298, "grad_norm": 0.5185867547988892, "learning_rate": 6.64451827242525e-06, "loss": 0.4759, "step": 400 }, { "epoch": 0.19996675531914893, "grad_norm": 0.5771741271018982, "learning_rate": 6.661129568106313e-06, "loss": 0.4837, "step": 401 }, { "epoch": 0.2004654255319149, "grad_norm": 0.6745825409889221, "learning_rate": 6.6777408637873766e-06, "loss": 0.4968, "step": 402 }, { "epoch": 0.20096409574468085, "grad_norm": 0.5483439564704895, "learning_rate": 6.694352159468439e-06, "loss": 0.4448, "step": 403 }, { "epoch": 0.2014627659574468, "grad_norm": 0.5389791131019592, "learning_rate": 6.710963455149502e-06, "loss": 0.4926, "step": 404 }, { "epoch": 0.20196143617021275, "grad_norm": 0.6482044458389282, "learning_rate": 6.727574750830566e-06, "loss": 0.4844, "step": 405 }, { "epoch": 0.20246010638297873, "grad_norm": 0.636666476726532, "learning_rate": 6.744186046511628e-06, "loss": 0.4714, "step": 406 }, { "epoch": 0.20295877659574468, "grad_norm": 0.5254217982292175, "learning_rate": 6.760797342192692e-06, "loss": 0.4768, "step": 407 }, { "epoch": 0.20345744680851063, "grad_norm": 0.606842577457428, "learning_rate": 6.777408637873755e-06, "loss": 0.4398, "step": 408 }, { "epoch": 0.20395611702127658, "grad_norm": 0.8092384934425354, "learning_rate": 6.7940199335548175e-06, "loss": 0.4901, "step": 409 }, { "epoch": 0.20445478723404256, "grad_norm": 0.6256732940673828, "learning_rate": 6.810631229235881e-06, "loss": 0.4973, "step": 410 }, { "epoch": 0.2049534574468085, "grad_norm": 0.5403755903244019, "learning_rate": 6.827242524916944e-06, "loss": 0.4861, "step": 411 }, { "epoch": 0.20545212765957446, "grad_norm": 0.7433263659477234, "learning_rate": 6.843853820598007e-06, "loss": 0.4964, "step": 412 }, { "epoch": 0.2059507978723404, "grad_norm": 0.6706695556640625, "learning_rate": 6.86046511627907e-06, "loss": 0.492, "step": 413 }, { "epoch": 0.2064494680851064, "grad_norm": 0.5051175951957703, "learning_rate": 6.877076411960133e-06, "loss": 0.4614, "step": 414 }, { "epoch": 0.20694813829787234, "grad_norm": 0.4936462342739105, "learning_rate": 6.8936877076411965e-06, "loss": 0.4918, "step": 415 }, { "epoch": 0.2074468085106383, "grad_norm": 0.563370943069458, "learning_rate": 6.910299003322259e-06, "loss": 0.4557, "step": 416 }, { "epoch": 0.20794547872340424, "grad_norm": 0.5619051456451416, "learning_rate": 6.926910299003324e-06, "loss": 0.5168, "step": 417 }, { "epoch": 0.20844414893617022, "grad_norm": 0.5305076837539673, "learning_rate": 6.9435215946843855e-06, "loss": 0.4688, "step": 418 }, { "epoch": 0.20894281914893617, "grad_norm": 0.6153885126113892, "learning_rate": 6.960132890365448e-06, "loss": 0.5149, "step": 419 }, { "epoch": 0.20944148936170212, "grad_norm": 0.548520028591156, "learning_rate": 6.976744186046513e-06, "loss": 0.4757, "step": 420 }, { "epoch": 0.20994015957446807, "grad_norm": 0.6531148552894592, "learning_rate": 6.9933554817275755e-06, "loss": 0.4681, "step": 421 }, { "epoch": 0.21043882978723405, "grad_norm": 0.5211474895477295, "learning_rate": 7.009966777408639e-06, "loss": 0.4916, "step": 422 }, { "epoch": 0.2109375, "grad_norm": 0.5366091728210449, "learning_rate": 7.026578073089702e-06, "loss": 0.4705, "step": 423 }, { "epoch": 0.21143617021276595, "grad_norm": 0.5815908312797546, "learning_rate": 7.0431893687707646e-06, "loss": 0.4989, "step": 424 }, { "epoch": 0.21193484042553193, "grad_norm": 0.5356396436691284, "learning_rate": 7.059800664451828e-06, "loss": 0.5082, "step": 425 }, { "epoch": 0.21243351063829788, "grad_norm": 0.5446996688842773, "learning_rate": 7.076411960132891e-06, "loss": 0.504, "step": 426 }, { "epoch": 0.21293218085106383, "grad_norm": 0.5675705075263977, "learning_rate": 7.0930232558139545e-06, "loss": 0.4743, "step": 427 }, { "epoch": 0.21343085106382978, "grad_norm": 0.6114431619644165, "learning_rate": 7.109634551495017e-06, "loss": 0.4941, "step": 428 }, { "epoch": 0.21392952127659576, "grad_norm": 0.6084715127944946, "learning_rate": 7.12624584717608e-06, "loss": 0.4964, "step": 429 }, { "epoch": 0.2144281914893617, "grad_norm": 0.6390047669410706, "learning_rate": 7.1428571428571436e-06, "loss": 0.5102, "step": 430 }, { "epoch": 0.21492686170212766, "grad_norm": 0.595679759979248, "learning_rate": 7.159468438538206e-06, "loss": 0.458, "step": 431 }, { "epoch": 0.2154255319148936, "grad_norm": 0.5811206698417664, "learning_rate": 7.17607973421927e-06, "loss": 0.4749, "step": 432 }, { "epoch": 0.2159242021276596, "grad_norm": 0.7381309270858765, "learning_rate": 7.192691029900333e-06, "loss": 0.5011, "step": 433 }, { "epoch": 0.21642287234042554, "grad_norm": 0.6063110828399658, "learning_rate": 7.209302325581395e-06, "loss": 0.4755, "step": 434 }, { "epoch": 0.2169215425531915, "grad_norm": 0.7155268788337708, "learning_rate": 7.225913621262459e-06, "loss": 0.4845, "step": 435 }, { "epoch": 0.21742021276595744, "grad_norm": 0.6397129893302917, "learning_rate": 7.242524916943522e-06, "loss": 0.5203, "step": 436 }, { "epoch": 0.21791888297872342, "grad_norm": 0.5451716780662537, "learning_rate": 7.259136212624585e-06, "loss": 0.4989, "step": 437 }, { "epoch": 0.21841755319148937, "grad_norm": 0.6008687019348145, "learning_rate": 7.275747508305648e-06, "loss": 0.4809, "step": 438 }, { "epoch": 0.21891622340425532, "grad_norm": 0.6081984043121338, "learning_rate": 7.292358803986711e-06, "loss": 0.4717, "step": 439 }, { "epoch": 0.21941489361702127, "grad_norm": 0.48501789569854736, "learning_rate": 7.308970099667775e-06, "loss": 0.4301, "step": 440 }, { "epoch": 0.21991356382978725, "grad_norm": 0.6003995537757874, "learning_rate": 7.325581395348837e-06, "loss": 0.4997, "step": 441 }, { "epoch": 0.2204122340425532, "grad_norm": 0.5011851191520691, "learning_rate": 7.342192691029902e-06, "loss": 0.4566, "step": 442 }, { "epoch": 0.22091090425531915, "grad_norm": 0.5589715838432312, "learning_rate": 7.358803986710964e-06, "loss": 0.5097, "step": 443 }, { "epoch": 0.2214095744680851, "grad_norm": 0.561805248260498, "learning_rate": 7.375415282392027e-06, "loss": 0.4759, "step": 444 }, { "epoch": 0.22190824468085107, "grad_norm": 0.49992603063583374, "learning_rate": 7.392026578073091e-06, "loss": 0.4803, "step": 445 }, { "epoch": 0.22240691489361702, "grad_norm": 0.640761137008667, "learning_rate": 7.408637873754153e-06, "loss": 0.4594, "step": 446 }, { "epoch": 0.22290558510638298, "grad_norm": 0.6046059131622314, "learning_rate": 7.425249169435217e-06, "loss": 0.5071, "step": 447 }, { "epoch": 0.22340425531914893, "grad_norm": 0.5272344946861267, "learning_rate": 7.44186046511628e-06, "loss": 0.5103, "step": 448 }, { "epoch": 0.2239029255319149, "grad_norm": 0.5864842534065247, "learning_rate": 7.4584717607973425e-06, "loss": 0.4664, "step": 449 }, { "epoch": 0.22440159574468085, "grad_norm": 0.6174520254135132, "learning_rate": 7.475083056478406e-06, "loss": 0.5164, "step": 450 }, { "epoch": 0.2249002659574468, "grad_norm": 0.5973398089408875, "learning_rate": 7.491694352159469e-06, "loss": 0.4872, "step": 451 }, { "epoch": 0.22539893617021275, "grad_norm": 0.6010795831680298, "learning_rate": 7.508305647840532e-06, "loss": 0.5052, "step": 452 }, { "epoch": 0.22589760638297873, "grad_norm": 0.5870740413665771, "learning_rate": 7.524916943521595e-06, "loss": 0.4552, "step": 453 }, { "epoch": 0.22639627659574468, "grad_norm": 0.5431109070777893, "learning_rate": 7.541528239202659e-06, "loss": 0.4857, "step": 454 }, { "epoch": 0.22689494680851063, "grad_norm": 0.608448326587677, "learning_rate": 7.5581395348837215e-06, "loss": 0.5074, "step": 455 }, { "epoch": 0.22739361702127658, "grad_norm": 0.5661842823028564, "learning_rate": 7.574750830564784e-06, "loss": 0.4759, "step": 456 }, { "epoch": 0.22789228723404256, "grad_norm": 0.5964498519897461, "learning_rate": 7.591362126245848e-06, "loss": 0.5047, "step": 457 }, { "epoch": 0.2283909574468085, "grad_norm": 0.5776697397232056, "learning_rate": 7.6079734219269106e-06, "loss": 0.4937, "step": 458 }, { "epoch": 0.22888962765957446, "grad_norm": 0.5906480550765991, "learning_rate": 7.624584717607974e-06, "loss": 0.4814, "step": 459 }, { "epoch": 0.2293882978723404, "grad_norm": 0.5887320041656494, "learning_rate": 7.641196013289037e-06, "loss": 0.4972, "step": 460 }, { "epoch": 0.2298869680851064, "grad_norm": 0.5848200917243958, "learning_rate": 7.6578073089701e-06, "loss": 0.4759, "step": 461 }, { "epoch": 0.23038563829787234, "grad_norm": 0.6445795297622681, "learning_rate": 7.674418604651164e-06, "loss": 0.4898, "step": 462 }, { "epoch": 0.2308843085106383, "grad_norm": 0.5131809711456299, "learning_rate": 7.691029900332227e-06, "loss": 0.5193, "step": 463 }, { "epoch": 0.23138297872340424, "grad_norm": 0.5702722668647766, "learning_rate": 7.70764119601329e-06, "loss": 0.4733, "step": 464 }, { "epoch": 0.23188164893617022, "grad_norm": 0.6132248640060425, "learning_rate": 7.724252491694352e-06, "loss": 0.4575, "step": 465 }, { "epoch": 0.23238031914893617, "grad_norm": 0.5810995697975159, "learning_rate": 7.740863787375415e-06, "loss": 0.4786, "step": 466 }, { "epoch": 0.23287898936170212, "grad_norm": 0.580348789691925, "learning_rate": 7.75747508305648e-06, "loss": 0.4843, "step": 467 }, { "epoch": 0.23337765957446807, "grad_norm": 0.6053818464279175, "learning_rate": 7.774086378737542e-06, "loss": 0.4604, "step": 468 }, { "epoch": 0.23387632978723405, "grad_norm": 0.5688367486000061, "learning_rate": 7.790697674418605e-06, "loss": 0.4762, "step": 469 }, { "epoch": 0.234375, "grad_norm": 0.6015356183052063, "learning_rate": 7.807308970099668e-06, "loss": 0.511, "step": 470 }, { "epoch": 0.23487367021276595, "grad_norm": 0.5473264455795288, "learning_rate": 7.82392026578073e-06, "loss": 0.4803, "step": 471 }, { "epoch": 0.23537234042553193, "grad_norm": 0.5282036066055298, "learning_rate": 7.840531561461795e-06, "loss": 0.4869, "step": 472 }, { "epoch": 0.23587101063829788, "grad_norm": 0.5777396559715271, "learning_rate": 7.857142857142858e-06, "loss": 0.4936, "step": 473 }, { "epoch": 0.23636968085106383, "grad_norm": 0.5699732303619385, "learning_rate": 7.873754152823922e-06, "loss": 0.4938, "step": 474 }, { "epoch": 0.23686835106382978, "grad_norm": 0.5360299944877625, "learning_rate": 7.890365448504985e-06, "loss": 0.4756, "step": 475 }, { "epoch": 0.23736702127659576, "grad_norm": 0.5887415409088135, "learning_rate": 7.906976744186048e-06, "loss": 0.4815, "step": 476 }, { "epoch": 0.2378656914893617, "grad_norm": 0.5912429690361023, "learning_rate": 7.92358803986711e-06, "loss": 0.499, "step": 477 }, { "epoch": 0.23836436170212766, "grad_norm": 0.5278929471969604, "learning_rate": 7.940199335548173e-06, "loss": 0.4727, "step": 478 }, { "epoch": 0.2388630319148936, "grad_norm": 0.4800964593887329, "learning_rate": 7.956810631229238e-06, "loss": 0.4664, "step": 479 }, { "epoch": 0.2393617021276596, "grad_norm": 0.5403974652290344, "learning_rate": 7.9734219269103e-06, "loss": 0.4734, "step": 480 }, { "epoch": 0.23986037234042554, "grad_norm": 0.5089088082313538, "learning_rate": 7.990033222591363e-06, "loss": 0.5023, "step": 481 }, { "epoch": 0.2403590425531915, "grad_norm": 0.5530551671981812, "learning_rate": 8.006644518272426e-06, "loss": 0.5031, "step": 482 }, { "epoch": 0.24085771276595744, "grad_norm": 0.5354371666908264, "learning_rate": 8.023255813953488e-06, "loss": 0.5057, "step": 483 }, { "epoch": 0.24135638297872342, "grad_norm": 0.5476059317588806, "learning_rate": 8.039867109634553e-06, "loss": 0.4805, "step": 484 }, { "epoch": 0.24185505319148937, "grad_norm": 0.5368127226829529, "learning_rate": 8.056478405315616e-06, "loss": 0.4929, "step": 485 }, { "epoch": 0.24235372340425532, "grad_norm": 0.5097534656524658, "learning_rate": 8.073089700996678e-06, "loss": 0.4647, "step": 486 }, { "epoch": 0.24285239361702127, "grad_norm": 0.5807977914810181, "learning_rate": 8.089700996677741e-06, "loss": 0.4677, "step": 487 }, { "epoch": 0.24335106382978725, "grad_norm": 0.4802601933479309, "learning_rate": 8.106312292358804e-06, "loss": 0.4494, "step": 488 }, { "epoch": 0.2438497340425532, "grad_norm": 0.4422217309474945, "learning_rate": 8.122923588039868e-06, "loss": 0.4534, "step": 489 }, { "epoch": 0.24434840425531915, "grad_norm": 0.5267391800880432, "learning_rate": 8.139534883720931e-06, "loss": 0.4847, "step": 490 }, { "epoch": 0.2448470744680851, "grad_norm": 0.5414633750915527, "learning_rate": 8.156146179401994e-06, "loss": 0.4599, "step": 491 }, { "epoch": 0.24534574468085107, "grad_norm": 0.5535857081413269, "learning_rate": 8.172757475083057e-06, "loss": 0.4691, "step": 492 }, { "epoch": 0.24584441489361702, "grad_norm": 0.5273587107658386, "learning_rate": 8.18936877076412e-06, "loss": 0.4833, "step": 493 }, { "epoch": 0.24634308510638298, "grad_norm": 0.6031257510185242, "learning_rate": 8.205980066445184e-06, "loss": 0.4576, "step": 494 }, { "epoch": 0.24684175531914893, "grad_norm": 0.6738388538360596, "learning_rate": 8.222591362126247e-06, "loss": 0.4803, "step": 495 }, { "epoch": 0.2473404255319149, "grad_norm": 0.5652573108673096, "learning_rate": 8.23920265780731e-06, "loss": 0.4487, "step": 496 }, { "epoch": 0.24783909574468085, "grad_norm": 0.6972902417182922, "learning_rate": 8.255813953488374e-06, "loss": 0.5288, "step": 497 }, { "epoch": 0.2483377659574468, "grad_norm": 0.5669332146644592, "learning_rate": 8.272425249169436e-06, "loss": 0.4688, "step": 498 }, { "epoch": 0.24883643617021275, "grad_norm": 0.6684141159057617, "learning_rate": 8.2890365448505e-06, "loss": 0.4986, "step": 499 }, { "epoch": 0.24933510638297873, "grad_norm": 0.5432189702987671, "learning_rate": 8.305647840531562e-06, "loss": 0.461, "step": 500 }, { "epoch": 0.24983377659574468, "grad_norm": 0.5701374411582947, "learning_rate": 8.322259136212625e-06, "loss": 0.4708, "step": 501 }, { "epoch": 0.25033244680851063, "grad_norm": 0.5965026021003723, "learning_rate": 8.338870431893689e-06, "loss": 0.455, "step": 502 }, { "epoch": 0.2508311170212766, "grad_norm": 0.5572510957717896, "learning_rate": 8.355481727574752e-06, "loss": 0.4639, "step": 503 }, { "epoch": 0.25132978723404253, "grad_norm": 0.5693596601486206, "learning_rate": 8.372093023255815e-06, "loss": 0.4904, "step": 504 }, { "epoch": 0.2518284574468085, "grad_norm": 0.563241183757782, "learning_rate": 8.388704318936877e-06, "loss": 0.491, "step": 505 }, { "epoch": 0.2523271276595745, "grad_norm": 0.6016919016838074, "learning_rate": 8.40531561461794e-06, "loss": 0.4943, "step": 506 }, { "epoch": 0.25282579787234044, "grad_norm": 0.5994514226913452, "learning_rate": 8.421926910299005e-06, "loss": 0.4718, "step": 507 }, { "epoch": 0.2533244680851064, "grad_norm": 0.6590986251831055, "learning_rate": 8.438538205980067e-06, "loss": 0.4829, "step": 508 }, { "epoch": 0.25382313829787234, "grad_norm": 0.5817270874977112, "learning_rate": 8.45514950166113e-06, "loss": 0.4362, "step": 509 }, { "epoch": 0.2543218085106383, "grad_norm": 0.6519371867179871, "learning_rate": 8.471760797342193e-06, "loss": 0.4561, "step": 510 }, { "epoch": 0.25482047872340424, "grad_norm": 0.6955299377441406, "learning_rate": 8.488372093023256e-06, "loss": 0.5115, "step": 511 }, { "epoch": 0.2553191489361702, "grad_norm": 0.6765804290771484, "learning_rate": 8.50498338870432e-06, "loss": 0.4773, "step": 512 }, { "epoch": 0.25581781914893614, "grad_norm": 0.5947204828262329, "learning_rate": 8.521594684385383e-06, "loss": 0.4856, "step": 513 }, { "epoch": 0.25631648936170215, "grad_norm": 0.6756787896156311, "learning_rate": 8.538205980066447e-06, "loss": 0.479, "step": 514 }, { "epoch": 0.2568151595744681, "grad_norm": 0.6758587956428528, "learning_rate": 8.554817275747508e-06, "loss": 0.4645, "step": 515 }, { "epoch": 0.25731382978723405, "grad_norm": 0.6146256327629089, "learning_rate": 8.571428571428571e-06, "loss": 0.433, "step": 516 }, { "epoch": 0.2578125, "grad_norm": 0.6144259572029114, "learning_rate": 8.588039867109635e-06, "loss": 0.4487, "step": 517 }, { "epoch": 0.25831117021276595, "grad_norm": 0.6176734566688538, "learning_rate": 8.604651162790698e-06, "loss": 0.4757, "step": 518 }, { "epoch": 0.2588098404255319, "grad_norm": 0.6143619418144226, "learning_rate": 8.621262458471763e-06, "loss": 0.4749, "step": 519 }, { "epoch": 0.25930851063829785, "grad_norm": 0.612930953502655, "learning_rate": 8.637873754152825e-06, "loss": 0.4875, "step": 520 }, { "epoch": 0.25980718085106386, "grad_norm": 0.6514411568641663, "learning_rate": 8.654485049833888e-06, "loss": 0.5088, "step": 521 }, { "epoch": 0.2603058510638298, "grad_norm": 0.636424720287323, "learning_rate": 8.67109634551495e-06, "loss": 0.4998, "step": 522 }, { "epoch": 0.26080452127659576, "grad_norm": 0.556447446346283, "learning_rate": 8.687707641196014e-06, "loss": 0.4547, "step": 523 }, { "epoch": 0.2613031914893617, "grad_norm": 0.5965206623077393, "learning_rate": 8.704318936877078e-06, "loss": 0.4774, "step": 524 }, { "epoch": 0.26180186170212766, "grad_norm": 0.6961112022399902, "learning_rate": 8.72093023255814e-06, "loss": 0.4576, "step": 525 }, { "epoch": 0.2623005319148936, "grad_norm": 0.5920089483261108, "learning_rate": 8.737541528239203e-06, "loss": 0.4996, "step": 526 }, { "epoch": 0.26279920212765956, "grad_norm": 0.6414315104484558, "learning_rate": 8.754152823920266e-06, "loss": 0.48, "step": 527 }, { "epoch": 0.2632978723404255, "grad_norm": 0.6266006231307983, "learning_rate": 8.770764119601329e-06, "loss": 0.4488, "step": 528 }, { "epoch": 0.2637965425531915, "grad_norm": 0.7083429098129272, "learning_rate": 8.787375415282393e-06, "loss": 0.4587, "step": 529 }, { "epoch": 0.26429521276595747, "grad_norm": 0.5778377652168274, "learning_rate": 8.803986710963456e-06, "loss": 0.4663, "step": 530 }, { "epoch": 0.2647938829787234, "grad_norm": 0.5736879706382751, "learning_rate": 8.820598006644519e-06, "loss": 0.4823, "step": 531 }, { "epoch": 0.26529255319148937, "grad_norm": 0.6609888076782227, "learning_rate": 8.837209302325582e-06, "loss": 0.4882, "step": 532 }, { "epoch": 0.2657912234042553, "grad_norm": 0.5224712491035461, "learning_rate": 8.853820598006644e-06, "loss": 0.4529, "step": 533 }, { "epoch": 0.26628989361702127, "grad_norm": 0.6866216659545898, "learning_rate": 8.870431893687709e-06, "loss": 0.4875, "step": 534 }, { "epoch": 0.2667885638297872, "grad_norm": 0.5435084700584412, "learning_rate": 8.887043189368772e-06, "loss": 0.5161, "step": 535 }, { "epoch": 0.26728723404255317, "grad_norm": 0.5439012050628662, "learning_rate": 8.903654485049834e-06, "loss": 0.4888, "step": 536 }, { "epoch": 0.2677859042553192, "grad_norm": 0.5719301700592041, "learning_rate": 8.920265780730899e-06, "loss": 0.4372, "step": 537 }, { "epoch": 0.2682845744680851, "grad_norm": 0.523941695690155, "learning_rate": 8.93687707641196e-06, "loss": 0.4443, "step": 538 }, { "epoch": 0.2687832446808511, "grad_norm": 0.5697094798088074, "learning_rate": 8.953488372093024e-06, "loss": 0.484, "step": 539 }, { "epoch": 0.269281914893617, "grad_norm": 0.48665961623191833, "learning_rate": 8.970099667774087e-06, "loss": 0.426, "step": 540 }, { "epoch": 0.269780585106383, "grad_norm": 0.5255846977233887, "learning_rate": 8.98671096345515e-06, "loss": 0.4653, "step": 541 }, { "epoch": 0.2702792553191489, "grad_norm": 0.5466011762619019, "learning_rate": 9.003322259136214e-06, "loss": 0.4577, "step": 542 }, { "epoch": 0.2707779255319149, "grad_norm": 0.527887225151062, "learning_rate": 9.019933554817277e-06, "loss": 0.4903, "step": 543 }, { "epoch": 0.2712765957446808, "grad_norm": 0.62400221824646, "learning_rate": 9.03654485049834e-06, "loss": 0.4794, "step": 544 }, { "epoch": 0.27177526595744683, "grad_norm": 0.5406920909881592, "learning_rate": 9.053156146179402e-06, "loss": 0.506, "step": 545 }, { "epoch": 0.2722739361702128, "grad_norm": 0.5334205627441406, "learning_rate": 9.069767441860465e-06, "loss": 0.4546, "step": 546 }, { "epoch": 0.27277260638297873, "grad_norm": 0.581389307975769, "learning_rate": 9.08637873754153e-06, "loss": 0.4635, "step": 547 }, { "epoch": 0.2732712765957447, "grad_norm": 0.5529372096061707, "learning_rate": 9.102990033222592e-06, "loss": 0.4611, "step": 548 }, { "epoch": 0.27376994680851063, "grad_norm": 0.5657907724380493, "learning_rate": 9.119601328903655e-06, "loss": 0.4634, "step": 549 }, { "epoch": 0.2742686170212766, "grad_norm": 0.6249788403511047, "learning_rate": 9.136212624584718e-06, "loss": 0.4912, "step": 550 }, { "epoch": 0.27476728723404253, "grad_norm": 0.6206207275390625, "learning_rate": 9.15282392026578e-06, "loss": 0.4992, "step": 551 }, { "epoch": 0.2752659574468085, "grad_norm": 0.635506272315979, "learning_rate": 9.169435215946845e-06, "loss": 0.4728, "step": 552 }, { "epoch": 0.2757646276595745, "grad_norm": 0.5343915224075317, "learning_rate": 9.186046511627908e-06, "loss": 0.4902, "step": 553 }, { "epoch": 0.27626329787234044, "grad_norm": 0.6037989854812622, "learning_rate": 9.20265780730897e-06, "loss": 0.4649, "step": 554 }, { "epoch": 0.2767619680851064, "grad_norm": 0.6742358803749084, "learning_rate": 9.219269102990033e-06, "loss": 0.4878, "step": 555 }, { "epoch": 0.27726063829787234, "grad_norm": 0.642440140247345, "learning_rate": 9.235880398671098e-06, "loss": 0.4654, "step": 556 }, { "epoch": 0.2777593085106383, "grad_norm": 0.6338894367218018, "learning_rate": 9.25249169435216e-06, "loss": 0.5162, "step": 557 }, { "epoch": 0.27825797872340424, "grad_norm": 0.6478314995765686, "learning_rate": 9.269102990033223e-06, "loss": 0.45, "step": 558 }, { "epoch": 0.2787566489361702, "grad_norm": 0.6013652086257935, "learning_rate": 9.285714285714288e-06, "loss": 0.4836, "step": 559 }, { "epoch": 0.27925531914893614, "grad_norm": 0.6327393054962158, "learning_rate": 9.30232558139535e-06, "loss": 0.5124, "step": 560 }, { "epoch": 0.27975398936170215, "grad_norm": 0.519423246383667, "learning_rate": 9.318936877076413e-06, "loss": 0.4733, "step": 561 }, { "epoch": 0.2802526595744681, "grad_norm": 0.5928606390953064, "learning_rate": 9.335548172757476e-06, "loss": 0.4632, "step": 562 }, { "epoch": 0.28075132978723405, "grad_norm": 0.6060440540313721, "learning_rate": 9.352159468438539e-06, "loss": 0.4594, "step": 563 }, { "epoch": 0.28125, "grad_norm": 0.5044464468955994, "learning_rate": 9.368770764119603e-06, "loss": 0.4512, "step": 564 }, { "epoch": 0.28174867021276595, "grad_norm": 0.5556921362876892, "learning_rate": 9.385382059800666e-06, "loss": 0.4846, "step": 565 }, { "epoch": 0.2822473404255319, "grad_norm": 0.5368123650550842, "learning_rate": 9.401993355481728e-06, "loss": 0.4523, "step": 566 }, { "epoch": 0.28274601063829785, "grad_norm": 0.5823186039924622, "learning_rate": 9.418604651162791e-06, "loss": 0.4451, "step": 567 }, { "epoch": 0.28324468085106386, "grad_norm": 0.5219334363937378, "learning_rate": 9.435215946843854e-06, "loss": 0.4628, "step": 568 }, { "epoch": 0.2837433510638298, "grad_norm": 0.5780482888221741, "learning_rate": 9.451827242524918e-06, "loss": 0.4808, "step": 569 }, { "epoch": 0.28424202127659576, "grad_norm": 0.5711033344268799, "learning_rate": 9.468438538205981e-06, "loss": 0.4868, "step": 570 }, { "epoch": 0.2847406914893617, "grad_norm": 0.5696048736572266, "learning_rate": 9.485049833887044e-06, "loss": 0.4767, "step": 571 }, { "epoch": 0.28523936170212766, "grad_norm": 0.5156471133232117, "learning_rate": 9.501661129568107e-06, "loss": 0.478, "step": 572 }, { "epoch": 0.2857380319148936, "grad_norm": 0.6000272631645203, "learning_rate": 9.51827242524917e-06, "loss": 0.4827, "step": 573 }, { "epoch": 0.28623670212765956, "grad_norm": 0.5010817050933838, "learning_rate": 9.534883720930234e-06, "loss": 0.4706, "step": 574 }, { "epoch": 0.2867353723404255, "grad_norm": 0.5319303274154663, "learning_rate": 9.551495016611297e-06, "loss": 0.452, "step": 575 }, { "epoch": 0.2872340425531915, "grad_norm": 0.6416358351707458, "learning_rate": 9.56810631229236e-06, "loss": 0.4998, "step": 576 }, { "epoch": 0.28773271276595747, "grad_norm": 0.5986093282699585, "learning_rate": 9.584717607973422e-06, "loss": 0.4921, "step": 577 }, { "epoch": 0.2882313829787234, "grad_norm": 0.5873139500617981, "learning_rate": 9.601328903654485e-06, "loss": 0.4928, "step": 578 }, { "epoch": 0.28873005319148937, "grad_norm": 0.6173769235610962, "learning_rate": 9.61794019933555e-06, "loss": 0.426, "step": 579 }, { "epoch": 0.2892287234042553, "grad_norm": 0.6065990328788757, "learning_rate": 9.634551495016612e-06, "loss": 0.4504, "step": 580 }, { "epoch": 0.28972739361702127, "grad_norm": 0.6275486946105957, "learning_rate": 9.651162790697676e-06, "loss": 0.4809, "step": 581 }, { "epoch": 0.2902260638297872, "grad_norm": 0.6835201382637024, "learning_rate": 9.66777408637874e-06, "loss": 0.5085, "step": 582 }, { "epoch": 0.29072473404255317, "grad_norm": 0.49816128611564636, "learning_rate": 9.684385382059802e-06, "loss": 0.4505, "step": 583 }, { "epoch": 0.2912234042553192, "grad_norm": 0.5676817297935486, "learning_rate": 9.700996677740865e-06, "loss": 0.4783, "step": 584 }, { "epoch": 0.2917220744680851, "grad_norm": 0.5379215478897095, "learning_rate": 9.717607973421927e-06, "loss": 0.4928, "step": 585 }, { "epoch": 0.2922207446808511, "grad_norm": 0.574317455291748, "learning_rate": 9.734219269102992e-06, "loss": 0.458, "step": 586 }, { "epoch": 0.292719414893617, "grad_norm": 0.5125823616981506, "learning_rate": 9.750830564784055e-06, "loss": 0.482, "step": 587 }, { "epoch": 0.293218085106383, "grad_norm": 0.5408780574798584, "learning_rate": 9.767441860465117e-06, "loss": 0.4611, "step": 588 }, { "epoch": 0.2937167553191489, "grad_norm": 0.5661694407463074, "learning_rate": 9.78405315614618e-06, "loss": 0.4704, "step": 589 }, { "epoch": 0.2942154255319149, "grad_norm": 0.5051293969154358, "learning_rate": 9.800664451827243e-06, "loss": 0.4572, "step": 590 }, { "epoch": 0.2947140957446808, "grad_norm": 0.5282635688781738, "learning_rate": 9.817275747508307e-06, "loss": 0.4618, "step": 591 }, { "epoch": 0.29521276595744683, "grad_norm": 0.5339493155479431, "learning_rate": 9.83388704318937e-06, "loss": 0.4479, "step": 592 }, { "epoch": 0.2957114361702128, "grad_norm": 0.5458594560623169, "learning_rate": 9.850498338870433e-06, "loss": 0.4941, "step": 593 }, { "epoch": 0.29621010638297873, "grad_norm": 0.6659757494926453, "learning_rate": 9.867109634551495e-06, "loss": 0.5045, "step": 594 }, { "epoch": 0.2967087765957447, "grad_norm": 0.5310888290405273, "learning_rate": 9.883720930232558e-06, "loss": 0.4508, "step": 595 }, { "epoch": 0.29720744680851063, "grad_norm": 0.5901215672492981, "learning_rate": 9.900332225913623e-06, "loss": 0.457, "step": 596 }, { "epoch": 0.2977061170212766, "grad_norm": 0.6187776327133179, "learning_rate": 9.916943521594685e-06, "loss": 0.5145, "step": 597 }, { "epoch": 0.29820478723404253, "grad_norm": 0.5243094563484192, "learning_rate": 9.933554817275748e-06, "loss": 0.4777, "step": 598 }, { "epoch": 0.2987034574468085, "grad_norm": 0.6220588088035583, "learning_rate": 9.950166112956811e-06, "loss": 0.4706, "step": 599 }, { "epoch": 0.2992021276595745, "grad_norm": 0.564398467540741, "learning_rate": 9.966777408637874e-06, "loss": 0.4618, "step": 600 }, { "epoch": 0.29970079787234044, "grad_norm": 0.6300983428955078, "learning_rate": 9.983388704318938e-06, "loss": 0.498, "step": 601 }, { "epoch": 0.3001994680851064, "grad_norm": 0.5592682361602783, "learning_rate": 1e-05, "loss": 0.4799, "step": 602 }, { "epoch": 0.30069813829787234, "grad_norm": 0.652984619140625, "learning_rate": 9.999999157899962e-06, "loss": 0.4512, "step": 603 }, { "epoch": 0.3011968085106383, "grad_norm": 0.5259566903114319, "learning_rate": 9.999996631600126e-06, "loss": 0.4553, "step": 604 }, { "epoch": 0.30169547872340424, "grad_norm": 0.5591691732406616, "learning_rate": 9.999992421101348e-06, "loss": 0.449, "step": 605 }, { "epoch": 0.3021941489361702, "grad_norm": 0.6691451668739319, "learning_rate": 9.999986526405043e-06, "loss": 0.4812, "step": 606 }, { "epoch": 0.30269281914893614, "grad_norm": 0.6197488903999329, "learning_rate": 9.999978947513199e-06, "loss": 0.4879, "step": 607 }, { "epoch": 0.30319148936170215, "grad_norm": 0.5706349611282349, "learning_rate": 9.999969684428365e-06, "loss": 0.4458, "step": 608 }, { "epoch": 0.3036901595744681, "grad_norm": 0.6616134643554688, "learning_rate": 9.999958737153666e-06, "loss": 0.477, "step": 609 }, { "epoch": 0.30418882978723405, "grad_norm": 0.6390761137008667, "learning_rate": 9.999946105692784e-06, "loss": 0.4856, "step": 610 }, { "epoch": 0.3046875, "grad_norm": 0.6459905505180359, "learning_rate": 9.99993179004998e-06, "loss": 0.4835, "step": 611 }, { "epoch": 0.30518617021276595, "grad_norm": 0.6779118776321411, "learning_rate": 9.99991579023007e-06, "loss": 0.472, "step": 612 }, { "epoch": 0.3056848404255319, "grad_norm": 0.6755257844924927, "learning_rate": 9.99989810623845e-06, "loss": 0.4929, "step": 613 }, { "epoch": 0.30618351063829785, "grad_norm": 0.6918604969978333, "learning_rate": 9.999878738081072e-06, "loss": 0.4554, "step": 614 }, { "epoch": 0.30668218085106386, "grad_norm": 0.6453360319137573, "learning_rate": 9.99985768576446e-06, "loss": 0.4745, "step": 615 }, { "epoch": 0.3071808510638298, "grad_norm": 0.699151337146759, "learning_rate": 9.999834949295706e-06, "loss": 0.4602, "step": 616 }, { "epoch": 0.30767952127659576, "grad_norm": 0.580316424369812, "learning_rate": 9.99981052868247e-06, "loss": 0.4768, "step": 617 }, { "epoch": 0.3081781914893617, "grad_norm": 0.7218919396400452, "learning_rate": 9.999784423932975e-06, "loss": 0.4518, "step": 618 }, { "epoch": 0.30867686170212766, "grad_norm": 0.6588768362998962, "learning_rate": 9.999756635056017e-06, "loss": 0.4615, "step": 619 }, { "epoch": 0.3091755319148936, "grad_norm": 0.5206940770149231, "learning_rate": 9.999727162060954e-06, "loss": 0.4552, "step": 620 }, { "epoch": 0.30967420212765956, "grad_norm": 0.5767697095870972, "learning_rate": 9.999696004957717e-06, "loss": 0.454, "step": 621 }, { "epoch": 0.3101728723404255, "grad_norm": 0.6510716080665588, "learning_rate": 9.999663163756799e-06, "loss": 0.4564, "step": 622 }, { "epoch": 0.3106715425531915, "grad_norm": 0.5606602430343628, "learning_rate": 9.99962863846926e-06, "loss": 0.4643, "step": 623 }, { "epoch": 0.31117021276595747, "grad_norm": 0.6070423722267151, "learning_rate": 9.999592429106733e-06, "loss": 0.4704, "step": 624 }, { "epoch": 0.3116688829787234, "grad_norm": 0.6753053665161133, "learning_rate": 9.999554535681412e-06, "loss": 0.4861, "step": 625 }, { "epoch": 0.31216755319148937, "grad_norm": 0.63004469871521, "learning_rate": 9.999514958206064e-06, "loss": 0.4713, "step": 626 }, { "epoch": 0.3126662234042553, "grad_norm": 0.564764142036438, "learning_rate": 9.999473696694017e-06, "loss": 0.4798, "step": 627 }, { "epoch": 0.31316489361702127, "grad_norm": 0.5637955069541931, "learning_rate": 9.999430751159172e-06, "loss": 0.4869, "step": 628 }, { "epoch": 0.3136635638297872, "grad_norm": 0.5764055848121643, "learning_rate": 9.999386121615994e-06, "loss": 0.4489, "step": 629 }, { "epoch": 0.31416223404255317, "grad_norm": 0.5806159377098083, "learning_rate": 9.999339808079516e-06, "loss": 0.4601, "step": 630 }, { "epoch": 0.3146609042553192, "grad_norm": 0.6713747978210449, "learning_rate": 9.999291810565338e-06, "loss": 0.4683, "step": 631 }, { "epoch": 0.3151595744680851, "grad_norm": 0.6080219745635986, "learning_rate": 9.999242129089628e-06, "loss": 0.5183, "step": 632 }, { "epoch": 0.3156582446808511, "grad_norm": 0.7107604146003723, "learning_rate": 9.99919076366912e-06, "loss": 0.4644, "step": 633 }, { "epoch": 0.316156914893617, "grad_norm": 0.6728383898735046, "learning_rate": 9.999137714321116e-06, "loss": 0.4559, "step": 634 }, { "epoch": 0.316655585106383, "grad_norm": 0.5618370175361633, "learning_rate": 9.999082981063486e-06, "loss": 0.4606, "step": 635 }, { "epoch": 0.3171542553191489, "grad_norm": 0.7626729011535645, "learning_rate": 9.999026563914666e-06, "loss": 0.5139, "step": 636 }, { "epoch": 0.3176529255319149, "grad_norm": 0.6161646246910095, "learning_rate": 9.99896846289366e-06, "loss": 0.4449, "step": 637 }, { "epoch": 0.3181515957446808, "grad_norm": 0.5473371744155884, "learning_rate": 9.998908678020037e-06, "loss": 0.4536, "step": 638 }, { "epoch": 0.31865026595744683, "grad_norm": 0.764215350151062, "learning_rate": 9.998847209313935e-06, "loss": 0.469, "step": 639 }, { "epoch": 0.3191489361702128, "grad_norm": 0.6656177639961243, "learning_rate": 9.99878405679606e-06, "loss": 0.4997, "step": 640 }, { "epoch": 0.31964760638297873, "grad_norm": 0.7018892168998718, "learning_rate": 9.998719220487687e-06, "loss": 0.4744, "step": 641 }, { "epoch": 0.3201462765957447, "grad_norm": 0.5868011116981506, "learning_rate": 9.998652700410651e-06, "loss": 0.4929, "step": 642 }, { "epoch": 0.32064494680851063, "grad_norm": 0.6281352639198303, "learning_rate": 9.998584496587362e-06, "loss": 0.4436, "step": 643 }, { "epoch": 0.3211436170212766, "grad_norm": 0.5733711123466492, "learning_rate": 9.998514609040793e-06, "loss": 0.486, "step": 644 }, { "epoch": 0.32164228723404253, "grad_norm": 0.5793837308883667, "learning_rate": 9.998443037794483e-06, "loss": 0.4621, "step": 645 }, { "epoch": 0.3221409574468085, "grad_norm": 0.537600576877594, "learning_rate": 9.998369782872542e-06, "loss": 0.4747, "step": 646 }, { "epoch": 0.3226396276595745, "grad_norm": 0.5819153785705566, "learning_rate": 9.998294844299644e-06, "loss": 0.4241, "step": 647 }, { "epoch": 0.32313829787234044, "grad_norm": 0.5807910561561584, "learning_rate": 9.99821822210103e-06, "loss": 0.4908, "step": 648 }, { "epoch": 0.3236369680851064, "grad_norm": 0.5471644997596741, "learning_rate": 9.998139916302512e-06, "loss": 0.4931, "step": 649 }, { "epoch": 0.32413563829787234, "grad_norm": 0.6386508941650391, "learning_rate": 9.998059926930468e-06, "loss": 0.4793, "step": 650 }, { "epoch": 0.3246343085106383, "grad_norm": 0.614708662033081, "learning_rate": 9.997978254011838e-06, "loss": 0.4786, "step": 651 }, { "epoch": 0.32513297872340424, "grad_norm": 0.5915523767471313, "learning_rate": 9.997894897574134e-06, "loss": 0.4441, "step": 652 }, { "epoch": 0.3256316489361702, "grad_norm": 0.6179320216178894, "learning_rate": 9.997809857645432e-06, "loss": 0.4812, "step": 653 }, { "epoch": 0.32613031914893614, "grad_norm": 0.5775718092918396, "learning_rate": 9.997723134254382e-06, "loss": 0.4839, "step": 654 }, { "epoch": 0.32662898936170215, "grad_norm": 0.6351132988929749, "learning_rate": 9.997634727430193e-06, "loss": 0.4808, "step": 655 }, { "epoch": 0.3271276595744681, "grad_norm": 0.5736388564109802, "learning_rate": 9.997544637202641e-06, "loss": 0.4577, "step": 656 }, { "epoch": 0.32762632978723405, "grad_norm": 0.5560563802719116, "learning_rate": 9.997452863602075e-06, "loss": 0.4816, "step": 657 }, { "epoch": 0.328125, "grad_norm": 0.5185325145721436, "learning_rate": 9.997359406659408e-06, "loss": 0.4311, "step": 658 }, { "epoch": 0.32862367021276595, "grad_norm": 0.533109188079834, "learning_rate": 9.99726426640612e-06, "loss": 0.4726, "step": 659 }, { "epoch": 0.3291223404255319, "grad_norm": 0.6720905900001526, "learning_rate": 9.997167442874257e-06, "loss": 0.4353, "step": 660 }, { "epoch": 0.32962101063829785, "grad_norm": 0.5861791968345642, "learning_rate": 9.997068936096436e-06, "loss": 0.4708, "step": 661 }, { "epoch": 0.33011968085106386, "grad_norm": 0.557450532913208, "learning_rate": 9.996968746105833e-06, "loss": 0.4503, "step": 662 }, { "epoch": 0.3306183510638298, "grad_norm": 0.5669955015182495, "learning_rate": 9.9968668729362e-06, "loss": 0.4687, "step": 663 }, { "epoch": 0.33111702127659576, "grad_norm": 0.5771009922027588, "learning_rate": 9.996763316621852e-06, "loss": 0.4764, "step": 664 }, { "epoch": 0.3316156914893617, "grad_norm": 0.6825502514839172, "learning_rate": 9.996658077197667e-06, "loss": 0.4856, "step": 665 }, { "epoch": 0.33211436170212766, "grad_norm": 0.5878959894180298, "learning_rate": 9.996551154699097e-06, "loss": 0.4498, "step": 666 }, { "epoch": 0.3326130319148936, "grad_norm": 0.7261025309562683, "learning_rate": 9.99644254916216e-06, "loss": 0.4599, "step": 667 }, { "epoch": 0.33311170212765956, "grad_norm": 0.6248176693916321, "learning_rate": 9.99633226062343e-06, "loss": 0.4501, "step": 668 }, { "epoch": 0.3336103723404255, "grad_norm": 0.5613275766372681, "learning_rate": 9.996220289120069e-06, "loss": 0.4598, "step": 669 }, { "epoch": 0.3341090425531915, "grad_norm": 0.6245164275169373, "learning_rate": 9.996106634689784e-06, "loss": 0.4731, "step": 670 }, { "epoch": 0.33460771276595747, "grad_norm": 0.6533389687538147, "learning_rate": 9.995991297370861e-06, "loss": 0.5061, "step": 671 }, { "epoch": 0.3351063829787234, "grad_norm": 0.5748913884162903, "learning_rate": 9.995874277202153e-06, "loss": 0.4675, "step": 672 }, { "epoch": 0.33560505319148937, "grad_norm": 0.5511308312416077, "learning_rate": 9.995755574223071e-06, "loss": 0.4653, "step": 673 }, { "epoch": 0.3361037234042553, "grad_norm": 0.5985400080680847, "learning_rate": 9.995635188473606e-06, "loss": 0.4709, "step": 674 }, { "epoch": 0.33660239361702127, "grad_norm": 0.5875035524368286, "learning_rate": 9.995513119994304e-06, "loss": 0.5004, "step": 675 }, { "epoch": 0.3371010638297872, "grad_norm": 0.5780587792396545, "learning_rate": 9.995389368826283e-06, "loss": 0.465, "step": 676 }, { "epoch": 0.33759973404255317, "grad_norm": 0.572279691696167, "learning_rate": 9.99526393501123e-06, "loss": 0.5061, "step": 677 }, { "epoch": 0.3380984042553192, "grad_norm": 0.5495202541351318, "learning_rate": 9.995136818591394e-06, "loss": 0.4825, "step": 678 }, { "epoch": 0.3385970744680851, "grad_norm": 0.6120004057884216, "learning_rate": 9.99500801960959e-06, "loss": 0.4935, "step": 679 }, { "epoch": 0.3390957446808511, "grad_norm": 0.5573785305023193, "learning_rate": 9.99487753810921e-06, "loss": 0.4297, "step": 680 }, { "epoch": 0.339594414893617, "grad_norm": 0.5085989236831665, "learning_rate": 9.994745374134201e-06, "loss": 0.4143, "step": 681 }, { "epoch": 0.340093085106383, "grad_norm": 0.6466352343559265, "learning_rate": 9.99461152772908e-06, "loss": 0.4725, "step": 682 }, { "epoch": 0.3405917553191489, "grad_norm": 0.6122413873672485, "learning_rate": 9.994475998938934e-06, "loss": 0.4515, "step": 683 }, { "epoch": 0.3410904255319149, "grad_norm": 0.5440813899040222, "learning_rate": 9.994338787809412e-06, "loss": 0.4694, "step": 684 }, { "epoch": 0.3415890957446808, "grad_norm": 0.5586623549461365, "learning_rate": 9.994199894386736e-06, "loss": 0.4509, "step": 685 }, { "epoch": 0.34208776595744683, "grad_norm": 0.638180673122406, "learning_rate": 9.994059318717687e-06, "loss": 0.4662, "step": 686 }, { "epoch": 0.3425864361702128, "grad_norm": 0.5843632221221924, "learning_rate": 9.99391706084962e-06, "loss": 0.4391, "step": 687 }, { "epoch": 0.34308510638297873, "grad_norm": 0.5685454607009888, "learning_rate": 9.99377312083045e-06, "loss": 0.4759, "step": 688 }, { "epoch": 0.3435837765957447, "grad_norm": 0.5271984338760376, "learning_rate": 9.993627498708665e-06, "loss": 0.4711, "step": 689 }, { "epoch": 0.34408244680851063, "grad_norm": 0.5561754703521729, "learning_rate": 9.993480194533312e-06, "loss": 0.432, "step": 690 }, { "epoch": 0.3445811170212766, "grad_norm": 0.5966961979866028, "learning_rate": 9.993331208354014e-06, "loss": 0.4198, "step": 691 }, { "epoch": 0.34507978723404253, "grad_norm": 0.5815892815589905, "learning_rate": 9.993180540220952e-06, "loss": 0.4829, "step": 692 }, { "epoch": 0.3455784574468085, "grad_norm": 0.589792788028717, "learning_rate": 9.993028190184877e-06, "loss": 0.4529, "step": 693 }, { "epoch": 0.3460771276595745, "grad_norm": 0.551114022731781, "learning_rate": 9.99287415829711e-06, "loss": 0.449, "step": 694 }, { "epoch": 0.34657579787234044, "grad_norm": 0.5596604943275452, "learning_rate": 9.992718444609531e-06, "loss": 0.4463, "step": 695 }, { "epoch": 0.3470744680851064, "grad_norm": 0.6281490921974182, "learning_rate": 9.992561049174594e-06, "loss": 0.5096, "step": 696 }, { "epoch": 0.34757313829787234, "grad_norm": 0.5531086325645447, "learning_rate": 9.992401972045314e-06, "loss": 0.4626, "step": 697 }, { "epoch": 0.3480718085106383, "grad_norm": 0.5565131306648254, "learning_rate": 9.992241213275274e-06, "loss": 0.4644, "step": 698 }, { "epoch": 0.34857047872340424, "grad_norm": 0.5723236799240112, "learning_rate": 9.992078772918625e-06, "loss": 0.4622, "step": 699 }, { "epoch": 0.3490691489361702, "grad_norm": 0.562030017375946, "learning_rate": 9.991914651030084e-06, "loss": 0.4563, "step": 700 }, { "epoch": 0.34956781914893614, "grad_norm": 0.5500701665878296, "learning_rate": 9.991748847664933e-06, "loss": 0.4763, "step": 701 }, { "epoch": 0.35006648936170215, "grad_norm": 0.6455311179161072, "learning_rate": 9.991581362879022e-06, "loss": 0.463, "step": 702 }, { "epoch": 0.3505651595744681, "grad_norm": 0.6310989260673523, "learning_rate": 9.991412196728765e-06, "loss": 0.4536, "step": 703 }, { "epoch": 0.35106382978723405, "grad_norm": 0.6204677820205688, "learning_rate": 9.991241349271146e-06, "loss": 0.485, "step": 704 }, { "epoch": 0.3515625, "grad_norm": 0.6488769054412842, "learning_rate": 9.991068820563712e-06, "loss": 0.4501, "step": 705 }, { "epoch": 0.35206117021276595, "grad_norm": 0.5602238774299622, "learning_rate": 9.990894610664576e-06, "loss": 0.4559, "step": 706 }, { "epoch": 0.3525598404255319, "grad_norm": 0.6331875920295715, "learning_rate": 9.990718719632424e-06, "loss": 0.468, "step": 707 }, { "epoch": 0.35305851063829785, "grad_norm": 0.48532888293266296, "learning_rate": 9.990541147526498e-06, "loss": 0.4651, "step": 708 }, { "epoch": 0.35355718085106386, "grad_norm": 0.5617280602455139, "learning_rate": 9.990361894406612e-06, "loss": 0.4506, "step": 709 }, { "epoch": 0.3540558510638298, "grad_norm": 0.6136946082115173, "learning_rate": 9.990180960333148e-06, "loss": 0.4648, "step": 710 }, { "epoch": 0.35455452127659576, "grad_norm": 0.6063882112503052, "learning_rate": 9.989998345367051e-06, "loss": 0.4638, "step": 711 }, { "epoch": 0.3550531914893617, "grad_norm": 0.6068097949028015, "learning_rate": 9.989814049569832e-06, "loss": 0.4584, "step": 712 }, { "epoch": 0.35555186170212766, "grad_norm": 0.584631621837616, "learning_rate": 9.98962807300357e-06, "loss": 0.4763, "step": 713 }, { "epoch": 0.3560505319148936, "grad_norm": 0.6264452934265137, "learning_rate": 9.989440415730908e-06, "loss": 0.418, "step": 714 }, { "epoch": 0.35654920212765956, "grad_norm": 0.6262893676757812, "learning_rate": 9.98925107781506e-06, "loss": 0.4894, "step": 715 }, { "epoch": 0.3570478723404255, "grad_norm": 0.5517139434814453, "learning_rate": 9.989060059319799e-06, "loss": 0.4751, "step": 716 }, { "epoch": 0.3575465425531915, "grad_norm": 0.5115319490432739, "learning_rate": 9.988867360309469e-06, "loss": 0.4596, "step": 717 }, { "epoch": 0.35804521276595747, "grad_norm": 0.6034055352210999, "learning_rate": 9.988672980848979e-06, "loss": 0.4506, "step": 718 }, { "epoch": 0.3585438829787234, "grad_norm": 0.6020833253860474, "learning_rate": 9.988476921003803e-06, "loss": 0.4664, "step": 719 }, { "epoch": 0.35904255319148937, "grad_norm": 0.5445871949195862, "learning_rate": 9.988279180839982e-06, "loss": 0.4359, "step": 720 }, { "epoch": 0.3595412234042553, "grad_norm": 0.5763530135154724, "learning_rate": 9.988079760424123e-06, "loss": 0.4423, "step": 721 }, { "epoch": 0.36003989361702127, "grad_norm": 0.6003451347351074, "learning_rate": 9.9878786598234e-06, "loss": 0.4245, "step": 722 }, { "epoch": 0.3605385638297872, "grad_norm": 0.5349460244178772, "learning_rate": 9.98767587910555e-06, "loss": 0.4454, "step": 723 }, { "epoch": 0.36103723404255317, "grad_norm": 0.584007978439331, "learning_rate": 9.987471418338878e-06, "loss": 0.4963, "step": 724 }, { "epoch": 0.3615359042553192, "grad_norm": 0.5977215766906738, "learning_rate": 9.987265277592256e-06, "loss": 0.4844, "step": 725 }, { "epoch": 0.3620345744680851, "grad_norm": 0.5443600416183472, "learning_rate": 9.987057456935117e-06, "loss": 0.4563, "step": 726 }, { "epoch": 0.3625332446808511, "grad_norm": 0.6250536441802979, "learning_rate": 9.986847956437467e-06, "loss": 0.533, "step": 727 }, { "epoch": 0.363031914893617, "grad_norm": 0.6188754439353943, "learning_rate": 9.986636776169871e-06, "loss": 0.4538, "step": 728 }, { "epoch": 0.363530585106383, "grad_norm": 0.5643008947372437, "learning_rate": 9.986423916203466e-06, "loss": 0.5324, "step": 729 }, { "epoch": 0.3640292553191489, "grad_norm": 0.5582212805747986, "learning_rate": 9.986209376609953e-06, "loss": 0.4775, "step": 730 }, { "epoch": 0.3645279255319149, "grad_norm": 0.6469908356666565, "learning_rate": 9.985993157461592e-06, "loss": 0.4742, "step": 731 }, { "epoch": 0.3650265957446808, "grad_norm": 0.5398740768432617, "learning_rate": 9.985775258831215e-06, "loss": 0.4415, "step": 732 }, { "epoch": 0.36552526595744683, "grad_norm": 0.6137683987617493, "learning_rate": 9.985555680792225e-06, "loss": 0.4617, "step": 733 }, { "epoch": 0.3660239361702128, "grad_norm": 0.6158071160316467, "learning_rate": 9.98533442341858e-06, "loss": 0.4745, "step": 734 }, { "epoch": 0.36652260638297873, "grad_norm": 0.5827381610870361, "learning_rate": 9.985111486784809e-06, "loss": 0.4507, "step": 735 }, { "epoch": 0.3670212765957447, "grad_norm": 0.5740682482719421, "learning_rate": 9.984886870966007e-06, "loss": 0.4614, "step": 736 }, { "epoch": 0.36751994680851063, "grad_norm": 0.6353812217712402, "learning_rate": 9.98466057603783e-06, "loss": 0.4784, "step": 737 }, { "epoch": 0.3680186170212766, "grad_norm": 0.5654738545417786, "learning_rate": 9.984432602076507e-06, "loss": 0.4926, "step": 738 }, { "epoch": 0.36851728723404253, "grad_norm": 0.6270098090171814, "learning_rate": 9.98420294915883e-06, "loss": 0.5109, "step": 739 }, { "epoch": 0.3690159574468085, "grad_norm": 0.6867227554321289, "learning_rate": 9.983971617362151e-06, "loss": 0.4736, "step": 740 }, { "epoch": 0.3695146276595745, "grad_norm": 0.5806553363800049, "learning_rate": 9.983738606764394e-06, "loss": 0.4849, "step": 741 }, { "epoch": 0.37001329787234044, "grad_norm": 0.5998751521110535, "learning_rate": 9.983503917444047e-06, "loss": 0.4563, "step": 742 }, { "epoch": 0.3705119680851064, "grad_norm": 0.6432068943977356, "learning_rate": 9.983267549480162e-06, "loss": 0.4971, "step": 743 }, { "epoch": 0.37101063829787234, "grad_norm": 0.5526711344718933, "learning_rate": 9.983029502952357e-06, "loss": 0.4703, "step": 744 }, { "epoch": 0.3715093085106383, "grad_norm": 0.6435108780860901, "learning_rate": 9.982789777940814e-06, "loss": 0.5016, "step": 745 }, { "epoch": 0.37200797872340424, "grad_norm": 0.5635398626327515, "learning_rate": 9.982548374526285e-06, "loss": 0.4845, "step": 746 }, { "epoch": 0.3725066489361702, "grad_norm": 0.637410581111908, "learning_rate": 9.982305292790082e-06, "loss": 0.5016, "step": 747 }, { "epoch": 0.37300531914893614, "grad_norm": 0.5918897986412048, "learning_rate": 9.982060532814086e-06, "loss": 0.4563, "step": 748 }, { "epoch": 0.37350398936170215, "grad_norm": 0.529600977897644, "learning_rate": 9.981814094680741e-06, "loss": 0.4616, "step": 749 }, { "epoch": 0.3740026595744681, "grad_norm": 0.5123669505119324, "learning_rate": 9.981565978473059e-06, "loss": 0.4464, "step": 750 }, { "epoch": 0.37450132978723405, "grad_norm": 0.6056315302848816, "learning_rate": 9.981316184274614e-06, "loss": 0.4615, "step": 751 }, { "epoch": 0.375, "grad_norm": 0.5935787558555603, "learning_rate": 9.981064712169545e-06, "loss": 0.4636, "step": 752 }, { "epoch": 0.37549867021276595, "grad_norm": 0.5789679884910583, "learning_rate": 9.980811562242564e-06, "loss": 0.4589, "step": 753 }, { "epoch": 0.3759973404255319, "grad_norm": 0.5506417751312256, "learning_rate": 9.980556734578934e-06, "loss": 0.4661, "step": 754 }, { "epoch": 0.37649601063829785, "grad_norm": 0.6257126927375793, "learning_rate": 9.980300229264497e-06, "loss": 0.4728, "step": 755 }, { "epoch": 0.37699468085106386, "grad_norm": 0.5117264986038208, "learning_rate": 9.980042046385651e-06, "loss": 0.4465, "step": 756 }, { "epoch": 0.3774933510638298, "grad_norm": 0.6241040229797363, "learning_rate": 9.979782186029363e-06, "loss": 0.4681, "step": 757 }, { "epoch": 0.37799202127659576, "grad_norm": 0.6207339763641357, "learning_rate": 9.979520648283166e-06, "loss": 0.4804, "step": 758 }, { "epoch": 0.3784906914893617, "grad_norm": 0.6229996681213379, "learning_rate": 9.979257433235157e-06, "loss": 0.4724, "step": 759 }, { "epoch": 0.37898936170212766, "grad_norm": 0.6354979872703552, "learning_rate": 9.978992540973992e-06, "loss": 0.4395, "step": 760 }, { "epoch": 0.3794880319148936, "grad_norm": 0.6109357476234436, "learning_rate": 9.978725971588903e-06, "loss": 0.5008, "step": 761 }, { "epoch": 0.37998670212765956, "grad_norm": 0.5778639316558838, "learning_rate": 9.978457725169679e-06, "loss": 0.4506, "step": 762 }, { "epoch": 0.3804853723404255, "grad_norm": 0.5498069524765015, "learning_rate": 9.978187801806677e-06, "loss": 0.4589, "step": 763 }, { "epoch": 0.3809840425531915, "grad_norm": 0.5254846215248108, "learning_rate": 9.977916201590818e-06, "loss": 0.4198, "step": 764 }, { "epoch": 0.38148271276595747, "grad_norm": 0.5505985617637634, "learning_rate": 9.977642924613585e-06, "loss": 0.4635, "step": 765 }, { "epoch": 0.3819813829787234, "grad_norm": 0.611925482749939, "learning_rate": 9.97736797096703e-06, "loss": 0.4344, "step": 766 }, { "epoch": 0.38248005319148937, "grad_norm": 0.5737864375114441, "learning_rate": 9.977091340743772e-06, "loss": 0.4357, "step": 767 }, { "epoch": 0.3829787234042553, "grad_norm": 0.6597175002098083, "learning_rate": 9.976813034036987e-06, "loss": 0.4691, "step": 768 }, { "epoch": 0.38347739361702127, "grad_norm": 0.681160569190979, "learning_rate": 9.976533050940423e-06, "loss": 0.4624, "step": 769 }, { "epoch": 0.3839760638297872, "grad_norm": 0.5469726324081421, "learning_rate": 9.976251391548384e-06, "loss": 0.4412, "step": 770 }, { "epoch": 0.38447473404255317, "grad_norm": 0.575087308883667, "learning_rate": 9.97596805595575e-06, "loss": 0.4721, "step": 771 }, { "epoch": 0.3849734042553192, "grad_norm": 0.6852418184280396, "learning_rate": 9.975683044257958e-06, "loss": 0.4817, "step": 772 }, { "epoch": 0.3854720744680851, "grad_norm": 0.5879660248756409, "learning_rate": 9.975396356551011e-06, "loss": 0.4897, "step": 773 }, { "epoch": 0.3859707446808511, "grad_norm": 0.5748050212860107, "learning_rate": 9.975107992931473e-06, "loss": 0.5109, "step": 774 }, { "epoch": 0.386469414893617, "grad_norm": 0.535886824131012, "learning_rate": 9.974817953496486e-06, "loss": 0.4307, "step": 775 }, { "epoch": 0.386968085106383, "grad_norm": 0.5423892140388489, "learning_rate": 9.974526238343737e-06, "loss": 0.4537, "step": 776 }, { "epoch": 0.3874667553191489, "grad_norm": 0.5860415101051331, "learning_rate": 9.974232847571492e-06, "loss": 0.4785, "step": 777 }, { "epoch": 0.3879654255319149, "grad_norm": 0.6277981996536255, "learning_rate": 9.973937781278578e-06, "loss": 0.4531, "step": 778 }, { "epoch": 0.3884640957446808, "grad_norm": 0.5574867129325867, "learning_rate": 9.973641039564381e-06, "loss": 0.451, "step": 779 }, { "epoch": 0.38896276595744683, "grad_norm": 0.5385394096374512, "learning_rate": 9.97334262252886e-06, "loss": 0.4601, "step": 780 }, { "epoch": 0.3894614361702128, "grad_norm": 0.6366627812385559, "learning_rate": 9.97304253027253e-06, "loss": 0.4688, "step": 781 }, { "epoch": 0.38996010638297873, "grad_norm": 0.639367938041687, "learning_rate": 9.972740762896476e-06, "loss": 0.4568, "step": 782 }, { "epoch": 0.3904587765957447, "grad_norm": 0.5876588821411133, "learning_rate": 9.972437320502347e-06, "loss": 0.466, "step": 783 }, { "epoch": 0.39095744680851063, "grad_norm": 0.576861560344696, "learning_rate": 9.97213220319235e-06, "loss": 0.4566, "step": 784 }, { "epoch": 0.3914561170212766, "grad_norm": 0.6306362152099609, "learning_rate": 9.971825411069264e-06, "loss": 0.4415, "step": 785 }, { "epoch": 0.39195478723404253, "grad_norm": 0.532692015171051, "learning_rate": 9.971516944236429e-06, "loss": 0.4715, "step": 786 }, { "epoch": 0.3924534574468085, "grad_norm": 0.5569854974746704, "learning_rate": 9.971206802797746e-06, "loss": 0.4499, "step": 787 }, { "epoch": 0.3929521276595745, "grad_norm": 0.5644725561141968, "learning_rate": 9.970894986857687e-06, "loss": 0.4612, "step": 788 }, { "epoch": 0.39345079787234044, "grad_norm": 0.5768519639968872, "learning_rate": 9.970581496521282e-06, "loss": 0.4513, "step": 789 }, { "epoch": 0.3939494680851064, "grad_norm": 0.6070574522018433, "learning_rate": 9.970266331894125e-06, "loss": 0.489, "step": 790 }, { "epoch": 0.39444813829787234, "grad_norm": 0.6357319951057434, "learning_rate": 9.96994949308238e-06, "loss": 0.4445, "step": 791 }, { "epoch": 0.3949468085106383, "grad_norm": 0.5688294768333435, "learning_rate": 9.96963098019277e-06, "loss": 0.4654, "step": 792 }, { "epoch": 0.39544547872340424, "grad_norm": 0.6604008078575134, "learning_rate": 9.969310793332583e-06, "loss": 0.4871, "step": 793 }, { "epoch": 0.3959441489361702, "grad_norm": 0.5909300446510315, "learning_rate": 9.96898893260967e-06, "loss": 0.4731, "step": 794 }, { "epoch": 0.39644281914893614, "grad_norm": 0.5924171209335327, "learning_rate": 9.968665398132444e-06, "loss": 0.47, "step": 795 }, { "epoch": 0.39694148936170215, "grad_norm": 0.6169602274894714, "learning_rate": 9.968340190009888e-06, "loss": 0.4384, "step": 796 }, { "epoch": 0.3974401595744681, "grad_norm": 0.6100724935531616, "learning_rate": 9.968013308351545e-06, "loss": 0.4766, "step": 797 }, { "epoch": 0.39793882978723405, "grad_norm": 0.5711425542831421, "learning_rate": 9.967684753267522e-06, "loss": 0.4952, "step": 798 }, { "epoch": 0.3984375, "grad_norm": 0.656146764755249, "learning_rate": 9.967354524868485e-06, "loss": 0.4796, "step": 799 }, { "epoch": 0.39893617021276595, "grad_norm": 0.6182783842086792, "learning_rate": 9.967022623265674e-06, "loss": 0.4647, "step": 800 }, { "epoch": 0.3994348404255319, "grad_norm": 0.655119001865387, "learning_rate": 9.966689048570884e-06, "loss": 0.4499, "step": 801 }, { "epoch": 0.39993351063829785, "grad_norm": 0.5906473994255066, "learning_rate": 9.966353800896475e-06, "loss": 0.4639, "step": 802 }, { "epoch": 0.40043218085106386, "grad_norm": 0.6991852521896362, "learning_rate": 9.966016880355375e-06, "loss": 0.4494, "step": 803 }, { "epoch": 0.4009308510638298, "grad_norm": 0.7669057250022888, "learning_rate": 9.965678287061068e-06, "loss": 0.4696, "step": 804 }, { "epoch": 0.40142952127659576, "grad_norm": 0.5442091226577759, "learning_rate": 9.965338021127612e-06, "loss": 0.4832, "step": 805 }, { "epoch": 0.4019281914893617, "grad_norm": 0.7099936604499817, "learning_rate": 9.964996082669617e-06, "loss": 0.4602, "step": 806 }, { "epoch": 0.40242686170212766, "grad_norm": 0.6741791367530823, "learning_rate": 9.964652471802264e-06, "loss": 0.472, "step": 807 }, { "epoch": 0.4029255319148936, "grad_norm": 0.6549288034439087, "learning_rate": 9.964307188641293e-06, "loss": 0.4597, "step": 808 }, { "epoch": 0.40342420212765956, "grad_norm": 0.734392523765564, "learning_rate": 9.963960233303012e-06, "loss": 0.4889, "step": 809 }, { "epoch": 0.4039228723404255, "grad_norm": 0.5933405160903931, "learning_rate": 9.963611605904286e-06, "loss": 0.4492, "step": 810 }, { "epoch": 0.4044215425531915, "grad_norm": 0.7134311199188232, "learning_rate": 9.96326130656255e-06, "loss": 0.4632, "step": 811 }, { "epoch": 0.40492021276595747, "grad_norm": 0.6931994557380676, "learning_rate": 9.962909335395796e-06, "loss": 0.4578, "step": 812 }, { "epoch": 0.4054188829787234, "grad_norm": 0.5808475017547607, "learning_rate": 9.962555692522583e-06, "loss": 0.4425, "step": 813 }, { "epoch": 0.40591755319148937, "grad_norm": 0.7136069536209106, "learning_rate": 9.962200378062032e-06, "loss": 0.4463, "step": 814 }, { "epoch": 0.4064162234042553, "grad_norm": 0.6464163661003113, "learning_rate": 9.96184339213383e-06, "loss": 0.4595, "step": 815 }, { "epoch": 0.40691489361702127, "grad_norm": 0.6713710427284241, "learning_rate": 9.961484734858218e-06, "loss": 0.4427, "step": 816 }, { "epoch": 0.4074135638297872, "grad_norm": 0.5937074422836304, "learning_rate": 9.96112440635601e-06, "loss": 0.4455, "step": 817 }, { "epoch": 0.40791223404255317, "grad_norm": 0.7482831478118896, "learning_rate": 9.960762406748581e-06, "loss": 0.4714, "step": 818 }, { "epoch": 0.4084109042553192, "grad_norm": 0.6630957722663879, "learning_rate": 9.960398736157864e-06, "loss": 0.4577, "step": 819 }, { "epoch": 0.4089095744680851, "grad_norm": 0.5945445895195007, "learning_rate": 9.960033394706359e-06, "loss": 0.4995, "step": 820 }, { "epoch": 0.4094082446808511, "grad_norm": 0.6590966582298279, "learning_rate": 9.959666382517125e-06, "loss": 0.489, "step": 821 }, { "epoch": 0.409906914893617, "grad_norm": 0.5619611740112305, "learning_rate": 9.95929769971379e-06, "loss": 0.4814, "step": 822 }, { "epoch": 0.410405585106383, "grad_norm": 0.5990843176841736, "learning_rate": 9.958927346420539e-06, "loss": 0.4793, "step": 823 }, { "epoch": 0.4109042553191489, "grad_norm": 0.5945183038711548, "learning_rate": 9.958555322762123e-06, "loss": 0.4467, "step": 824 }, { "epoch": 0.4114029255319149, "grad_norm": 0.5929149985313416, "learning_rate": 9.958181628863854e-06, "loss": 0.4228, "step": 825 }, { "epoch": 0.4119015957446808, "grad_norm": 0.6010994911193848, "learning_rate": 9.957806264851607e-06, "loss": 0.4731, "step": 826 }, { "epoch": 0.41240026595744683, "grad_norm": 0.6486637592315674, "learning_rate": 9.957429230851818e-06, "loss": 0.4586, "step": 827 }, { "epoch": 0.4128989361702128, "grad_norm": 0.6800046563148499, "learning_rate": 9.95705052699149e-06, "loss": 0.4454, "step": 828 }, { "epoch": 0.41339760638297873, "grad_norm": 0.5605024099349976, "learning_rate": 9.956670153398185e-06, "loss": 0.4562, "step": 829 }, { "epoch": 0.4138962765957447, "grad_norm": 0.5645004510879517, "learning_rate": 9.956288110200027e-06, "loss": 0.439, "step": 830 }, { "epoch": 0.41439494680851063, "grad_norm": 0.6155285239219666, "learning_rate": 9.955904397525703e-06, "loss": 0.4917, "step": 831 }, { "epoch": 0.4148936170212766, "grad_norm": 0.5094565153121948, "learning_rate": 9.955519015504464e-06, "loss": 0.4549, "step": 832 }, { "epoch": 0.41539228723404253, "grad_norm": 0.5561363697052002, "learning_rate": 9.955131964266121e-06, "loss": 0.4709, "step": 833 }, { "epoch": 0.4158909574468085, "grad_norm": 0.5618149638175964, "learning_rate": 9.95474324394105e-06, "loss": 0.4428, "step": 834 }, { "epoch": 0.4163896276595745, "grad_norm": 0.5247511863708496, "learning_rate": 9.954352854660187e-06, "loss": 0.4515, "step": 835 }, { "epoch": 0.41688829787234044, "grad_norm": 0.5212857723236084, "learning_rate": 9.953960796555029e-06, "loss": 0.4497, "step": 836 }, { "epoch": 0.4173869680851064, "grad_norm": 0.6652053594589233, "learning_rate": 9.953567069757638e-06, "loss": 0.5007, "step": 837 }, { "epoch": 0.41788563829787234, "grad_norm": 0.603585422039032, "learning_rate": 9.953171674400638e-06, "loss": 0.4822, "step": 838 }, { "epoch": 0.4183843085106383, "grad_norm": 0.5788070559501648, "learning_rate": 9.952774610617213e-06, "loss": 0.4891, "step": 839 }, { "epoch": 0.41888297872340424, "grad_norm": 0.5356699824333191, "learning_rate": 9.952375878541109e-06, "loss": 0.4494, "step": 840 }, { "epoch": 0.4193816489361702, "grad_norm": 0.5645936131477356, "learning_rate": 9.951975478306638e-06, "loss": 0.432, "step": 841 }, { "epoch": 0.41988031914893614, "grad_norm": 0.5924050211906433, "learning_rate": 9.951573410048668e-06, "loss": 0.4664, "step": 842 }, { "epoch": 0.42037898936170215, "grad_norm": 0.5700175762176514, "learning_rate": 9.951169673902632e-06, "loss": 0.4851, "step": 843 }, { "epoch": 0.4208776595744681, "grad_norm": 0.5166407227516174, "learning_rate": 9.950764270004525e-06, "loss": 0.4721, "step": 844 }, { "epoch": 0.42137632978723405, "grad_norm": 0.5897667407989502, "learning_rate": 9.950357198490903e-06, "loss": 0.4502, "step": 845 }, { "epoch": 0.421875, "grad_norm": 0.6189143657684326, "learning_rate": 9.949948459498885e-06, "loss": 0.4509, "step": 846 }, { "epoch": 0.42237367021276595, "grad_norm": 0.5847206115722656, "learning_rate": 9.949538053166148e-06, "loss": 0.4557, "step": 847 }, { "epoch": 0.4228723404255319, "grad_norm": 0.653302788734436, "learning_rate": 9.949125979630937e-06, "loss": 0.4516, "step": 848 }, { "epoch": 0.42337101063829785, "grad_norm": 0.6379662752151489, "learning_rate": 9.948712239032052e-06, "loss": 0.4373, "step": 849 }, { "epoch": 0.42386968085106386, "grad_norm": 0.6994174122810364, "learning_rate": 9.94829683150886e-06, "loss": 0.4565, "step": 850 }, { "epoch": 0.4243683510638298, "grad_norm": 0.5767796039581299, "learning_rate": 9.947879757201282e-06, "loss": 0.4792, "step": 851 }, { "epoch": 0.42486702127659576, "grad_norm": 0.6765949130058289, "learning_rate": 9.947461016249811e-06, "loss": 0.4669, "step": 852 }, { "epoch": 0.4253656914893617, "grad_norm": 0.6781662106513977, "learning_rate": 9.947040608795492e-06, "loss": 0.4838, "step": 853 }, { "epoch": 0.42586436170212766, "grad_norm": 0.5827946662902832, "learning_rate": 9.946618534979938e-06, "loss": 0.4533, "step": 854 }, { "epoch": 0.4263630319148936, "grad_norm": 0.6133431792259216, "learning_rate": 9.946194794945319e-06, "loss": 0.4528, "step": 855 }, { "epoch": 0.42686170212765956, "grad_norm": 0.6037358045578003, "learning_rate": 9.945769388834365e-06, "loss": 0.4539, "step": 856 }, { "epoch": 0.4273603723404255, "grad_norm": 0.5878170728683472, "learning_rate": 9.945342316790374e-06, "loss": 0.4549, "step": 857 }, { "epoch": 0.4278590425531915, "grad_norm": 0.5381538271903992, "learning_rate": 9.944913578957198e-06, "loss": 0.4964, "step": 858 }, { "epoch": 0.42835771276595747, "grad_norm": 0.5875141024589539, "learning_rate": 9.944483175479255e-06, "loss": 0.4143, "step": 859 }, { "epoch": 0.4288563829787234, "grad_norm": 0.689507007598877, "learning_rate": 9.94405110650152e-06, "loss": 0.4691, "step": 860 }, { "epoch": 0.42935505319148937, "grad_norm": 0.5849044919013977, "learning_rate": 9.943617372169532e-06, "loss": 0.4944, "step": 861 }, { "epoch": 0.4298537234042553, "grad_norm": 0.5292565226554871, "learning_rate": 9.943181972629391e-06, "loss": 0.4542, "step": 862 }, { "epoch": 0.43035239361702127, "grad_norm": 0.6500378251075745, "learning_rate": 9.942744908027757e-06, "loss": 0.4666, "step": 863 }, { "epoch": 0.4308510638297872, "grad_norm": 0.5910899043083191, "learning_rate": 9.942306178511849e-06, "loss": 0.4571, "step": 864 }, { "epoch": 0.43134973404255317, "grad_norm": 0.49802353978157043, "learning_rate": 9.94186578422945e-06, "loss": 0.4411, "step": 865 }, { "epoch": 0.4318484042553192, "grad_norm": 0.5470302104949951, "learning_rate": 9.941423725328904e-06, "loss": 0.4366, "step": 866 }, { "epoch": 0.4323470744680851, "grad_norm": 0.549688458442688, "learning_rate": 9.940980001959111e-06, "loss": 0.4799, "step": 867 }, { "epoch": 0.4328457446808511, "grad_norm": 0.489200234413147, "learning_rate": 9.940534614269537e-06, "loss": 0.4174, "step": 868 }, { "epoch": 0.433344414893617, "grad_norm": 0.5537315607070923, "learning_rate": 9.940087562410205e-06, "loss": 0.4897, "step": 869 }, { "epoch": 0.433843085106383, "grad_norm": 0.6452416777610779, "learning_rate": 9.939638846531701e-06, "loss": 0.4445, "step": 870 }, { "epoch": 0.4343417553191489, "grad_norm": 0.6428449153900146, "learning_rate": 9.93918846678517e-06, "loss": 0.4799, "step": 871 }, { "epoch": 0.4348404255319149, "grad_norm": 0.6568225026130676, "learning_rate": 9.938736423322317e-06, "loss": 0.4661, "step": 872 }, { "epoch": 0.4353390957446808, "grad_norm": 0.5872544050216675, "learning_rate": 9.938282716295411e-06, "loss": 0.4808, "step": 873 }, { "epoch": 0.43583776595744683, "grad_norm": 0.6722471117973328, "learning_rate": 9.937827345857275e-06, "loss": 0.4329, "step": 874 }, { "epoch": 0.4363364361702128, "grad_norm": 0.4827336370944977, "learning_rate": 9.937370312161298e-06, "loss": 0.4369, "step": 875 }, { "epoch": 0.43683510638297873, "grad_norm": 0.6470399498939514, "learning_rate": 9.936911615361428e-06, "loss": 0.4867, "step": 876 }, { "epoch": 0.4373337765957447, "grad_norm": 0.5818237066268921, "learning_rate": 9.936451255612173e-06, "loss": 0.4855, "step": 877 }, { "epoch": 0.43783244680851063, "grad_norm": 0.5730016827583313, "learning_rate": 9.935989233068597e-06, "loss": 0.4505, "step": 878 }, { "epoch": 0.4383311170212766, "grad_norm": 0.5286703705787659, "learning_rate": 9.935525547886331e-06, "loss": 0.4742, "step": 879 }, { "epoch": 0.43882978723404253, "grad_norm": 0.5706779360771179, "learning_rate": 9.935060200221563e-06, "loss": 0.4702, "step": 880 }, { "epoch": 0.4393284574468085, "grad_norm": 0.5283784866333008, "learning_rate": 9.934593190231037e-06, "loss": 0.4818, "step": 881 }, { "epoch": 0.4398271276595745, "grad_norm": 0.6147560477256775, "learning_rate": 9.934124518072064e-06, "loss": 0.4665, "step": 882 }, { "epoch": 0.44032579787234044, "grad_norm": 0.6017918586730957, "learning_rate": 9.93365418390251e-06, "loss": 0.4785, "step": 883 }, { "epoch": 0.4408244680851064, "grad_norm": 0.5892265439033508, "learning_rate": 9.933182187880804e-06, "loss": 0.4494, "step": 884 }, { "epoch": 0.44132313829787234, "grad_norm": 0.5409145355224609, "learning_rate": 9.93270853016593e-06, "loss": 0.4434, "step": 885 }, { "epoch": 0.4418218085106383, "grad_norm": 0.6884061694145203, "learning_rate": 9.93223321091744e-06, "loss": 0.4644, "step": 886 }, { "epoch": 0.44232047872340424, "grad_norm": 0.6470730900764465, "learning_rate": 9.931756230295434e-06, "loss": 0.4574, "step": 887 }, { "epoch": 0.4428191489361702, "grad_norm": 0.5104542374610901, "learning_rate": 9.931277588460583e-06, "loss": 0.5022, "step": 888 }, { "epoch": 0.44331781914893614, "grad_norm": 0.6373216509819031, "learning_rate": 9.930797285574112e-06, "loss": 0.4598, "step": 889 }, { "epoch": 0.44381648936170215, "grad_norm": 0.5751199126243591, "learning_rate": 9.930315321797805e-06, "loss": 0.4517, "step": 890 }, { "epoch": 0.4443151595744681, "grad_norm": 0.5065343976020813, "learning_rate": 9.929831697294008e-06, "loss": 0.4663, "step": 891 }, { "epoch": 0.44481382978723405, "grad_norm": 0.5543419122695923, "learning_rate": 9.929346412225622e-06, "loss": 0.4621, "step": 892 }, { "epoch": 0.4453125, "grad_norm": 0.5836206078529358, "learning_rate": 9.928859466756115e-06, "loss": 0.4475, "step": 893 }, { "epoch": 0.44581117021276595, "grad_norm": 0.6153258085250854, "learning_rate": 9.928370861049507e-06, "loss": 0.4983, "step": 894 }, { "epoch": 0.4463098404255319, "grad_norm": 0.5092177987098694, "learning_rate": 9.92788059527038e-06, "loss": 0.4792, "step": 895 }, { "epoch": 0.44680851063829785, "grad_norm": 0.5541229844093323, "learning_rate": 9.927388669583876e-06, "loss": 0.4759, "step": 896 }, { "epoch": 0.44730718085106386, "grad_norm": 0.5747464299201965, "learning_rate": 9.926895084155694e-06, "loss": 0.4138, "step": 897 }, { "epoch": 0.4478058510638298, "grad_norm": 0.5651940107345581, "learning_rate": 9.926399839152096e-06, "loss": 0.4551, "step": 898 }, { "epoch": 0.44830452127659576, "grad_norm": 0.5643485188484192, "learning_rate": 9.925902934739897e-06, "loss": 0.4353, "step": 899 }, { "epoch": 0.4488031914893617, "grad_norm": 0.6057491302490234, "learning_rate": 9.925404371086477e-06, "loss": 0.4298, "step": 900 }, { "epoch": 0.44930186170212766, "grad_norm": 0.5665568709373474, "learning_rate": 9.92490414835977e-06, "loss": 0.4483, "step": 901 }, { "epoch": 0.4498005319148936, "grad_norm": 0.5522546768188477, "learning_rate": 9.924402266728273e-06, "loss": 0.4546, "step": 902 }, { "epoch": 0.45029920212765956, "grad_norm": 0.5380772948265076, "learning_rate": 9.923898726361037e-06, "loss": 0.422, "step": 903 }, { "epoch": 0.4507978723404255, "grad_norm": 0.6127991676330566, "learning_rate": 9.923393527427677e-06, "loss": 0.4465, "step": 904 }, { "epoch": 0.4512965425531915, "grad_norm": 0.6245388388633728, "learning_rate": 9.922886670098364e-06, "loss": 0.4244, "step": 905 }, { "epoch": 0.45179521276595747, "grad_norm": 0.632032036781311, "learning_rate": 9.922378154543828e-06, "loss": 0.4676, "step": 906 }, { "epoch": 0.4522938829787234, "grad_norm": 0.6851885914802551, "learning_rate": 9.921867980935355e-06, "loss": 0.4804, "step": 907 }, { "epoch": 0.45279255319148937, "grad_norm": 0.5241862535476685, "learning_rate": 9.921356149444795e-06, "loss": 0.4386, "step": 908 }, { "epoch": 0.4532912234042553, "grad_norm": 0.5754929184913635, "learning_rate": 9.920842660244552e-06, "loss": 0.4549, "step": 909 }, { "epoch": 0.45378989361702127, "grad_norm": 0.6384445428848267, "learning_rate": 9.92032751350759e-06, "loss": 0.4584, "step": 910 }, { "epoch": 0.4542885638297872, "grad_norm": 0.5751316547393799, "learning_rate": 9.919810709407429e-06, "loss": 0.4662, "step": 911 }, { "epoch": 0.45478723404255317, "grad_norm": 0.6750133037567139, "learning_rate": 9.919292248118153e-06, "loss": 0.4674, "step": 912 }, { "epoch": 0.4552859042553192, "grad_norm": 0.573419451713562, "learning_rate": 9.918772129814398e-06, "loss": 0.4515, "step": 913 }, { "epoch": 0.4557845744680851, "grad_norm": 0.5701740980148315, "learning_rate": 9.918250354671361e-06, "loss": 0.4366, "step": 914 }, { "epoch": 0.4562832446808511, "grad_norm": 0.6698089838027954, "learning_rate": 9.917726922864796e-06, "loss": 0.4706, "step": 915 }, { "epoch": 0.456781914893617, "grad_norm": 0.5825135707855225, "learning_rate": 9.917201834571019e-06, "loss": 0.4287, "step": 916 }, { "epoch": 0.457280585106383, "grad_norm": 0.5864697694778442, "learning_rate": 9.916675089966896e-06, "loss": 0.4555, "step": 917 }, { "epoch": 0.4577792553191489, "grad_norm": 0.6039130091667175, "learning_rate": 9.91614668922986e-06, "loss": 0.4251, "step": 918 }, { "epoch": 0.4582779255319149, "grad_norm": 0.6010239720344543, "learning_rate": 9.915616632537895e-06, "loss": 0.4447, "step": 919 }, { "epoch": 0.4587765957446808, "grad_norm": 0.6926130652427673, "learning_rate": 9.915084920069546e-06, "loss": 0.4851, "step": 920 }, { "epoch": 0.45927526595744683, "grad_norm": 0.610931932926178, "learning_rate": 9.914551552003915e-06, "loss": 0.4654, "step": 921 }, { "epoch": 0.4597739361702128, "grad_norm": 0.7406291961669922, "learning_rate": 9.914016528520662e-06, "loss": 0.4603, "step": 922 }, { "epoch": 0.46027260638297873, "grad_norm": 0.6902374029159546, "learning_rate": 9.913479849800006e-06, "loss": 0.4571, "step": 923 }, { "epoch": 0.4607712765957447, "grad_norm": 0.5054622888565063, "learning_rate": 9.912941516022716e-06, "loss": 0.4526, "step": 924 }, { "epoch": 0.46126994680851063, "grad_norm": 0.6752886176109314, "learning_rate": 9.91240152737013e-06, "loss": 0.4753, "step": 925 }, { "epoch": 0.4617686170212766, "grad_norm": 0.6302437782287598, "learning_rate": 9.911859884024137e-06, "loss": 0.4628, "step": 926 }, { "epoch": 0.46226728723404253, "grad_norm": 0.5528219938278198, "learning_rate": 9.91131658616718e-06, "loss": 0.4241, "step": 927 }, { "epoch": 0.4627659574468085, "grad_norm": 0.6522142887115479, "learning_rate": 9.91077163398227e-06, "loss": 0.4556, "step": 928 }, { "epoch": 0.4632646276595745, "grad_norm": 0.5732448697090149, "learning_rate": 9.910225027652965e-06, "loss": 0.4443, "step": 929 }, { "epoch": 0.46376329787234044, "grad_norm": 0.6012253761291504, "learning_rate": 9.909676767363383e-06, "loss": 0.4511, "step": 930 }, { "epoch": 0.4642619680851064, "grad_norm": 0.5971193313598633, "learning_rate": 9.909126853298202e-06, "loss": 0.4415, "step": 931 }, { "epoch": 0.46476063829787234, "grad_norm": 0.6436319947242737, "learning_rate": 9.908575285642654e-06, "loss": 0.5, "step": 932 }, { "epoch": 0.4652593085106383, "grad_norm": 0.6636220812797546, "learning_rate": 9.90802206458253e-06, "loss": 0.4694, "step": 933 }, { "epoch": 0.46575797872340424, "grad_norm": 0.5358365774154663, "learning_rate": 9.907467190304176e-06, "loss": 0.4577, "step": 934 }, { "epoch": 0.4662566489361702, "grad_norm": 0.6162282228469849, "learning_rate": 9.906910662994496e-06, "loss": 0.4786, "step": 935 }, { "epoch": 0.46675531914893614, "grad_norm": 0.6093978881835938, "learning_rate": 9.906352482840951e-06, "loss": 0.4183, "step": 936 }, { "epoch": 0.46725398936170215, "grad_norm": 0.5885254144668579, "learning_rate": 9.905792650031558e-06, "loss": 0.4542, "step": 937 }, { "epoch": 0.4677526595744681, "grad_norm": 0.5525876879692078, "learning_rate": 9.905231164754892e-06, "loss": 0.4607, "step": 938 }, { "epoch": 0.46825132978723405, "grad_norm": 0.5817450284957886, "learning_rate": 9.904668027200082e-06, "loss": 0.4482, "step": 939 }, { "epoch": 0.46875, "grad_norm": 0.6003032922744751, "learning_rate": 9.904103237556816e-06, "loss": 0.4627, "step": 940 }, { "epoch": 0.46924867021276595, "grad_norm": 0.6007341146469116, "learning_rate": 9.90353679601534e-06, "loss": 0.4433, "step": 941 }, { "epoch": 0.4697473404255319, "grad_norm": 0.551874041557312, "learning_rate": 9.902968702766452e-06, "loss": 0.4641, "step": 942 }, { "epoch": 0.47024601063829785, "grad_norm": 0.5690295100212097, "learning_rate": 9.902398958001509e-06, "loss": 0.4136, "step": 943 }, { "epoch": 0.47074468085106386, "grad_norm": 0.5859566926956177, "learning_rate": 9.901827561912422e-06, "loss": 0.4755, "step": 944 }, { "epoch": 0.4712433510638298, "grad_norm": 0.5340538620948792, "learning_rate": 9.901254514691663e-06, "loss": 0.4732, "step": 945 }, { "epoch": 0.47174202127659576, "grad_norm": 0.5762385725975037, "learning_rate": 9.900679816532256e-06, "loss": 0.4527, "step": 946 }, { "epoch": 0.4722406914893617, "grad_norm": 0.5510350465774536, "learning_rate": 9.900103467627781e-06, "loss": 0.4581, "step": 947 }, { "epoch": 0.47273936170212766, "grad_norm": 0.5575346350669861, "learning_rate": 9.89952546817238e-06, "loss": 0.4517, "step": 948 }, { "epoch": 0.4732380319148936, "grad_norm": 0.5643312931060791, "learning_rate": 9.898945818360739e-06, "loss": 0.4699, "step": 949 }, { "epoch": 0.47373670212765956, "grad_norm": 0.6174932718276978, "learning_rate": 9.898364518388114e-06, "loss": 0.4563, "step": 950 }, { "epoch": 0.4742353723404255, "grad_norm": 0.6141154766082764, "learning_rate": 9.897781568450306e-06, "loss": 0.4567, "step": 951 }, { "epoch": 0.4747340425531915, "grad_norm": 0.6544926166534424, "learning_rate": 9.897196968743679e-06, "loss": 0.4778, "step": 952 }, { "epoch": 0.47523271276595747, "grad_norm": 0.5472080707550049, "learning_rate": 9.896610719465145e-06, "loss": 0.4258, "step": 953 }, { "epoch": 0.4757313829787234, "grad_norm": 0.6952992081642151, "learning_rate": 9.896022820812182e-06, "loss": 0.4264, "step": 954 }, { "epoch": 0.47623005319148937, "grad_norm": 0.638692319393158, "learning_rate": 9.895433272982812e-06, "loss": 0.4488, "step": 955 }, { "epoch": 0.4767287234042553, "grad_norm": 0.6165536642074585, "learning_rate": 9.894842076175623e-06, "loss": 0.4307, "step": 956 }, { "epoch": 0.47722739361702127, "grad_norm": 0.5952292680740356, "learning_rate": 9.894249230589753e-06, "loss": 0.4913, "step": 957 }, { "epoch": 0.4777260638297872, "grad_norm": 0.5969880819320679, "learning_rate": 9.893654736424892e-06, "loss": 0.4605, "step": 958 }, { "epoch": 0.47822473404255317, "grad_norm": 0.752267599105835, "learning_rate": 9.893058593881293e-06, "loss": 0.489, "step": 959 }, { "epoch": 0.4787234042553192, "grad_norm": 0.5699406266212463, "learning_rate": 9.89246080315976e-06, "loss": 0.4376, "step": 960 }, { "epoch": 0.4792220744680851, "grad_norm": 0.6625651121139526, "learning_rate": 9.891861364461654e-06, "loss": 0.4575, "step": 961 }, { "epoch": 0.4797207446808511, "grad_norm": 0.7115068435668945, "learning_rate": 9.89126027798889e-06, "loss": 0.4501, "step": 962 }, { "epoch": 0.480219414893617, "grad_norm": 0.724976122379303, "learning_rate": 9.890657543943936e-06, "loss": 0.4542, "step": 963 }, { "epoch": 0.480718085106383, "grad_norm": 0.70829838514328, "learning_rate": 9.890053162529815e-06, "loss": 0.4727, "step": 964 }, { "epoch": 0.4812167553191489, "grad_norm": 0.6398433446884155, "learning_rate": 9.889447133950113e-06, "loss": 0.4834, "step": 965 }, { "epoch": 0.4817154255319149, "grad_norm": 0.7667595148086548, "learning_rate": 9.88883945840896e-06, "loss": 0.4535, "step": 966 }, { "epoch": 0.4822140957446808, "grad_norm": 0.5984047055244446, "learning_rate": 9.888230136111047e-06, "loss": 0.459, "step": 967 }, { "epoch": 0.48271276595744683, "grad_norm": 0.6970547437667847, "learning_rate": 9.887619167261618e-06, "loss": 0.4605, "step": 968 }, { "epoch": 0.4832114361702128, "grad_norm": 0.6445826888084412, "learning_rate": 9.887006552066471e-06, "loss": 0.4868, "step": 969 }, { "epoch": 0.48371010638297873, "grad_norm": 0.6457627415657043, "learning_rate": 9.88639229073196e-06, "loss": 0.4569, "step": 970 }, { "epoch": 0.4842087765957447, "grad_norm": 0.6434492468833923, "learning_rate": 9.885776383464994e-06, "loss": 0.4333, "step": 971 }, { "epoch": 0.48470744680851063, "grad_norm": 0.552403450012207, "learning_rate": 9.885158830473031e-06, "loss": 0.4672, "step": 972 }, { "epoch": 0.4852061170212766, "grad_norm": 0.601506233215332, "learning_rate": 9.884539631964094e-06, "loss": 0.4371, "step": 973 }, { "epoch": 0.48570478723404253, "grad_norm": 0.6108084321022034, "learning_rate": 9.883918788146749e-06, "loss": 0.4324, "step": 974 }, { "epoch": 0.4862034574468085, "grad_norm": 0.5702722668647766, "learning_rate": 9.883296299230121e-06, "loss": 0.4418, "step": 975 }, { "epoch": 0.4867021276595745, "grad_norm": 0.7844724059104919, "learning_rate": 9.882672165423893e-06, "loss": 0.4809, "step": 976 }, { "epoch": 0.48720079787234044, "grad_norm": 0.6615170836448669, "learning_rate": 9.882046386938293e-06, "loss": 0.4946, "step": 977 }, { "epoch": 0.4876994680851064, "grad_norm": 0.6998576521873474, "learning_rate": 9.881418963984114e-06, "loss": 0.446, "step": 978 }, { "epoch": 0.48819813829787234, "grad_norm": 0.6735450625419617, "learning_rate": 9.88078989677269e-06, "loss": 0.4439, "step": 979 }, { "epoch": 0.4886968085106383, "grad_norm": 0.5947353839874268, "learning_rate": 9.880159185515925e-06, "loss": 0.4678, "step": 980 }, { "epoch": 0.48919547872340424, "grad_norm": 0.6660810708999634, "learning_rate": 9.87952683042626e-06, "loss": 0.4622, "step": 981 }, { "epoch": 0.4896941489361702, "grad_norm": 0.6978661417961121, "learning_rate": 9.878892831716702e-06, "loss": 0.459, "step": 982 }, { "epoch": 0.49019281914893614, "grad_norm": 0.6377536654472351, "learning_rate": 9.878257189600804e-06, "loss": 0.4455, "step": 983 }, { "epoch": 0.49069148936170215, "grad_norm": 0.8032950758934021, "learning_rate": 9.87761990429268e-06, "loss": 0.4577, "step": 984 }, { "epoch": 0.4911901595744681, "grad_norm": 0.6370677351951599, "learning_rate": 9.876980976006987e-06, "loss": 0.478, "step": 985 }, { "epoch": 0.49168882978723405, "grad_norm": 0.5885022282600403, "learning_rate": 9.876340404958949e-06, "loss": 0.4327, "step": 986 }, { "epoch": 0.4921875, "grad_norm": 0.6617292761802673, "learning_rate": 9.875698191364328e-06, "loss": 0.4409, "step": 987 }, { "epoch": 0.49268617021276595, "grad_norm": 0.725048840045929, "learning_rate": 9.875054335439454e-06, "loss": 0.4511, "step": 988 }, { "epoch": 0.4931848404255319, "grad_norm": 0.5570661425590515, "learning_rate": 9.874408837401198e-06, "loss": 0.4305, "step": 989 }, { "epoch": 0.49368351063829785, "grad_norm": 0.583367645740509, "learning_rate": 9.873761697466997e-06, "loss": 0.4582, "step": 990 }, { "epoch": 0.49418218085106386, "grad_norm": 0.6612867712974548, "learning_rate": 9.873112915854826e-06, "loss": 0.4799, "step": 991 }, { "epoch": 0.4946808510638298, "grad_norm": 0.5954098701477051, "learning_rate": 9.872462492783223e-06, "loss": 0.4848, "step": 992 }, { "epoch": 0.49517952127659576, "grad_norm": 0.5272772908210754, "learning_rate": 9.87181042847128e-06, "loss": 0.4547, "step": 993 }, { "epoch": 0.4956781914893617, "grad_norm": 0.5894871354103088, "learning_rate": 9.871156723138633e-06, "loss": 0.4582, "step": 994 }, { "epoch": 0.49617686170212766, "grad_norm": 0.5588670969009399, "learning_rate": 9.870501377005479e-06, "loss": 0.43, "step": 995 }, { "epoch": 0.4966755319148936, "grad_norm": 0.5750675797462463, "learning_rate": 9.869844390292564e-06, "loss": 0.4614, "step": 996 }, { "epoch": 0.49717420212765956, "grad_norm": 0.5324882864952087, "learning_rate": 9.869185763221188e-06, "loss": 0.433, "step": 997 }, { "epoch": 0.4976728723404255, "grad_norm": 0.5476863980293274, "learning_rate": 9.868525496013204e-06, "loss": 0.4616, "step": 998 }, { "epoch": 0.4981715425531915, "grad_norm": 0.48413336277008057, "learning_rate": 9.867863588891013e-06, "loss": 0.4142, "step": 999 }, { "epoch": 0.49867021276595747, "grad_norm": 0.5406008958816528, "learning_rate": 9.867200042077576e-06, "loss": 0.4148, "step": 1000 }, { "epoch": 0.4991688829787234, "grad_norm": 0.5358157157897949, "learning_rate": 9.866534855796398e-06, "loss": 0.4463, "step": 1001 }, { "epoch": 0.49966755319148937, "grad_norm": 0.5778772830963135, "learning_rate": 9.865868030271544e-06, "loss": 0.4335, "step": 1002 }, { "epoch": 0.5001662234042553, "grad_norm": 0.5577998161315918, "learning_rate": 9.865199565727626e-06, "loss": 0.4573, "step": 1003 }, { "epoch": 0.5006648936170213, "grad_norm": 0.5663499236106873, "learning_rate": 9.864529462389808e-06, "loss": 0.4396, "step": 1004 }, { "epoch": 0.5011635638297872, "grad_norm": 0.5516644716262817, "learning_rate": 9.86385772048381e-06, "loss": 0.4528, "step": 1005 }, { "epoch": 0.5016622340425532, "grad_norm": 0.582984983921051, "learning_rate": 9.863184340235901e-06, "loss": 0.4601, "step": 1006 }, { "epoch": 0.5021609042553191, "grad_norm": 0.5511255860328674, "learning_rate": 9.862509321872902e-06, "loss": 0.4472, "step": 1007 }, { "epoch": 0.5026595744680851, "grad_norm": 0.5138124823570251, "learning_rate": 9.861832665622187e-06, "loss": 0.4277, "step": 1008 }, { "epoch": 0.503158244680851, "grad_norm": 0.6183996200561523, "learning_rate": 9.86115437171168e-06, "loss": 0.4678, "step": 1009 }, { "epoch": 0.503656914893617, "grad_norm": 0.5740852355957031, "learning_rate": 9.860474440369856e-06, "loss": 0.4625, "step": 1010 }, { "epoch": 0.504155585106383, "grad_norm": 0.5919680595397949, "learning_rate": 9.859792871825747e-06, "loss": 0.4847, "step": 1011 }, { "epoch": 0.504654255319149, "grad_norm": 0.5157231688499451, "learning_rate": 9.85910966630893e-06, "loss": 0.4167, "step": 1012 }, { "epoch": 0.5051529255319149, "grad_norm": 0.6342364549636841, "learning_rate": 9.858424824049536e-06, "loss": 0.4641, "step": 1013 }, { "epoch": 0.5056515957446809, "grad_norm": 0.5353963375091553, "learning_rate": 9.857738345278247e-06, "loss": 0.4211, "step": 1014 }, { "epoch": 0.5061502659574468, "grad_norm": 0.4991030991077423, "learning_rate": 9.857050230226299e-06, "loss": 0.4257, "step": 1015 }, { "epoch": 0.5066489361702128, "grad_norm": 0.6020225286483765, "learning_rate": 9.856360479125472e-06, "loss": 0.4451, "step": 1016 }, { "epoch": 0.5071476063829787, "grad_norm": 0.5796292424201965, "learning_rate": 9.855669092208108e-06, "loss": 0.4538, "step": 1017 }, { "epoch": 0.5076462765957447, "grad_norm": 0.5898211002349854, "learning_rate": 9.854976069707089e-06, "loss": 0.4111, "step": 1018 }, { "epoch": 0.5081449468085106, "grad_norm": 0.5827491879463196, "learning_rate": 9.854281411855853e-06, "loss": 0.4724, "step": 1019 }, { "epoch": 0.5086436170212766, "grad_norm": 0.5727003216743469, "learning_rate": 9.853585118888391e-06, "loss": 0.4484, "step": 1020 }, { "epoch": 0.5091422872340425, "grad_norm": 0.6156240701675415, "learning_rate": 9.85288719103924e-06, "loss": 0.4768, "step": 1021 }, { "epoch": 0.5096409574468085, "grad_norm": 0.5891485810279846, "learning_rate": 9.852187628543492e-06, "loss": 0.4364, "step": 1022 }, { "epoch": 0.5101396276595744, "grad_norm": 0.5611181855201721, "learning_rate": 9.851486431636786e-06, "loss": 0.4313, "step": 1023 }, { "epoch": 0.5106382978723404, "grad_norm": 0.6101273894309998, "learning_rate": 9.850783600555314e-06, "loss": 0.4374, "step": 1024 }, { "epoch": 0.5111369680851063, "grad_norm": 0.5118335485458374, "learning_rate": 9.850079135535817e-06, "loss": 0.4778, "step": 1025 }, { "epoch": 0.5116356382978723, "grad_norm": 0.6365725994110107, "learning_rate": 9.849373036815588e-06, "loss": 0.4659, "step": 1026 }, { "epoch": 0.5121343085106383, "grad_norm": 0.595096230506897, "learning_rate": 9.848665304632469e-06, "loss": 0.5079, "step": 1027 }, { "epoch": 0.5126329787234043, "grad_norm": 0.5607242584228516, "learning_rate": 9.847955939224853e-06, "loss": 0.4692, "step": 1028 }, { "epoch": 0.5131316489361702, "grad_norm": 0.715101957321167, "learning_rate": 9.847244940831682e-06, "loss": 0.4706, "step": 1029 }, { "epoch": 0.5136303191489362, "grad_norm": 0.5631657838821411, "learning_rate": 9.846532309692448e-06, "loss": 0.4512, "step": 1030 }, { "epoch": 0.5141289893617021, "grad_norm": 0.6842343807220459, "learning_rate": 9.845818046047194e-06, "loss": 0.454, "step": 1031 }, { "epoch": 0.5146276595744681, "grad_norm": 0.621405839920044, "learning_rate": 9.845102150136512e-06, "loss": 0.4899, "step": 1032 }, { "epoch": 0.515126329787234, "grad_norm": 0.5219696164131165, "learning_rate": 9.844384622201546e-06, "loss": 0.4336, "step": 1033 }, { "epoch": 0.515625, "grad_norm": 0.5612657070159912, "learning_rate": 9.843665462483988e-06, "loss": 0.4545, "step": 1034 }, { "epoch": 0.516123670212766, "grad_norm": 0.6294629573822021, "learning_rate": 9.842944671226081e-06, "loss": 0.4619, "step": 1035 }, { "epoch": 0.5166223404255319, "grad_norm": 0.5542619824409485, "learning_rate": 9.842222248670612e-06, "loss": 0.4356, "step": 1036 }, { "epoch": 0.5171210106382979, "grad_norm": 0.5759193897247314, "learning_rate": 9.841498195060925e-06, "loss": 0.4356, "step": 1037 }, { "epoch": 0.5176196808510638, "grad_norm": 0.5824272632598877, "learning_rate": 9.84077251064091e-06, "loss": 0.4485, "step": 1038 }, { "epoch": 0.5181183510638298, "grad_norm": 0.5242204070091248, "learning_rate": 9.840045195655006e-06, "loss": 0.4365, "step": 1039 }, { "epoch": 0.5186170212765957, "grad_norm": 0.4884873330593109, "learning_rate": 9.8393162503482e-06, "loss": 0.4248, "step": 1040 }, { "epoch": 0.5191156914893617, "grad_norm": 0.5428763628005981, "learning_rate": 9.838585674966034e-06, "loss": 0.456, "step": 1041 }, { "epoch": 0.5196143617021277, "grad_norm": 0.5338824391365051, "learning_rate": 9.837853469754595e-06, "loss": 0.4431, "step": 1042 }, { "epoch": 0.5201130319148937, "grad_norm": 0.5955228209495544, "learning_rate": 9.837119634960513e-06, "loss": 0.4438, "step": 1043 }, { "epoch": 0.5206117021276596, "grad_norm": 0.6417208313941956, "learning_rate": 9.83638417083098e-06, "loss": 0.4333, "step": 1044 }, { "epoch": 0.5211103723404256, "grad_norm": 0.5724093914031982, "learning_rate": 9.835647077613725e-06, "loss": 0.4501, "step": 1045 }, { "epoch": 0.5216090425531915, "grad_norm": 0.5465425848960876, "learning_rate": 9.834908355557031e-06, "loss": 0.4358, "step": 1046 }, { "epoch": 0.5221077127659575, "grad_norm": 0.5672125220298767, "learning_rate": 9.834168004909732e-06, "loss": 0.4354, "step": 1047 }, { "epoch": 0.5226063829787234, "grad_norm": 0.5744042992591858, "learning_rate": 9.833426025921206e-06, "loss": 0.4295, "step": 1048 }, { "epoch": 0.5231050531914894, "grad_norm": 0.5840733051300049, "learning_rate": 9.832682418841381e-06, "loss": 0.446, "step": 1049 }, { "epoch": 0.5236037234042553, "grad_norm": 0.6003197431564331, "learning_rate": 9.831937183920734e-06, "loss": 0.4715, "step": 1050 }, { "epoch": 0.5241023936170213, "grad_norm": 0.5646655559539795, "learning_rate": 9.831190321410288e-06, "loss": 0.4426, "step": 1051 }, { "epoch": 0.5246010638297872, "grad_norm": 0.570976734161377, "learning_rate": 9.830441831561622e-06, "loss": 0.5053, "step": 1052 }, { "epoch": 0.5250997340425532, "grad_norm": 0.5404282212257385, "learning_rate": 9.82969171462685e-06, "loss": 0.4381, "step": 1053 }, { "epoch": 0.5255984042553191, "grad_norm": 0.5552166104316711, "learning_rate": 9.828939970858644e-06, "loss": 0.4481, "step": 1054 }, { "epoch": 0.5260970744680851, "grad_norm": 0.5842152237892151, "learning_rate": 9.828186600510224e-06, "loss": 0.4595, "step": 1055 }, { "epoch": 0.526595744680851, "grad_norm": 0.5142593383789062, "learning_rate": 9.827431603835353e-06, "loss": 0.4442, "step": 1056 }, { "epoch": 0.527094414893617, "grad_norm": 0.5705006718635559, "learning_rate": 9.826674981088343e-06, "loss": 0.4456, "step": 1057 }, { "epoch": 0.527593085106383, "grad_norm": 0.5371757745742798, "learning_rate": 9.825916732524058e-06, "loss": 0.4608, "step": 1058 }, { "epoch": 0.528091755319149, "grad_norm": 0.5547760128974915, "learning_rate": 9.825156858397903e-06, "loss": 0.4672, "step": 1059 }, { "epoch": 0.5285904255319149, "grad_norm": 0.5241231918334961, "learning_rate": 9.824395358965838e-06, "loss": 0.4729, "step": 1060 }, { "epoch": 0.5290890957446809, "grad_norm": 0.5044342279434204, "learning_rate": 9.823632234484362e-06, "loss": 0.4385, "step": 1061 }, { "epoch": 0.5295877659574468, "grad_norm": 0.4977937936782837, "learning_rate": 9.82286748521053e-06, "loss": 0.4363, "step": 1062 }, { "epoch": 0.5300864361702128, "grad_norm": 0.5512864589691162, "learning_rate": 9.822101111401937e-06, "loss": 0.4676, "step": 1063 }, { "epoch": 0.5305851063829787, "grad_norm": 0.5564478635787964, "learning_rate": 9.82133311331673e-06, "loss": 0.4715, "step": 1064 }, { "epoch": 0.5310837765957447, "grad_norm": 0.5933271646499634, "learning_rate": 9.8205634912136e-06, "loss": 0.4648, "step": 1065 }, { "epoch": 0.5315824468085106, "grad_norm": 0.5022256970405579, "learning_rate": 9.819792245351789e-06, "loss": 0.4672, "step": 1066 }, { "epoch": 0.5320811170212766, "grad_norm": 0.572726309299469, "learning_rate": 9.819019375991082e-06, "loss": 0.4887, "step": 1067 }, { "epoch": 0.5325797872340425, "grad_norm": 0.5981096625328064, "learning_rate": 9.818244883391812e-06, "loss": 0.4729, "step": 1068 }, { "epoch": 0.5330784574468085, "grad_norm": 0.5157843828201294, "learning_rate": 9.81746876781486e-06, "loss": 0.4404, "step": 1069 }, { "epoch": 0.5335771276595744, "grad_norm": 0.6486029028892517, "learning_rate": 9.81669102952165e-06, "loss": 0.4692, "step": 1070 }, { "epoch": 0.5340757978723404, "grad_norm": 0.5353829860687256, "learning_rate": 9.815911668774161e-06, "loss": 0.4507, "step": 1071 }, { "epoch": 0.5345744680851063, "grad_norm": 0.5570482611656189, "learning_rate": 9.815130685834909e-06, "loss": 0.4606, "step": 1072 }, { "epoch": 0.5350731382978723, "grad_norm": 0.5640379190444946, "learning_rate": 9.814348080966958e-06, "loss": 0.4702, "step": 1073 }, { "epoch": 0.5355718085106383, "grad_norm": 0.6493992805480957, "learning_rate": 9.813563854433926e-06, "loss": 0.4516, "step": 1074 }, { "epoch": 0.5360704787234043, "grad_norm": 0.6467697620391846, "learning_rate": 9.812778006499968e-06, "loss": 0.4418, "step": 1075 }, { "epoch": 0.5365691489361702, "grad_norm": 0.49502697587013245, "learning_rate": 9.811990537429792e-06, "loss": 0.4614, "step": 1076 }, { "epoch": 0.5370678191489362, "grad_norm": 0.5663555264472961, "learning_rate": 9.811201447488646e-06, "loss": 0.4722, "step": 1077 }, { "epoch": 0.5375664893617021, "grad_norm": 0.664264440536499, "learning_rate": 9.810410736942328e-06, "loss": 0.4608, "step": 1078 }, { "epoch": 0.5380651595744681, "grad_norm": 0.5610332489013672, "learning_rate": 9.809618406057184e-06, "loss": 0.4131, "step": 1079 }, { "epoch": 0.538563829787234, "grad_norm": 0.5682882070541382, "learning_rate": 9.808824455100098e-06, "loss": 0.4421, "step": 1080 }, { "epoch": 0.5390625, "grad_norm": 0.6207956075668335, "learning_rate": 9.808028884338508e-06, "loss": 0.4616, "step": 1081 }, { "epoch": 0.539561170212766, "grad_norm": 0.6173178553581238, "learning_rate": 9.807231694040392e-06, "loss": 0.417, "step": 1082 }, { "epoch": 0.5400598404255319, "grad_norm": 0.46595299243927, "learning_rate": 9.806432884474278e-06, "loss": 0.4484, "step": 1083 }, { "epoch": 0.5405585106382979, "grad_norm": 0.7064598798751831, "learning_rate": 9.805632455909233e-06, "loss": 0.4558, "step": 1084 }, { "epoch": 0.5410571808510638, "grad_norm": 0.6313666105270386, "learning_rate": 9.804830408614877e-06, "loss": 0.4502, "step": 1085 }, { "epoch": 0.5415558510638298, "grad_norm": 0.5952221155166626, "learning_rate": 9.80402674286137e-06, "loss": 0.4509, "step": 1086 }, { "epoch": 0.5420545212765957, "grad_norm": 0.47275224328041077, "learning_rate": 9.80322145891942e-06, "loss": 0.4563, "step": 1087 }, { "epoch": 0.5425531914893617, "grad_norm": 0.6663647890090942, "learning_rate": 9.802414557060277e-06, "loss": 0.4503, "step": 1088 }, { "epoch": 0.5430518617021277, "grad_norm": 0.601155698299408, "learning_rate": 9.80160603755574e-06, "loss": 0.4723, "step": 1089 }, { "epoch": 0.5435505319148937, "grad_norm": 0.6282462477684021, "learning_rate": 9.800795900678148e-06, "loss": 0.4577, "step": 1090 }, { "epoch": 0.5440492021276596, "grad_norm": 0.5597252249717712, "learning_rate": 9.79998414670039e-06, "loss": 0.4746, "step": 1091 }, { "epoch": 0.5445478723404256, "grad_norm": 0.5353700518608093, "learning_rate": 9.799170775895897e-06, "loss": 0.4446, "step": 1092 }, { "epoch": 0.5450465425531915, "grad_norm": 0.5350664258003235, "learning_rate": 9.798355788538642e-06, "loss": 0.4245, "step": 1093 }, { "epoch": 0.5455452127659575, "grad_norm": 0.5677266120910645, "learning_rate": 9.79753918490315e-06, "loss": 0.4228, "step": 1094 }, { "epoch": 0.5460438829787234, "grad_norm": 0.5310558080673218, "learning_rate": 9.796720965264481e-06, "loss": 0.475, "step": 1095 }, { "epoch": 0.5465425531914894, "grad_norm": 0.4773181676864624, "learning_rate": 9.795901129898249e-06, "loss": 0.4563, "step": 1096 }, { "epoch": 0.5470412234042553, "grad_norm": 0.5279744267463684, "learning_rate": 9.795079679080603e-06, "loss": 0.4416, "step": 1097 }, { "epoch": 0.5475398936170213, "grad_norm": 0.6371231079101562, "learning_rate": 9.794256613088243e-06, "loss": 0.4609, "step": 1098 }, { "epoch": 0.5480385638297872, "grad_norm": 0.6052016019821167, "learning_rate": 9.79343193219841e-06, "loss": 0.4232, "step": 1099 }, { "epoch": 0.5485372340425532, "grad_norm": 0.5480323433876038, "learning_rate": 9.792605636688888e-06, "loss": 0.4438, "step": 1100 }, { "epoch": 0.5490359042553191, "grad_norm": 0.7398461103439331, "learning_rate": 9.79177772683801e-06, "loss": 0.4422, "step": 1101 }, { "epoch": 0.5495345744680851, "grad_norm": 0.6029170751571655, "learning_rate": 9.790948202924645e-06, "loss": 0.4664, "step": 1102 }, { "epoch": 0.550033244680851, "grad_norm": 0.6282842755317688, "learning_rate": 9.790117065228213e-06, "loss": 0.4499, "step": 1103 }, { "epoch": 0.550531914893617, "grad_norm": 0.5936135649681091, "learning_rate": 9.78928431402867e-06, "loss": 0.4433, "step": 1104 }, { "epoch": 0.551030585106383, "grad_norm": 0.5526001453399658, "learning_rate": 9.788449949606528e-06, "loss": 0.436, "step": 1105 }, { "epoch": 0.551529255319149, "grad_norm": 0.5218191742897034, "learning_rate": 9.787613972242826e-06, "loss": 0.4667, "step": 1106 }, { "epoch": 0.5520279255319149, "grad_norm": 0.6132838129997253, "learning_rate": 9.786776382219159e-06, "loss": 0.4659, "step": 1107 }, { "epoch": 0.5525265957446809, "grad_norm": 0.54330974817276, "learning_rate": 9.785937179817661e-06, "loss": 0.4067, "step": 1108 }, { "epoch": 0.5530252659574468, "grad_norm": 0.5111769437789917, "learning_rate": 9.785096365321007e-06, "loss": 0.4557, "step": 1109 }, { "epoch": 0.5535239361702128, "grad_norm": 0.43922653794288635, "learning_rate": 9.784253939012418e-06, "loss": 0.4722, "step": 1110 }, { "epoch": 0.5540226063829787, "grad_norm": 0.5476747155189514, "learning_rate": 9.783409901175657e-06, "loss": 0.463, "step": 1111 }, { "epoch": 0.5545212765957447, "grad_norm": 0.5406081676483154, "learning_rate": 9.782564252095028e-06, "loss": 0.4332, "step": 1112 }, { "epoch": 0.5550199468085106, "grad_norm": 0.5713194608688354, "learning_rate": 9.781716992055383e-06, "loss": 0.4399, "step": 1113 }, { "epoch": 0.5555186170212766, "grad_norm": 0.5721248388290405, "learning_rate": 9.78086812134211e-06, "loss": 0.4371, "step": 1114 }, { "epoch": 0.5560172872340425, "grad_norm": 0.4825245440006256, "learning_rate": 9.780017640241144e-06, "loss": 0.4418, "step": 1115 }, { "epoch": 0.5565159574468085, "grad_norm": 0.5837076306343079, "learning_rate": 9.77916554903896e-06, "loss": 0.4178, "step": 1116 }, { "epoch": 0.5570146276595744, "grad_norm": 0.6019561290740967, "learning_rate": 9.778311848022577e-06, "loss": 0.442, "step": 1117 }, { "epoch": 0.5575132978723404, "grad_norm": 0.5443748235702515, "learning_rate": 9.777456537479556e-06, "loss": 0.4503, "step": 1118 }, { "epoch": 0.5580119680851063, "grad_norm": 0.5619662404060364, "learning_rate": 9.776599617697999e-06, "loss": 0.451, "step": 1119 }, { "epoch": 0.5585106382978723, "grad_norm": 0.5049303770065308, "learning_rate": 9.775741088966553e-06, "loss": 0.4251, "step": 1120 }, { "epoch": 0.5590093085106383, "grad_norm": 0.5552359223365784, "learning_rate": 9.774880951574401e-06, "loss": 0.4505, "step": 1121 }, { "epoch": 0.5595079787234043, "grad_norm": 0.5337053537368774, "learning_rate": 9.774019205811276e-06, "loss": 0.4572, "step": 1122 }, { "epoch": 0.5600066489361702, "grad_norm": 0.5651829242706299, "learning_rate": 9.773155851967446e-06, "loss": 0.4781, "step": 1123 }, { "epoch": 0.5605053191489362, "grad_norm": 0.5181887745857239, "learning_rate": 9.772290890333724e-06, "loss": 0.4551, "step": 1124 }, { "epoch": 0.5610039893617021, "grad_norm": 0.5415905117988586, "learning_rate": 9.771424321201461e-06, "loss": 0.4671, "step": 1125 }, { "epoch": 0.5615026595744681, "grad_norm": 0.548304557800293, "learning_rate": 9.770556144862557e-06, "loss": 0.4337, "step": 1126 }, { "epoch": 0.562001329787234, "grad_norm": 0.5041640996932983, "learning_rate": 9.769686361609445e-06, "loss": 0.4629, "step": 1127 }, { "epoch": 0.5625, "grad_norm": 0.5698083639144897, "learning_rate": 9.768814971735103e-06, "loss": 0.4476, "step": 1128 }, { "epoch": 0.562998670212766, "grad_norm": 0.5583614110946655, "learning_rate": 9.767941975533054e-06, "loss": 0.4339, "step": 1129 }, { "epoch": 0.5634973404255319, "grad_norm": 0.5501581430435181, "learning_rate": 9.76706737329735e-06, "loss": 0.431, "step": 1130 }, { "epoch": 0.5639960106382979, "grad_norm": 0.5161309838294983, "learning_rate": 9.766191165322598e-06, "loss": 0.4774, "step": 1131 }, { "epoch": 0.5644946808510638, "grad_norm": 0.5192849040031433, "learning_rate": 9.76531335190394e-06, "loss": 0.4401, "step": 1132 }, { "epoch": 0.5649933510638298, "grad_norm": 0.539866030216217, "learning_rate": 9.764433933337058e-06, "loss": 0.4672, "step": 1133 }, { "epoch": 0.5654920212765957, "grad_norm": 0.5384755730628967, "learning_rate": 9.763552909918174e-06, "loss": 0.4123, "step": 1134 }, { "epoch": 0.5659906914893617, "grad_norm": 0.553204357624054, "learning_rate": 9.762670281944051e-06, "loss": 0.4444, "step": 1135 }, { "epoch": 0.5664893617021277, "grad_norm": 0.5695171356201172, "learning_rate": 9.761786049711998e-06, "loss": 0.4511, "step": 1136 }, { "epoch": 0.5669880319148937, "grad_norm": 0.5804975628852844, "learning_rate": 9.760900213519855e-06, "loss": 0.456, "step": 1137 }, { "epoch": 0.5674867021276596, "grad_norm": 0.522517204284668, "learning_rate": 9.76001277366601e-06, "loss": 0.47, "step": 1138 }, { "epoch": 0.5679853723404256, "grad_norm": 0.5244485139846802, "learning_rate": 9.759123730449386e-06, "loss": 0.4597, "step": 1139 }, { "epoch": 0.5684840425531915, "grad_norm": 0.5896297693252563, "learning_rate": 9.75823308416945e-06, "loss": 0.4373, "step": 1140 }, { "epoch": 0.5689827127659575, "grad_norm": 0.5580756664276123, "learning_rate": 9.757340835126208e-06, "loss": 0.4348, "step": 1141 }, { "epoch": 0.5694813829787234, "grad_norm": 0.5392361879348755, "learning_rate": 9.756446983620203e-06, "loss": 0.4562, "step": 1142 }, { "epoch": 0.5699800531914894, "grad_norm": 0.5194135904312134, "learning_rate": 9.75555152995252e-06, "loss": 0.4385, "step": 1143 }, { "epoch": 0.5704787234042553, "grad_norm": 0.4990376830101013, "learning_rate": 9.754654474424788e-06, "loss": 0.4402, "step": 1144 }, { "epoch": 0.5709773936170213, "grad_norm": 0.5598211884498596, "learning_rate": 9.753755817339165e-06, "loss": 0.4578, "step": 1145 }, { "epoch": 0.5714760638297872, "grad_norm": 0.6209713816642761, "learning_rate": 9.752855558998358e-06, "loss": 0.4411, "step": 1146 }, { "epoch": 0.5719747340425532, "grad_norm": 0.556668758392334, "learning_rate": 9.75195369970561e-06, "loss": 0.4271, "step": 1147 }, { "epoch": 0.5724734042553191, "grad_norm": 0.5753093361854553, "learning_rate": 9.751050239764704e-06, "loss": 0.4444, "step": 1148 }, { "epoch": 0.5729720744680851, "grad_norm": 0.592029869556427, "learning_rate": 9.750145179479959e-06, "loss": 0.4497, "step": 1149 }, { "epoch": 0.573470744680851, "grad_norm": 0.6044180393218994, "learning_rate": 9.749238519156236e-06, "loss": 0.4296, "step": 1150 }, { "epoch": 0.573969414893617, "grad_norm": 0.6095849871635437, "learning_rate": 9.748330259098937e-06, "loss": 0.4342, "step": 1151 }, { "epoch": 0.574468085106383, "grad_norm": 0.6164958477020264, "learning_rate": 9.747420399613997e-06, "loss": 0.458, "step": 1152 }, { "epoch": 0.574966755319149, "grad_norm": 0.5544493198394775, "learning_rate": 9.746508941007894e-06, "loss": 0.4249, "step": 1153 }, { "epoch": 0.5754654255319149, "grad_norm": 0.5988225340843201, "learning_rate": 9.745595883587647e-06, "loss": 0.4579, "step": 1154 }, { "epoch": 0.5759640957446809, "grad_norm": 0.5138585567474365, "learning_rate": 9.744681227660808e-06, "loss": 0.4152, "step": 1155 }, { "epoch": 0.5764627659574468, "grad_norm": 0.5398774743080139, "learning_rate": 9.743764973535468e-06, "loss": 0.4214, "step": 1156 }, { "epoch": 0.5769614361702128, "grad_norm": 0.5911434888839722, "learning_rate": 9.742847121520259e-06, "loss": 0.4466, "step": 1157 }, { "epoch": 0.5774601063829787, "grad_norm": 0.597629189491272, "learning_rate": 9.741927671924352e-06, "loss": 0.4649, "step": 1158 }, { "epoch": 0.5779587765957447, "grad_norm": 0.5441303849220276, "learning_rate": 9.741006625057452e-06, "loss": 0.436, "step": 1159 }, { "epoch": 0.5784574468085106, "grad_norm": 0.6193251013755798, "learning_rate": 9.740083981229807e-06, "loss": 0.4616, "step": 1160 }, { "epoch": 0.5789561170212766, "grad_norm": 0.5671262145042419, "learning_rate": 9.739159740752197e-06, "loss": 0.4262, "step": 1161 }, { "epoch": 0.5794547872340425, "grad_norm": 0.5708621740341187, "learning_rate": 9.738233903935948e-06, "loss": 0.4183, "step": 1162 }, { "epoch": 0.5799534574468085, "grad_norm": 0.5335114002227783, "learning_rate": 9.737306471092914e-06, "loss": 0.441, "step": 1163 }, { "epoch": 0.5804521276595744, "grad_norm": 0.624437689781189, "learning_rate": 9.736377442535493e-06, "loss": 0.4582, "step": 1164 }, { "epoch": 0.5809507978723404, "grad_norm": 0.554711103439331, "learning_rate": 9.735446818576622e-06, "loss": 0.4463, "step": 1165 }, { "epoch": 0.5814494680851063, "grad_norm": 0.5594738125801086, "learning_rate": 9.734514599529769e-06, "loss": 0.4248, "step": 1166 }, { "epoch": 0.5819481382978723, "grad_norm": 0.6053128242492676, "learning_rate": 9.733580785708942e-06, "loss": 0.4677, "step": 1167 }, { "epoch": 0.5824468085106383, "grad_norm": 0.5159712433815002, "learning_rate": 9.732645377428691e-06, "loss": 0.4387, "step": 1168 }, { "epoch": 0.5829454787234043, "grad_norm": 0.5231837034225464, "learning_rate": 9.731708375004095e-06, "loss": 0.4547, "step": 1169 }, { "epoch": 0.5834441489361702, "grad_norm": 0.5728696584701538, "learning_rate": 9.730769778750776e-06, "loss": 0.4803, "step": 1170 }, { "epoch": 0.5839428191489362, "grad_norm": 0.5904802680015564, "learning_rate": 9.729829588984889e-06, "loss": 0.4572, "step": 1171 }, { "epoch": 0.5844414893617021, "grad_norm": 0.5435386896133423, "learning_rate": 9.72888780602313e-06, "loss": 0.4425, "step": 1172 }, { "epoch": 0.5849401595744681, "grad_norm": 0.538791298866272, "learning_rate": 9.727944430182727e-06, "loss": 0.4791, "step": 1173 }, { "epoch": 0.585438829787234, "grad_norm": 0.6180341839790344, "learning_rate": 9.726999461781447e-06, "loss": 0.483, "step": 1174 }, { "epoch": 0.5859375, "grad_norm": 0.5424614548683167, "learning_rate": 9.726052901137595e-06, "loss": 0.455, "step": 1175 }, { "epoch": 0.586436170212766, "grad_norm": 0.506623387336731, "learning_rate": 9.72510474857001e-06, "loss": 0.4335, "step": 1176 }, { "epoch": 0.5869348404255319, "grad_norm": 0.5056496858596802, "learning_rate": 9.724155004398064e-06, "loss": 0.4593, "step": 1177 }, { "epoch": 0.5874335106382979, "grad_norm": 0.5244532227516174, "learning_rate": 9.723203668941675e-06, "loss": 0.4682, "step": 1178 }, { "epoch": 0.5879321808510638, "grad_norm": 0.5567191243171692, "learning_rate": 9.722250742521286e-06, "loss": 0.4552, "step": 1179 }, { "epoch": 0.5884308510638298, "grad_norm": 0.5136327743530273, "learning_rate": 9.721296225457882e-06, "loss": 0.4525, "step": 1180 }, { "epoch": 0.5889295212765957, "grad_norm": 0.5300541520118713, "learning_rate": 9.720340118072985e-06, "loss": 0.4385, "step": 1181 }, { "epoch": 0.5894281914893617, "grad_norm": 0.5434508919715881, "learning_rate": 9.719382420688645e-06, "loss": 0.4358, "step": 1182 }, { "epoch": 0.5899268617021277, "grad_norm": 0.5628609657287598, "learning_rate": 9.718423133627458e-06, "loss": 0.4453, "step": 1183 }, { "epoch": 0.5904255319148937, "grad_norm": 0.5528215169906616, "learning_rate": 9.717462257212548e-06, "loss": 0.4708, "step": 1184 }, { "epoch": 0.5909242021276596, "grad_norm": 0.51912921667099, "learning_rate": 9.716499791767578e-06, "loss": 0.4878, "step": 1185 }, { "epoch": 0.5914228723404256, "grad_norm": 0.5468912124633789, "learning_rate": 9.71553573761674e-06, "loss": 0.4334, "step": 1186 }, { "epoch": 0.5919215425531915, "grad_norm": 0.5577077865600586, "learning_rate": 9.714570095084774e-06, "loss": 0.4457, "step": 1187 }, { "epoch": 0.5924202127659575, "grad_norm": 0.5511829257011414, "learning_rate": 9.71360286449694e-06, "loss": 0.4605, "step": 1188 }, { "epoch": 0.5929188829787234, "grad_norm": 0.5743992328643799, "learning_rate": 9.712634046179046e-06, "loss": 0.4753, "step": 1189 }, { "epoch": 0.5934175531914894, "grad_norm": 0.5627368092536926, "learning_rate": 9.711663640457423e-06, "loss": 0.4278, "step": 1190 }, { "epoch": 0.5939162234042553, "grad_norm": 0.5392044186592102, "learning_rate": 9.710691647658946e-06, "loss": 0.4359, "step": 1191 }, { "epoch": 0.5944148936170213, "grad_norm": 0.5820670127868652, "learning_rate": 9.70971806811102e-06, "loss": 0.4471, "step": 1192 }, { "epoch": 0.5949135638297872, "grad_norm": 0.6391332149505615, "learning_rate": 9.708742902141584e-06, "loss": 0.4668, "step": 1193 }, { "epoch": 0.5954122340425532, "grad_norm": 0.5083509683609009, "learning_rate": 9.707766150079116e-06, "loss": 0.4327, "step": 1194 }, { "epoch": 0.5959109042553191, "grad_norm": 0.5770764946937561, "learning_rate": 9.706787812252625e-06, "loss": 0.4461, "step": 1195 }, { "epoch": 0.5964095744680851, "grad_norm": 0.581737756729126, "learning_rate": 9.705807888991651e-06, "loss": 0.4269, "step": 1196 }, { "epoch": 0.596908244680851, "grad_norm": 0.5278202891349792, "learning_rate": 9.704826380626274e-06, "loss": 0.4603, "step": 1197 }, { "epoch": 0.597406914893617, "grad_norm": 0.5645574331283569, "learning_rate": 9.703843287487105e-06, "loss": 0.4708, "step": 1198 }, { "epoch": 0.597905585106383, "grad_norm": 0.6290432810783386, "learning_rate": 9.70285860990529e-06, "loss": 0.4789, "step": 1199 }, { "epoch": 0.598404255319149, "grad_norm": 0.5180264115333557, "learning_rate": 9.701872348212505e-06, "loss": 0.4505, "step": 1200 }, { "epoch": 0.5989029255319149, "grad_norm": 0.5329393148422241, "learning_rate": 9.700884502740965e-06, "loss": 0.4579, "step": 1201 }, { "epoch": 0.5994015957446809, "grad_norm": 0.4916694462299347, "learning_rate": 9.699895073823414e-06, "loss": 0.4834, "step": 1202 }, { "epoch": 0.5999002659574468, "grad_norm": 0.5863071084022522, "learning_rate": 9.698904061793133e-06, "loss": 0.4355, "step": 1203 }, { "epoch": 0.6003989361702128, "grad_norm": 0.5349161624908447, "learning_rate": 9.697911466983934e-06, "loss": 0.4416, "step": 1204 }, { "epoch": 0.6008976063829787, "grad_norm": 0.5964974164962769, "learning_rate": 9.69691728973016e-06, "loss": 0.4432, "step": 1205 }, { "epoch": 0.6013962765957447, "grad_norm": 0.5953390002250671, "learning_rate": 9.695921530366696e-06, "loss": 0.4518, "step": 1206 }, { "epoch": 0.6018949468085106, "grad_norm": 0.5252787470817566, "learning_rate": 9.694924189228948e-06, "loss": 0.4681, "step": 1207 }, { "epoch": 0.6023936170212766, "grad_norm": 0.518551766872406, "learning_rate": 9.69392526665286e-06, "loss": 0.4492, "step": 1208 }, { "epoch": 0.6028922872340425, "grad_norm": 0.5532982349395752, "learning_rate": 9.692924762974913e-06, "loss": 0.4382, "step": 1209 }, { "epoch": 0.6033909574468085, "grad_norm": 0.5906094312667847, "learning_rate": 9.691922678532115e-06, "loss": 0.4374, "step": 1210 }, { "epoch": 0.6038896276595744, "grad_norm": 0.6127765774726868, "learning_rate": 9.690919013662007e-06, "loss": 0.4532, "step": 1211 }, { "epoch": 0.6043882978723404, "grad_norm": 0.5361376404762268, "learning_rate": 9.689913768702666e-06, "loss": 0.4827, "step": 1212 }, { "epoch": 0.6048869680851063, "grad_norm": 0.4983506500720978, "learning_rate": 9.688906943992696e-06, "loss": 0.4449, "step": 1213 }, { "epoch": 0.6053856382978723, "grad_norm": 0.5355129241943359, "learning_rate": 9.687898539871239e-06, "loss": 0.4331, "step": 1214 }, { "epoch": 0.6058843085106383, "grad_norm": 0.5766198039054871, "learning_rate": 9.68688855667796e-06, "loss": 0.4344, "step": 1215 }, { "epoch": 0.6063829787234043, "grad_norm": 0.5319749712944031, "learning_rate": 9.68587699475307e-06, "loss": 0.4376, "step": 1216 }, { "epoch": 0.6068816489361702, "grad_norm": 0.534095823764801, "learning_rate": 9.684863854437296e-06, "loss": 0.4215, "step": 1217 }, { "epoch": 0.6073803191489362, "grad_norm": 0.6370128393173218, "learning_rate": 9.683849136071907e-06, "loss": 0.4355, "step": 1218 }, { "epoch": 0.6078789893617021, "grad_norm": 0.6307505965232849, "learning_rate": 9.682832839998704e-06, "loss": 0.4515, "step": 1219 }, { "epoch": 0.6083776595744681, "grad_norm": 0.5171810388565063, "learning_rate": 9.681814966560012e-06, "loss": 0.4057, "step": 1220 }, { "epoch": 0.608876329787234, "grad_norm": 0.5881466269493103, "learning_rate": 9.680795516098693e-06, "loss": 0.4219, "step": 1221 }, { "epoch": 0.609375, "grad_norm": 0.5898534655570984, "learning_rate": 9.679774488958138e-06, "loss": 0.4215, "step": 1222 }, { "epoch": 0.609873670212766, "grad_norm": 0.5580704212188721, "learning_rate": 9.67875188548227e-06, "loss": 0.464, "step": 1223 }, { "epoch": 0.6103723404255319, "grad_norm": 0.5427054166793823, "learning_rate": 9.677727706015544e-06, "loss": 0.4526, "step": 1224 }, { "epoch": 0.6108710106382979, "grad_norm": 0.4832181930541992, "learning_rate": 9.676701950902944e-06, "loss": 0.4459, "step": 1225 }, { "epoch": 0.6113696808510638, "grad_norm": 0.5576671957969666, "learning_rate": 9.675674620489984e-06, "loss": 0.4855, "step": 1226 }, { "epoch": 0.6118683510638298, "grad_norm": 0.534922182559967, "learning_rate": 9.674645715122713e-06, "loss": 0.436, "step": 1227 }, { "epoch": 0.6123670212765957, "grad_norm": 0.5473531484603882, "learning_rate": 9.673615235147703e-06, "loss": 0.4432, "step": 1228 }, { "epoch": 0.6128656914893617, "grad_norm": 0.5949001908302307, "learning_rate": 9.672583180912065e-06, "loss": 0.4548, "step": 1229 }, { "epoch": 0.6133643617021277, "grad_norm": 0.5830024480819702, "learning_rate": 9.671549552763433e-06, "loss": 0.4273, "step": 1230 }, { "epoch": 0.6138630319148937, "grad_norm": 0.4983159899711609, "learning_rate": 9.67051435104998e-06, "loss": 0.458, "step": 1231 }, { "epoch": 0.6143617021276596, "grad_norm": 0.5538854598999023, "learning_rate": 9.669477576120396e-06, "loss": 0.4403, "step": 1232 }, { "epoch": 0.6148603723404256, "grad_norm": 0.5354077816009521, "learning_rate": 9.668439228323911e-06, "loss": 0.4349, "step": 1233 }, { "epoch": 0.6153590425531915, "grad_norm": 0.5571261048316956, "learning_rate": 9.667399308010284e-06, "loss": 0.4222, "step": 1234 }, { "epoch": 0.6158577127659575, "grad_norm": 0.5583681464195251, "learning_rate": 9.666357815529802e-06, "loss": 0.4637, "step": 1235 }, { "epoch": 0.6163563829787234, "grad_norm": 0.5630175471305847, "learning_rate": 9.665314751233277e-06, "loss": 0.47, "step": 1236 }, { "epoch": 0.6168550531914894, "grad_norm": 0.5718780755996704, "learning_rate": 9.664270115472059e-06, "loss": 0.4719, "step": 1237 }, { "epoch": 0.6173537234042553, "grad_norm": 0.5822452306747437, "learning_rate": 9.663223908598023e-06, "loss": 0.4104, "step": 1238 }, { "epoch": 0.6178523936170213, "grad_norm": 0.5688141584396362, "learning_rate": 9.66217613096357e-06, "loss": 0.466, "step": 1239 }, { "epoch": 0.6183510638297872, "grad_norm": 0.5879578590393066, "learning_rate": 9.661126782921636e-06, "loss": 0.4205, "step": 1240 }, { "epoch": 0.6188497340425532, "grad_norm": 0.5780250430107117, "learning_rate": 9.660075864825683e-06, "loss": 0.4503, "step": 1241 }, { "epoch": 0.6193484042553191, "grad_norm": 0.6391312479972839, "learning_rate": 9.659023377029704e-06, "loss": 0.484, "step": 1242 }, { "epoch": 0.6198470744680851, "grad_norm": 0.5307849645614624, "learning_rate": 9.657969319888215e-06, "loss": 0.4237, "step": 1243 }, { "epoch": 0.620345744680851, "grad_norm": 0.5803769826889038, "learning_rate": 9.656913693756267e-06, "loss": 0.4403, "step": 1244 }, { "epoch": 0.620844414893617, "grad_norm": 0.5707873702049255, "learning_rate": 9.655856498989438e-06, "loss": 0.4533, "step": 1245 }, { "epoch": 0.621343085106383, "grad_norm": 0.5518537759780884, "learning_rate": 9.65479773594383e-06, "loss": 0.437, "step": 1246 }, { "epoch": 0.621841755319149, "grad_norm": 0.5138470530509949, "learning_rate": 9.653737404976083e-06, "loss": 0.4173, "step": 1247 }, { "epoch": 0.6223404255319149, "grad_norm": 0.4745774269104004, "learning_rate": 9.652675506443352e-06, "loss": 0.4891, "step": 1248 }, { "epoch": 0.6228390957446809, "grad_norm": 0.5479373335838318, "learning_rate": 9.651612040703332e-06, "loss": 0.463, "step": 1249 }, { "epoch": 0.6233377659574468, "grad_norm": 0.49740466475486755, "learning_rate": 9.650547008114237e-06, "loss": 0.4604, "step": 1250 }, { "epoch": 0.6238364361702128, "grad_norm": 0.476203978061676, "learning_rate": 9.649480409034817e-06, "loss": 0.4509, "step": 1251 }, { "epoch": 0.6243351063829787, "grad_norm": 0.5184555053710938, "learning_rate": 9.648412243824341e-06, "loss": 0.4614, "step": 1252 }, { "epoch": 0.6248337765957447, "grad_norm": 0.5694014430046082, "learning_rate": 9.647342512842611e-06, "loss": 0.4723, "step": 1253 }, { "epoch": 0.6253324468085106, "grad_norm": 0.5121799111366272, "learning_rate": 9.646271216449956e-06, "loss": 0.4202, "step": 1254 }, { "epoch": 0.6258311170212766, "grad_norm": 0.5986043810844421, "learning_rate": 9.645198355007232e-06, "loss": 0.4454, "step": 1255 }, { "epoch": 0.6263297872340425, "grad_norm": 0.5826494693756104, "learning_rate": 9.644123928875821e-06, "loss": 0.4504, "step": 1256 }, { "epoch": 0.6268284574468085, "grad_norm": 0.557275116443634, "learning_rate": 9.643047938417632e-06, "loss": 0.4321, "step": 1257 }, { "epoch": 0.6273271276595744, "grad_norm": 0.49396181106567383, "learning_rate": 9.641970383995102e-06, "loss": 0.4169, "step": 1258 }, { "epoch": 0.6278257978723404, "grad_norm": 0.5165982246398926, "learning_rate": 9.640891265971195e-06, "loss": 0.438, "step": 1259 }, { "epoch": 0.6283244680851063, "grad_norm": 0.5464956164360046, "learning_rate": 9.639810584709401e-06, "loss": 0.4624, "step": 1260 }, { "epoch": 0.6288231382978723, "grad_norm": 0.5445046424865723, "learning_rate": 9.638728340573735e-06, "loss": 0.4352, "step": 1261 }, { "epoch": 0.6293218085106383, "grad_norm": 0.5718458890914917, "learning_rate": 9.637644533928742e-06, "loss": 0.4645, "step": 1262 }, { "epoch": 0.6298204787234043, "grad_norm": 0.5886974930763245, "learning_rate": 9.636559165139493e-06, "loss": 0.4743, "step": 1263 }, { "epoch": 0.6303191489361702, "grad_norm": 0.5487725734710693, "learning_rate": 9.63547223457158e-06, "loss": 0.4422, "step": 1264 }, { "epoch": 0.6308178191489362, "grad_norm": 0.5054807066917419, "learning_rate": 9.634383742591127e-06, "loss": 0.4222, "step": 1265 }, { "epoch": 0.6313164893617021, "grad_norm": 0.4902487099170685, "learning_rate": 9.633293689564781e-06, "loss": 0.4289, "step": 1266 }, { "epoch": 0.6318151595744681, "grad_norm": 0.6335762143135071, "learning_rate": 9.632202075859716e-06, "loss": 0.44, "step": 1267 }, { "epoch": 0.632313829787234, "grad_norm": 0.6549399495124817, "learning_rate": 9.63110890184363e-06, "loss": 0.4659, "step": 1268 }, { "epoch": 0.6328125, "grad_norm": 0.5416547656059265, "learning_rate": 9.630014167884748e-06, "loss": 0.4397, "step": 1269 }, { "epoch": 0.633311170212766, "grad_norm": 0.6040160655975342, "learning_rate": 9.628917874351822e-06, "loss": 0.4444, "step": 1270 }, { "epoch": 0.6338098404255319, "grad_norm": 0.5391263961791992, "learning_rate": 9.627820021614125e-06, "loss": 0.4438, "step": 1271 }, { "epoch": 0.6343085106382979, "grad_norm": 0.6072854995727539, "learning_rate": 9.62672061004146e-06, "loss": 0.4396, "step": 1272 }, { "epoch": 0.6348071808510638, "grad_norm": 0.5333736538887024, "learning_rate": 9.62561964000415e-06, "loss": 0.4308, "step": 1273 }, { "epoch": 0.6353058510638298, "grad_norm": 0.5501661896705627, "learning_rate": 9.624517111873048e-06, "loss": 0.4518, "step": 1274 }, { "epoch": 0.6358045212765957, "grad_norm": 0.6502571105957031, "learning_rate": 9.62341302601953e-06, "loss": 0.4616, "step": 1275 }, { "epoch": 0.6363031914893617, "grad_norm": 0.6175972819328308, "learning_rate": 9.622307382815497e-06, "loss": 0.4404, "step": 1276 }, { "epoch": 0.6368018617021277, "grad_norm": 0.6271362900733948, "learning_rate": 9.62120018263337e-06, "loss": 0.4479, "step": 1277 }, { "epoch": 0.6373005319148937, "grad_norm": 0.6011665463447571, "learning_rate": 9.6200914258461e-06, "loss": 0.4486, "step": 1278 }, { "epoch": 0.6377992021276596, "grad_norm": 0.552477240562439, "learning_rate": 9.618981112827162e-06, "loss": 0.4556, "step": 1279 }, { "epoch": 0.6382978723404256, "grad_norm": 0.5342796444892883, "learning_rate": 9.617869243950556e-06, "loss": 0.4412, "step": 1280 }, { "epoch": 0.6387965425531915, "grad_norm": 0.5208759903907776, "learning_rate": 9.616755819590797e-06, "loss": 0.405, "step": 1281 }, { "epoch": 0.6392952127659575, "grad_norm": 0.5242282748222351, "learning_rate": 9.615640840122936e-06, "loss": 0.4301, "step": 1282 }, { "epoch": 0.6397938829787234, "grad_norm": 0.5829936861991882, "learning_rate": 9.614524305922542e-06, "loss": 0.4803, "step": 1283 }, { "epoch": 0.6402925531914894, "grad_norm": 0.5775812864303589, "learning_rate": 9.613406217365708e-06, "loss": 0.4007, "step": 1284 }, { "epoch": 0.6407912234042553, "grad_norm": 0.5245973467826843, "learning_rate": 9.612286574829051e-06, "loss": 0.4382, "step": 1285 }, { "epoch": 0.6412898936170213, "grad_norm": 0.5398409962654114, "learning_rate": 9.611165378689713e-06, "loss": 0.4856, "step": 1286 }, { "epoch": 0.6417885638297872, "grad_norm": 0.6008882522583008, "learning_rate": 9.610042629325354e-06, "loss": 0.4635, "step": 1287 }, { "epoch": 0.6422872340425532, "grad_norm": 0.5827884078025818, "learning_rate": 9.608918327114163e-06, "loss": 0.4369, "step": 1288 }, { "epoch": 0.6427859042553191, "grad_norm": 0.5689192414283752, "learning_rate": 9.60779247243485e-06, "loss": 0.4623, "step": 1289 }, { "epoch": 0.6432845744680851, "grad_norm": 0.5736048817634583, "learning_rate": 9.60666506566665e-06, "loss": 0.4593, "step": 1290 }, { "epoch": 0.643783244680851, "grad_norm": 0.6110102534294128, "learning_rate": 9.605536107189314e-06, "loss": 0.4488, "step": 1291 }, { "epoch": 0.644281914893617, "grad_norm": 0.5936771631240845, "learning_rate": 9.604405597383125e-06, "loss": 0.459, "step": 1292 }, { "epoch": 0.644780585106383, "grad_norm": 0.4956464469432831, "learning_rate": 9.60327353662888e-06, "loss": 0.4782, "step": 1293 }, { "epoch": 0.645279255319149, "grad_norm": 0.5389618873596191, "learning_rate": 9.602139925307905e-06, "loss": 0.4516, "step": 1294 }, { "epoch": 0.6457779255319149, "grad_norm": 0.5478381514549255, "learning_rate": 9.601004763802046e-06, "loss": 0.4194, "step": 1295 }, { "epoch": 0.6462765957446809, "grad_norm": 0.558731734752655, "learning_rate": 9.599868052493668e-06, "loss": 0.4438, "step": 1296 }, { "epoch": 0.6467752659574468, "grad_norm": 0.5176449418067932, "learning_rate": 9.598729791765665e-06, "loss": 0.4162, "step": 1297 }, { "epoch": 0.6472739361702128, "grad_norm": 0.5428903102874756, "learning_rate": 9.597589982001445e-06, "loss": 0.4383, "step": 1298 }, { "epoch": 0.6477726063829787, "grad_norm": 0.628162682056427, "learning_rate": 9.596448623584944e-06, "loss": 0.475, "step": 1299 }, { "epoch": 0.6482712765957447, "grad_norm": 0.5643385648727417, "learning_rate": 9.595305716900614e-06, "loss": 0.4607, "step": 1300 }, { "epoch": 0.6487699468085106, "grad_norm": 0.6562557220458984, "learning_rate": 9.594161262333436e-06, "loss": 0.4671, "step": 1301 }, { "epoch": 0.6492686170212766, "grad_norm": 0.5418674945831299, "learning_rate": 9.593015260268905e-06, "loss": 0.4473, "step": 1302 }, { "epoch": 0.6497672872340425, "grad_norm": 0.5853339433670044, "learning_rate": 9.591867711093041e-06, "loss": 0.4447, "step": 1303 }, { "epoch": 0.6502659574468085, "grad_norm": 0.6694492697715759, "learning_rate": 9.590718615192386e-06, "loss": 0.4369, "step": 1304 }, { "epoch": 0.6507646276595744, "grad_norm": 0.599475085735321, "learning_rate": 9.589567972953999e-06, "loss": 0.4608, "step": 1305 }, { "epoch": 0.6512632978723404, "grad_norm": 0.5883001089096069, "learning_rate": 9.588415784765465e-06, "loss": 0.4363, "step": 1306 }, { "epoch": 0.6517619680851063, "grad_norm": 0.635637104511261, "learning_rate": 9.587262051014885e-06, "loss": 0.4382, "step": 1307 }, { "epoch": 0.6522606382978723, "grad_norm": 0.510591983795166, "learning_rate": 9.586106772090883e-06, "loss": 0.4047, "step": 1308 }, { "epoch": 0.6527593085106383, "grad_norm": 0.4985331594944, "learning_rate": 9.584949948382604e-06, "loss": 0.4494, "step": 1309 }, { "epoch": 0.6532579787234043, "grad_norm": 0.5417577028274536, "learning_rate": 9.58379158027971e-06, "loss": 0.4215, "step": 1310 }, { "epoch": 0.6537566489361702, "grad_norm": 0.5534601211547852, "learning_rate": 9.582631668172391e-06, "loss": 0.4391, "step": 1311 }, { "epoch": 0.6542553191489362, "grad_norm": 0.5676228404045105, "learning_rate": 9.581470212451346e-06, "loss": 0.4062, "step": 1312 }, { "epoch": 0.6547539893617021, "grad_norm": 0.514831006526947, "learning_rate": 9.580307213507805e-06, "loss": 0.4828, "step": 1313 }, { "epoch": 0.6552526595744681, "grad_norm": 0.6640628576278687, "learning_rate": 9.579142671733508e-06, "loss": 0.4398, "step": 1314 }, { "epoch": 0.655751329787234, "grad_norm": 0.5330972671508789, "learning_rate": 9.577976587520722e-06, "loss": 0.4114, "step": 1315 }, { "epoch": 0.65625, "grad_norm": 0.5687843561172485, "learning_rate": 9.57680896126223e-06, "loss": 0.4701, "step": 1316 }, { "epoch": 0.656748670212766, "grad_norm": 0.4923546314239502, "learning_rate": 9.575639793351335e-06, "loss": 0.4512, "step": 1317 }, { "epoch": 0.6572473404255319, "grad_norm": 0.6290971040725708, "learning_rate": 9.574469084181861e-06, "loss": 0.4412, "step": 1318 }, { "epoch": 0.6577460106382979, "grad_norm": 0.5714132785797119, "learning_rate": 9.573296834148147e-06, "loss": 0.451, "step": 1319 }, { "epoch": 0.6582446808510638, "grad_norm": 0.4696807861328125, "learning_rate": 9.572123043645055e-06, "loss": 0.4242, "step": 1320 }, { "epoch": 0.6587433510638298, "grad_norm": 0.5746036767959595, "learning_rate": 9.570947713067967e-06, "loss": 0.4157, "step": 1321 }, { "epoch": 0.6592420212765957, "grad_norm": 0.5132668018341064, "learning_rate": 9.569770842812777e-06, "loss": 0.4054, "step": 1322 }, { "epoch": 0.6597406914893617, "grad_norm": 0.5766343474388123, "learning_rate": 9.568592433275906e-06, "loss": 0.4151, "step": 1323 }, { "epoch": 0.6602393617021277, "grad_norm": 0.5444534420967102, "learning_rate": 9.567412484854287e-06, "loss": 0.4332, "step": 1324 }, { "epoch": 0.6607380319148937, "grad_norm": 0.46634721755981445, "learning_rate": 9.566230997945376e-06, "loss": 0.4311, "step": 1325 }, { "epoch": 0.6612367021276596, "grad_norm": 0.5521314740180969, "learning_rate": 9.565047972947142e-06, "loss": 0.4456, "step": 1326 }, { "epoch": 0.6617353723404256, "grad_norm": 0.5256344676017761, "learning_rate": 9.563863410258076e-06, "loss": 0.4667, "step": 1327 }, { "epoch": 0.6622340425531915, "grad_norm": 0.549075722694397, "learning_rate": 9.562677310277187e-06, "loss": 0.4435, "step": 1328 }, { "epoch": 0.6627327127659575, "grad_norm": 0.5020999312400818, "learning_rate": 9.561489673404003e-06, "loss": 0.4331, "step": 1329 }, { "epoch": 0.6632313829787234, "grad_norm": 0.4939720034599304, "learning_rate": 9.560300500038564e-06, "loss": 0.4287, "step": 1330 }, { "epoch": 0.6637300531914894, "grad_norm": 0.6009901165962219, "learning_rate": 9.559109790581433e-06, "loss": 0.4733, "step": 1331 }, { "epoch": 0.6642287234042553, "grad_norm": 0.5091086626052856, "learning_rate": 9.557917545433689e-06, "loss": 0.4616, "step": 1332 }, { "epoch": 0.6647273936170213, "grad_norm": 0.49905216693878174, "learning_rate": 9.556723764996924e-06, "loss": 0.4647, "step": 1333 }, { "epoch": 0.6652260638297872, "grad_norm": 0.5009170174598694, "learning_rate": 9.555528449673258e-06, "loss": 0.433, "step": 1334 }, { "epoch": 0.6657247340425532, "grad_norm": 0.5382608771324158, "learning_rate": 9.554331599865314e-06, "loss": 0.4525, "step": 1335 }, { "epoch": 0.6662234042553191, "grad_norm": 0.53777015209198, "learning_rate": 9.553133215976244e-06, "loss": 0.4415, "step": 1336 }, { "epoch": 0.6667220744680851, "grad_norm": 0.5454398393630981, "learning_rate": 9.55193329840971e-06, "loss": 0.4744, "step": 1337 }, { "epoch": 0.667220744680851, "grad_norm": 0.58790522813797, "learning_rate": 9.550731847569891e-06, "loss": 0.4483, "step": 1338 }, { "epoch": 0.667719414893617, "grad_norm": 0.6336485147476196, "learning_rate": 9.549528863861485e-06, "loss": 0.4663, "step": 1339 }, { "epoch": 0.668218085106383, "grad_norm": 0.5056489706039429, "learning_rate": 9.548324347689705e-06, "loss": 0.445, "step": 1340 }, { "epoch": 0.668716755319149, "grad_norm": 0.5707383155822754, "learning_rate": 9.54711829946028e-06, "loss": 0.4354, "step": 1341 }, { "epoch": 0.6692154255319149, "grad_norm": 0.5045529007911682, "learning_rate": 9.545910719579454e-06, "loss": 0.4321, "step": 1342 }, { "epoch": 0.6697140957446809, "grad_norm": 0.5513744950294495, "learning_rate": 9.544701608453991e-06, "loss": 0.4444, "step": 1343 }, { "epoch": 0.6702127659574468, "grad_norm": 0.5573916435241699, "learning_rate": 9.543490966491166e-06, "loss": 0.4259, "step": 1344 }, { "epoch": 0.6707114361702128, "grad_norm": 0.564714252948761, "learning_rate": 9.542278794098772e-06, "loss": 0.4801, "step": 1345 }, { "epoch": 0.6712101063829787, "grad_norm": 0.5929874777793884, "learning_rate": 9.541065091685118e-06, "loss": 0.4387, "step": 1346 }, { "epoch": 0.6717087765957447, "grad_norm": 0.5838549137115479, "learning_rate": 9.539849859659027e-06, "loss": 0.42, "step": 1347 }, { "epoch": 0.6722074468085106, "grad_norm": 0.540676474571228, "learning_rate": 9.538633098429837e-06, "loss": 0.4803, "step": 1348 }, { "epoch": 0.6727061170212766, "grad_norm": 0.5178366899490356, "learning_rate": 9.537414808407402e-06, "loss": 0.4848, "step": 1349 }, { "epoch": 0.6732047872340425, "grad_norm": 0.5799399614334106, "learning_rate": 9.536194990002092e-06, "loss": 0.4511, "step": 1350 }, { "epoch": 0.6737034574468085, "grad_norm": 0.5818527340888977, "learning_rate": 9.53497364362479e-06, "loss": 0.4467, "step": 1351 }, { "epoch": 0.6742021276595744, "grad_norm": 0.586786687374115, "learning_rate": 9.533750769686896e-06, "loss": 0.4562, "step": 1352 }, { "epoch": 0.6747007978723404, "grad_norm": 0.5910617709159851, "learning_rate": 9.532526368600319e-06, "loss": 0.4244, "step": 1353 }, { "epoch": 0.6751994680851063, "grad_norm": 0.6181769967079163, "learning_rate": 9.531300440777489e-06, "loss": 0.4503, "step": 1354 }, { "epoch": 0.6756981382978723, "grad_norm": 0.5236305594444275, "learning_rate": 9.530072986631348e-06, "loss": 0.4178, "step": 1355 }, { "epoch": 0.6761968085106383, "grad_norm": 0.5867089629173279, "learning_rate": 9.52884400657535e-06, "loss": 0.4218, "step": 1356 }, { "epoch": 0.6766954787234043, "grad_norm": 0.5824356079101562, "learning_rate": 9.527613501023465e-06, "loss": 0.4555, "step": 1357 }, { "epoch": 0.6771941489361702, "grad_norm": 0.55819171667099, "learning_rate": 9.526381470390178e-06, "loss": 0.4374, "step": 1358 }, { "epoch": 0.6776928191489362, "grad_norm": 0.5480185747146606, "learning_rate": 9.525147915090484e-06, "loss": 0.4405, "step": 1359 }, { "epoch": 0.6781914893617021, "grad_norm": 0.6405315399169922, "learning_rate": 9.523912835539896e-06, "loss": 0.4557, "step": 1360 }, { "epoch": 0.6786901595744681, "grad_norm": 0.5400524139404297, "learning_rate": 9.522676232154437e-06, "loss": 0.4365, "step": 1361 }, { "epoch": 0.679188829787234, "grad_norm": 0.5451029539108276, "learning_rate": 9.521438105350645e-06, "loss": 0.4185, "step": 1362 }, { "epoch": 0.6796875, "grad_norm": 0.5616662502288818, "learning_rate": 9.520198455545568e-06, "loss": 0.4461, "step": 1363 }, { "epoch": 0.680186170212766, "grad_norm": 0.6162424087524414, "learning_rate": 9.518957283156774e-06, "loss": 0.4068, "step": 1364 }, { "epoch": 0.6806848404255319, "grad_norm": 0.5489886999130249, "learning_rate": 9.517714588602336e-06, "loss": 0.4411, "step": 1365 }, { "epoch": 0.6811835106382979, "grad_norm": 0.5618600845336914, "learning_rate": 9.516470372300847e-06, "loss": 0.4494, "step": 1366 }, { "epoch": 0.6816821808510638, "grad_norm": 0.5248700976371765, "learning_rate": 9.515224634671405e-06, "loss": 0.4509, "step": 1367 }, { "epoch": 0.6821808510638298, "grad_norm": 0.5571079254150391, "learning_rate": 9.513977376133625e-06, "loss": 0.4468, "step": 1368 }, { "epoch": 0.6826795212765957, "grad_norm": 0.4955291450023651, "learning_rate": 9.512728597107637e-06, "loss": 0.4628, "step": 1369 }, { "epoch": 0.6831781914893617, "grad_norm": 0.5638875365257263, "learning_rate": 9.511478298014074e-06, "loss": 0.4382, "step": 1370 }, { "epoch": 0.6836768617021277, "grad_norm": 0.5002965331077576, "learning_rate": 9.510226479274091e-06, "loss": 0.4304, "step": 1371 }, { "epoch": 0.6841755319148937, "grad_norm": 0.5155728459358215, "learning_rate": 9.508973141309352e-06, "loss": 0.4155, "step": 1372 }, { "epoch": 0.6846742021276596, "grad_norm": 0.5294942259788513, "learning_rate": 9.507718284542026e-06, "loss": 0.4657, "step": 1373 }, { "epoch": 0.6851728723404256, "grad_norm": 0.5128902196884155, "learning_rate": 9.506461909394804e-06, "loss": 0.4136, "step": 1374 }, { "epoch": 0.6856715425531915, "grad_norm": 0.5202723741531372, "learning_rate": 9.50520401629088e-06, "loss": 0.4205, "step": 1375 }, { "epoch": 0.6861702127659575, "grad_norm": 0.5289199352264404, "learning_rate": 9.503944605653966e-06, "loss": 0.4511, "step": 1376 }, { "epoch": 0.6866688829787234, "grad_norm": 0.5159313082695007, "learning_rate": 9.502683677908277e-06, "loss": 0.4424, "step": 1377 }, { "epoch": 0.6871675531914894, "grad_norm": 0.5698240995407104, "learning_rate": 9.50142123347855e-06, "loss": 0.4479, "step": 1378 }, { "epoch": 0.6876662234042553, "grad_norm": 0.5723600387573242, "learning_rate": 9.500157272790022e-06, "loss": 0.43, "step": 1379 }, { "epoch": 0.6881648936170213, "grad_norm": 0.5333765149116516, "learning_rate": 9.498891796268449e-06, "loss": 0.4365, "step": 1380 }, { "epoch": 0.6886635638297872, "grad_norm": 0.5485024452209473, "learning_rate": 9.497624804340089e-06, "loss": 0.4416, "step": 1381 }, { "epoch": 0.6891622340425532, "grad_norm": 0.6496608853340149, "learning_rate": 9.496356297431722e-06, "loss": 0.4554, "step": 1382 }, { "epoch": 0.6896609042553191, "grad_norm": 0.5653586983680725, "learning_rate": 9.495086275970627e-06, "loss": 0.4862, "step": 1383 }, { "epoch": 0.6901595744680851, "grad_norm": 0.5873114466667175, "learning_rate": 9.493814740384601e-06, "loss": 0.4324, "step": 1384 }, { "epoch": 0.690658244680851, "grad_norm": 0.6605737209320068, "learning_rate": 9.492541691101947e-06, "loss": 0.4316, "step": 1385 }, { "epoch": 0.691156914893617, "grad_norm": 0.5788408517837524, "learning_rate": 9.491267128551478e-06, "loss": 0.4657, "step": 1386 }, { "epoch": 0.691655585106383, "grad_norm": 0.5978612899780273, "learning_rate": 9.48999105316252e-06, "loss": 0.4535, "step": 1387 }, { "epoch": 0.692154255319149, "grad_norm": 0.7798579931259155, "learning_rate": 9.488713465364902e-06, "loss": 0.463, "step": 1388 }, { "epoch": 0.6926529255319149, "grad_norm": 0.618751049041748, "learning_rate": 9.487434365588973e-06, "loss": 0.4556, "step": 1389 }, { "epoch": 0.6931515957446809, "grad_norm": 0.6937804818153381, "learning_rate": 9.48615375426558e-06, "loss": 0.4751, "step": 1390 }, { "epoch": 0.6936502659574468, "grad_norm": 0.4795111417770386, "learning_rate": 9.484871631826086e-06, "loss": 0.422, "step": 1391 }, { "epoch": 0.6941489361702128, "grad_norm": 0.5772943496704102, "learning_rate": 9.48358799870236e-06, "loss": 0.439, "step": 1392 }, { "epoch": 0.6946476063829787, "grad_norm": 0.586489200592041, "learning_rate": 9.482302855326783e-06, "loss": 0.4229, "step": 1393 }, { "epoch": 0.6951462765957447, "grad_norm": 0.5887942314147949, "learning_rate": 9.481016202132241e-06, "loss": 0.4322, "step": 1394 }, { "epoch": 0.6956449468085106, "grad_norm": 0.5716894865036011, "learning_rate": 9.479728039552129e-06, "loss": 0.4446, "step": 1395 }, { "epoch": 0.6961436170212766, "grad_norm": 0.6053922176361084, "learning_rate": 9.478438368020357e-06, "loss": 0.441, "step": 1396 }, { "epoch": 0.6966422872340425, "grad_norm": 0.5643517971038818, "learning_rate": 9.477147187971332e-06, "loss": 0.4423, "step": 1397 }, { "epoch": 0.6971409574468085, "grad_norm": 0.559736967086792, "learning_rate": 9.475854499839978e-06, "loss": 0.4279, "step": 1398 }, { "epoch": 0.6976396276595744, "grad_norm": 0.5920050144195557, "learning_rate": 9.474560304061724e-06, "loss": 0.4347, "step": 1399 }, { "epoch": 0.6981382978723404, "grad_norm": 0.5702822804450989, "learning_rate": 9.473264601072507e-06, "loss": 0.4777, "step": 1400 }, { "epoch": 0.6986369680851063, "grad_norm": 0.5753346681594849, "learning_rate": 9.47196739130877e-06, "loss": 0.4387, "step": 1401 }, { "epoch": 0.6991356382978723, "grad_norm": 0.587054431438446, "learning_rate": 9.470668675207469e-06, "loss": 0.4886, "step": 1402 }, { "epoch": 0.6996343085106383, "grad_norm": 0.5841382145881653, "learning_rate": 9.469368453206058e-06, "loss": 0.4307, "step": 1403 }, { "epoch": 0.7001329787234043, "grad_norm": 0.5677496790885925, "learning_rate": 9.468066725742507e-06, "loss": 0.4446, "step": 1404 }, { "epoch": 0.7006316489361702, "grad_norm": 0.5416389107704163, "learning_rate": 9.46676349325529e-06, "loss": 0.446, "step": 1405 }, { "epoch": 0.7011303191489362, "grad_norm": 0.6401746869087219, "learning_rate": 9.46545875618339e-06, "loss": 0.4207, "step": 1406 }, { "epoch": 0.7016289893617021, "grad_norm": 0.6305884122848511, "learning_rate": 9.46415251496629e-06, "loss": 0.4268, "step": 1407 }, { "epoch": 0.7021276595744681, "grad_norm": 0.5019261240959167, "learning_rate": 9.462844770043987e-06, "loss": 0.4528, "step": 1408 }, { "epoch": 0.702626329787234, "grad_norm": 0.5831255912780762, "learning_rate": 9.46153552185698e-06, "loss": 0.4292, "step": 1409 }, { "epoch": 0.703125, "grad_norm": 0.597933292388916, "learning_rate": 9.460224770846278e-06, "loss": 0.4265, "step": 1410 }, { "epoch": 0.703623670212766, "grad_norm": 0.5733057260513306, "learning_rate": 9.458912517453393e-06, "loss": 0.4699, "step": 1411 }, { "epoch": 0.7041223404255319, "grad_norm": 0.5399320125579834, "learning_rate": 9.457598762120348e-06, "loss": 0.4473, "step": 1412 }, { "epoch": 0.7046210106382979, "grad_norm": 0.5824705958366394, "learning_rate": 9.456283505289664e-06, "loss": 0.4393, "step": 1413 }, { "epoch": 0.7051196808510638, "grad_norm": 0.6616657376289368, "learning_rate": 9.454966747404373e-06, "loss": 0.4347, "step": 1414 }, { "epoch": 0.7056183510638298, "grad_norm": 0.6083416938781738, "learning_rate": 9.453648488908014e-06, "loss": 0.4524, "step": 1415 }, { "epoch": 0.7061170212765957, "grad_norm": 0.48476073145866394, "learning_rate": 9.452328730244628e-06, "loss": 0.4378, "step": 1416 }, { "epoch": 0.7066156914893617, "grad_norm": 0.6110156774520874, "learning_rate": 9.451007471858762e-06, "loss": 0.4375, "step": 1417 }, { "epoch": 0.7071143617021277, "grad_norm": 0.5849565267562866, "learning_rate": 9.449684714195467e-06, "loss": 0.4347, "step": 1418 }, { "epoch": 0.7076130319148937, "grad_norm": 0.5844003558158875, "learning_rate": 9.448360457700306e-06, "loss": 0.4638, "step": 1419 }, { "epoch": 0.7081117021276596, "grad_norm": 0.4879095256328583, "learning_rate": 9.447034702819336e-06, "loss": 0.4527, "step": 1420 }, { "epoch": 0.7086103723404256, "grad_norm": 0.5707655549049377, "learning_rate": 9.445707449999126e-06, "loss": 0.4187, "step": 1421 }, { "epoch": 0.7091090425531915, "grad_norm": 0.5419336557388306, "learning_rate": 9.444378699686749e-06, "loss": 0.4716, "step": 1422 }, { "epoch": 0.7096077127659575, "grad_norm": 0.5410516262054443, "learning_rate": 9.443048452329782e-06, "loss": 0.4206, "step": 1423 }, { "epoch": 0.7101063829787234, "grad_norm": 0.6171124577522278, "learning_rate": 9.441716708376303e-06, "loss": 0.453, "step": 1424 }, { "epoch": 0.7106050531914894, "grad_norm": 0.6066707968711853, "learning_rate": 9.440383468274898e-06, "loss": 0.476, "step": 1425 }, { "epoch": 0.7111037234042553, "grad_norm": 0.511742353439331, "learning_rate": 9.439048732474657e-06, "loss": 0.4455, "step": 1426 }, { "epoch": 0.7116023936170213, "grad_norm": 0.5383885502815247, "learning_rate": 9.437712501425169e-06, "loss": 0.429, "step": 1427 }, { "epoch": 0.7121010638297872, "grad_norm": 0.6137298345565796, "learning_rate": 9.436374775576533e-06, "loss": 0.4748, "step": 1428 }, { "epoch": 0.7125997340425532, "grad_norm": 0.5641465783119202, "learning_rate": 9.435035555379349e-06, "loss": 0.4484, "step": 1429 }, { "epoch": 0.7130984042553191, "grad_norm": 0.7702415585517883, "learning_rate": 9.433694841284715e-06, "loss": 0.4455, "step": 1430 }, { "epoch": 0.7135970744680851, "grad_norm": 0.6070055961608887, "learning_rate": 9.432352633744244e-06, "loss": 0.4797, "step": 1431 }, { "epoch": 0.714095744680851, "grad_norm": 0.5420405864715576, "learning_rate": 9.431008933210039e-06, "loss": 0.4259, "step": 1432 }, { "epoch": 0.714594414893617, "grad_norm": 0.6651608347892761, "learning_rate": 9.429663740134719e-06, "loss": 0.4236, "step": 1433 }, { "epoch": 0.715093085106383, "grad_norm": 0.5472536683082581, "learning_rate": 9.428317054971392e-06, "loss": 0.443, "step": 1434 }, { "epoch": 0.715591755319149, "grad_norm": 0.5587415099143982, "learning_rate": 9.426968878173679e-06, "loss": 0.4527, "step": 1435 }, { "epoch": 0.7160904255319149, "grad_norm": 0.5196306705474854, "learning_rate": 9.4256192101957e-06, "loss": 0.4335, "step": 1436 }, { "epoch": 0.7165890957446809, "grad_norm": 0.6023223400115967, "learning_rate": 9.424268051492076e-06, "loss": 0.4628, "step": 1437 }, { "epoch": 0.7170877659574468, "grad_norm": 0.5250732898712158, "learning_rate": 9.42291540251793e-06, "loss": 0.4296, "step": 1438 }, { "epoch": 0.7175864361702128, "grad_norm": 0.5384118556976318, "learning_rate": 9.421561263728892e-06, "loss": 0.4526, "step": 1439 }, { "epoch": 0.7180851063829787, "grad_norm": 0.5254924297332764, "learning_rate": 9.420205635581088e-06, "loss": 0.4095, "step": 1440 }, { "epoch": 0.7185837765957447, "grad_norm": 0.5989787578582764, "learning_rate": 9.418848518531148e-06, "loss": 0.4624, "step": 1441 }, { "epoch": 0.7190824468085106, "grad_norm": 0.5757245421409607, "learning_rate": 9.417489913036203e-06, "loss": 0.4701, "step": 1442 }, { "epoch": 0.7195811170212766, "grad_norm": 0.5648587942123413, "learning_rate": 9.416129819553886e-06, "loss": 0.443, "step": 1443 }, { "epoch": 0.7200797872340425, "grad_norm": 0.5471376180648804, "learning_rate": 9.414768238542331e-06, "loss": 0.4363, "step": 1444 }, { "epoch": 0.7205784574468085, "grad_norm": 0.6504669785499573, "learning_rate": 9.413405170460172e-06, "loss": 0.459, "step": 1445 }, { "epoch": 0.7210771276595744, "grad_norm": 0.6591322422027588, "learning_rate": 9.412040615766547e-06, "loss": 0.446, "step": 1446 }, { "epoch": 0.7215757978723404, "grad_norm": 0.5647242069244385, "learning_rate": 9.410674574921092e-06, "loss": 0.4624, "step": 1447 }, { "epoch": 0.7220744680851063, "grad_norm": 0.5709680914878845, "learning_rate": 9.409307048383944e-06, "loss": 0.4349, "step": 1448 }, { "epoch": 0.7225731382978723, "grad_norm": 0.6119711399078369, "learning_rate": 9.407938036615738e-06, "loss": 0.4127, "step": 1449 }, { "epoch": 0.7230718085106383, "grad_norm": 0.6763197183609009, "learning_rate": 9.406567540077616e-06, "loss": 0.4276, "step": 1450 }, { "epoch": 0.7235704787234043, "grad_norm": 0.4984426200389862, "learning_rate": 9.405195559231214e-06, "loss": 0.4209, "step": 1451 }, { "epoch": 0.7240691489361702, "grad_norm": 0.5726171135902405, "learning_rate": 9.40382209453867e-06, "loss": 0.4219, "step": 1452 }, { "epoch": 0.7245678191489362, "grad_norm": 0.5398706197738647, "learning_rate": 9.40244714646262e-06, "loss": 0.4043, "step": 1453 }, { "epoch": 0.7250664893617021, "grad_norm": 0.5273908376693726, "learning_rate": 9.401070715466208e-06, "loss": 0.4744, "step": 1454 }, { "epoch": 0.7255651595744681, "grad_norm": 0.670342743396759, "learning_rate": 9.399692802013064e-06, "loss": 0.4726, "step": 1455 }, { "epoch": 0.726063829787234, "grad_norm": 0.553152322769165, "learning_rate": 9.398313406567327e-06, "loss": 0.4581, "step": 1456 }, { "epoch": 0.7265625, "grad_norm": 0.5104444026947021, "learning_rate": 9.396932529593634e-06, "loss": 0.4221, "step": 1457 }, { "epoch": 0.727061170212766, "grad_norm": 0.5980516076087952, "learning_rate": 9.39555017155712e-06, "loss": 0.4203, "step": 1458 }, { "epoch": 0.7275598404255319, "grad_norm": 0.48893114924430847, "learning_rate": 9.394166332923414e-06, "loss": 0.4389, "step": 1459 }, { "epoch": 0.7280585106382979, "grad_norm": 0.5951645374298096, "learning_rate": 9.392781014158651e-06, "loss": 0.4719, "step": 1460 }, { "epoch": 0.7285571808510638, "grad_norm": 0.493314266204834, "learning_rate": 9.391394215729464e-06, "loss": 0.432, "step": 1461 }, { "epoch": 0.7290558510638298, "grad_norm": 0.6460697650909424, "learning_rate": 9.390005938102979e-06, "loss": 0.4412, "step": 1462 }, { "epoch": 0.7295545212765957, "grad_norm": 0.5218823552131653, "learning_rate": 9.388616181746825e-06, "loss": 0.4086, "step": 1463 }, { "epoch": 0.7300531914893617, "grad_norm": 0.5175458788871765, "learning_rate": 9.387224947129127e-06, "loss": 0.4143, "step": 1464 }, { "epoch": 0.7305518617021277, "grad_norm": 0.5610154867172241, "learning_rate": 9.385832234718508e-06, "loss": 0.4332, "step": 1465 }, { "epoch": 0.7310505319148937, "grad_norm": 0.548356831073761, "learning_rate": 9.38443804498409e-06, "loss": 0.4504, "step": 1466 }, { "epoch": 0.7315492021276596, "grad_norm": 0.5859161615371704, "learning_rate": 9.383042378395492e-06, "loss": 0.4731, "step": 1467 }, { "epoch": 0.7320478723404256, "grad_norm": 0.5058428049087524, "learning_rate": 9.381645235422831e-06, "loss": 0.4439, "step": 1468 }, { "epoch": 0.7325465425531915, "grad_norm": 0.5268284678459167, "learning_rate": 9.380246616536718e-06, "loss": 0.4177, "step": 1469 }, { "epoch": 0.7330452127659575, "grad_norm": 0.588035523891449, "learning_rate": 9.378846522208267e-06, "loss": 0.4674, "step": 1470 }, { "epoch": 0.7335438829787234, "grad_norm": 0.5362616777420044, "learning_rate": 9.377444952909086e-06, "loss": 0.4229, "step": 1471 }, { "epoch": 0.7340425531914894, "grad_norm": 0.4950518310070038, "learning_rate": 9.376041909111276e-06, "loss": 0.4021, "step": 1472 }, { "epoch": 0.7345412234042553, "grad_norm": 0.5025284290313721, "learning_rate": 9.37463739128744e-06, "loss": 0.4525, "step": 1473 }, { "epoch": 0.7350398936170213, "grad_norm": 0.5464311242103577, "learning_rate": 9.373231399910677e-06, "loss": 0.4585, "step": 1474 }, { "epoch": 0.7355385638297872, "grad_norm": 0.532315731048584, "learning_rate": 9.371823935454578e-06, "loss": 0.4165, "step": 1475 }, { "epoch": 0.7360372340425532, "grad_norm": 0.5106806755065918, "learning_rate": 9.370414998393238e-06, "loss": 0.4332, "step": 1476 }, { "epoch": 0.7365359042553191, "grad_norm": 0.5002481937408447, "learning_rate": 9.36900458920124e-06, "loss": 0.4214, "step": 1477 }, { "epoch": 0.7370345744680851, "grad_norm": 0.5482000708580017, "learning_rate": 9.367592708353669e-06, "loss": 0.4427, "step": 1478 }, { "epoch": 0.737533244680851, "grad_norm": 0.5548692941665649, "learning_rate": 9.3661793563261e-06, "loss": 0.4454, "step": 1479 }, { "epoch": 0.738031914893617, "grad_norm": 0.5531906485557556, "learning_rate": 9.364764533594607e-06, "loss": 0.4436, "step": 1480 }, { "epoch": 0.738530585106383, "grad_norm": 0.560247540473938, "learning_rate": 9.363348240635762e-06, "loss": 0.426, "step": 1481 }, { "epoch": 0.739029255319149, "grad_norm": 0.5026974081993103, "learning_rate": 9.361930477926624e-06, "loss": 0.4118, "step": 1482 }, { "epoch": 0.7395279255319149, "grad_norm": 0.4976430833339691, "learning_rate": 9.360511245944757e-06, "loss": 0.4278, "step": 1483 }, { "epoch": 0.7400265957446809, "grad_norm": 0.7049152255058289, "learning_rate": 9.359090545168212e-06, "loss": 0.4531, "step": 1484 }, { "epoch": 0.7405252659574468, "grad_norm": 0.52281254529953, "learning_rate": 9.35766837607554e-06, "loss": 0.4746, "step": 1485 }, { "epoch": 0.7410239361702128, "grad_norm": 0.5782875418663025, "learning_rate": 9.35624473914578e-06, "loss": 0.4417, "step": 1486 }, { "epoch": 0.7415226063829787, "grad_norm": 0.540672779083252, "learning_rate": 9.35481963485848e-06, "loss": 0.4703, "step": 1487 }, { "epoch": 0.7420212765957447, "grad_norm": 0.5192979574203491, "learning_rate": 9.353393063693659e-06, "loss": 0.4282, "step": 1488 }, { "epoch": 0.7425199468085106, "grad_norm": 0.5213472843170166, "learning_rate": 9.351965026131855e-06, "loss": 0.4373, "step": 1489 }, { "epoch": 0.7430186170212766, "grad_norm": 0.7020635604858398, "learning_rate": 9.35053552265408e-06, "loss": 0.4484, "step": 1490 }, { "epoch": 0.7435172872340425, "grad_norm": 0.5072195529937744, "learning_rate": 9.34910455374185e-06, "loss": 0.4326, "step": 1491 }, { "epoch": 0.7440159574468085, "grad_norm": 0.5542097091674805, "learning_rate": 9.347672119877175e-06, "loss": 0.4646, "step": 1492 }, { "epoch": 0.7445146276595744, "grad_norm": 0.5093257427215576, "learning_rate": 9.346238221542554e-06, "loss": 0.4488, "step": 1493 }, { "epoch": 0.7450132978723404, "grad_norm": 0.5278159379959106, "learning_rate": 9.344802859220983e-06, "loss": 0.4367, "step": 1494 }, { "epoch": 0.7455119680851063, "grad_norm": 0.49358731508255005, "learning_rate": 9.343366033395949e-06, "loss": 0.438, "step": 1495 }, { "epoch": 0.7460106382978723, "grad_norm": 0.5487825274467468, "learning_rate": 9.34192774455143e-06, "loss": 0.4347, "step": 1496 }, { "epoch": 0.7465093085106383, "grad_norm": 0.5177642703056335, "learning_rate": 9.340487993171902e-06, "loss": 0.4275, "step": 1497 }, { "epoch": 0.7470079787234043, "grad_norm": 0.5953543186187744, "learning_rate": 9.33904677974233e-06, "loss": 0.4906, "step": 1498 }, { "epoch": 0.7475066489361702, "grad_norm": 0.4716092348098755, "learning_rate": 9.337604104748171e-06, "loss": 0.4613, "step": 1499 }, { "epoch": 0.7480053191489362, "grad_norm": 0.5537077188491821, "learning_rate": 9.336159968675378e-06, "loss": 0.426, "step": 1500 }, { "epoch": 0.7485039893617021, "grad_norm": 0.5290467739105225, "learning_rate": 9.334714372010395e-06, "loss": 0.449, "step": 1501 }, { "epoch": 0.7490026595744681, "grad_norm": 0.5873563885688782, "learning_rate": 9.333267315240151e-06, "loss": 0.4161, "step": 1502 }, { "epoch": 0.749501329787234, "grad_norm": 0.5549539923667908, "learning_rate": 9.331818798852077e-06, "loss": 0.4283, "step": 1503 }, { "epoch": 0.75, "grad_norm": 0.5005287528038025, "learning_rate": 9.33036882333409e-06, "loss": 0.4586, "step": 1504 }, { "epoch": 0.750498670212766, "grad_norm": 0.5236934423446655, "learning_rate": 9.328917389174601e-06, "loss": 0.4249, "step": 1505 }, { "epoch": 0.7509973404255319, "grad_norm": 0.4735150933265686, "learning_rate": 9.327464496862512e-06, "loss": 0.4198, "step": 1506 }, { "epoch": 0.7514960106382979, "grad_norm": 0.5662949681282043, "learning_rate": 9.326010146887211e-06, "loss": 0.4073, "step": 1507 }, { "epoch": 0.7519946808510638, "grad_norm": 0.5456349849700928, "learning_rate": 9.324554339738586e-06, "loss": 0.4791, "step": 1508 }, { "epoch": 0.7524933510638298, "grad_norm": 0.5214309692382812, "learning_rate": 9.323097075907009e-06, "loss": 0.3956, "step": 1509 }, { "epoch": 0.7529920212765957, "grad_norm": 0.5750072002410889, "learning_rate": 9.321638355883342e-06, "loss": 0.4136, "step": 1510 }, { "epoch": 0.7534906914893617, "grad_norm": 0.541465699672699, "learning_rate": 9.320178180158945e-06, "loss": 0.4284, "step": 1511 }, { "epoch": 0.7539893617021277, "grad_norm": 0.5736734867095947, "learning_rate": 9.318716549225661e-06, "loss": 0.4807, "step": 1512 }, { "epoch": 0.7544880319148937, "grad_norm": 0.6445060968399048, "learning_rate": 9.317253463575829e-06, "loss": 0.4437, "step": 1513 }, { "epoch": 0.7549867021276596, "grad_norm": 0.5489020347595215, "learning_rate": 9.315788923702269e-06, "loss": 0.4458, "step": 1514 }, { "epoch": 0.7554853723404256, "grad_norm": 0.6244863271713257, "learning_rate": 9.3143229300983e-06, "loss": 0.4715, "step": 1515 }, { "epoch": 0.7559840425531915, "grad_norm": 0.5607755780220032, "learning_rate": 9.31285548325773e-06, "loss": 0.4227, "step": 1516 }, { "epoch": 0.7564827127659575, "grad_norm": 0.5300732851028442, "learning_rate": 9.311386583674848e-06, "loss": 0.4129, "step": 1517 }, { "epoch": 0.7569813829787234, "grad_norm": 0.5917513966560364, "learning_rate": 9.309916231844443e-06, "loss": 0.4477, "step": 1518 }, { "epoch": 0.7574800531914894, "grad_norm": 0.525814950466156, "learning_rate": 9.308444428261787e-06, "loss": 0.4533, "step": 1519 }, { "epoch": 0.7579787234042553, "grad_norm": 0.6202428936958313, "learning_rate": 9.30697117342264e-06, "loss": 0.4363, "step": 1520 }, { "epoch": 0.7584773936170213, "grad_norm": 0.5761476159095764, "learning_rate": 9.305496467823256e-06, "loss": 0.4317, "step": 1521 }, { "epoch": 0.7589760638297872, "grad_norm": 0.5558403730392456, "learning_rate": 9.304020311960373e-06, "loss": 0.4749, "step": 1522 }, { "epoch": 0.7594747340425532, "grad_norm": 0.5804154872894287, "learning_rate": 9.302542706331221e-06, "loss": 0.4366, "step": 1523 }, { "epoch": 0.7599734042553191, "grad_norm": 0.5234715938568115, "learning_rate": 9.301063651433516e-06, "loss": 0.3901, "step": 1524 }, { "epoch": 0.7604720744680851, "grad_norm": 0.5461035370826721, "learning_rate": 9.299583147765463e-06, "loss": 0.414, "step": 1525 }, { "epoch": 0.760970744680851, "grad_norm": 0.6122422814369202, "learning_rate": 9.298101195825755e-06, "loss": 0.4661, "step": 1526 }, { "epoch": 0.761469414893617, "grad_norm": 0.5290006399154663, "learning_rate": 9.296617796113572e-06, "loss": 0.4302, "step": 1527 }, { "epoch": 0.761968085106383, "grad_norm": 0.6330251693725586, "learning_rate": 9.295132949128582e-06, "loss": 0.3829, "step": 1528 }, { "epoch": 0.762466755319149, "grad_norm": 0.612444281578064, "learning_rate": 9.293646655370943e-06, "loss": 0.4407, "step": 1529 }, { "epoch": 0.7629654255319149, "grad_norm": 0.5106480717658997, "learning_rate": 9.292158915341295e-06, "loss": 0.4282, "step": 1530 }, { "epoch": 0.7634640957446809, "grad_norm": 0.5285942554473877, "learning_rate": 9.290669729540771e-06, "loss": 0.439, "step": 1531 }, { "epoch": 0.7639627659574468, "grad_norm": 0.6582516431808472, "learning_rate": 9.289179098470989e-06, "loss": 0.4738, "step": 1532 }, { "epoch": 0.7644614361702128, "grad_norm": 0.5532265901565552, "learning_rate": 9.28768702263405e-06, "loss": 0.4335, "step": 1533 }, { "epoch": 0.7649601063829787, "grad_norm": 0.5485312938690186, "learning_rate": 9.286193502532547e-06, "loss": 0.4215, "step": 1534 }, { "epoch": 0.7654587765957447, "grad_norm": 0.5010218024253845, "learning_rate": 9.284698538669558e-06, "loss": 0.4617, "step": 1535 }, { "epoch": 0.7659574468085106, "grad_norm": 0.5433047413825989, "learning_rate": 9.283202131548645e-06, "loss": 0.4489, "step": 1536 }, { "epoch": 0.7664561170212766, "grad_norm": 0.5139133930206299, "learning_rate": 9.28170428167386e-06, "loss": 0.3882, "step": 1537 }, { "epoch": 0.7669547872340425, "grad_norm": 0.5092138051986694, "learning_rate": 9.280204989549736e-06, "loss": 0.3973, "step": 1538 }, { "epoch": 0.7674534574468085, "grad_norm": 0.5702957510948181, "learning_rate": 9.278704255681295e-06, "loss": 0.4181, "step": 1539 }, { "epoch": 0.7679521276595744, "grad_norm": 0.471287339925766, "learning_rate": 9.277202080574047e-06, "loss": 0.4092, "step": 1540 }, { "epoch": 0.7684507978723404, "grad_norm": 0.543611466884613, "learning_rate": 9.27569846473398e-06, "loss": 0.4652, "step": 1541 }, { "epoch": 0.7689494680851063, "grad_norm": 0.5741743445396423, "learning_rate": 9.274193408667577e-06, "loss": 0.4429, "step": 1542 }, { "epoch": 0.7694481382978723, "grad_norm": 0.5663116574287415, "learning_rate": 9.272686912881799e-06, "loss": 0.4666, "step": 1543 }, { "epoch": 0.7699468085106383, "grad_norm": 0.5119589567184448, "learning_rate": 9.271178977884093e-06, "loss": 0.4264, "step": 1544 }, { "epoch": 0.7704454787234043, "grad_norm": 0.5180177092552185, "learning_rate": 9.269669604182391e-06, "loss": 0.4195, "step": 1545 }, { "epoch": 0.7709441489361702, "grad_norm": 0.5102515816688538, "learning_rate": 9.268158792285114e-06, "loss": 0.4384, "step": 1546 }, { "epoch": 0.7714428191489362, "grad_norm": 0.5203704833984375, "learning_rate": 9.266646542701163e-06, "loss": 0.451, "step": 1547 }, { "epoch": 0.7719414893617021, "grad_norm": 0.5328230857849121, "learning_rate": 9.265132855939922e-06, "loss": 0.4354, "step": 1548 }, { "epoch": 0.7724401595744681, "grad_norm": 0.5254027843475342, "learning_rate": 9.263617732511262e-06, "loss": 0.4513, "step": 1549 }, { "epoch": 0.772938829787234, "grad_norm": 0.515679657459259, "learning_rate": 9.262101172925539e-06, "loss": 0.4057, "step": 1550 }, { "epoch": 0.7734375, "grad_norm": 0.5526025295257568, "learning_rate": 9.26058317769359e-06, "loss": 0.4525, "step": 1551 }, { "epoch": 0.773936170212766, "grad_norm": 0.5139873623847961, "learning_rate": 9.259063747326735e-06, "loss": 0.4479, "step": 1552 }, { "epoch": 0.7744348404255319, "grad_norm": 0.5337789058685303, "learning_rate": 9.257542882336781e-06, "loss": 0.4183, "step": 1553 }, { "epoch": 0.7749335106382979, "grad_norm": 0.5653138756752014, "learning_rate": 9.256020583236014e-06, "loss": 0.4471, "step": 1554 }, { "epoch": 0.7754321808510638, "grad_norm": 0.5371720790863037, "learning_rate": 9.254496850537207e-06, "loss": 0.4394, "step": 1555 }, { "epoch": 0.7759308510638298, "grad_norm": 0.5776761770248413, "learning_rate": 9.252971684753615e-06, "loss": 0.4234, "step": 1556 }, { "epoch": 0.7764295212765957, "grad_norm": 0.5575884580612183, "learning_rate": 9.251445086398974e-06, "loss": 0.4484, "step": 1557 }, { "epoch": 0.7769281914893617, "grad_norm": 0.5101816654205322, "learning_rate": 9.249917055987501e-06, "loss": 0.4151, "step": 1558 }, { "epoch": 0.7774268617021277, "grad_norm": 0.5303686857223511, "learning_rate": 9.248387594033902e-06, "loss": 0.4719, "step": 1559 }, { "epoch": 0.7779255319148937, "grad_norm": 0.5390374064445496, "learning_rate": 9.246856701053357e-06, "loss": 0.4143, "step": 1560 }, { "epoch": 0.7784242021276596, "grad_norm": 0.5655892491340637, "learning_rate": 9.245324377561535e-06, "loss": 0.4136, "step": 1561 }, { "epoch": 0.7789228723404256, "grad_norm": 0.5073516368865967, "learning_rate": 9.243790624074582e-06, "loss": 0.4343, "step": 1562 }, { "epoch": 0.7794215425531915, "grad_norm": 0.4871708154678345, "learning_rate": 9.242255441109128e-06, "loss": 0.419, "step": 1563 }, { "epoch": 0.7799202127659575, "grad_norm": 0.5734376907348633, "learning_rate": 9.240718829182284e-06, "loss": 0.4165, "step": 1564 }, { "epoch": 0.7804188829787234, "grad_norm": 0.550529420375824, "learning_rate": 9.239180788811645e-06, "loss": 0.4506, "step": 1565 }, { "epoch": 0.7809175531914894, "grad_norm": 0.5881732702255249, "learning_rate": 9.23764132051528e-06, "loss": 0.4377, "step": 1566 }, { "epoch": 0.7814162234042553, "grad_norm": 0.5852780938148499, "learning_rate": 9.236100424811747e-06, "loss": 0.4676, "step": 1567 }, { "epoch": 0.7819148936170213, "grad_norm": 0.5368360280990601, "learning_rate": 9.23455810222008e-06, "loss": 0.4115, "step": 1568 }, { "epoch": 0.7824135638297872, "grad_norm": 0.6649180054664612, "learning_rate": 9.233014353259795e-06, "loss": 0.4538, "step": 1569 }, { "epoch": 0.7829122340425532, "grad_norm": 0.4981454908847809, "learning_rate": 9.231469178450887e-06, "loss": 0.4464, "step": 1570 }, { "epoch": 0.7834109042553191, "grad_norm": 0.5064942836761475, "learning_rate": 9.229922578313837e-06, "loss": 0.4186, "step": 1571 }, { "epoch": 0.7839095744680851, "grad_norm": 0.5965084433555603, "learning_rate": 9.228374553369596e-06, "loss": 0.4301, "step": 1572 }, { "epoch": 0.784408244680851, "grad_norm": 0.5406386852264404, "learning_rate": 9.226825104139605e-06, "loss": 0.4116, "step": 1573 }, { "epoch": 0.784906914893617, "grad_norm": 0.5348740816116333, "learning_rate": 9.22527423114578e-06, "loss": 0.482, "step": 1574 }, { "epoch": 0.785405585106383, "grad_norm": 0.5869398713111877, "learning_rate": 9.223721934910515e-06, "loss": 0.4449, "step": 1575 }, { "epoch": 0.785904255319149, "grad_norm": 0.6462042927742004, "learning_rate": 9.222168215956688e-06, "loss": 0.4937, "step": 1576 }, { "epoch": 0.7864029255319149, "grad_norm": 0.5794700384140015, "learning_rate": 9.220613074807653e-06, "loss": 0.4408, "step": 1577 }, { "epoch": 0.7869015957446809, "grad_norm": 0.591293454170227, "learning_rate": 9.219056511987241e-06, "loss": 0.4633, "step": 1578 }, { "epoch": 0.7874002659574468, "grad_norm": 0.533279299736023, "learning_rate": 9.217498528019769e-06, "loss": 0.4071, "step": 1579 }, { "epoch": 0.7878989361702128, "grad_norm": 0.5289615392684937, "learning_rate": 9.215939123430027e-06, "loss": 0.4314, "step": 1580 }, { "epoch": 0.7883976063829787, "grad_norm": 0.5706449747085571, "learning_rate": 9.214378298743283e-06, "loss": 0.4348, "step": 1581 }, { "epoch": 0.7888962765957447, "grad_norm": 0.5719366669654846, "learning_rate": 9.212816054485286e-06, "loss": 0.4427, "step": 1582 }, { "epoch": 0.7893949468085106, "grad_norm": 0.5458727478981018, "learning_rate": 9.211252391182263e-06, "loss": 0.4496, "step": 1583 }, { "epoch": 0.7898936170212766, "grad_norm": 0.4786149263381958, "learning_rate": 9.209687309360917e-06, "loss": 0.4231, "step": 1584 }, { "epoch": 0.7903922872340425, "grad_norm": 0.5386475324630737, "learning_rate": 9.208120809548433e-06, "loss": 0.4458, "step": 1585 }, { "epoch": 0.7908909574468085, "grad_norm": 0.513339102268219, "learning_rate": 9.206552892272468e-06, "loss": 0.4436, "step": 1586 }, { "epoch": 0.7913896276595744, "grad_norm": 0.4443117678165436, "learning_rate": 9.204983558061163e-06, "loss": 0.4431, "step": 1587 }, { "epoch": 0.7918882978723404, "grad_norm": 0.5087964534759521, "learning_rate": 9.203412807443127e-06, "loss": 0.4496, "step": 1588 }, { "epoch": 0.7923869680851063, "grad_norm": 0.48400235176086426, "learning_rate": 9.201840640947458e-06, "loss": 0.4634, "step": 1589 }, { "epoch": 0.7928856382978723, "grad_norm": 0.5220666527748108, "learning_rate": 9.200267059103718e-06, "loss": 0.4232, "step": 1590 }, { "epoch": 0.7933843085106383, "grad_norm": 0.5156209468841553, "learning_rate": 9.198692062441956e-06, "loss": 0.4507, "step": 1591 }, { "epoch": 0.7938829787234043, "grad_norm": 0.506166934967041, "learning_rate": 9.197115651492694e-06, "loss": 0.4269, "step": 1592 }, { "epoch": 0.7943816489361702, "grad_norm": 0.5877869725227356, "learning_rate": 9.19553782678693e-06, "loss": 0.4556, "step": 1593 }, { "epoch": 0.7948803191489362, "grad_norm": 0.5671528577804565, "learning_rate": 9.193958588856138e-06, "loss": 0.4179, "step": 1594 }, { "epoch": 0.7953789893617021, "grad_norm": 0.6255511045455933, "learning_rate": 9.192377938232269e-06, "loss": 0.4454, "step": 1595 }, { "epoch": 0.7958776595744681, "grad_norm": 0.569083034992218, "learning_rate": 9.190795875447748e-06, "loss": 0.4343, "step": 1596 }, { "epoch": 0.796376329787234, "grad_norm": 0.6176881194114685, "learning_rate": 9.18921240103548e-06, "loss": 0.4555, "step": 1597 }, { "epoch": 0.796875, "grad_norm": 0.7172409296035767, "learning_rate": 9.187627515528842e-06, "loss": 0.4771, "step": 1598 }, { "epoch": 0.797373670212766, "grad_norm": 0.5963289141654968, "learning_rate": 9.186041219461682e-06, "loss": 0.4418, "step": 1599 }, { "epoch": 0.7978723404255319, "grad_norm": 0.6425360441207886, "learning_rate": 9.184453513368332e-06, "loss": 0.4314, "step": 1600 }, { "epoch": 0.7983710106382979, "grad_norm": 0.6354106068611145, "learning_rate": 9.182864397783597e-06, "loss": 0.4352, "step": 1601 }, { "epoch": 0.7988696808510638, "grad_norm": 0.4926011562347412, "learning_rate": 9.181273873242751e-06, "loss": 0.4391, "step": 1602 }, { "epoch": 0.7993683510638298, "grad_norm": 0.5945559144020081, "learning_rate": 9.179681940281547e-06, "loss": 0.4341, "step": 1603 }, { "epoch": 0.7998670212765957, "grad_norm": 0.49926522374153137, "learning_rate": 9.178088599436213e-06, "loss": 0.4547, "step": 1604 }, { "epoch": 0.8003656914893617, "grad_norm": 0.6493245959281921, "learning_rate": 9.17649385124345e-06, "loss": 0.4, "step": 1605 }, { "epoch": 0.8008643617021277, "grad_norm": 0.5531352758407593, "learning_rate": 9.174897696240431e-06, "loss": 0.4229, "step": 1606 }, { "epoch": 0.8013630319148937, "grad_norm": 0.5549320578575134, "learning_rate": 9.173300134964808e-06, "loss": 0.4395, "step": 1607 }, { "epoch": 0.8018617021276596, "grad_norm": 0.6758076548576355, "learning_rate": 9.1717011679547e-06, "loss": 0.4109, "step": 1608 }, { "epoch": 0.8023603723404256, "grad_norm": 0.515856146812439, "learning_rate": 9.170100795748706e-06, "loss": 0.3923, "step": 1609 }, { "epoch": 0.8028590425531915, "grad_norm": 0.6487864851951599, "learning_rate": 9.168499018885893e-06, "loss": 0.4603, "step": 1610 }, { "epoch": 0.8033577127659575, "grad_norm": 0.5734822750091553, "learning_rate": 9.166895837905805e-06, "loss": 0.3938, "step": 1611 }, { "epoch": 0.8038563829787234, "grad_norm": 0.4829653799533844, "learning_rate": 9.165291253348458e-06, "loss": 0.426, "step": 1612 }, { "epoch": 0.8043550531914894, "grad_norm": 0.6165553331375122, "learning_rate": 9.163685265754338e-06, "loss": 0.4163, "step": 1613 }, { "epoch": 0.8048537234042553, "grad_norm": 0.6354992389678955, "learning_rate": 9.162077875664408e-06, "loss": 0.432, "step": 1614 }, { "epoch": 0.8053523936170213, "grad_norm": 0.5807990431785583, "learning_rate": 9.160469083620102e-06, "loss": 0.4288, "step": 1615 }, { "epoch": 0.8058510638297872, "grad_norm": 0.5604804158210754, "learning_rate": 9.158858890163323e-06, "loss": 0.4445, "step": 1616 }, { "epoch": 0.8063497340425532, "grad_norm": 0.4774661958217621, "learning_rate": 9.15724729583645e-06, "loss": 0.4197, "step": 1617 }, { "epoch": 0.8068484042553191, "grad_norm": 0.6342750191688538, "learning_rate": 9.155634301182333e-06, "loss": 0.4309, "step": 1618 }, { "epoch": 0.8073470744680851, "grad_norm": 0.5033733248710632, "learning_rate": 9.154019906744292e-06, "loss": 0.419, "step": 1619 }, { "epoch": 0.807845744680851, "grad_norm": 0.6049508452415466, "learning_rate": 9.15240411306612e-06, "loss": 0.4407, "step": 1620 }, { "epoch": 0.808344414893617, "grad_norm": 0.6101361513137817, "learning_rate": 9.150786920692082e-06, "loss": 0.4288, "step": 1621 }, { "epoch": 0.808843085106383, "grad_norm": 0.5979120135307312, "learning_rate": 9.149168330166912e-06, "loss": 0.411, "step": 1622 }, { "epoch": 0.809341755319149, "grad_norm": 0.6326796412467957, "learning_rate": 9.147548342035815e-06, "loss": 0.4193, "step": 1623 }, { "epoch": 0.8098404255319149, "grad_norm": 0.5727104544639587, "learning_rate": 9.145926956844468e-06, "loss": 0.427, "step": 1624 }, { "epoch": 0.8103390957446809, "grad_norm": 0.5831063985824585, "learning_rate": 9.144304175139021e-06, "loss": 0.4385, "step": 1625 }, { "epoch": 0.8108377659574468, "grad_norm": 0.6886059641838074, "learning_rate": 9.14267999746609e-06, "loss": 0.442, "step": 1626 }, { "epoch": 0.8113364361702128, "grad_norm": 0.5953529477119446, "learning_rate": 9.141054424372763e-06, "loss": 0.4515, "step": 1627 }, { "epoch": 0.8118351063829787, "grad_norm": 0.5494146943092346, "learning_rate": 9.1394274564066e-06, "loss": 0.4442, "step": 1628 }, { "epoch": 0.8123337765957447, "grad_norm": 0.6660972833633423, "learning_rate": 9.137799094115626e-06, "loss": 0.4505, "step": 1629 }, { "epoch": 0.8128324468085106, "grad_norm": 0.5076459646224976, "learning_rate": 9.136169338048341e-06, "loss": 0.44, "step": 1630 }, { "epoch": 0.8133311170212766, "grad_norm": 0.5758276581764221, "learning_rate": 9.13453818875371e-06, "loss": 0.3984, "step": 1631 }, { "epoch": 0.8138297872340425, "grad_norm": 0.6807901859283447, "learning_rate": 9.13290564678117e-06, "loss": 0.4617, "step": 1632 }, { "epoch": 0.8143284574468085, "grad_norm": 0.6157277226448059, "learning_rate": 9.131271712680629e-06, "loss": 0.4334, "step": 1633 }, { "epoch": 0.8148271276595744, "grad_norm": 0.6009770631790161, "learning_rate": 9.129636387002457e-06, "loss": 0.4587, "step": 1634 }, { "epoch": 0.8153257978723404, "grad_norm": 0.6986009478569031, "learning_rate": 9.1279996702975e-06, "loss": 0.4333, "step": 1635 }, { "epoch": 0.8158244680851063, "grad_norm": 0.6295288801193237, "learning_rate": 9.126361563117068e-06, "loss": 0.4358, "step": 1636 }, { "epoch": 0.8163231382978723, "grad_norm": 0.6701803207397461, "learning_rate": 9.124722066012943e-06, "loss": 0.4555, "step": 1637 }, { "epoch": 0.8168218085106383, "grad_norm": 0.5539750456809998, "learning_rate": 9.123081179537372e-06, "loss": 0.4384, "step": 1638 }, { "epoch": 0.8173204787234043, "grad_norm": 0.6369745135307312, "learning_rate": 9.121438904243071e-06, "loss": 0.4588, "step": 1639 }, { "epoch": 0.8178191489361702, "grad_norm": 0.7069336175918579, "learning_rate": 9.119795240683224e-06, "loss": 0.4204, "step": 1640 }, { "epoch": 0.8183178191489362, "grad_norm": 0.5597377419471741, "learning_rate": 9.118150189411485e-06, "loss": 0.4615, "step": 1641 }, { "epoch": 0.8188164893617021, "grad_norm": 0.6948033571243286, "learning_rate": 9.11650375098197e-06, "loss": 0.4297, "step": 1642 }, { "epoch": 0.8193151595744681, "grad_norm": 0.6256908178329468, "learning_rate": 9.114855925949267e-06, "loss": 0.4296, "step": 1643 }, { "epoch": 0.819813829787234, "grad_norm": 0.5449667572975159, "learning_rate": 9.113206714868429e-06, "loss": 0.443, "step": 1644 }, { "epoch": 0.8203125, "grad_norm": 0.6046210527420044, "learning_rate": 9.111556118294977e-06, "loss": 0.448, "step": 1645 }, { "epoch": 0.820811170212766, "grad_norm": 0.5785977840423584, "learning_rate": 9.109904136784896e-06, "loss": 0.4454, "step": 1646 }, { "epoch": 0.8213098404255319, "grad_norm": 0.4992258548736572, "learning_rate": 9.108250770894642e-06, "loss": 0.4145, "step": 1647 }, { "epoch": 0.8218085106382979, "grad_norm": 0.7401988506317139, "learning_rate": 9.10659602118113e-06, "loss": 0.4418, "step": 1648 }, { "epoch": 0.8223071808510638, "grad_norm": 0.5145320296287537, "learning_rate": 9.104939888201754e-06, "loss": 0.4056, "step": 1649 }, { "epoch": 0.8228058510638298, "grad_norm": 0.5856240391731262, "learning_rate": 9.103282372514359e-06, "loss": 0.4115, "step": 1650 }, { "epoch": 0.8233045212765957, "grad_norm": 0.638629674911499, "learning_rate": 9.101623474677266e-06, "loss": 0.4256, "step": 1651 }, { "epoch": 0.8238031914893617, "grad_norm": 0.5495814085006714, "learning_rate": 9.099963195249256e-06, "loss": 0.4559, "step": 1652 }, { "epoch": 0.8243018617021277, "grad_norm": 0.6292268633842468, "learning_rate": 9.09830153478958e-06, "loss": 0.4522, "step": 1653 }, { "epoch": 0.8248005319148937, "grad_norm": 0.5586514472961426, "learning_rate": 9.09663849385795e-06, "loss": 0.4507, "step": 1654 }, { "epoch": 0.8252992021276596, "grad_norm": 0.5426177382469177, "learning_rate": 9.094974073014544e-06, "loss": 0.4165, "step": 1655 }, { "epoch": 0.8257978723404256, "grad_norm": 0.6462473273277283, "learning_rate": 9.093308272820007e-06, "loss": 0.4356, "step": 1656 }, { "epoch": 0.8262965425531915, "grad_norm": 0.48958727717399597, "learning_rate": 9.091641093835447e-06, "loss": 0.4296, "step": 1657 }, { "epoch": 0.8267952127659575, "grad_norm": 0.6199252009391785, "learning_rate": 9.089972536622436e-06, "loss": 0.4532, "step": 1658 }, { "epoch": 0.8272938829787234, "grad_norm": 0.5726792812347412, "learning_rate": 9.088302601743013e-06, "loss": 0.4534, "step": 1659 }, { "epoch": 0.8277925531914894, "grad_norm": 0.6088813543319702, "learning_rate": 9.086631289759675e-06, "loss": 0.4941, "step": 1660 }, { "epoch": 0.8282912234042553, "grad_norm": 0.5545929670333862, "learning_rate": 9.08495860123539e-06, "loss": 0.434, "step": 1661 }, { "epoch": 0.8287898936170213, "grad_norm": 0.5708557367324829, "learning_rate": 9.083284536733585e-06, "loss": 0.4617, "step": 1662 }, { "epoch": 0.8292885638297872, "grad_norm": 0.5258373618125916, "learning_rate": 9.081609096818154e-06, "loss": 0.4022, "step": 1663 }, { "epoch": 0.8297872340425532, "grad_norm": 0.6195336580276489, "learning_rate": 9.079932282053449e-06, "loss": 0.4643, "step": 1664 }, { "epoch": 0.8302859042553191, "grad_norm": 0.5056676864624023, "learning_rate": 9.078254093004289e-06, "loss": 0.4084, "step": 1665 }, { "epoch": 0.8307845744680851, "grad_norm": 0.5186375975608826, "learning_rate": 9.076574530235958e-06, "loss": 0.4278, "step": 1666 }, { "epoch": 0.831283244680851, "grad_norm": 0.5764119029045105, "learning_rate": 9.074893594314195e-06, "loss": 0.4376, "step": 1667 }, { "epoch": 0.831781914893617, "grad_norm": 0.6097090840339661, "learning_rate": 9.073211285805213e-06, "loss": 0.4376, "step": 1668 }, { "epoch": 0.832280585106383, "grad_norm": 0.5237488150596619, "learning_rate": 9.071527605275673e-06, "loss": 0.426, "step": 1669 }, { "epoch": 0.832779255319149, "grad_norm": 0.5331575870513916, "learning_rate": 9.069842553292713e-06, "loss": 0.3987, "step": 1670 }, { "epoch": 0.8332779255319149, "grad_norm": 0.5140233635902405, "learning_rate": 9.068156130423924e-06, "loss": 0.4452, "step": 1671 }, { "epoch": 0.8337765957446809, "grad_norm": 0.5851209163665771, "learning_rate": 9.066468337237358e-06, "loss": 0.4391, "step": 1672 }, { "epoch": 0.8342752659574468, "grad_norm": 0.5936776995658875, "learning_rate": 9.064779174301534e-06, "loss": 0.431, "step": 1673 }, { "epoch": 0.8347739361702128, "grad_norm": 0.5087127685546875, "learning_rate": 9.063088642185429e-06, "loss": 0.4792, "step": 1674 }, { "epoch": 0.8352726063829787, "grad_norm": 0.581203281879425, "learning_rate": 9.061396741458481e-06, "loss": 0.4165, "step": 1675 }, { "epoch": 0.8357712765957447, "grad_norm": 0.5115969777107239, "learning_rate": 9.05970347269059e-06, "loss": 0.4389, "step": 1676 }, { "epoch": 0.8362699468085106, "grad_norm": 0.6015080809593201, "learning_rate": 9.058008836452119e-06, "loss": 0.4428, "step": 1677 }, { "epoch": 0.8367686170212766, "grad_norm": 0.5974856019020081, "learning_rate": 9.056312833313888e-06, "loss": 0.4299, "step": 1678 }, { "epoch": 0.8372672872340425, "grad_norm": 0.48287805914878845, "learning_rate": 9.054615463847176e-06, "loss": 0.4515, "step": 1679 }, { "epoch": 0.8377659574468085, "grad_norm": 0.6151671409606934, "learning_rate": 9.052916728623728e-06, "loss": 0.4697, "step": 1680 }, { "epoch": 0.8382646276595744, "grad_norm": 0.5713690519332886, "learning_rate": 9.051216628215744e-06, "loss": 0.4252, "step": 1681 }, { "epoch": 0.8387632978723404, "grad_norm": 0.5767151713371277, "learning_rate": 9.04951516319589e-06, "loss": 0.4162, "step": 1682 }, { "epoch": 0.8392619680851063, "grad_norm": 0.5696364045143127, "learning_rate": 9.047812334137282e-06, "loss": 0.4123, "step": 1683 }, { "epoch": 0.8397606382978723, "grad_norm": 0.5573612451553345, "learning_rate": 9.046108141613505e-06, "loss": 0.4722, "step": 1684 }, { "epoch": 0.8402593085106383, "grad_norm": 0.676445484161377, "learning_rate": 9.044402586198597e-06, "loss": 0.4253, "step": 1685 }, { "epoch": 0.8407579787234043, "grad_norm": 0.5937748551368713, "learning_rate": 9.042695668467059e-06, "loss": 0.4343, "step": 1686 }, { "epoch": 0.8412566489361702, "grad_norm": 0.5619312524795532, "learning_rate": 9.040987388993847e-06, "loss": 0.4201, "step": 1687 }, { "epoch": 0.8417553191489362, "grad_norm": 0.6084080934524536, "learning_rate": 9.03927774835438e-06, "loss": 0.4482, "step": 1688 }, { "epoch": 0.8422539893617021, "grad_norm": 0.5893582701683044, "learning_rate": 9.037566747124532e-06, "loss": 0.4629, "step": 1689 }, { "epoch": 0.8427526595744681, "grad_norm": 0.49377304315567017, "learning_rate": 9.035854385880638e-06, "loss": 0.4371, "step": 1690 }, { "epoch": 0.843251329787234, "grad_norm": 0.6046088337898254, "learning_rate": 9.034140665199488e-06, "loss": 0.4345, "step": 1691 }, { "epoch": 0.84375, "grad_norm": 0.6435823440551758, "learning_rate": 9.032425585658333e-06, "loss": 0.4523, "step": 1692 }, { "epoch": 0.844248670212766, "grad_norm": 0.5458728075027466, "learning_rate": 9.03070914783488e-06, "loss": 0.4178, "step": 1693 }, { "epoch": 0.8447473404255319, "grad_norm": 0.6577427387237549, "learning_rate": 9.028991352307294e-06, "loss": 0.4169, "step": 1694 }, { "epoch": 0.8452460106382979, "grad_norm": 0.5891067981719971, "learning_rate": 9.0272721996542e-06, "loss": 0.4218, "step": 1695 }, { "epoch": 0.8457446808510638, "grad_norm": 0.5781768560409546, "learning_rate": 9.025551690454672e-06, "loss": 0.4042, "step": 1696 }, { "epoch": 0.8462433510638298, "grad_norm": 0.5510154962539673, "learning_rate": 9.023829825288249e-06, "loss": 0.4543, "step": 1697 }, { "epoch": 0.8467420212765957, "grad_norm": 0.6280282735824585, "learning_rate": 9.022106604734927e-06, "loss": 0.4513, "step": 1698 }, { "epoch": 0.8472406914893617, "grad_norm": 0.5587658286094666, "learning_rate": 9.020382029375148e-06, "loss": 0.4391, "step": 1699 }, { "epoch": 0.8477393617021277, "grad_norm": 0.6039196252822876, "learning_rate": 9.018656099789826e-06, "loss": 0.4194, "step": 1700 }, { "epoch": 0.8482380319148937, "grad_norm": 0.5313088893890381, "learning_rate": 9.01692881656032e-06, "loss": 0.429, "step": 1701 }, { "epoch": 0.8487367021276596, "grad_norm": 0.4778323471546173, "learning_rate": 9.015200180268447e-06, "loss": 0.417, "step": 1702 }, { "epoch": 0.8492353723404256, "grad_norm": 0.6309943199157715, "learning_rate": 9.013470191496483e-06, "loss": 0.4402, "step": 1703 }, { "epoch": 0.8497340425531915, "grad_norm": 0.5178186297416687, "learning_rate": 9.011738850827156e-06, "loss": 0.4405, "step": 1704 }, { "epoch": 0.8502327127659575, "grad_norm": 0.5981805324554443, "learning_rate": 9.010006158843651e-06, "loss": 0.4554, "step": 1705 }, { "epoch": 0.8507313829787234, "grad_norm": 0.5888550281524658, "learning_rate": 9.008272116129608e-06, "loss": 0.4585, "step": 1706 }, { "epoch": 0.8512300531914894, "grad_norm": 0.5322354435920715, "learning_rate": 9.006536723269122e-06, "loss": 0.4564, "step": 1707 }, { "epoch": 0.8517287234042553, "grad_norm": 0.684849202632904, "learning_rate": 9.004799980846744e-06, "loss": 0.4461, "step": 1708 }, { "epoch": 0.8522273936170213, "grad_norm": 0.5726137161254883, "learning_rate": 9.003061889447477e-06, "loss": 0.4087, "step": 1709 }, { "epoch": 0.8527260638297872, "grad_norm": 0.6411903500556946, "learning_rate": 9.00132244965678e-06, "loss": 0.477, "step": 1710 }, { "epoch": 0.8532247340425532, "grad_norm": 0.5549492239952087, "learning_rate": 8.999581662060566e-06, "loss": 0.4132, "step": 1711 }, { "epoch": 0.8537234042553191, "grad_norm": 0.5723696351051331, "learning_rate": 8.997839527245201e-06, "loss": 0.4619, "step": 1712 }, { "epoch": 0.8542220744680851, "grad_norm": 0.6059219241142273, "learning_rate": 8.996096045797507e-06, "loss": 0.4691, "step": 1713 }, { "epoch": 0.854720744680851, "grad_norm": 0.5641019940376282, "learning_rate": 8.994351218304757e-06, "loss": 0.4291, "step": 1714 }, { "epoch": 0.855219414893617, "grad_norm": 0.549946665763855, "learning_rate": 8.992605045354684e-06, "loss": 0.4361, "step": 1715 }, { "epoch": 0.855718085106383, "grad_norm": 0.5283389091491699, "learning_rate": 8.99085752753546e-06, "loss": 0.4403, "step": 1716 }, { "epoch": 0.856216755319149, "grad_norm": 0.5289129018783569, "learning_rate": 8.989108665435724e-06, "loss": 0.4588, "step": 1717 }, { "epoch": 0.8567154255319149, "grad_norm": 0.5135754346847534, "learning_rate": 8.987358459644563e-06, "loss": 0.417, "step": 1718 }, { "epoch": 0.8572140957446809, "grad_norm": 0.5645970702171326, "learning_rate": 8.985606910751516e-06, "loss": 0.4361, "step": 1719 }, { "epoch": 0.8577127659574468, "grad_norm": 0.5532199740409851, "learning_rate": 8.983854019346574e-06, "loss": 0.468, "step": 1720 }, { "epoch": 0.8582114361702128, "grad_norm": 0.4834684133529663, "learning_rate": 8.982099786020182e-06, "loss": 0.4505, "step": 1721 }, { "epoch": 0.8587101063829787, "grad_norm": 0.4849489629268646, "learning_rate": 8.980344211363235e-06, "loss": 0.4536, "step": 1722 }, { "epoch": 0.8592087765957447, "grad_norm": 0.5635472536087036, "learning_rate": 8.97858729596708e-06, "loss": 0.436, "step": 1723 }, { "epoch": 0.8597074468085106, "grad_norm": 0.5314710140228271, "learning_rate": 8.976829040423519e-06, "loss": 0.4148, "step": 1724 }, { "epoch": 0.8602061170212766, "grad_norm": 0.5503197312355042, "learning_rate": 8.9750694453248e-06, "loss": 0.4347, "step": 1725 }, { "epoch": 0.8607047872340425, "grad_norm": 0.6028314232826233, "learning_rate": 8.973308511263625e-06, "loss": 0.4394, "step": 1726 }, { "epoch": 0.8612034574468085, "grad_norm": 0.5820469856262207, "learning_rate": 8.971546238833152e-06, "loss": 0.4441, "step": 1727 }, { "epoch": 0.8617021276595744, "grad_norm": 0.5447351932525635, "learning_rate": 8.969782628626978e-06, "loss": 0.4725, "step": 1728 }, { "epoch": 0.8622007978723404, "grad_norm": 0.5727984309196472, "learning_rate": 8.968017681239163e-06, "loss": 0.4926, "step": 1729 }, { "epoch": 0.8626994680851063, "grad_norm": 0.552836000919342, "learning_rate": 8.966251397264209e-06, "loss": 0.4146, "step": 1730 }, { "epoch": 0.8631981382978723, "grad_norm": 0.571809709072113, "learning_rate": 8.96448377729707e-06, "loss": 0.4347, "step": 1731 }, { "epoch": 0.8636968085106383, "grad_norm": 0.5289079546928406, "learning_rate": 8.962714821933157e-06, "loss": 0.4395, "step": 1732 }, { "epoch": 0.8641954787234043, "grad_norm": 0.5437042713165283, "learning_rate": 8.960944531768316e-06, "loss": 0.4265, "step": 1733 }, { "epoch": 0.8646941489361702, "grad_norm": 0.5935264229774475, "learning_rate": 8.95917290739886e-06, "loss": 0.4513, "step": 1734 }, { "epoch": 0.8651928191489362, "grad_norm": 0.5014663338661194, "learning_rate": 8.957399949421538e-06, "loss": 0.4089, "step": 1735 }, { "epoch": 0.8656914893617021, "grad_norm": 0.5982638001441956, "learning_rate": 8.955625658433555e-06, "loss": 0.4506, "step": 1736 }, { "epoch": 0.8661901595744681, "grad_norm": 0.5891695022583008, "learning_rate": 8.953850035032563e-06, "loss": 0.4183, "step": 1737 }, { "epoch": 0.866688829787234, "grad_norm": 0.6001369953155518, "learning_rate": 8.952073079816663e-06, "loss": 0.4111, "step": 1738 }, { "epoch": 0.8671875, "grad_norm": 0.5084583759307861, "learning_rate": 8.950294793384404e-06, "loss": 0.4407, "step": 1739 }, { "epoch": 0.867686170212766, "grad_norm": 0.47071096301078796, "learning_rate": 8.948515176334784e-06, "loss": 0.4055, "step": 1740 }, { "epoch": 0.8681848404255319, "grad_norm": 0.6236487627029419, "learning_rate": 8.946734229267251e-06, "loss": 0.4362, "step": 1741 }, { "epoch": 0.8686835106382979, "grad_norm": 0.5289919376373291, "learning_rate": 8.944951952781698e-06, "loss": 0.446, "step": 1742 }, { "epoch": 0.8691821808510638, "grad_norm": 0.5293678641319275, "learning_rate": 8.943168347478465e-06, "loss": 0.451, "step": 1743 }, { "epoch": 0.8696808510638298, "grad_norm": 0.5386779308319092, "learning_rate": 8.941383413958346e-06, "loss": 0.4096, "step": 1744 }, { "epoch": 0.8701795212765957, "grad_norm": 0.5561116337776184, "learning_rate": 8.939597152822574e-06, "loss": 0.4451, "step": 1745 }, { "epoch": 0.8706781914893617, "grad_norm": 0.6148067116737366, "learning_rate": 8.937809564672836e-06, "loss": 0.4288, "step": 1746 }, { "epoch": 0.8711768617021277, "grad_norm": 0.5516778230667114, "learning_rate": 8.936020650111262e-06, "loss": 0.4167, "step": 1747 }, { "epoch": 0.8716755319148937, "grad_norm": 0.5283200740814209, "learning_rate": 8.93423040974043e-06, "loss": 0.4038, "step": 1748 }, { "epoch": 0.8721742021276596, "grad_norm": 0.5996692180633545, "learning_rate": 8.932438844163364e-06, "loss": 0.4547, "step": 1749 }, { "epoch": 0.8726728723404256, "grad_norm": 0.5371984839439392, "learning_rate": 8.930645953983537e-06, "loss": 0.4301, "step": 1750 }, { "epoch": 0.8731715425531915, "grad_norm": 0.5223242044448853, "learning_rate": 8.928851739804862e-06, "loss": 0.4448, "step": 1751 }, { "epoch": 0.8736702127659575, "grad_norm": 0.6545284390449524, "learning_rate": 8.927056202231709e-06, "loss": 0.4308, "step": 1752 }, { "epoch": 0.8741688829787234, "grad_norm": 0.4987036883831024, "learning_rate": 8.925259341868881e-06, "loss": 0.4298, "step": 1753 }, { "epoch": 0.8746675531914894, "grad_norm": 0.5690712332725525, "learning_rate": 8.923461159321634e-06, "loss": 0.4241, "step": 1754 }, { "epoch": 0.8751662234042553, "grad_norm": 0.6319894790649414, "learning_rate": 8.921661655195668e-06, "loss": 0.4321, "step": 1755 }, { "epoch": 0.8756648936170213, "grad_norm": 0.5762075185775757, "learning_rate": 8.91986083009713e-06, "loss": 0.4203, "step": 1756 }, { "epoch": 0.8761635638297872, "grad_norm": 0.5194082856178284, "learning_rate": 8.918058684632606e-06, "loss": 0.427, "step": 1757 }, { "epoch": 0.8766622340425532, "grad_norm": 0.5484507083892822, "learning_rate": 8.916255219409136e-06, "loss": 0.392, "step": 1758 }, { "epoch": 0.8771609042553191, "grad_norm": 0.5465835332870483, "learning_rate": 8.914450435034193e-06, "loss": 0.4208, "step": 1759 }, { "epoch": 0.8776595744680851, "grad_norm": 0.5102187395095825, "learning_rate": 8.912644332115705e-06, "loss": 0.4629, "step": 1760 }, { "epoch": 0.878158244680851, "grad_norm": 0.5432314276695251, "learning_rate": 8.910836911262037e-06, "loss": 0.444, "step": 1761 }, { "epoch": 0.878656914893617, "grad_norm": 0.5100592374801636, "learning_rate": 8.909028173082003e-06, "loss": 0.4445, "step": 1762 }, { "epoch": 0.879155585106383, "grad_norm": 0.5752875804901123, "learning_rate": 8.90721811818486e-06, "loss": 0.4301, "step": 1763 }, { "epoch": 0.879654255319149, "grad_norm": 0.5583136081695557, "learning_rate": 8.905406747180301e-06, "loss": 0.4221, "step": 1764 }, { "epoch": 0.8801529255319149, "grad_norm": 0.4954591989517212, "learning_rate": 8.903594060678473e-06, "loss": 0.4026, "step": 1765 }, { "epoch": 0.8806515957446809, "grad_norm": 0.49431079626083374, "learning_rate": 8.90178005928996e-06, "loss": 0.4259, "step": 1766 }, { "epoch": 0.8811502659574468, "grad_norm": 0.6123365163803101, "learning_rate": 8.89996474362579e-06, "loss": 0.4363, "step": 1767 }, { "epoch": 0.8816489361702128, "grad_norm": 0.47583699226379395, "learning_rate": 8.898148114297435e-06, "loss": 0.4163, "step": 1768 }, { "epoch": 0.8821476063829787, "grad_norm": 0.5216759443283081, "learning_rate": 8.896330171916807e-06, "loss": 0.4187, "step": 1769 }, { "epoch": 0.8826462765957447, "grad_norm": 0.5199053287506104, "learning_rate": 8.894510917096263e-06, "loss": 0.4281, "step": 1770 }, { "epoch": 0.8831449468085106, "grad_norm": 0.5101550817489624, "learning_rate": 8.892690350448601e-06, "loss": 0.4253, "step": 1771 }, { "epoch": 0.8836436170212766, "grad_norm": 0.5497869253158569, "learning_rate": 8.89086847258706e-06, "loss": 0.4209, "step": 1772 }, { "epoch": 0.8841422872340425, "grad_norm": 0.5558885335922241, "learning_rate": 8.889045284125319e-06, "loss": 0.4209, "step": 1773 }, { "epoch": 0.8846409574468085, "grad_norm": 0.5868385434150696, "learning_rate": 8.887220785677506e-06, "loss": 0.4228, "step": 1774 }, { "epoch": 0.8851396276595744, "grad_norm": 0.608061671257019, "learning_rate": 8.885394977858179e-06, "loss": 0.4462, "step": 1775 }, { "epoch": 0.8856382978723404, "grad_norm": 0.6550830602645874, "learning_rate": 8.88356786128235e-06, "loss": 0.4561, "step": 1776 }, { "epoch": 0.8861369680851063, "grad_norm": 0.5386308431625366, "learning_rate": 8.881739436565458e-06, "loss": 0.4433, "step": 1777 }, { "epoch": 0.8866356382978723, "grad_norm": 0.6009330153465271, "learning_rate": 8.879909704323395e-06, "loss": 0.4027, "step": 1778 }, { "epoch": 0.8871343085106383, "grad_norm": 0.6471377611160278, "learning_rate": 8.878078665172485e-06, "loss": 0.473, "step": 1779 }, { "epoch": 0.8876329787234043, "grad_norm": 0.5537880659103394, "learning_rate": 8.876246319729498e-06, "loss": 0.4452, "step": 1780 }, { "epoch": 0.8881316489361702, "grad_norm": 0.6331372261047363, "learning_rate": 8.874412668611639e-06, "loss": 0.4059, "step": 1781 }, { "epoch": 0.8886303191489362, "grad_norm": 0.5612801313400269, "learning_rate": 8.872577712436555e-06, "loss": 0.479, "step": 1782 }, { "epoch": 0.8891289893617021, "grad_norm": 0.5882365107536316, "learning_rate": 8.870741451822331e-06, "loss": 0.4318, "step": 1783 }, { "epoch": 0.8896276595744681, "grad_norm": 0.5334587693214417, "learning_rate": 8.868903887387498e-06, "loss": 0.4494, "step": 1784 }, { "epoch": 0.890126329787234, "grad_norm": 0.5492268800735474, "learning_rate": 8.867065019751019e-06, "loss": 0.4148, "step": 1785 }, { "epoch": 0.890625, "grad_norm": 0.6310436129570007, "learning_rate": 8.865224849532298e-06, "loss": 0.4542, "step": 1786 }, { "epoch": 0.891123670212766, "grad_norm": 0.671527624130249, "learning_rate": 8.863383377351175e-06, "loss": 0.4587, "step": 1787 }, { "epoch": 0.8916223404255319, "grad_norm": 0.5489889979362488, "learning_rate": 8.861540603827936e-06, "loss": 0.442, "step": 1788 }, { "epoch": 0.8921210106382979, "grad_norm": 0.638680636882782, "learning_rate": 8.859696529583298e-06, "loss": 0.4546, "step": 1789 }, { "epoch": 0.8926196808510638, "grad_norm": 0.48033061623573303, "learning_rate": 8.857851155238421e-06, "loss": 0.4349, "step": 1790 }, { "epoch": 0.8931183510638298, "grad_norm": 0.5354960560798645, "learning_rate": 8.8560044814149e-06, "loss": 0.424, "step": 1791 }, { "epoch": 0.8936170212765957, "grad_norm": 0.5838180780410767, "learning_rate": 8.85415650873477e-06, "loss": 0.4018, "step": 1792 }, { "epoch": 0.8941156914893617, "grad_norm": 0.5189645886421204, "learning_rate": 8.8523072378205e-06, "loss": 0.4104, "step": 1793 }, { "epoch": 0.8946143617021277, "grad_norm": 0.540162980556488, "learning_rate": 8.850456669294998e-06, "loss": 0.4407, "step": 1794 }, { "epoch": 0.8951130319148937, "grad_norm": 0.5163559317588806, "learning_rate": 8.848604803781612e-06, "loss": 0.418, "step": 1795 }, { "epoch": 0.8956117021276596, "grad_norm": 0.6342364549636841, "learning_rate": 8.846751641904121e-06, "loss": 0.4268, "step": 1796 }, { "epoch": 0.8961103723404256, "grad_norm": 0.6020134091377258, "learning_rate": 8.84489718428675e-06, "loss": 0.4518, "step": 1797 }, { "epoch": 0.8966090425531915, "grad_norm": 0.5264042019844055, "learning_rate": 8.843041431554148e-06, "loss": 0.4308, "step": 1798 }, { "epoch": 0.8971077127659575, "grad_norm": 0.5610998272895813, "learning_rate": 8.841184384331409e-06, "loss": 0.448, "step": 1799 }, { "epoch": 0.8976063829787234, "grad_norm": 0.5381839871406555, "learning_rate": 8.839326043244064e-06, "loss": 0.4288, "step": 1800 }, { "epoch": 0.8981050531914894, "grad_norm": 0.5572534799575806, "learning_rate": 8.837466408918073e-06, "loss": 0.4433, "step": 1801 }, { "epoch": 0.8986037234042553, "grad_norm": 0.4978639781475067, "learning_rate": 8.835605481979837e-06, "loss": 0.4364, "step": 1802 }, { "epoch": 0.8991023936170213, "grad_norm": 0.5196175575256348, "learning_rate": 8.833743263056188e-06, "loss": 0.4425, "step": 1803 }, { "epoch": 0.8996010638297872, "grad_norm": 0.5368587970733643, "learning_rate": 8.8318797527744e-06, "loss": 0.436, "step": 1804 }, { "epoch": 0.9000997340425532, "grad_norm": 0.5756350755691528, "learning_rate": 8.830014951762173e-06, "loss": 0.42, "step": 1805 }, { "epoch": 0.9005984042553191, "grad_norm": 0.5280420184135437, "learning_rate": 8.828148860647652e-06, "loss": 0.4357, "step": 1806 }, { "epoch": 0.9010970744680851, "grad_norm": 0.6247347593307495, "learning_rate": 8.826281480059406e-06, "loss": 0.4363, "step": 1807 }, { "epoch": 0.901595744680851, "grad_norm": 0.5336983799934387, "learning_rate": 8.824412810626449e-06, "loss": 0.4156, "step": 1808 }, { "epoch": 0.902094414893617, "grad_norm": 0.6200982928276062, "learning_rate": 8.822542852978218e-06, "loss": 0.4388, "step": 1809 }, { "epoch": 0.902593085106383, "grad_norm": 0.5381795167922974, "learning_rate": 8.820671607744594e-06, "loss": 0.4196, "step": 1810 }, { "epoch": 0.903091755319149, "grad_norm": 0.5095324516296387, "learning_rate": 8.818799075555883e-06, "loss": 0.4392, "step": 1811 }, { "epoch": 0.9035904255319149, "grad_norm": 0.5294657945632935, "learning_rate": 8.816925257042833e-06, "loss": 0.4333, "step": 1812 }, { "epoch": 0.9040890957446809, "grad_norm": 0.5345436930656433, "learning_rate": 8.815050152836617e-06, "loss": 0.405, "step": 1813 }, { "epoch": 0.9045877659574468, "grad_norm": 0.5338881611824036, "learning_rate": 8.81317376356885e-06, "loss": 0.4232, "step": 1814 }, { "epoch": 0.9050864361702128, "grad_norm": 0.5898707509040833, "learning_rate": 8.81129608987157e-06, "loss": 0.4103, "step": 1815 }, { "epoch": 0.9055851063829787, "grad_norm": 0.5700839757919312, "learning_rate": 8.809417132377259e-06, "loss": 0.4501, "step": 1816 }, { "epoch": 0.9060837765957447, "grad_norm": 0.5109901428222656, "learning_rate": 8.807536891718818e-06, "loss": 0.4642, "step": 1817 }, { "epoch": 0.9065824468085106, "grad_norm": 0.5669790506362915, "learning_rate": 8.805655368529588e-06, "loss": 0.4159, "step": 1818 }, { "epoch": 0.9070811170212766, "grad_norm": 0.501448929309845, "learning_rate": 8.803772563443347e-06, "loss": 0.4331, "step": 1819 }, { "epoch": 0.9075797872340425, "grad_norm": 0.4923454821109772, "learning_rate": 8.801888477094296e-06, "loss": 0.4123, "step": 1820 }, { "epoch": 0.9080784574468085, "grad_norm": 0.49530449509620667, "learning_rate": 8.800003110117069e-06, "loss": 0.4171, "step": 1821 }, { "epoch": 0.9085771276595744, "grad_norm": 0.47009095549583435, "learning_rate": 8.798116463146735e-06, "loss": 0.4267, "step": 1822 }, { "epoch": 0.9090757978723404, "grad_norm": 0.5101699829101562, "learning_rate": 8.796228536818791e-06, "loss": 0.4135, "step": 1823 }, { "epoch": 0.9095744680851063, "grad_norm": 0.5919820070266724, "learning_rate": 8.794339331769168e-06, "loss": 0.4373, "step": 1824 }, { "epoch": 0.9100731382978723, "grad_norm": 0.496233195066452, "learning_rate": 8.792448848634224e-06, "loss": 0.4404, "step": 1825 }, { "epoch": 0.9105718085106383, "grad_norm": 0.5590314269065857, "learning_rate": 8.790557088050751e-06, "loss": 0.4544, "step": 1826 }, { "epoch": 0.9110704787234043, "grad_norm": 0.5411926507949829, "learning_rate": 8.788664050655968e-06, "loss": 0.4362, "step": 1827 }, { "epoch": 0.9115691489361702, "grad_norm": 0.5529196262359619, "learning_rate": 8.786769737087526e-06, "loss": 0.4351, "step": 1828 }, { "epoch": 0.9120678191489362, "grad_norm": 0.5015996694564819, "learning_rate": 8.784874147983507e-06, "loss": 0.4204, "step": 1829 }, { "epoch": 0.9125664893617021, "grad_norm": 0.5192598700523376, "learning_rate": 8.78297728398242e-06, "loss": 0.3969, "step": 1830 }, { "epoch": 0.9130651595744681, "grad_norm": 0.4769732356071472, "learning_rate": 8.781079145723203e-06, "loss": 0.3978, "step": 1831 }, { "epoch": 0.913563829787234, "grad_norm": 0.5453273057937622, "learning_rate": 8.779179733845229e-06, "loss": 0.4457, "step": 1832 }, { "epoch": 0.9140625, "grad_norm": 0.5465777516365051, "learning_rate": 8.777279048988292e-06, "loss": 0.4047, "step": 1833 }, { "epoch": 0.914561170212766, "grad_norm": 0.48285913467407227, "learning_rate": 8.775377091792623e-06, "loss": 0.4334, "step": 1834 }, { "epoch": 0.9150598404255319, "grad_norm": 0.5004767179489136, "learning_rate": 8.773473862898873e-06, "loss": 0.4469, "step": 1835 }, { "epoch": 0.9155585106382979, "grad_norm": 0.527439296245575, "learning_rate": 8.771569362948128e-06, "loss": 0.4284, "step": 1836 }, { "epoch": 0.9160571808510638, "grad_norm": 0.6324864625930786, "learning_rate": 8.769663592581899e-06, "loss": 0.424, "step": 1837 }, { "epoch": 0.9165558510638298, "grad_norm": 0.49343371391296387, "learning_rate": 8.767756552442128e-06, "loss": 0.4125, "step": 1838 }, { "epoch": 0.9170545212765957, "grad_norm": 0.6080456972122192, "learning_rate": 8.765848243171178e-06, "loss": 0.4869, "step": 1839 }, { "epoch": 0.9175531914893617, "grad_norm": 0.578825831413269, "learning_rate": 8.763938665411848e-06, "loss": 0.4191, "step": 1840 }, { "epoch": 0.9180518617021277, "grad_norm": 0.5405349135398865, "learning_rate": 8.762027819807357e-06, "loss": 0.4218, "step": 1841 }, { "epoch": 0.9185505319148937, "grad_norm": 0.5694636702537537, "learning_rate": 8.760115707001357e-06, "loss": 0.3911, "step": 1842 }, { "epoch": 0.9190492021276596, "grad_norm": 0.5215682983398438, "learning_rate": 8.758202327637923e-06, "loss": 0.4083, "step": 1843 }, { "epoch": 0.9195478723404256, "grad_norm": 0.6180152893066406, "learning_rate": 8.756287682361557e-06, "loss": 0.4805, "step": 1844 }, { "epoch": 0.9200465425531915, "grad_norm": 0.6321216225624084, "learning_rate": 8.75437177181719e-06, "loss": 0.4081, "step": 1845 }, { "epoch": 0.9205452127659575, "grad_norm": 0.5485541820526123, "learning_rate": 8.752454596650175e-06, "loss": 0.4155, "step": 1846 }, { "epoch": 0.9210438829787234, "grad_norm": 0.5557497143745422, "learning_rate": 8.750536157506295e-06, "loss": 0.4167, "step": 1847 }, { "epoch": 0.9215425531914894, "grad_norm": 0.5958173274993896, "learning_rate": 8.748616455031757e-06, "loss": 0.4595, "step": 1848 }, { "epoch": 0.9220412234042553, "grad_norm": 0.5650848150253296, "learning_rate": 8.746695489873194e-06, "loss": 0.4343, "step": 1849 }, { "epoch": 0.9225398936170213, "grad_norm": 0.5259332060813904, "learning_rate": 8.74477326267766e-06, "loss": 0.4004, "step": 1850 }, { "epoch": 0.9230385638297872, "grad_norm": 0.6137081384658813, "learning_rate": 8.742849774092642e-06, "loss": 0.4132, "step": 1851 }, { "epoch": 0.9235372340425532, "grad_norm": 0.5497092604637146, "learning_rate": 8.740925024766049e-06, "loss": 0.4139, "step": 1852 }, { "epoch": 0.9240359042553191, "grad_norm": 0.509395182132721, "learning_rate": 8.738999015346212e-06, "loss": 0.4281, "step": 1853 }, { "epoch": 0.9245345744680851, "grad_norm": 0.6221457123756409, "learning_rate": 8.737071746481887e-06, "loss": 0.4308, "step": 1854 }, { "epoch": 0.925033244680851, "grad_norm": 0.646846354007721, "learning_rate": 8.735143218822254e-06, "loss": 0.4707, "step": 1855 }, { "epoch": 0.925531914893617, "grad_norm": 0.5822504162788391, "learning_rate": 8.73321343301692e-06, "loss": 0.4101, "step": 1856 }, { "epoch": 0.926030585106383, "grad_norm": 0.5247465372085571, "learning_rate": 8.731282389715918e-06, "loss": 0.4309, "step": 1857 }, { "epoch": 0.926529255319149, "grad_norm": 0.5731497406959534, "learning_rate": 8.729350089569695e-06, "loss": 0.4211, "step": 1858 }, { "epoch": 0.9270279255319149, "grad_norm": 0.5734261274337769, "learning_rate": 8.727416533229129e-06, "loss": 0.4145, "step": 1859 }, { "epoch": 0.9275265957446809, "grad_norm": 0.5419289469718933, "learning_rate": 8.725481721345519e-06, "loss": 0.4405, "step": 1860 }, { "epoch": 0.9280252659574468, "grad_norm": 0.5169686675071716, "learning_rate": 8.723545654570587e-06, "loss": 0.4158, "step": 1861 }, { "epoch": 0.9285239361702128, "grad_norm": 0.6322340965270996, "learning_rate": 8.721608333556479e-06, "loss": 0.4266, "step": 1862 }, { "epoch": 0.9290226063829787, "grad_norm": 0.5308459997177124, "learning_rate": 8.71966975895576e-06, "loss": 0.4504, "step": 1863 }, { "epoch": 0.9295212765957447, "grad_norm": 0.6408416032791138, "learning_rate": 8.71772993142142e-06, "loss": 0.4118, "step": 1864 }, { "epoch": 0.9300199468085106, "grad_norm": 0.5493698716163635, "learning_rate": 8.715788851606874e-06, "loss": 0.4199, "step": 1865 }, { "epoch": 0.9305186170212766, "grad_norm": 0.5606467723846436, "learning_rate": 8.713846520165952e-06, "loss": 0.4369, "step": 1866 }, { "epoch": 0.9310172872340425, "grad_norm": 0.6236917972564697, "learning_rate": 8.711902937752908e-06, "loss": 0.4312, "step": 1867 }, { "epoch": 0.9315159574468085, "grad_norm": 0.5043035745620728, "learning_rate": 8.70995810502242e-06, "loss": 0.4416, "step": 1868 }, { "epoch": 0.9320146276595744, "grad_norm": 0.5046939849853516, "learning_rate": 8.708012022629586e-06, "loss": 0.4266, "step": 1869 }, { "epoch": 0.9325132978723404, "grad_norm": 0.6313596367835999, "learning_rate": 8.706064691229925e-06, "loss": 0.392, "step": 1870 }, { "epoch": 0.9330119680851063, "grad_norm": 0.5866550803184509, "learning_rate": 8.704116111479374e-06, "loss": 0.4089, "step": 1871 }, { "epoch": 0.9335106382978723, "grad_norm": 0.5877976417541504, "learning_rate": 8.702166284034293e-06, "loss": 0.4511, "step": 1872 }, { "epoch": 0.9340093085106383, "grad_norm": 0.6070416569709778, "learning_rate": 8.700215209551463e-06, "loss": 0.4047, "step": 1873 }, { "epoch": 0.9345079787234043, "grad_norm": 0.5563450455665588, "learning_rate": 8.698262888688083e-06, "loss": 0.4625, "step": 1874 }, { "epoch": 0.9350066489361702, "grad_norm": 0.520146369934082, "learning_rate": 8.696309322101774e-06, "loss": 0.4552, "step": 1875 }, { "epoch": 0.9355053191489362, "grad_norm": 0.7186123132705688, "learning_rate": 8.694354510450575e-06, "loss": 0.4538, "step": 1876 }, { "epoch": 0.9360039893617021, "grad_norm": 0.49595117568969727, "learning_rate": 8.692398454392943e-06, "loss": 0.4255, "step": 1877 }, { "epoch": 0.9365026595744681, "grad_norm": 0.5981892347335815, "learning_rate": 8.69044115458776e-06, "loss": 0.4133, "step": 1878 }, { "epoch": 0.937001329787234, "grad_norm": 0.5967240333557129, "learning_rate": 8.688482611694317e-06, "loss": 0.4328, "step": 1879 }, { "epoch": 0.9375, "grad_norm": 0.572664737701416, "learning_rate": 8.686522826372334e-06, "loss": 0.4597, "step": 1880 }, { "epoch": 0.937998670212766, "grad_norm": 0.5598470568656921, "learning_rate": 8.684561799281946e-06, "loss": 0.4299, "step": 1881 }, { "epoch": 0.9384973404255319, "grad_norm": 0.5437055826187134, "learning_rate": 8.682599531083701e-06, "loss": 0.4304, "step": 1882 }, { "epoch": 0.9389960106382979, "grad_norm": 0.5144897103309631, "learning_rate": 8.680636022438572e-06, "loss": 0.4373, "step": 1883 }, { "epoch": 0.9394946808510638, "grad_norm": 0.6303250789642334, "learning_rate": 8.678671274007948e-06, "loss": 0.444, "step": 1884 }, { "epoch": 0.9399933510638298, "grad_norm": 0.542549729347229, "learning_rate": 8.676705286453634e-06, "loss": 0.4367, "step": 1885 }, { "epoch": 0.9404920212765957, "grad_norm": 0.520122766494751, "learning_rate": 8.674738060437853e-06, "loss": 0.4112, "step": 1886 }, { "epoch": 0.9409906914893617, "grad_norm": 0.5679329633712769, "learning_rate": 8.672769596623246e-06, "loss": 0.4308, "step": 1887 }, { "epoch": 0.9414893617021277, "grad_norm": 0.6151743531227112, "learning_rate": 8.67079989567287e-06, "loss": 0.4099, "step": 1888 }, { "epoch": 0.9419880319148937, "grad_norm": 0.4932071566581726, "learning_rate": 8.6688289582502e-06, "loss": 0.4151, "step": 1889 }, { "epoch": 0.9424867021276596, "grad_norm": 0.5644400119781494, "learning_rate": 8.666856785019124e-06, "loss": 0.4343, "step": 1890 }, { "epoch": 0.9429853723404256, "grad_norm": 0.6330550909042358, "learning_rate": 8.664883376643953e-06, "loss": 0.4569, "step": 1891 }, { "epoch": 0.9434840425531915, "grad_norm": 0.4787265360355377, "learning_rate": 8.662908733789406e-06, "loss": 0.414, "step": 1892 }, { "epoch": 0.9439827127659575, "grad_norm": 0.6214877367019653, "learning_rate": 8.660932857120625e-06, "loss": 0.4176, "step": 1893 }, { "epoch": 0.9444813829787234, "grad_norm": 0.5425284504890442, "learning_rate": 8.65895574730316e-06, "loss": 0.4435, "step": 1894 }, { "epoch": 0.9449800531914894, "grad_norm": 0.5048279762268066, "learning_rate": 8.656977405002987e-06, "loss": 0.4269, "step": 1895 }, { "epoch": 0.9454787234042553, "grad_norm": 0.5663760304450989, "learning_rate": 8.654997830886484e-06, "loss": 0.4548, "step": 1896 }, { "epoch": 0.9459773936170213, "grad_norm": 0.5127114057540894, "learning_rate": 8.653017025620454e-06, "loss": 0.4164, "step": 1897 }, { "epoch": 0.9464760638297872, "grad_norm": 0.5135595798492432, "learning_rate": 8.651034989872113e-06, "loss": 0.413, "step": 1898 }, { "epoch": 0.9469747340425532, "grad_norm": 0.5281567573547363, "learning_rate": 8.649051724309089e-06, "loss": 0.4345, "step": 1899 }, { "epoch": 0.9474734042553191, "grad_norm": 0.4973181188106537, "learning_rate": 8.647067229599424e-06, "loss": 0.4371, "step": 1900 }, { "epoch": 0.9479720744680851, "grad_norm": 0.4592910408973694, "learning_rate": 8.645081506411574e-06, "loss": 0.4144, "step": 1901 }, { "epoch": 0.948470744680851, "grad_norm": 0.5139361619949341, "learning_rate": 8.643094555414415e-06, "loss": 0.4125, "step": 1902 }, { "epoch": 0.948969414893617, "grad_norm": 0.45974940061569214, "learning_rate": 8.641106377277227e-06, "loss": 0.4445, "step": 1903 }, { "epoch": 0.949468085106383, "grad_norm": 0.5055744647979736, "learning_rate": 8.63911697266971e-06, "loss": 0.4381, "step": 1904 }, { "epoch": 0.949966755319149, "grad_norm": 0.5336101055145264, "learning_rate": 8.637126342261974e-06, "loss": 0.4059, "step": 1905 }, { "epoch": 0.9504654255319149, "grad_norm": 0.5619093179702759, "learning_rate": 8.635134486724543e-06, "loss": 0.4485, "step": 1906 }, { "epoch": 0.9509640957446809, "grad_norm": 0.554803729057312, "learning_rate": 8.633141406728354e-06, "loss": 0.4172, "step": 1907 }, { "epoch": 0.9514627659574468, "grad_norm": 0.4925840497016907, "learning_rate": 8.631147102944757e-06, "loss": 0.3939, "step": 1908 }, { "epoch": 0.9519614361702128, "grad_norm": 0.5849730372428894, "learning_rate": 8.629151576045513e-06, "loss": 0.4492, "step": 1909 }, { "epoch": 0.9524601063829787, "grad_norm": 0.5740758776664734, "learning_rate": 8.627154826702793e-06, "loss": 0.4715, "step": 1910 }, { "epoch": 0.9529587765957447, "grad_norm": 0.48875126242637634, "learning_rate": 8.625156855589186e-06, "loss": 0.4225, "step": 1911 }, { "epoch": 0.9534574468085106, "grad_norm": 0.4681142568588257, "learning_rate": 8.623157663377686e-06, "loss": 0.4229, "step": 1912 }, { "epoch": 0.9539561170212766, "grad_norm": 0.5386026501655579, "learning_rate": 8.621157250741702e-06, "loss": 0.4475, "step": 1913 }, { "epoch": 0.9544547872340425, "grad_norm": 0.5343765020370483, "learning_rate": 8.61915561835505e-06, "loss": 0.4539, "step": 1914 }, { "epoch": 0.9549534574468085, "grad_norm": 0.484599769115448, "learning_rate": 8.617152766891965e-06, "loss": 0.4257, "step": 1915 }, { "epoch": 0.9554521276595744, "grad_norm": 0.49040400981903076, "learning_rate": 8.615148697027085e-06, "loss": 0.4279, "step": 1916 }, { "epoch": 0.9559507978723404, "grad_norm": 0.5564659833908081, "learning_rate": 8.61314340943546e-06, "loss": 0.4283, "step": 1917 }, { "epoch": 0.9564494680851063, "grad_norm": 0.5700339078903198, "learning_rate": 8.611136904792552e-06, "loss": 0.4536, "step": 1918 }, { "epoch": 0.9569481382978723, "grad_norm": 0.4975261092185974, "learning_rate": 8.609129183774235e-06, "loss": 0.4251, "step": 1919 }, { "epoch": 0.9574468085106383, "grad_norm": 0.4780798554420471, "learning_rate": 8.607120247056782e-06, "loss": 0.4368, "step": 1920 }, { "epoch": 0.9579454787234043, "grad_norm": 0.5367822647094727, "learning_rate": 8.605110095316891e-06, "loss": 0.4468, "step": 1921 }, { "epoch": 0.9584441489361702, "grad_norm": 0.5158174633979797, "learning_rate": 8.603098729231657e-06, "loss": 0.3782, "step": 1922 }, { "epoch": 0.9589428191489362, "grad_norm": 0.5037357807159424, "learning_rate": 8.60108614947859e-06, "loss": 0.4341, "step": 1923 }, { "epoch": 0.9594414893617021, "grad_norm": 0.5375674366950989, "learning_rate": 8.59907235673561e-06, "loss": 0.4145, "step": 1924 }, { "epoch": 0.9599401595744681, "grad_norm": 0.4969749450683594, "learning_rate": 8.59705735168104e-06, "loss": 0.4143, "step": 1925 }, { "epoch": 0.960438829787234, "grad_norm": 0.5093451738357544, "learning_rate": 8.595041134993613e-06, "loss": 0.4275, "step": 1926 }, { "epoch": 0.9609375, "grad_norm": 0.5785737633705139, "learning_rate": 8.593023707352474e-06, "loss": 0.449, "step": 1927 }, { "epoch": 0.961436170212766, "grad_norm": 0.5192000269889832, "learning_rate": 8.591005069437174e-06, "loss": 0.424, "step": 1928 }, { "epoch": 0.9619348404255319, "grad_norm": 0.5007727146148682, "learning_rate": 8.58898522192767e-06, "loss": 0.4354, "step": 1929 }, { "epoch": 0.9624335106382979, "grad_norm": 0.642981767654419, "learning_rate": 8.586964165504326e-06, "loss": 0.4228, "step": 1930 }, { "epoch": 0.9629321808510638, "grad_norm": 0.5628847479820251, "learning_rate": 8.584941900847917e-06, "loss": 0.4179, "step": 1931 }, { "epoch": 0.9634308510638298, "grad_norm": 0.5080211162567139, "learning_rate": 8.58291842863962e-06, "loss": 0.4277, "step": 1932 }, { "epoch": 0.9639295212765957, "grad_norm": 0.5581888556480408, "learning_rate": 8.580893749561024e-06, "loss": 0.4432, "step": 1933 }, { "epoch": 0.9644281914893617, "grad_norm": 0.4934799075126648, "learning_rate": 8.578867864294122e-06, "loss": 0.409, "step": 1934 }, { "epoch": 0.9649268617021277, "grad_norm": 0.48020532727241516, "learning_rate": 8.57684077352131e-06, "loss": 0.4313, "step": 1935 }, { "epoch": 0.9654255319148937, "grad_norm": 0.5317588448524475, "learning_rate": 8.574812477925395e-06, "loss": 0.4562, "step": 1936 }, { "epoch": 0.9659242021276596, "grad_norm": 0.5381292104721069, "learning_rate": 8.572782978189592e-06, "loss": 0.4242, "step": 1937 }, { "epoch": 0.9664228723404256, "grad_norm": 0.5465403199195862, "learning_rate": 8.570752274997511e-06, "loss": 0.4497, "step": 1938 }, { "epoch": 0.9669215425531915, "grad_norm": 0.5692149996757507, "learning_rate": 8.56872036903318e-06, "loss": 0.4369, "step": 1939 }, { "epoch": 0.9674202127659575, "grad_norm": 0.5115702748298645, "learning_rate": 8.566687260981022e-06, "loss": 0.4295, "step": 1940 }, { "epoch": 0.9679188829787234, "grad_norm": 0.531503438949585, "learning_rate": 8.56465295152587e-06, "loss": 0.4214, "step": 1941 }, { "epoch": 0.9684175531914894, "grad_norm": 0.6021542549133301, "learning_rate": 8.562617441352963e-06, "loss": 0.4269, "step": 1942 }, { "epoch": 0.9689162234042553, "grad_norm": 0.5488909482955933, "learning_rate": 8.56058073114794e-06, "loss": 0.4345, "step": 1943 }, { "epoch": 0.9694148936170213, "grad_norm": 0.5245727896690369, "learning_rate": 8.558542821596849e-06, "loss": 0.4308, "step": 1944 }, { "epoch": 0.9699135638297872, "grad_norm": 0.5206015706062317, "learning_rate": 8.556503713386136e-06, "loss": 0.4443, "step": 1945 }, { "epoch": 0.9704122340425532, "grad_norm": 0.5026476979255676, "learning_rate": 8.554463407202658e-06, "loss": 0.4194, "step": 1946 }, { "epoch": 0.9709109042553191, "grad_norm": 0.5136427283287048, "learning_rate": 8.552421903733667e-06, "loss": 0.4054, "step": 1947 }, { "epoch": 0.9714095744680851, "grad_norm": 0.5431079268455505, "learning_rate": 8.550379203666827e-06, "loss": 0.4496, "step": 1948 }, { "epoch": 0.971908244680851, "grad_norm": 0.5174503326416016, "learning_rate": 8.548335307690202e-06, "loss": 0.4304, "step": 1949 }, { "epoch": 0.972406914893617, "grad_norm": 0.5057862997055054, "learning_rate": 8.546290216492254e-06, "loss": 0.4297, "step": 1950 }, { "epoch": 0.972905585106383, "grad_norm": 0.4988548457622528, "learning_rate": 8.544243930761856e-06, "loss": 0.4049, "step": 1951 }, { "epoch": 0.973404255319149, "grad_norm": 0.5542688369750977, "learning_rate": 8.542196451188274e-06, "loss": 0.476, "step": 1952 }, { "epoch": 0.9739029255319149, "grad_norm": 0.47411736845970154, "learning_rate": 8.540147778461183e-06, "loss": 0.4184, "step": 1953 }, { "epoch": 0.9744015957446809, "grad_norm": 0.5046451091766357, "learning_rate": 8.53809791327066e-06, "loss": 0.4475, "step": 1954 }, { "epoch": 0.9749002659574468, "grad_norm": 0.46030089259147644, "learning_rate": 8.536046856307179e-06, "loss": 0.4614, "step": 1955 }, { "epoch": 0.9753989361702128, "grad_norm": 0.468783438205719, "learning_rate": 8.53399460826162e-06, "loss": 0.4539, "step": 1956 }, { "epoch": 0.9758976063829787, "grad_norm": 0.44517379999160767, "learning_rate": 8.531941169825262e-06, "loss": 0.4366, "step": 1957 }, { "epoch": 0.9763962765957447, "grad_norm": 0.5146215558052063, "learning_rate": 8.529886541689783e-06, "loss": 0.4063, "step": 1958 }, { "epoch": 0.9768949468085106, "grad_norm": 0.5454283356666565, "learning_rate": 8.527830724547269e-06, "loss": 0.4469, "step": 1959 }, { "epoch": 0.9773936170212766, "grad_norm": 0.4991472661495209, "learning_rate": 8.525773719090193e-06, "loss": 0.4257, "step": 1960 }, { "epoch": 0.9778922872340425, "grad_norm": 0.49590903520584106, "learning_rate": 8.523715526011445e-06, "loss": 0.421, "step": 1961 }, { "epoch": 0.9783909574468085, "grad_norm": 0.4924129843711853, "learning_rate": 8.521656146004304e-06, "loss": 0.4227, "step": 1962 }, { "epoch": 0.9788896276595744, "grad_norm": 0.5180701017379761, "learning_rate": 8.51959557976245e-06, "loss": 0.4266, "step": 1963 }, { "epoch": 0.9793882978723404, "grad_norm": 0.5519359111785889, "learning_rate": 8.517533827979966e-06, "loss": 0.4389, "step": 1964 }, { "epoch": 0.9798869680851063, "grad_norm": 0.5218471884727478, "learning_rate": 8.515470891351332e-06, "loss": 0.3996, "step": 1965 }, { "epoch": 0.9803856382978723, "grad_norm": 0.49602264165878296, "learning_rate": 8.513406770571426e-06, "loss": 0.4268, "step": 1966 }, { "epoch": 0.9808843085106383, "grad_norm": 0.5719467997550964, "learning_rate": 8.51134146633553e-06, "loss": 0.4537, "step": 1967 }, { "epoch": 0.9813829787234043, "grad_norm": 0.5383333563804626, "learning_rate": 8.509274979339318e-06, "loss": 0.4302, "step": 1968 }, { "epoch": 0.9818816489361702, "grad_norm": 0.49543944001197815, "learning_rate": 8.507207310278868e-06, "loss": 0.433, "step": 1969 }, { "epoch": 0.9823803191489362, "grad_norm": 0.5699578523635864, "learning_rate": 8.50513845985065e-06, "loss": 0.456, "step": 1970 }, { "epoch": 0.9828789893617021, "grad_norm": 0.5772659778594971, "learning_rate": 8.50306842875154e-06, "loss": 0.4443, "step": 1971 }, { "epoch": 0.9833776595744681, "grad_norm": 0.41696450114250183, "learning_rate": 8.500997217678803e-06, "loss": 0.3892, "step": 1972 }, { "epoch": 0.983876329787234, "grad_norm": 0.5189298987388611, "learning_rate": 8.49892482733011e-06, "loss": 0.4116, "step": 1973 }, { "epoch": 0.984375, "grad_norm": 0.559877872467041, "learning_rate": 8.496851258403522e-06, "loss": 0.4272, "step": 1974 }, { "epoch": 0.984873670212766, "grad_norm": 0.5300582051277161, "learning_rate": 8.494776511597502e-06, "loss": 0.4167, "step": 1975 }, { "epoch": 0.9853723404255319, "grad_norm": 0.5383410453796387, "learning_rate": 8.492700587610905e-06, "loss": 0.4234, "step": 1976 }, { "epoch": 0.9858710106382979, "grad_norm": 0.6280267238616943, "learning_rate": 8.49062348714299e-06, "loss": 0.4559, "step": 1977 }, { "epoch": 0.9863696808510638, "grad_norm": 0.5149670243263245, "learning_rate": 8.488545210893403e-06, "loss": 0.4112, "step": 1978 }, { "epoch": 0.9868683510638298, "grad_norm": 0.4940004050731659, "learning_rate": 8.486465759562191e-06, "loss": 0.397, "step": 1979 }, { "epoch": 0.9873670212765957, "grad_norm": 0.5150808095932007, "learning_rate": 8.484385133849797e-06, "loss": 0.4149, "step": 1980 }, { "epoch": 0.9878656914893617, "grad_norm": 0.5524197220802307, "learning_rate": 8.482303334457061e-06, "loss": 0.4057, "step": 1981 }, { "epoch": 0.9883643617021277, "grad_norm": 0.5205764770507812, "learning_rate": 8.480220362085215e-06, "loss": 0.4384, "step": 1982 }, { "epoch": 0.9888630319148937, "grad_norm": 0.5085683465003967, "learning_rate": 8.478136217435888e-06, "loss": 0.4088, "step": 1983 }, { "epoch": 0.9893617021276596, "grad_norm": 0.48622074723243713, "learning_rate": 8.4760509012111e-06, "loss": 0.3906, "step": 1984 }, { "epoch": 0.9898603723404256, "grad_norm": 0.5649114847183228, "learning_rate": 8.473964414113273e-06, "loss": 0.4291, "step": 1985 }, { "epoch": 0.9903590425531915, "grad_norm": 0.5517509579658508, "learning_rate": 8.47187675684522e-06, "loss": 0.4162, "step": 1986 }, { "epoch": 0.9908577127659575, "grad_norm": 0.5134198069572449, "learning_rate": 8.469787930110145e-06, "loss": 0.4023, "step": 1987 }, { "epoch": 0.9913563829787234, "grad_norm": 0.5099616646766663, "learning_rate": 8.467697934611646e-06, "loss": 0.4371, "step": 1988 }, { "epoch": 0.9918550531914894, "grad_norm": 0.5654440522193909, "learning_rate": 8.465606771053721e-06, "loss": 0.4494, "step": 1989 }, { "epoch": 0.9923537234042553, "grad_norm": 0.5520712733268738, "learning_rate": 8.463514440140757e-06, "loss": 0.4263, "step": 1990 }, { "epoch": 0.9928523936170213, "grad_norm": 0.5150190591812134, "learning_rate": 8.461420942577537e-06, "loss": 0.4092, "step": 1991 }, { "epoch": 0.9933510638297872, "grad_norm": 0.5533103346824646, "learning_rate": 8.45932627906923e-06, "loss": 0.4144, "step": 1992 }, { "epoch": 0.9938497340425532, "grad_norm": 0.5417529344558716, "learning_rate": 8.457230450321404e-06, "loss": 0.4321, "step": 1993 }, { "epoch": 0.9943484042553191, "grad_norm": 0.5797672271728516, "learning_rate": 8.455133457040018e-06, "loss": 0.4307, "step": 1994 }, { "epoch": 0.9948470744680851, "grad_norm": 0.4860178530216217, "learning_rate": 8.453035299931425e-06, "loss": 0.4286, "step": 1995 }, { "epoch": 0.995345744680851, "grad_norm": 0.5338885188102722, "learning_rate": 8.450935979702367e-06, "loss": 0.4341, "step": 1996 }, { "epoch": 0.995844414893617, "grad_norm": 0.5024545788764954, "learning_rate": 8.448835497059979e-06, "loss": 0.4139, "step": 1997 }, { "epoch": 0.996343085106383, "grad_norm": 0.5273353457450867, "learning_rate": 8.446733852711788e-06, "loss": 0.4082, "step": 1998 }, { "epoch": 0.996841755319149, "grad_norm": 0.469568133354187, "learning_rate": 8.444631047365712e-06, "loss": 0.437, "step": 1999 }, { "epoch": 0.9973404255319149, "grad_norm": 0.5140546560287476, "learning_rate": 8.442527081730058e-06, "loss": 0.4108, "step": 2000 }, { "epoch": 0.9978390957446809, "grad_norm": 0.5025549530982971, "learning_rate": 8.44042195651353e-06, "loss": 0.4306, "step": 2001 }, { "epoch": 0.9983377659574468, "grad_norm": 0.5385447144508362, "learning_rate": 8.438315672425214e-06, "loss": 0.4325, "step": 2002 }, { "epoch": 0.9988364361702128, "grad_norm": 0.4593009352684021, "learning_rate": 8.436208230174594e-06, "loss": 0.4088, "step": 2003 }, { "epoch": 0.9993351063829787, "grad_norm": 0.4809059202671051, "learning_rate": 8.434099630471539e-06, "loss": 0.4281, "step": 2004 }, { "epoch": 0.9998337765957447, "grad_norm": 0.4970215857028961, "learning_rate": 8.43198987402631e-06, "loss": 0.437, "step": 2005 }, { "epoch": 1.0003324468085106, "grad_norm": 1.188124179840088, "learning_rate": 8.429878961549557e-06, "loss": 0.6971, "step": 2006 }, { "epoch": 1.0008311170212767, "grad_norm": 0.6007209420204163, "learning_rate": 8.427766893752321e-06, "loss": 0.4154, "step": 2007 }, { "epoch": 1.0013297872340425, "grad_norm": 0.4965974688529968, "learning_rate": 8.425653671346031e-06, "loss": 0.3922, "step": 2008 }, { "epoch": 1.0018284574468086, "grad_norm": 0.4982292950153351, "learning_rate": 8.423539295042504e-06, "loss": 0.4263, "step": 2009 }, { "epoch": 1.0023271276595744, "grad_norm": 0.5235840678215027, "learning_rate": 8.421423765553948e-06, "loss": 0.3629, "step": 2010 }, { "epoch": 1.0028257978723405, "grad_norm": 0.574918806552887, "learning_rate": 8.419307083592956e-06, "loss": 0.4626, "step": 2011 }, { "epoch": 1.0033244680851063, "grad_norm": 0.4918566346168518, "learning_rate": 8.417189249872513e-06, "loss": 0.3624, "step": 2012 }, { "epoch": 1.0038231382978724, "grad_norm": 0.47372564673423767, "learning_rate": 8.415070265105986e-06, "loss": 0.4014, "step": 2013 }, { "epoch": 1.0043218085106382, "grad_norm": 0.4483410120010376, "learning_rate": 8.41295013000714e-06, "loss": 0.3637, "step": 2014 }, { "epoch": 1.0048204787234043, "grad_norm": 0.5668216943740845, "learning_rate": 8.410828845290118e-06, "loss": 0.4124, "step": 2015 }, { "epoch": 1.0053191489361701, "grad_norm": 0.5006892085075378, "learning_rate": 8.408706411669453e-06, "loss": 0.3904, "step": 2016 }, { "epoch": 1.0058178191489362, "grad_norm": 0.5189214944839478, "learning_rate": 8.406582829860067e-06, "loss": 0.4101, "step": 2017 }, { "epoch": 1.006316489361702, "grad_norm": 0.5083330273628235, "learning_rate": 8.404458100577265e-06, "loss": 0.4171, "step": 2018 }, { "epoch": 1.006815159574468, "grad_norm": 0.4016234874725342, "learning_rate": 8.402332224536747e-06, "loss": 0.3249, "step": 2019 }, { "epoch": 1.007313829787234, "grad_norm": 0.6799015402793884, "learning_rate": 8.400205202454586e-06, "loss": 0.4674, "step": 2020 }, { "epoch": 1.0078125, "grad_norm": 0.46074816584587097, "learning_rate": 8.398077035047253e-06, "loss": 0.3268, "step": 2021 }, { "epoch": 1.008311170212766, "grad_norm": 0.47981634736061096, "learning_rate": 8.395947723031595e-06, "loss": 0.3602, "step": 2022 }, { "epoch": 1.008809840425532, "grad_norm": 0.5231695175170898, "learning_rate": 8.393817267124854e-06, "loss": 0.4712, "step": 2023 }, { "epoch": 1.009308510638298, "grad_norm": 0.5888950228691101, "learning_rate": 8.391685668044652e-06, "loss": 0.3961, "step": 2024 }, { "epoch": 1.0098071808510638, "grad_norm": 0.49319976568222046, "learning_rate": 8.389552926508994e-06, "loss": 0.3219, "step": 2025 }, { "epoch": 1.0103058510638299, "grad_norm": 0.5250237584114075, "learning_rate": 8.387419043236276e-06, "loss": 0.4613, "step": 2026 }, { "epoch": 1.0108045212765957, "grad_norm": 0.4913465678691864, "learning_rate": 8.385284018945273e-06, "loss": 0.3224, "step": 2027 }, { "epoch": 1.0113031914893618, "grad_norm": 0.5608167052268982, "learning_rate": 8.383147854355149e-06, "loss": 0.3981, "step": 2028 }, { "epoch": 1.0118018617021276, "grad_norm": 0.5233553051948547, "learning_rate": 8.381010550185449e-06, "loss": 0.4336, "step": 2029 }, { "epoch": 1.0123005319148937, "grad_norm": 0.49837619066238403, "learning_rate": 8.378872107156099e-06, "loss": 0.4358, "step": 2030 }, { "epoch": 1.0127992021276595, "grad_norm": 0.5409919619560242, "learning_rate": 8.37673252598742e-06, "loss": 0.363, "step": 2031 }, { "epoch": 1.0132978723404256, "grad_norm": 0.5928682684898376, "learning_rate": 8.3745918074001e-06, "loss": 0.411, "step": 2032 }, { "epoch": 1.0137965425531914, "grad_norm": 0.575761079788208, "learning_rate": 8.372449952115223e-06, "loss": 0.3982, "step": 2033 }, { "epoch": 1.0142952127659575, "grad_norm": 0.4723951518535614, "learning_rate": 8.370306960854251e-06, "loss": 0.3902, "step": 2034 }, { "epoch": 1.0147938829787233, "grad_norm": 0.5252487063407898, "learning_rate": 8.368162834339029e-06, "loss": 0.4005, "step": 2035 }, { "epoch": 1.0152925531914894, "grad_norm": 0.4748563766479492, "learning_rate": 8.366017573291784e-06, "loss": 0.3626, "step": 2036 }, { "epoch": 1.0157912234042554, "grad_norm": 0.5586925745010376, "learning_rate": 8.363871178435128e-06, "loss": 0.3784, "step": 2037 }, { "epoch": 1.0162898936170213, "grad_norm": 0.4558405578136444, "learning_rate": 8.361723650492052e-06, "loss": 0.3578, "step": 2038 }, { "epoch": 1.0167885638297873, "grad_norm": 0.5931308269500732, "learning_rate": 8.359574990185926e-06, "loss": 0.4428, "step": 2039 }, { "epoch": 1.0172872340425532, "grad_norm": 0.5015847086906433, "learning_rate": 8.357425198240508e-06, "loss": 0.4041, "step": 2040 }, { "epoch": 1.0177859042553192, "grad_norm": 0.4901617467403412, "learning_rate": 8.355274275379933e-06, "loss": 0.3806, "step": 2041 }, { "epoch": 1.018284574468085, "grad_norm": 0.484463095664978, "learning_rate": 8.35312222232872e-06, "loss": 0.3495, "step": 2042 }, { "epoch": 1.0187832446808511, "grad_norm": 0.5147261619567871, "learning_rate": 8.350969039811765e-06, "loss": 0.3553, "step": 2043 }, { "epoch": 1.019281914893617, "grad_norm": 0.5648505687713623, "learning_rate": 8.348814728554345e-06, "loss": 0.4357, "step": 2044 }, { "epoch": 1.019780585106383, "grad_norm": 0.5774952173233032, "learning_rate": 8.346659289282118e-06, "loss": 0.3575, "step": 2045 }, { "epoch": 1.0202792553191489, "grad_norm": 0.5629116892814636, "learning_rate": 8.344502722721125e-06, "loss": 0.4478, "step": 2046 }, { "epoch": 1.020777925531915, "grad_norm": 0.5050501227378845, "learning_rate": 8.342345029597779e-06, "loss": 0.3919, "step": 2047 }, { "epoch": 1.0212765957446808, "grad_norm": 0.5923069715499878, "learning_rate": 8.340186210638883e-06, "loss": 0.4005, "step": 2048 }, { "epoch": 1.0217752659574468, "grad_norm": 0.5539566278457642, "learning_rate": 8.33802626657161e-06, "loss": 0.3627, "step": 2049 }, { "epoch": 1.0222739361702127, "grad_norm": 0.5116119384765625, "learning_rate": 8.335865198123519e-06, "loss": 0.4321, "step": 2050 }, { "epoch": 1.0227726063829787, "grad_norm": 0.49027442932128906, "learning_rate": 8.33370300602254e-06, "loss": 0.3217, "step": 2051 }, { "epoch": 1.0232712765957446, "grad_norm": 0.5972253084182739, "learning_rate": 8.331539690996988e-06, "loss": 0.3889, "step": 2052 }, { "epoch": 1.0237699468085106, "grad_norm": 0.5827274918556213, "learning_rate": 8.329375253775554e-06, "loss": 0.462, "step": 2053 }, { "epoch": 1.0242686170212767, "grad_norm": 0.43511003255844116, "learning_rate": 8.327209695087307e-06, "loss": 0.3332, "step": 2054 }, { "epoch": 1.0247672872340425, "grad_norm": 0.5455071926116943, "learning_rate": 8.325043015661693e-06, "loss": 0.3925, "step": 2055 }, { "epoch": 1.0252659574468086, "grad_norm": 0.5292108654975891, "learning_rate": 8.322875216228539e-06, "loss": 0.4327, "step": 2056 }, { "epoch": 1.0257646276595744, "grad_norm": 0.4999396502971649, "learning_rate": 8.320706297518044e-06, "loss": 0.4282, "step": 2057 }, { "epoch": 1.0262632978723405, "grad_norm": 0.530728816986084, "learning_rate": 8.318536260260787e-06, "loss": 0.3914, "step": 2058 }, { "epoch": 1.0267619680851063, "grad_norm": 0.5314630270004272, "learning_rate": 8.316365105187724e-06, "loss": 0.4298, "step": 2059 }, { "epoch": 1.0272606382978724, "grad_norm": 0.48941054940223694, "learning_rate": 8.314192833030189e-06, "loss": 0.3577, "step": 2060 }, { "epoch": 1.0277593085106382, "grad_norm": 0.49896833300590515, "learning_rate": 8.312019444519883e-06, "loss": 0.4085, "step": 2061 }, { "epoch": 1.0282579787234043, "grad_norm": 0.4678173363208771, "learning_rate": 8.309844940388899e-06, "loss": 0.3546, "step": 2062 }, { "epoch": 1.0287566489361701, "grad_norm": 0.5038262605667114, "learning_rate": 8.307669321369691e-06, "loss": 0.4209, "step": 2063 }, { "epoch": 1.0292553191489362, "grad_norm": 0.47053787112236023, "learning_rate": 8.305492588195098e-06, "loss": 0.3979, "step": 2064 }, { "epoch": 1.029753989361702, "grad_norm": 0.5489303469657898, "learning_rate": 8.303314741598329e-06, "loss": 0.3903, "step": 2065 }, { "epoch": 1.030252659574468, "grad_norm": 0.4888016879558563, "learning_rate": 8.30113578231297e-06, "loss": 0.3952, "step": 2066 }, { "epoch": 1.030751329787234, "grad_norm": 0.5160549283027649, "learning_rate": 8.298955711072982e-06, "loss": 0.387, "step": 2067 }, { "epoch": 1.03125, "grad_norm": 0.502030074596405, "learning_rate": 8.2967745286127e-06, "loss": 0.3908, "step": 2068 }, { "epoch": 1.031748670212766, "grad_norm": 0.4977430999279022, "learning_rate": 8.294592235666834e-06, "loss": 0.411, "step": 2069 }, { "epoch": 1.032247340425532, "grad_norm": 0.44176772236824036, "learning_rate": 8.29240883297047e-06, "loss": 0.3999, "step": 2070 }, { "epoch": 1.032746010638298, "grad_norm": 0.49290964007377625, "learning_rate": 8.29022432125906e-06, "loss": 0.389, "step": 2071 }, { "epoch": 1.0332446808510638, "grad_norm": 0.5626875758171082, "learning_rate": 8.288038701268436e-06, "loss": 0.4237, "step": 2072 }, { "epoch": 1.0337433510638299, "grad_norm": 0.4954549968242645, "learning_rate": 8.285851973734806e-06, "loss": 0.4045, "step": 2073 }, { "epoch": 1.0342420212765957, "grad_norm": 0.4594782888889313, "learning_rate": 8.283664139394747e-06, "loss": 0.3243, "step": 2074 }, { "epoch": 1.0347406914893618, "grad_norm": 0.5772703886032104, "learning_rate": 8.281475198985205e-06, "loss": 0.4614, "step": 2075 }, { "epoch": 1.0352393617021276, "grad_norm": 0.5075179934501648, "learning_rate": 8.279285153243508e-06, "loss": 0.3915, "step": 2076 }, { "epoch": 1.0357380319148937, "grad_norm": 0.5073160529136658, "learning_rate": 8.277094002907347e-06, "loss": 0.3955, "step": 2077 }, { "epoch": 1.0362367021276595, "grad_norm": 0.5080820918083191, "learning_rate": 8.27490174871479e-06, "loss": 0.3955, "step": 2078 }, { "epoch": 1.0367353723404256, "grad_norm": 0.5407707691192627, "learning_rate": 8.272708391404275e-06, "loss": 0.446, "step": 2079 }, { "epoch": 1.0372340425531914, "grad_norm": 0.47837740182876587, "learning_rate": 8.270513931714614e-06, "loss": 0.3318, "step": 2080 }, { "epoch": 1.0377327127659575, "grad_norm": 0.5119799971580505, "learning_rate": 8.268318370384991e-06, "loss": 0.3441, "step": 2081 }, { "epoch": 1.0382313829787233, "grad_norm": 0.5309589505195618, "learning_rate": 8.266121708154954e-06, "loss": 0.3742, "step": 2082 }, { "epoch": 1.0387300531914894, "grad_norm": 0.492872416973114, "learning_rate": 8.26392394576443e-06, "loss": 0.433, "step": 2083 }, { "epoch": 1.0392287234042552, "grad_norm": 0.4819408059120178, "learning_rate": 8.261725083953716e-06, "loss": 0.408, "step": 2084 }, { "epoch": 1.0397273936170213, "grad_norm": 0.5506460666656494, "learning_rate": 8.25952512346347e-06, "loss": 0.4043, "step": 2085 }, { "epoch": 1.0402260638297873, "grad_norm": 0.5866713523864746, "learning_rate": 8.25732406503473e-06, "loss": 0.4409, "step": 2086 }, { "epoch": 1.0407247340425532, "grad_norm": 0.5357630848884583, "learning_rate": 8.255121909408901e-06, "loss": 0.3866, "step": 2087 }, { "epoch": 1.0412234042553192, "grad_norm": 0.533028781414032, "learning_rate": 8.252918657327758e-06, "loss": 0.385, "step": 2088 }, { "epoch": 1.041722074468085, "grad_norm": 0.5673099160194397, "learning_rate": 8.25071430953344e-06, "loss": 0.4217, "step": 2089 }, { "epoch": 1.0422207446808511, "grad_norm": 0.5550149083137512, "learning_rate": 8.248508866768465e-06, "loss": 0.4015, "step": 2090 }, { "epoch": 1.042719414893617, "grad_norm": 0.42703986167907715, "learning_rate": 8.246302329775713e-06, "loss": 0.3561, "step": 2091 }, { "epoch": 1.043218085106383, "grad_norm": 0.5929997563362122, "learning_rate": 8.24409469929843e-06, "loss": 0.4794, "step": 2092 }, { "epoch": 1.0437167553191489, "grad_norm": 0.49752357602119446, "learning_rate": 8.24188597608024e-06, "loss": 0.3001, "step": 2093 }, { "epoch": 1.044215425531915, "grad_norm": 0.5945253372192383, "learning_rate": 8.239676160865127e-06, "loss": 0.4926, "step": 2094 }, { "epoch": 1.0447140957446808, "grad_norm": 0.5236752033233643, "learning_rate": 8.237465254397443e-06, "loss": 0.3812, "step": 2095 }, { "epoch": 1.0452127659574468, "grad_norm": 0.5149918794631958, "learning_rate": 8.235253257421912e-06, "loss": 0.3672, "step": 2096 }, { "epoch": 1.0457114361702127, "grad_norm": 0.5490847826004028, "learning_rate": 8.233040170683622e-06, "loss": 0.4633, "step": 2097 }, { "epoch": 1.0462101063829787, "grad_norm": 0.5277806520462036, "learning_rate": 8.230825994928031e-06, "loss": 0.3921, "step": 2098 }, { "epoch": 1.0467087765957448, "grad_norm": 0.5273690223693848, "learning_rate": 8.228610730900963e-06, "loss": 0.4044, "step": 2099 }, { "epoch": 1.0472074468085106, "grad_norm": 0.4852781593799591, "learning_rate": 8.226394379348603e-06, "loss": 0.3894, "step": 2100 }, { "epoch": 1.0477061170212767, "grad_norm": 0.5406553745269775, "learning_rate": 8.22417694101751e-06, "loss": 0.3932, "step": 2101 }, { "epoch": 1.0482047872340425, "grad_norm": 0.4896067678928375, "learning_rate": 8.221958416654606e-06, "loss": 0.3649, "step": 2102 }, { "epoch": 1.0487034574468086, "grad_norm": 0.48923736810684204, "learning_rate": 8.219738807007175e-06, "loss": 0.3512, "step": 2103 }, { "epoch": 1.0492021276595744, "grad_norm": 0.5667054057121277, "learning_rate": 8.217518112822876e-06, "loss": 0.3981, "step": 2104 }, { "epoch": 1.0497007978723405, "grad_norm": 0.4736813008785248, "learning_rate": 8.215296334849725e-06, "loss": 0.3894, "step": 2105 }, { "epoch": 1.0501994680851063, "grad_norm": 0.5818187594413757, "learning_rate": 8.213073473836106e-06, "loss": 0.4045, "step": 2106 }, { "epoch": 1.0506981382978724, "grad_norm": 0.5304458737373352, "learning_rate": 8.210849530530767e-06, "loss": 0.4209, "step": 2107 }, { "epoch": 1.0511968085106382, "grad_norm": 0.4857777953147888, "learning_rate": 8.208624505682824e-06, "loss": 0.336, "step": 2108 }, { "epoch": 1.0516954787234043, "grad_norm": 0.5406140685081482, "learning_rate": 8.206398400041748e-06, "loss": 0.4229, "step": 2109 }, { "epoch": 1.0521941489361701, "grad_norm": 0.40948009490966797, "learning_rate": 8.204171214357385e-06, "loss": 0.3207, "step": 2110 }, { "epoch": 1.0526928191489362, "grad_norm": 0.45707932114601135, "learning_rate": 8.20194294937994e-06, "loss": 0.3968, "step": 2111 }, { "epoch": 1.053191489361702, "grad_norm": 0.5037049651145935, "learning_rate": 8.199713605859983e-06, "loss": 0.434, "step": 2112 }, { "epoch": 1.053690159574468, "grad_norm": 0.5181276798248291, "learning_rate": 8.197483184548443e-06, "loss": 0.4139, "step": 2113 }, { "epoch": 1.054188829787234, "grad_norm": 0.4474653899669647, "learning_rate": 8.195251686196617e-06, "loss": 0.3605, "step": 2114 }, { "epoch": 1.0546875, "grad_norm": 0.5378538966178894, "learning_rate": 8.193019111556163e-06, "loss": 0.3997, "step": 2115 }, { "epoch": 1.055186170212766, "grad_norm": 0.5020760893821716, "learning_rate": 8.1907854613791e-06, "loss": 0.3519, "step": 2116 }, { "epoch": 1.055684840425532, "grad_norm": 0.5173994898796082, "learning_rate": 8.188550736417813e-06, "loss": 0.3498, "step": 2117 }, { "epoch": 1.056183510638298, "grad_norm": 0.5051339268684387, "learning_rate": 8.186314937425046e-06, "loss": 0.3916, "step": 2118 }, { "epoch": 1.0566821808510638, "grad_norm": 0.5022916197776794, "learning_rate": 8.184078065153905e-06, "loss": 0.4098, "step": 2119 }, { "epoch": 1.0571808510638299, "grad_norm": 0.587871253490448, "learning_rate": 8.181840120357859e-06, "loss": 0.4256, "step": 2120 }, { "epoch": 1.0576795212765957, "grad_norm": 0.4457314908504486, "learning_rate": 8.179601103790735e-06, "loss": 0.3592, "step": 2121 }, { "epoch": 1.0581781914893618, "grad_norm": 0.5103181004524231, "learning_rate": 8.177361016206726e-06, "loss": 0.3471, "step": 2122 }, { "epoch": 1.0586768617021276, "grad_norm": 0.5870183706283569, "learning_rate": 8.175119858360381e-06, "loss": 0.3978, "step": 2123 }, { "epoch": 1.0591755319148937, "grad_norm": 0.5685930848121643, "learning_rate": 8.172877631006614e-06, "loss": 0.4118, "step": 2124 }, { "epoch": 1.0596742021276595, "grad_norm": 0.4848538339138031, "learning_rate": 8.170634334900695e-06, "loss": 0.4187, "step": 2125 }, { "epoch": 1.0601728723404256, "grad_norm": 0.5086630582809448, "learning_rate": 8.168389970798257e-06, "loss": 0.3622, "step": 2126 }, { "epoch": 1.0606715425531914, "grad_norm": 0.5524936318397522, "learning_rate": 8.166144539455292e-06, "loss": 0.3655, "step": 2127 }, { "epoch": 1.0611702127659575, "grad_norm": 0.5181760191917419, "learning_rate": 8.163898041628148e-06, "loss": 0.355, "step": 2128 }, { "epoch": 1.0616688829787233, "grad_norm": 0.5733748078346252, "learning_rate": 8.161650478073542e-06, "loss": 0.4557, "step": 2129 }, { "epoch": 1.0621675531914894, "grad_norm": 0.45855438709259033, "learning_rate": 8.159401849548536e-06, "loss": 0.3365, "step": 2130 }, { "epoch": 1.0626662234042552, "grad_norm": 0.6167062520980835, "learning_rate": 8.157152156810562e-06, "loss": 0.4487, "step": 2131 }, { "epoch": 1.0631648936170213, "grad_norm": 0.4543485939502716, "learning_rate": 8.154901400617405e-06, "loss": 0.3622, "step": 2132 }, { "epoch": 1.0636635638297873, "grad_norm": 0.484325647354126, "learning_rate": 8.152649581727212e-06, "loss": 0.3783, "step": 2133 }, { "epoch": 1.0641622340425532, "grad_norm": 0.5495913624763489, "learning_rate": 8.150396700898481e-06, "loss": 0.4439, "step": 2134 }, { "epoch": 1.0646609042553192, "grad_norm": 0.5312252044677734, "learning_rate": 8.148142758890078e-06, "loss": 0.3938, "step": 2135 }, { "epoch": 1.065159574468085, "grad_norm": 0.4773842692375183, "learning_rate": 8.145887756461219e-06, "loss": 0.3776, "step": 2136 }, { "epoch": 1.0656582446808511, "grad_norm": 0.4700307548046112, "learning_rate": 8.143631694371478e-06, "loss": 0.4102, "step": 2137 }, { "epoch": 1.066156914893617, "grad_norm": 0.5658053755760193, "learning_rate": 8.141374573380787e-06, "loss": 0.4061, "step": 2138 }, { "epoch": 1.066655585106383, "grad_norm": 0.5367952585220337, "learning_rate": 8.139116394249436e-06, "loss": 0.3982, "step": 2139 }, { "epoch": 1.0671542553191489, "grad_norm": 0.48670125007629395, "learning_rate": 8.136857157738068e-06, "loss": 0.3746, "step": 2140 }, { "epoch": 1.067652925531915, "grad_norm": 0.526080846786499, "learning_rate": 8.134596864607687e-06, "loss": 0.3586, "step": 2141 }, { "epoch": 1.0681515957446808, "grad_norm": 0.5533256530761719, "learning_rate": 8.132335515619647e-06, "loss": 0.3794, "step": 2142 }, { "epoch": 1.0686502659574468, "grad_norm": 0.49007660150527954, "learning_rate": 8.130073111535663e-06, "loss": 0.4398, "step": 2143 }, { "epoch": 1.0691489361702127, "grad_norm": 0.5141827464103699, "learning_rate": 8.127809653117803e-06, "loss": 0.3539, "step": 2144 }, { "epoch": 1.0696476063829787, "grad_norm": 0.571532130241394, "learning_rate": 8.12554514112849e-06, "loss": 0.5081, "step": 2145 }, { "epoch": 1.0701462765957448, "grad_norm": 0.5187571048736572, "learning_rate": 8.123279576330502e-06, "loss": 0.3752, "step": 2146 }, { "epoch": 1.0706449468085106, "grad_norm": 0.5041022896766663, "learning_rate": 8.121012959486973e-06, "loss": 0.3931, "step": 2147 }, { "epoch": 1.0711436170212767, "grad_norm": 0.531309187412262, "learning_rate": 8.11874529136139e-06, "loss": 0.3902, "step": 2148 }, { "epoch": 1.0716422872340425, "grad_norm": 0.46935275197029114, "learning_rate": 8.116476572717594e-06, "loss": 0.3582, "step": 2149 }, { "epoch": 1.0721409574468086, "grad_norm": 0.5377678275108337, "learning_rate": 8.11420680431978e-06, "loss": 0.4706, "step": 2150 }, { "epoch": 1.0726396276595744, "grad_norm": 0.5237680077552795, "learning_rate": 8.111935986932495e-06, "loss": 0.4285, "step": 2151 }, { "epoch": 1.0731382978723405, "grad_norm": 0.5930753350257874, "learning_rate": 8.109664121320645e-06, "loss": 0.3794, "step": 2152 }, { "epoch": 1.0736369680851063, "grad_norm": 0.49182453751564026, "learning_rate": 8.107391208249482e-06, "loss": 0.367, "step": 2153 }, { "epoch": 1.0741356382978724, "grad_norm": 0.4952145218849182, "learning_rate": 8.105117248484616e-06, "loss": 0.4451, "step": 2154 }, { "epoch": 1.0746343085106382, "grad_norm": 0.4845575988292694, "learning_rate": 8.102842242792007e-06, "loss": 0.3889, "step": 2155 }, { "epoch": 1.0751329787234043, "grad_norm": 0.5500548481941223, "learning_rate": 8.100566191937968e-06, "loss": 0.3796, "step": 2156 }, { "epoch": 1.0756316489361701, "grad_norm": 0.5599648356437683, "learning_rate": 8.098289096689165e-06, "loss": 0.3605, "step": 2157 }, { "epoch": 1.0761303191489362, "grad_norm": 0.5828022360801697, "learning_rate": 8.096010957812613e-06, "loss": 0.3968, "step": 2158 }, { "epoch": 1.076628989361702, "grad_norm": 0.5338253974914551, "learning_rate": 8.093731776075681e-06, "loss": 0.3915, "step": 2159 }, { "epoch": 1.077127659574468, "grad_norm": 0.5997441411018372, "learning_rate": 8.091451552246088e-06, "loss": 0.4412, "step": 2160 }, { "epoch": 1.077626329787234, "grad_norm": 0.5035185217857361, "learning_rate": 8.089170287091907e-06, "loss": 0.3933, "step": 2161 }, { "epoch": 1.078125, "grad_norm": 0.5232201218605042, "learning_rate": 8.086887981381558e-06, "loss": 0.3582, "step": 2162 }, { "epoch": 1.078623670212766, "grad_norm": 0.5684865713119507, "learning_rate": 8.08460463588381e-06, "loss": 0.4094, "step": 2163 }, { "epoch": 1.079122340425532, "grad_norm": 0.4867194592952728, "learning_rate": 8.08232025136779e-06, "loss": 0.3543, "step": 2164 }, { "epoch": 1.079621010638298, "grad_norm": 0.5236110687255859, "learning_rate": 8.080034828602967e-06, "loss": 0.4003, "step": 2165 }, { "epoch": 1.0801196808510638, "grad_norm": 0.48304063081741333, "learning_rate": 8.077748368359164e-06, "loss": 0.4195, "step": 2166 }, { "epoch": 1.0806183510638299, "grad_norm": 0.519856870174408, "learning_rate": 8.075460871406552e-06, "loss": 0.4024, "step": 2167 }, { "epoch": 1.0811170212765957, "grad_norm": 0.6462239623069763, "learning_rate": 8.073172338515653e-06, "loss": 0.4096, "step": 2168 }, { "epoch": 1.0816156914893618, "grad_norm": 0.47278138995170593, "learning_rate": 8.070882770457333e-06, "loss": 0.386, "step": 2169 }, { "epoch": 1.0821143617021276, "grad_norm": 0.6540125012397766, "learning_rate": 8.068592168002813e-06, "loss": 0.4104, "step": 2170 }, { "epoch": 1.0826130319148937, "grad_norm": 0.6709726452827454, "learning_rate": 8.06630053192366e-06, "loss": 0.3415, "step": 2171 }, { "epoch": 1.0831117021276595, "grad_norm": 0.5357152819633484, "learning_rate": 8.064007862991785e-06, "loss": 0.4113, "step": 2172 }, { "epoch": 1.0836103723404256, "grad_norm": 0.6099688410758972, "learning_rate": 8.061714161979455e-06, "loss": 0.4034, "step": 2173 }, { "epoch": 1.0841090425531914, "grad_norm": 0.6431339383125305, "learning_rate": 8.059419429659277e-06, "loss": 0.4009, "step": 2174 }, { "epoch": 1.0846077127659575, "grad_norm": 0.6161786317825317, "learning_rate": 8.057123666804212e-06, "loss": 0.3908, "step": 2175 }, { "epoch": 1.0851063829787233, "grad_norm": 0.5059568881988525, "learning_rate": 8.05482687418756e-06, "loss": 0.431, "step": 2176 }, { "epoch": 1.0856050531914894, "grad_norm": 0.6099360585212708, "learning_rate": 8.052529052582978e-06, "loss": 0.403, "step": 2177 }, { "epoch": 1.0861037234042552, "grad_norm": 0.535208523273468, "learning_rate": 8.05023020276446e-06, "loss": 0.3615, "step": 2178 }, { "epoch": 1.0866023936170213, "grad_norm": 0.5634099245071411, "learning_rate": 8.047930325506354e-06, "loss": 0.4118, "step": 2179 }, { "epoch": 1.0871010638297873, "grad_norm": 0.6021304726600647, "learning_rate": 8.045629421583347e-06, "loss": 0.4442, "step": 2180 }, { "epoch": 1.0875997340425532, "grad_norm": 0.5542697310447693, "learning_rate": 8.04332749177048e-06, "loss": 0.422, "step": 2181 }, { "epoch": 1.0880984042553192, "grad_norm": 0.4689388871192932, "learning_rate": 8.041024536843131e-06, "loss": 0.3542, "step": 2182 }, { "epoch": 1.088597074468085, "grad_norm": 0.5013720989227295, "learning_rate": 8.03872055757703e-06, "loss": 0.3559, "step": 2183 }, { "epoch": 1.0890957446808511, "grad_norm": 0.5362561345100403, "learning_rate": 8.03641555474825e-06, "loss": 0.3855, "step": 2184 }, { "epoch": 1.089594414893617, "grad_norm": 0.5487497448921204, "learning_rate": 8.034109529133204e-06, "loss": 0.4146, "step": 2185 }, { "epoch": 1.090093085106383, "grad_norm": 0.47844618558883667, "learning_rate": 8.031802481508658e-06, "loss": 0.4132, "step": 2186 }, { "epoch": 1.0905917553191489, "grad_norm": 0.49924299120903015, "learning_rate": 8.029494412651714e-06, "loss": 0.4077, "step": 2187 }, { "epoch": 1.091090425531915, "grad_norm": 0.47173887491226196, "learning_rate": 8.027185323339826e-06, "loss": 0.3933, "step": 2188 }, { "epoch": 1.0915890957446808, "grad_norm": 0.45585352182388306, "learning_rate": 8.024875214350784e-06, "loss": 0.379, "step": 2189 }, { "epoch": 1.0920877659574468, "grad_norm": 0.5473808646202087, "learning_rate": 8.02256408646273e-06, "loss": 0.4469, "step": 2190 }, { "epoch": 1.0925864361702127, "grad_norm": 0.5064433813095093, "learning_rate": 8.02025194045414e-06, "loss": 0.3718, "step": 2191 }, { "epoch": 1.0930851063829787, "grad_norm": 0.504137396812439, "learning_rate": 8.017938777103839e-06, "loss": 0.3988, "step": 2192 }, { "epoch": 1.0935837765957448, "grad_norm": 0.5102059841156006, "learning_rate": 8.015624597190993e-06, "loss": 0.4024, "step": 2193 }, { "epoch": 1.0940824468085106, "grad_norm": 0.5549782514572144, "learning_rate": 8.01330940149511e-06, "loss": 0.3941, "step": 2194 }, { "epoch": 1.0945811170212767, "grad_norm": 0.47368794679641724, "learning_rate": 8.01099319079604e-06, "loss": 0.4003, "step": 2195 }, { "epoch": 1.0950797872340425, "grad_norm": 0.5290214419364929, "learning_rate": 8.008675965873977e-06, "loss": 0.3885, "step": 2196 }, { "epoch": 1.0955784574468086, "grad_norm": 0.5353811383247375, "learning_rate": 8.006357727509454e-06, "loss": 0.3605, "step": 2197 }, { "epoch": 1.0960771276595744, "grad_norm": 0.5109184980392456, "learning_rate": 8.004038476483347e-06, "loss": 0.4821, "step": 2198 }, { "epoch": 1.0965757978723405, "grad_norm": 0.4409053325653076, "learning_rate": 8.001718213576872e-06, "loss": 0.351, "step": 2199 }, { "epoch": 1.0970744680851063, "grad_norm": 0.49870532751083374, "learning_rate": 7.999396939571585e-06, "loss": 0.3427, "step": 2200 }, { "epoch": 1.0975731382978724, "grad_norm": 0.5460168123245239, "learning_rate": 7.997074655249388e-06, "loss": 0.4347, "step": 2201 }, { "epoch": 1.0980718085106382, "grad_norm": 0.5638095736503601, "learning_rate": 7.994751361392515e-06, "loss": 0.3934, "step": 2202 }, { "epoch": 1.0985704787234043, "grad_norm": 0.6145226359367371, "learning_rate": 7.992427058783547e-06, "loss": 0.4097, "step": 2203 }, { "epoch": 1.0990691489361701, "grad_norm": 0.5463929176330566, "learning_rate": 7.990101748205401e-06, "loss": 0.3911, "step": 2204 }, { "epoch": 1.0995678191489362, "grad_norm": 0.44593849778175354, "learning_rate": 7.987775430441336e-06, "loss": 0.3278, "step": 2205 }, { "epoch": 1.100066489361702, "grad_norm": 0.7100638747215271, "learning_rate": 7.985448106274947e-06, "loss": 0.508, "step": 2206 }, { "epoch": 1.100565159574468, "grad_norm": 0.4716928005218506, "learning_rate": 7.983119776490172e-06, "loss": 0.3738, "step": 2207 }, { "epoch": 1.101063829787234, "grad_norm": 0.5721107125282288, "learning_rate": 7.980790441871282e-06, "loss": 0.4061, "step": 2208 }, { "epoch": 1.1015625, "grad_norm": 0.5303715467453003, "learning_rate": 7.978460103202895e-06, "loss": 0.3989, "step": 2209 }, { "epoch": 1.102061170212766, "grad_norm": 0.46184760332107544, "learning_rate": 7.97612876126996e-06, "loss": 0.4154, "step": 2210 }, { "epoch": 1.102559840425532, "grad_norm": 0.5748103260993958, "learning_rate": 7.973796416857763e-06, "loss": 0.4595, "step": 2211 }, { "epoch": 1.103058510638298, "grad_norm": 0.5434631109237671, "learning_rate": 7.971463070751936e-06, "loss": 0.3476, "step": 2212 }, { "epoch": 1.1035571808510638, "grad_norm": 0.5684844255447388, "learning_rate": 7.969128723738441e-06, "loss": 0.4244, "step": 2213 }, { "epoch": 1.1040558510638299, "grad_norm": 0.4965466260910034, "learning_rate": 7.966793376603581e-06, "loss": 0.3366, "step": 2214 }, { "epoch": 1.1045545212765957, "grad_norm": 0.5168955326080322, "learning_rate": 7.964457030133992e-06, "loss": 0.4333, "step": 2215 }, { "epoch": 1.1050531914893618, "grad_norm": 0.5268712043762207, "learning_rate": 7.962119685116651e-06, "loss": 0.4148, "step": 2216 }, { "epoch": 1.1055518617021276, "grad_norm": 0.45096370577812195, "learning_rate": 7.959781342338868e-06, "loss": 0.3276, "step": 2217 }, { "epoch": 1.1060505319148937, "grad_norm": 0.5638201832771301, "learning_rate": 7.957442002588291e-06, "loss": 0.4085, "step": 2218 }, { "epoch": 1.1065492021276595, "grad_norm": 0.5895987153053284, "learning_rate": 7.955101666652904e-06, "loss": 0.4039, "step": 2219 }, { "epoch": 1.1070478723404256, "grad_norm": 0.5608728528022766, "learning_rate": 7.952760335321023e-06, "loss": 0.4518, "step": 2220 }, { "epoch": 1.1075465425531914, "grad_norm": 0.5034863352775574, "learning_rate": 7.950418009381307e-06, "loss": 0.3713, "step": 2221 }, { "epoch": 1.1080452127659575, "grad_norm": 0.6578441262245178, "learning_rate": 7.94807468962274e-06, "loss": 0.3619, "step": 2222 }, { "epoch": 1.1085438829787233, "grad_norm": 0.611061692237854, "learning_rate": 7.945730376834648e-06, "loss": 0.4117, "step": 2223 }, { "epoch": 1.1090425531914894, "grad_norm": 0.5116378664970398, "learning_rate": 7.943385071806691e-06, "loss": 0.3839, "step": 2224 }, { "epoch": 1.1095412234042552, "grad_norm": 0.6153894662857056, "learning_rate": 7.94103877532886e-06, "loss": 0.4357, "step": 2225 }, { "epoch": 1.1100398936170213, "grad_norm": 0.6489970684051514, "learning_rate": 7.93869148819148e-06, "loss": 0.405, "step": 2226 }, { "epoch": 1.1105385638297873, "grad_norm": 0.6484565138816833, "learning_rate": 7.936343211185214e-06, "loss": 0.4071, "step": 2227 }, { "epoch": 1.1110372340425532, "grad_norm": 0.6041762828826904, "learning_rate": 7.933993945101053e-06, "loss": 0.377, "step": 2228 }, { "epoch": 1.1115359042553192, "grad_norm": 0.6171084046363831, "learning_rate": 7.931643690730327e-06, "loss": 0.3589, "step": 2229 }, { "epoch": 1.112034574468085, "grad_norm": 0.536956250667572, "learning_rate": 7.929292448864692e-06, "loss": 0.3699, "step": 2230 }, { "epoch": 1.1125332446808511, "grad_norm": 0.5510492920875549, "learning_rate": 7.926940220296144e-06, "loss": 0.4275, "step": 2231 }, { "epoch": 1.113031914893617, "grad_norm": 0.4565505385398865, "learning_rate": 7.924587005817005e-06, "loss": 0.3921, "step": 2232 }, { "epoch": 1.113530585106383, "grad_norm": 0.6326431035995483, "learning_rate": 7.922232806219935e-06, "loss": 0.411, "step": 2233 }, { "epoch": 1.1140292553191489, "grad_norm": 0.5657100081443787, "learning_rate": 7.919877622297917e-06, "loss": 0.4181, "step": 2234 }, { "epoch": 1.114527925531915, "grad_norm": 0.4579087793827057, "learning_rate": 7.917521454844278e-06, "loss": 0.3702, "step": 2235 }, { "epoch": 1.1150265957446808, "grad_norm": 0.5842156410217285, "learning_rate": 7.915164304652662e-06, "loss": 0.4528, "step": 2236 }, { "epoch": 1.1155252659574468, "grad_norm": 0.5848935842514038, "learning_rate": 7.91280617251706e-06, "loss": 0.4105, "step": 2237 }, { "epoch": 1.1160239361702127, "grad_norm": 0.4890473186969757, "learning_rate": 7.910447059231779e-06, "loss": 0.348, "step": 2238 }, { "epoch": 1.1165226063829787, "grad_norm": 0.5157329440116882, "learning_rate": 7.908086965591466e-06, "loss": 0.4079, "step": 2239 }, { "epoch": 1.1170212765957448, "grad_norm": 0.547523021697998, "learning_rate": 7.905725892391092e-06, "loss": 0.3692, "step": 2240 }, { "epoch": 1.1175199468085106, "grad_norm": 0.5580873489379883, "learning_rate": 7.903363840425963e-06, "loss": 0.3915, "step": 2241 }, { "epoch": 1.1180186170212767, "grad_norm": 0.4852254092693329, "learning_rate": 7.901000810491713e-06, "loss": 0.3737, "step": 2242 }, { "epoch": 1.1185172872340425, "grad_norm": 0.5852223038673401, "learning_rate": 7.898636803384304e-06, "loss": 0.3938, "step": 2243 }, { "epoch": 1.1190159574468086, "grad_norm": 0.5272003412246704, "learning_rate": 7.896271819900027e-06, "loss": 0.4347, "step": 2244 }, { "epoch": 1.1195146276595744, "grad_norm": 0.4669896066188812, "learning_rate": 7.893905860835504e-06, "loss": 0.387, "step": 2245 }, { "epoch": 1.1200132978723405, "grad_norm": 0.48069053888320923, "learning_rate": 7.891538926987687e-06, "loss": 0.4155, "step": 2246 }, { "epoch": 1.1205119680851063, "grad_norm": 0.4849315583705902, "learning_rate": 7.889171019153852e-06, "loss": 0.4283, "step": 2247 }, { "epoch": 1.1210106382978724, "grad_norm": 0.4994777739048004, "learning_rate": 7.886802138131606e-06, "loss": 0.3435, "step": 2248 }, { "epoch": 1.1215093085106382, "grad_norm": 0.5703366994857788, "learning_rate": 7.884432284718881e-06, "loss": 0.4236, "step": 2249 }, { "epoch": 1.1220079787234043, "grad_norm": 0.5431674718856812, "learning_rate": 7.88206145971394e-06, "loss": 0.3562, "step": 2250 }, { "epoch": 1.1225066489361701, "grad_norm": 0.47353076934814453, "learning_rate": 7.879689663915371e-06, "loss": 0.4022, "step": 2251 }, { "epoch": 1.1230053191489362, "grad_norm": 0.4548666775226593, "learning_rate": 7.877316898122092e-06, "loss": 0.3791, "step": 2252 }, { "epoch": 1.123503989361702, "grad_norm": 0.48902657628059387, "learning_rate": 7.874943163133342e-06, "loss": 0.4218, "step": 2253 }, { "epoch": 1.124002659574468, "grad_norm": 0.5458621382713318, "learning_rate": 7.872568459748693e-06, "loss": 0.3826, "step": 2254 }, { "epoch": 1.124501329787234, "grad_norm": 0.554494321346283, "learning_rate": 7.870192788768037e-06, "loss": 0.3925, "step": 2255 }, { "epoch": 1.125, "grad_norm": 0.500878632068634, "learning_rate": 7.867816150991597e-06, "loss": 0.4062, "step": 2256 }, { "epoch": 1.125498670212766, "grad_norm": 0.4805998206138611, "learning_rate": 7.865438547219921e-06, "loss": 0.343, "step": 2257 }, { "epoch": 1.125997340425532, "grad_norm": 0.5435221791267395, "learning_rate": 7.863059978253877e-06, "loss": 0.4122, "step": 2258 }, { "epoch": 1.126496010638298, "grad_norm": 0.49712738394737244, "learning_rate": 7.860680444894668e-06, "loss": 0.396, "step": 2259 }, { "epoch": 1.1269946808510638, "grad_norm": 0.47068479657173157, "learning_rate": 7.858299947943812e-06, "loss": 0.371, "step": 2260 }, { "epoch": 1.1274933510638299, "grad_norm": 0.5337076783180237, "learning_rate": 7.855918488203155e-06, "loss": 0.3928, "step": 2261 }, { "epoch": 1.1279920212765957, "grad_norm": 0.6051372289657593, "learning_rate": 7.85353606647487e-06, "loss": 0.3969, "step": 2262 }, { "epoch": 1.1284906914893618, "grad_norm": 0.566285252571106, "learning_rate": 7.851152683561452e-06, "loss": 0.4253, "step": 2263 }, { "epoch": 1.1289893617021276, "grad_norm": 0.4712604582309723, "learning_rate": 7.848768340265718e-06, "loss": 0.3941, "step": 2264 }, { "epoch": 1.1294880319148937, "grad_norm": 0.5483136773109436, "learning_rate": 7.846383037390813e-06, "loss": 0.3904, "step": 2265 }, { "epoch": 1.1299867021276595, "grad_norm": 0.5351081490516663, "learning_rate": 7.843996775740199e-06, "loss": 0.3792, "step": 2266 }, { "epoch": 1.1304853723404256, "grad_norm": 0.5074359774589539, "learning_rate": 7.841609556117666e-06, "loss": 0.4323, "step": 2267 }, { "epoch": 1.1309840425531914, "grad_norm": 0.48690593242645264, "learning_rate": 7.839221379327326e-06, "loss": 0.3847, "step": 2268 }, { "epoch": 1.1314827127659575, "grad_norm": 0.5349803566932678, "learning_rate": 7.836832246173614e-06, "loss": 0.4056, "step": 2269 }, { "epoch": 1.1319813829787235, "grad_norm": 0.4608069658279419, "learning_rate": 7.834442157461279e-06, "loss": 0.3496, "step": 2270 }, { "epoch": 1.1324800531914894, "grad_norm": 0.5046423077583313, "learning_rate": 7.832051113995407e-06, "loss": 0.3495, "step": 2271 }, { "epoch": 1.1329787234042552, "grad_norm": 0.537604570388794, "learning_rate": 7.829659116581392e-06, "loss": 0.4366, "step": 2272 }, { "epoch": 1.1334773936170213, "grad_norm": 0.5777360796928406, "learning_rate": 7.827266166024953e-06, "loss": 0.432, "step": 2273 }, { "epoch": 1.1339760638297873, "grad_norm": 0.5333641767501831, "learning_rate": 7.824872263132137e-06, "loss": 0.4291, "step": 2274 }, { "epoch": 1.1344747340425532, "grad_norm": 0.4550599455833435, "learning_rate": 7.822477408709303e-06, "loss": 0.3657, "step": 2275 }, { "epoch": 1.1349734042553192, "grad_norm": 0.491338849067688, "learning_rate": 7.820081603563133e-06, "loss": 0.3874, "step": 2276 }, { "epoch": 1.135472074468085, "grad_norm": 0.57607501745224, "learning_rate": 7.817684848500631e-06, "loss": 0.3851, "step": 2277 }, { "epoch": 1.1359707446808511, "grad_norm": 0.49949002265930176, "learning_rate": 7.815287144329123e-06, "loss": 0.4092, "step": 2278 }, { "epoch": 1.136469414893617, "grad_norm": 0.45826974511146545, "learning_rate": 7.812888491856245e-06, "loss": 0.3837, "step": 2279 }, { "epoch": 1.136968085106383, "grad_norm": 0.572182834148407, "learning_rate": 7.810488891889967e-06, "loss": 0.4519, "step": 2280 }, { "epoch": 1.1374667553191489, "grad_norm": 0.4783734977245331, "learning_rate": 7.808088345238562e-06, "loss": 0.3761, "step": 2281 }, { "epoch": 1.137965425531915, "grad_norm": 0.4660688042640686, "learning_rate": 7.805686852710638e-06, "loss": 0.3423, "step": 2282 }, { "epoch": 1.1384640957446808, "grad_norm": 0.4834793508052826, "learning_rate": 7.80328441511511e-06, "loss": 0.3428, "step": 2283 }, { "epoch": 1.1389627659574468, "grad_norm": 0.5313360095024109, "learning_rate": 7.800881033261213e-06, "loss": 0.4113, "step": 2284 }, { "epoch": 1.1394614361702127, "grad_norm": 0.5265020728111267, "learning_rate": 7.798476707958507e-06, "loss": 0.391, "step": 2285 }, { "epoch": 1.1399601063829787, "grad_norm": 0.48208388686180115, "learning_rate": 7.79607144001686e-06, "loss": 0.3705, "step": 2286 }, { "epoch": 1.1404587765957448, "grad_norm": 0.5474538207054138, "learning_rate": 7.793665230246468e-06, "loss": 0.4294, "step": 2287 }, { "epoch": 1.1409574468085106, "grad_norm": 0.5305712223052979, "learning_rate": 7.791258079457834e-06, "loss": 0.4568, "step": 2288 }, { "epoch": 1.1414561170212765, "grad_norm": 0.44528305530548096, "learning_rate": 7.788849988461786e-06, "loss": 0.3569, "step": 2289 }, { "epoch": 1.1419547872340425, "grad_norm": 0.48740148544311523, "learning_rate": 7.786440958069461e-06, "loss": 0.4139, "step": 2290 }, { "epoch": 1.1424534574468086, "grad_norm": 0.5424089431762695, "learning_rate": 7.784030989092323e-06, "loss": 0.3944, "step": 2291 }, { "epoch": 1.1429521276595744, "grad_norm": 0.48635026812553406, "learning_rate": 7.781620082342141e-06, "loss": 0.3435, "step": 2292 }, { "epoch": 1.1434507978723405, "grad_norm": 0.4609752297401428, "learning_rate": 7.779208238631006e-06, "loss": 0.3548, "step": 2293 }, { "epoch": 1.1439494680851063, "grad_norm": 0.5284197926521301, "learning_rate": 7.776795458771326e-06, "loss": 0.3965, "step": 2294 }, { "epoch": 1.1444481382978724, "grad_norm": 0.4729102551937103, "learning_rate": 7.774381743575819e-06, "loss": 0.4127, "step": 2295 }, { "epoch": 1.1449468085106382, "grad_norm": 0.5361143350601196, "learning_rate": 7.771967093857523e-06, "loss": 0.4303, "step": 2296 }, { "epoch": 1.1454454787234043, "grad_norm": 0.5491675138473511, "learning_rate": 7.769551510429784e-06, "loss": 0.4073, "step": 2297 }, { "epoch": 1.1459441489361701, "grad_norm": 0.4306163191795349, "learning_rate": 7.767134994106273e-06, "loss": 0.3528, "step": 2298 }, { "epoch": 1.1464428191489362, "grad_norm": 0.43187183141708374, "learning_rate": 7.764717545700966e-06, "loss": 0.3463, "step": 2299 }, { "epoch": 1.146941489361702, "grad_norm": 0.4760376811027527, "learning_rate": 7.762299166028157e-06, "loss": 0.3742, "step": 2300 }, { "epoch": 1.147440159574468, "grad_norm": 0.47050637006759644, "learning_rate": 7.759879855902454e-06, "loss": 0.3518, "step": 2301 }, { "epoch": 1.147938829787234, "grad_norm": 0.4939165711402893, "learning_rate": 7.757459616138779e-06, "loss": 0.3307, "step": 2302 }, { "epoch": 1.1484375, "grad_norm": 0.5239924788475037, "learning_rate": 7.755038447552362e-06, "loss": 0.4472, "step": 2303 }, { "epoch": 1.148936170212766, "grad_norm": 0.4522842764854431, "learning_rate": 7.752616350958748e-06, "loss": 0.341, "step": 2304 }, { "epoch": 1.149434840425532, "grad_norm": 0.5366731882095337, "learning_rate": 7.750193327173801e-06, "loss": 0.4152, "step": 2305 }, { "epoch": 1.149933510638298, "grad_norm": 0.4814930558204651, "learning_rate": 7.747769377013691e-06, "loss": 0.3792, "step": 2306 }, { "epoch": 1.1504321808510638, "grad_norm": 0.4999622702598572, "learning_rate": 7.7453445012949e-06, "loss": 0.4002, "step": 2307 }, { "epoch": 1.1509308510638299, "grad_norm": 0.4526660442352295, "learning_rate": 7.742918700834225e-06, "loss": 0.3868, "step": 2308 }, { "epoch": 1.1514295212765957, "grad_norm": 0.482592910528183, "learning_rate": 7.74049197644877e-06, "loss": 0.3886, "step": 2309 }, { "epoch": 1.1519281914893618, "grad_norm": 0.5662731528282166, "learning_rate": 7.738064328955954e-06, "loss": 0.408, "step": 2310 }, { "epoch": 1.1524268617021276, "grad_norm": 0.4741404056549072, "learning_rate": 7.735635759173508e-06, "loss": 0.366, "step": 2311 }, { "epoch": 1.1529255319148937, "grad_norm": 0.5183899998664856, "learning_rate": 7.73320626791947e-06, "loss": 0.4065, "step": 2312 }, { "epoch": 1.1534242021276595, "grad_norm": 0.47834137082099915, "learning_rate": 7.73077585601219e-06, "loss": 0.4156, "step": 2313 }, { "epoch": 1.1539228723404256, "grad_norm": 0.4993215799331665, "learning_rate": 7.728344524270325e-06, "loss": 0.4233, "step": 2314 }, { "epoch": 1.1544215425531914, "grad_norm": 0.4953966736793518, "learning_rate": 7.725912273512847e-06, "loss": 0.4156, "step": 2315 }, { "epoch": 1.1549202127659575, "grad_norm": 0.47415778040885925, "learning_rate": 7.72347910455904e-06, "loss": 0.4035, "step": 2316 }, { "epoch": 1.1554188829787235, "grad_norm": 0.46065986156463623, "learning_rate": 7.721045018228484e-06, "loss": 0.3399, "step": 2317 }, { "epoch": 1.1559175531914894, "grad_norm": 0.5009950995445251, "learning_rate": 7.718610015341083e-06, "loss": 0.3996, "step": 2318 }, { "epoch": 1.1564162234042552, "grad_norm": 0.46986034512519836, "learning_rate": 7.716174096717042e-06, "loss": 0.3844, "step": 2319 }, { "epoch": 1.1569148936170213, "grad_norm": 0.5196350812911987, "learning_rate": 7.713737263176875e-06, "loss": 0.4138, "step": 2320 }, { "epoch": 1.1574135638297873, "grad_norm": 0.5225589871406555, "learning_rate": 7.711299515541407e-06, "loss": 0.4222, "step": 2321 }, { "epoch": 1.1579122340425532, "grad_norm": 0.5080852508544922, "learning_rate": 7.708860854631765e-06, "loss": 0.3901, "step": 2322 }, { "epoch": 1.1584109042553192, "grad_norm": 0.45691239833831787, "learning_rate": 7.706421281269391e-06, "loss": 0.4019, "step": 2323 }, { "epoch": 1.158909574468085, "grad_norm": 0.5177048444747925, "learning_rate": 7.703980796276029e-06, "loss": 0.3953, "step": 2324 }, { "epoch": 1.1594082446808511, "grad_norm": 0.44681888818740845, "learning_rate": 7.701539400473734e-06, "loss": 0.3866, "step": 2325 }, { "epoch": 1.159906914893617, "grad_norm": 0.5316444039344788, "learning_rate": 7.699097094684865e-06, "loss": 0.4326, "step": 2326 }, { "epoch": 1.160405585106383, "grad_norm": 0.5225628614425659, "learning_rate": 7.696653879732088e-06, "loss": 0.3801, "step": 2327 }, { "epoch": 1.1609042553191489, "grad_norm": 0.5169273018836975, "learning_rate": 7.694209756438375e-06, "loss": 0.4206, "step": 2328 }, { "epoch": 1.161402925531915, "grad_norm": 0.4967573881149292, "learning_rate": 7.691764725627006e-06, "loss": 0.3882, "step": 2329 }, { "epoch": 1.1619015957446808, "grad_norm": 0.6025100946426392, "learning_rate": 7.689318788121564e-06, "loss": 0.3882, "step": 2330 }, { "epoch": 1.1624002659574468, "grad_norm": 0.5247310996055603, "learning_rate": 7.686871944745938e-06, "loss": 0.3983, "step": 2331 }, { "epoch": 1.1628989361702127, "grad_norm": 0.47615712881088257, "learning_rate": 7.684424196324324e-06, "loss": 0.3526, "step": 2332 }, { "epoch": 1.1633976063829787, "grad_norm": 0.5405635833740234, "learning_rate": 7.681975543681221e-06, "loss": 0.3849, "step": 2333 }, { "epoch": 1.1638962765957448, "grad_norm": 0.5389652848243713, "learning_rate": 7.679525987641436e-06, "loss": 0.3724, "step": 2334 }, { "epoch": 1.1643949468085106, "grad_norm": 0.5535398125648499, "learning_rate": 7.677075529030073e-06, "loss": 0.386, "step": 2335 }, { "epoch": 1.1648936170212765, "grad_norm": 0.4491320252418518, "learning_rate": 7.674624168672547e-06, "loss": 0.3522, "step": 2336 }, { "epoch": 1.1653922872340425, "grad_norm": 0.5575441122055054, "learning_rate": 7.672171907394575e-06, "loss": 0.4209, "step": 2337 }, { "epoch": 1.1658909574468086, "grad_norm": 0.5491884350776672, "learning_rate": 7.669718746022174e-06, "loss": 0.4242, "step": 2338 }, { "epoch": 1.1663896276595744, "grad_norm": 0.4332101047039032, "learning_rate": 7.66726468538167e-06, "loss": 0.4199, "step": 2339 }, { "epoch": 1.1668882978723405, "grad_norm": 0.5183558464050293, "learning_rate": 7.664809726299684e-06, "loss": 0.4381, "step": 2340 }, { "epoch": 1.1673869680851063, "grad_norm": 0.4771330952644348, "learning_rate": 7.662353869603153e-06, "loss": 0.3659, "step": 2341 }, { "epoch": 1.1678856382978724, "grad_norm": 0.494902104139328, "learning_rate": 7.659897116119299e-06, "loss": 0.371, "step": 2342 }, { "epoch": 1.1683843085106382, "grad_norm": 0.5012110471725464, "learning_rate": 7.657439466675657e-06, "loss": 0.3984, "step": 2343 }, { "epoch": 1.1688829787234043, "grad_norm": 0.49653559923171997, "learning_rate": 7.654980922100069e-06, "loss": 0.4302, "step": 2344 }, { "epoch": 1.1693816489361701, "grad_norm": 0.4659462571144104, "learning_rate": 7.65252148322066e-06, "loss": 0.332, "step": 2345 }, { "epoch": 1.1698803191489362, "grad_norm": 0.6413690447807312, "learning_rate": 7.650061150865875e-06, "loss": 0.403, "step": 2346 }, { "epoch": 1.170378989361702, "grad_norm": 0.47489553689956665, "learning_rate": 7.647599925864449e-06, "loss": 0.3891, "step": 2347 }, { "epoch": 1.170877659574468, "grad_norm": 0.5471943616867065, "learning_rate": 7.645137809045422e-06, "loss": 0.3562, "step": 2348 }, { "epoch": 1.171376329787234, "grad_norm": 0.6311076879501343, "learning_rate": 7.642674801238134e-06, "loss": 0.4258, "step": 2349 }, { "epoch": 1.171875, "grad_norm": 0.5300273895263672, "learning_rate": 7.640210903272224e-06, "loss": 0.4204, "step": 2350 }, { "epoch": 1.172373670212766, "grad_norm": 0.5635346174240112, "learning_rate": 7.637746115977634e-06, "loss": 0.3819, "step": 2351 }, { "epoch": 1.172872340425532, "grad_norm": 0.4311618208885193, "learning_rate": 7.635280440184598e-06, "loss": 0.295, "step": 2352 }, { "epoch": 1.173371010638298, "grad_norm": 0.5375059843063354, "learning_rate": 7.632813876723657e-06, "loss": 0.4251, "step": 2353 }, { "epoch": 1.1738696808510638, "grad_norm": 0.5317449569702148, "learning_rate": 7.63034642642565e-06, "loss": 0.3968, "step": 2354 }, { "epoch": 1.1743683510638299, "grad_norm": 0.5219212174415588, "learning_rate": 7.627878090121712e-06, "loss": 0.3826, "step": 2355 }, { "epoch": 1.1748670212765957, "grad_norm": 0.5672245025634766, "learning_rate": 7.625408868643274e-06, "loss": 0.4656, "step": 2356 }, { "epoch": 1.1753656914893618, "grad_norm": 0.4617955982685089, "learning_rate": 7.622938762822074e-06, "loss": 0.3736, "step": 2357 }, { "epoch": 1.1758643617021276, "grad_norm": 0.4961754381656647, "learning_rate": 7.620467773490138e-06, "loss": 0.4121, "step": 2358 }, { "epoch": 1.1763630319148937, "grad_norm": 0.4814682602882385, "learning_rate": 7.617995901479796e-06, "loss": 0.3656, "step": 2359 }, { "epoch": 1.1768617021276595, "grad_norm": 0.537654459476471, "learning_rate": 7.615523147623673e-06, "loss": 0.4345, "step": 2360 }, { "epoch": 1.1773603723404256, "grad_norm": 0.48838010430336, "learning_rate": 7.613049512754692e-06, "loss": 0.4184, "step": 2361 }, { "epoch": 1.1778590425531914, "grad_norm": 0.455883651971817, "learning_rate": 7.6105749977060725e-06, "loss": 0.3432, "step": 2362 }, { "epoch": 1.1783577127659575, "grad_norm": 0.5475975275039673, "learning_rate": 7.608099603311328e-06, "loss": 0.4366, "step": 2363 }, { "epoch": 1.1788563829787235, "grad_norm": 0.49015364050865173, "learning_rate": 7.605623330404274e-06, "loss": 0.386, "step": 2364 }, { "epoch": 1.1793550531914894, "grad_norm": 0.5244724154472351, "learning_rate": 7.603146179819016e-06, "loss": 0.3793, "step": 2365 }, { "epoch": 1.1798537234042552, "grad_norm": 0.5920262336730957, "learning_rate": 7.600668152389956e-06, "loss": 0.4483, "step": 2366 }, { "epoch": 1.1803523936170213, "grad_norm": 0.47688987851142883, "learning_rate": 7.598189248951796e-06, "loss": 0.3889, "step": 2367 }, { "epoch": 1.1808510638297873, "grad_norm": 0.5336822271347046, "learning_rate": 7.595709470339527e-06, "loss": 0.3669, "step": 2368 }, { "epoch": 1.1813497340425532, "grad_norm": 0.5579977631568909, "learning_rate": 7.59322881738844e-06, "loss": 0.4152, "step": 2369 }, { "epoch": 1.1818484042553192, "grad_norm": 0.498778760433197, "learning_rate": 7.590747290934118e-06, "loss": 0.3828, "step": 2370 }, { "epoch": 1.182347074468085, "grad_norm": 0.5272276401519775, "learning_rate": 7.588264891812438e-06, "loss": 0.3853, "step": 2371 }, { "epoch": 1.1828457446808511, "grad_norm": 0.4861461818218231, "learning_rate": 7.585781620859569e-06, "loss": 0.3666, "step": 2372 }, { "epoch": 1.183344414893617, "grad_norm": 0.5493713617324829, "learning_rate": 7.5832974789119795e-06, "loss": 0.4005, "step": 2373 }, { "epoch": 1.183843085106383, "grad_norm": 0.45943912863731384, "learning_rate": 7.580812466806427e-06, "loss": 0.3442, "step": 2374 }, { "epoch": 1.1843417553191489, "grad_norm": 0.6316801905632019, "learning_rate": 7.5783265853799605e-06, "loss": 0.4578, "step": 2375 }, { "epoch": 1.184840425531915, "grad_norm": 0.5774199366569519, "learning_rate": 7.575839835469929e-06, "loss": 0.3839, "step": 2376 }, { "epoch": 1.1853390957446808, "grad_norm": 0.6118056178092957, "learning_rate": 7.5733522179139655e-06, "loss": 0.4215, "step": 2377 }, { "epoch": 1.1858377659574468, "grad_norm": 0.5116792321205139, "learning_rate": 7.570863733549999e-06, "loss": 0.3593, "step": 2378 }, { "epoch": 1.1863364361702127, "grad_norm": 0.5425112247467041, "learning_rate": 7.568374383216253e-06, "loss": 0.3772, "step": 2379 }, { "epoch": 1.1868351063829787, "grad_norm": 0.5407917499542236, "learning_rate": 7.565884167751238e-06, "loss": 0.3407, "step": 2380 }, { "epoch": 1.1873337765957448, "grad_norm": 0.6117902398109436, "learning_rate": 7.563393087993761e-06, "loss": 0.453, "step": 2381 }, { "epoch": 1.1878324468085106, "grad_norm": 0.4773917496204376, "learning_rate": 7.560901144782915e-06, "loss": 0.4135, "step": 2382 }, { "epoch": 1.1883311170212765, "grad_norm": 0.5058537125587463, "learning_rate": 7.558408338958087e-06, "loss": 0.3578, "step": 2383 }, { "epoch": 1.1888297872340425, "grad_norm": 0.5838953256607056, "learning_rate": 7.555914671358955e-06, "loss": 0.3895, "step": 2384 }, { "epoch": 1.1893284574468086, "grad_norm": 0.4412024915218353, "learning_rate": 7.553420142825485e-06, "loss": 0.3461, "step": 2385 }, { "epoch": 1.1898271276595744, "grad_norm": 0.5709052681922913, "learning_rate": 7.550924754197933e-06, "loss": 0.4211, "step": 2386 }, { "epoch": 1.1903257978723405, "grad_norm": 0.5246788859367371, "learning_rate": 7.548428506316845e-06, "loss": 0.3581, "step": 2387 }, { "epoch": 1.1908244680851063, "grad_norm": 0.628332793712616, "learning_rate": 7.545931400023061e-06, "loss": 0.4851, "step": 2388 }, { "epoch": 1.1913231382978724, "grad_norm": 0.5767329335212708, "learning_rate": 7.543433436157703e-06, "loss": 0.4176, "step": 2389 }, { "epoch": 1.1918218085106382, "grad_norm": 0.5040484666824341, "learning_rate": 7.540934615562185e-06, "loss": 0.3633, "step": 2390 }, { "epoch": 1.1923204787234043, "grad_norm": 0.66192626953125, "learning_rate": 7.538434939078213e-06, "loss": 0.4013, "step": 2391 }, { "epoch": 1.1928191489361701, "grad_norm": 0.5622859597206116, "learning_rate": 7.5359344075477755e-06, "loss": 0.3997, "step": 2392 }, { "epoch": 1.1933178191489362, "grad_norm": 0.5266722440719604, "learning_rate": 7.5334330218131525e-06, "loss": 0.3665, "step": 2393 }, { "epoch": 1.193816489361702, "grad_norm": 0.6420471668243408, "learning_rate": 7.5309307827169095e-06, "loss": 0.4383, "step": 2394 }, { "epoch": 1.194315159574468, "grad_norm": 0.5355828404426575, "learning_rate": 7.528427691101903e-06, "loss": 0.3212, "step": 2395 }, { "epoch": 1.194813829787234, "grad_norm": 0.5621774792671204, "learning_rate": 7.525923747811272e-06, "loss": 0.4605, "step": 2396 }, { "epoch": 1.1953125, "grad_norm": 0.5224775075912476, "learning_rate": 7.523418953688446e-06, "loss": 0.3632, "step": 2397 }, { "epoch": 1.195811170212766, "grad_norm": 0.5363511443138123, "learning_rate": 7.520913309577142e-06, "loss": 0.4159, "step": 2398 }, { "epoch": 1.196309840425532, "grad_norm": 0.6175137758255005, "learning_rate": 7.5184068163213576e-06, "loss": 0.3745, "step": 2399 }, { "epoch": 1.196808510638298, "grad_norm": 0.584683358669281, "learning_rate": 7.515899474765381e-06, "loss": 0.4601, "step": 2400 }, { "epoch": 1.1973071808510638, "grad_norm": 0.4835186004638672, "learning_rate": 7.513391285753786e-06, "loss": 0.3419, "step": 2401 }, { "epoch": 1.1978058510638299, "grad_norm": 0.6120791435241699, "learning_rate": 7.510882250131431e-06, "loss": 0.4628, "step": 2402 }, { "epoch": 1.1983045212765957, "grad_norm": 0.4967234134674072, "learning_rate": 7.508372368743458e-06, "loss": 0.3264, "step": 2403 }, { "epoch": 1.1988031914893618, "grad_norm": 0.608807384967804, "learning_rate": 7.505861642435297e-06, "loss": 0.4034, "step": 2404 }, { "epoch": 1.1993018617021276, "grad_norm": 0.5980563759803772, "learning_rate": 7.503350072052662e-06, "loss": 0.3613, "step": 2405 }, { "epoch": 1.1998005319148937, "grad_norm": 0.5674591064453125, "learning_rate": 7.500837658441548e-06, "loss": 0.4128, "step": 2406 }, { "epoch": 1.2002992021276595, "grad_norm": 0.44529375433921814, "learning_rate": 7.498324402448238e-06, "loss": 0.3236, "step": 2407 }, { "epoch": 1.2007978723404256, "grad_norm": 0.6776750683784485, "learning_rate": 7.495810304919298e-06, "loss": 0.4625, "step": 2408 }, { "epoch": 1.2012965425531914, "grad_norm": 0.5250003337860107, "learning_rate": 7.493295366701575e-06, "loss": 0.3587, "step": 2409 }, { "epoch": 1.2017952127659575, "grad_norm": 0.4988134801387787, "learning_rate": 7.490779588642202e-06, "loss": 0.4259, "step": 2410 }, { "epoch": 1.2022938829787235, "grad_norm": 0.4654931128025055, "learning_rate": 7.488262971588591e-06, "loss": 0.3785, "step": 2411 }, { "epoch": 1.2027925531914894, "grad_norm": 0.5141465067863464, "learning_rate": 7.485745516388443e-06, "loss": 0.3671, "step": 2412 }, { "epoch": 1.2032912234042552, "grad_norm": 0.6147392988204956, "learning_rate": 7.483227223889736e-06, "loss": 0.4587, "step": 2413 }, { "epoch": 1.2037898936170213, "grad_norm": 0.4835354685783386, "learning_rate": 7.480708094940732e-06, "loss": 0.3535, "step": 2414 }, { "epoch": 1.2042885638297873, "grad_norm": 0.5569568872451782, "learning_rate": 7.478188130389973e-06, "loss": 0.4433, "step": 2415 }, { "epoch": 1.2047872340425532, "grad_norm": 0.5980169177055359, "learning_rate": 7.475667331086286e-06, "loss": 0.3758, "step": 2416 }, { "epoch": 1.2052859042553192, "grad_norm": 0.5945726037025452, "learning_rate": 7.473145697878775e-06, "loss": 0.4031, "step": 2417 }, { "epoch": 1.205784574468085, "grad_norm": 0.4676496386528015, "learning_rate": 7.470623231616829e-06, "loss": 0.4143, "step": 2418 }, { "epoch": 1.2062832446808511, "grad_norm": 0.5440909266471863, "learning_rate": 7.468099933150116e-06, "loss": 0.3624, "step": 2419 }, { "epoch": 1.206781914893617, "grad_norm": 0.5931169986724854, "learning_rate": 7.465575803328582e-06, "loss": 0.4135, "step": 2420 }, { "epoch": 1.207280585106383, "grad_norm": 0.5145664811134338, "learning_rate": 7.463050843002453e-06, "loss": 0.4029, "step": 2421 }, { "epoch": 1.2077792553191489, "grad_norm": 0.49539729952812195, "learning_rate": 7.460525053022241e-06, "loss": 0.3812, "step": 2422 }, { "epoch": 1.208277925531915, "grad_norm": 0.5223681926727295, "learning_rate": 7.457998434238732e-06, "loss": 0.339, "step": 2423 }, { "epoch": 1.2087765957446808, "grad_norm": 0.5151605010032654, "learning_rate": 7.455470987502991e-06, "loss": 0.4378, "step": 2424 }, { "epoch": 1.2092752659574468, "grad_norm": 0.4755402207374573, "learning_rate": 7.452942713666362e-06, "loss": 0.3683, "step": 2425 }, { "epoch": 1.2097739361702127, "grad_norm": 0.5391697287559509, "learning_rate": 7.450413613580474e-06, "loss": 0.4119, "step": 2426 }, { "epoch": 1.2102726063829787, "grad_norm": 0.525365948677063, "learning_rate": 7.447883688097224e-06, "loss": 0.3932, "step": 2427 }, { "epoch": 1.2107712765957448, "grad_norm": 0.5165908932685852, "learning_rate": 7.4453529380687935e-06, "loss": 0.3524, "step": 2428 }, { "epoch": 1.2112699468085106, "grad_norm": 0.5484421253204346, "learning_rate": 7.442821364347641e-06, "loss": 0.3692, "step": 2429 }, { "epoch": 1.2117686170212765, "grad_norm": 0.5161662101745605, "learning_rate": 7.440288967786503e-06, "loss": 0.4214, "step": 2430 }, { "epoch": 1.2122672872340425, "grad_norm": 0.5141539573669434, "learning_rate": 7.43775574923839e-06, "loss": 0.4061, "step": 2431 }, { "epoch": 1.2127659574468086, "grad_norm": 0.4401189088821411, "learning_rate": 7.4352217095565916e-06, "loss": 0.3846, "step": 2432 }, { "epoch": 1.2132646276595744, "grad_norm": 0.5341564416885376, "learning_rate": 7.432686849594675e-06, "loss": 0.4159, "step": 2433 }, { "epoch": 1.2137632978723405, "grad_norm": 0.4744667112827301, "learning_rate": 7.43015117020648e-06, "loss": 0.3458, "step": 2434 }, { "epoch": 1.2142619680851063, "grad_norm": 0.4689410924911499, "learning_rate": 7.427614672246127e-06, "loss": 0.3916, "step": 2435 }, { "epoch": 1.2147606382978724, "grad_norm": 0.4988466799259186, "learning_rate": 7.42507735656801e-06, "loss": 0.369, "step": 2436 }, { "epoch": 1.2152593085106382, "grad_norm": 0.4822193682193756, "learning_rate": 7.422539224026797e-06, "loss": 0.3744, "step": 2437 }, { "epoch": 1.2157579787234043, "grad_norm": 0.5197915434837341, "learning_rate": 7.420000275477435e-06, "loss": 0.4785, "step": 2438 }, { "epoch": 1.2162566489361701, "grad_norm": 0.4557332694530487, "learning_rate": 7.417460511775141e-06, "loss": 0.3476, "step": 2439 }, { "epoch": 1.2167553191489362, "grad_norm": 0.5645822882652283, "learning_rate": 7.4149199337754105e-06, "loss": 0.4243, "step": 2440 }, { "epoch": 1.217253989361702, "grad_norm": 0.4526466131210327, "learning_rate": 7.412378542334011e-06, "loss": 0.3417, "step": 2441 }, { "epoch": 1.217752659574468, "grad_norm": 0.48428159952163696, "learning_rate": 7.409836338306986e-06, "loss": 0.3609, "step": 2442 }, { "epoch": 1.218251329787234, "grad_norm": 0.6034298539161682, "learning_rate": 7.4072933225506505e-06, "loss": 0.4511, "step": 2443 }, { "epoch": 1.21875, "grad_norm": 0.5103329420089722, "learning_rate": 7.404749495921594e-06, "loss": 0.417, "step": 2444 }, { "epoch": 1.219248670212766, "grad_norm": 0.5215536952018738, "learning_rate": 7.402204859276679e-06, "loss": 0.368, "step": 2445 }, { "epoch": 1.219747340425532, "grad_norm": 0.45633506774902344, "learning_rate": 7.399659413473043e-06, "loss": 0.3753, "step": 2446 }, { "epoch": 1.220246010638298, "grad_norm": 0.47350645065307617, "learning_rate": 7.39711315936809e-06, "loss": 0.3735, "step": 2447 }, { "epoch": 1.2207446808510638, "grad_norm": 0.5931663513183594, "learning_rate": 7.394566097819504e-06, "loss": 0.3541, "step": 2448 }, { "epoch": 1.2212433510638299, "grad_norm": 0.5106021165847778, "learning_rate": 7.392018229685236e-06, "loss": 0.4014, "step": 2449 }, { "epoch": 1.2217420212765957, "grad_norm": 0.5234706401824951, "learning_rate": 7.38946955582351e-06, "loss": 0.3795, "step": 2450 }, { "epoch": 1.2222406914893618, "grad_norm": 0.5277416706085205, "learning_rate": 7.386920077092821e-06, "loss": 0.3963, "step": 2451 }, { "epoch": 1.2227393617021276, "grad_norm": 0.5684831738471985, "learning_rate": 7.384369794351936e-06, "loss": 0.4128, "step": 2452 }, { "epoch": 1.2232380319148937, "grad_norm": 0.46902599930763245, "learning_rate": 7.3818187084598905e-06, "loss": 0.3394, "step": 2453 }, { "epoch": 1.2237367021276595, "grad_norm": 0.4561915695667267, "learning_rate": 7.379266820275993e-06, "loss": 0.4091, "step": 2454 }, { "epoch": 1.2242353723404256, "grad_norm": 0.5560210943222046, "learning_rate": 7.376714130659824e-06, "loss": 0.4423, "step": 2455 }, { "epoch": 1.2247340425531914, "grad_norm": 0.480296790599823, "learning_rate": 7.374160640471228e-06, "loss": 0.3532, "step": 2456 }, { "epoch": 1.2252327127659575, "grad_norm": 0.5120042562484741, "learning_rate": 7.371606350570326e-06, "loss": 0.3828, "step": 2457 }, { "epoch": 1.2257313829787235, "grad_norm": 0.5611795783042908, "learning_rate": 7.369051261817503e-06, "loss": 0.407, "step": 2458 }, { "epoch": 1.2262300531914894, "grad_norm": 0.5495477318763733, "learning_rate": 7.366495375073417e-06, "loss": 0.3888, "step": 2459 }, { "epoch": 1.2267287234042552, "grad_norm": 0.5567547678947449, "learning_rate": 7.363938691198988e-06, "loss": 0.3947, "step": 2460 }, { "epoch": 1.2272273936170213, "grad_norm": 0.4972410500049591, "learning_rate": 7.361381211055416e-06, "loss": 0.3716, "step": 2461 }, { "epoch": 1.2277260638297873, "grad_norm": 0.535511314868927, "learning_rate": 7.3588229355041596e-06, "loss": 0.4172, "step": 2462 }, { "epoch": 1.2282247340425532, "grad_norm": 0.5498426556587219, "learning_rate": 7.3562638654069475e-06, "loss": 0.3748, "step": 2463 }, { "epoch": 1.2287234042553192, "grad_norm": 0.5115609169006348, "learning_rate": 7.353704001625778e-06, "loss": 0.4064, "step": 2464 }, { "epoch": 1.229222074468085, "grad_norm": 0.5806992650032043, "learning_rate": 7.3511433450229154e-06, "loss": 0.4016, "step": 2465 }, { "epoch": 1.2297207446808511, "grad_norm": 0.46468108892440796, "learning_rate": 7.348581896460892e-06, "loss": 0.3378, "step": 2466 }, { "epoch": 1.230219414893617, "grad_norm": 0.5258749723434448, "learning_rate": 7.346019656802504e-06, "loss": 0.4269, "step": 2467 }, { "epoch": 1.230718085106383, "grad_norm": 0.4524332284927368, "learning_rate": 7.343456626910819e-06, "loss": 0.3955, "step": 2468 }, { "epoch": 1.2312167553191489, "grad_norm": 0.5054327249526978, "learning_rate": 7.340892807649167e-06, "loss": 0.3966, "step": 2469 }, { "epoch": 1.231715425531915, "grad_norm": 0.5612490177154541, "learning_rate": 7.338328199881144e-06, "loss": 0.4108, "step": 2470 }, { "epoch": 1.2322140957446808, "grad_norm": 0.5165700912475586, "learning_rate": 7.335762804470613e-06, "loss": 0.3903, "step": 2471 }, { "epoch": 1.2327127659574468, "grad_norm": 0.42891836166381836, "learning_rate": 7.333196622281703e-06, "loss": 0.3803, "step": 2472 }, { "epoch": 1.2332114361702127, "grad_norm": 0.5208513140678406, "learning_rate": 7.330629654178807e-06, "loss": 0.4081, "step": 2473 }, { "epoch": 1.2337101063829787, "grad_norm": 0.4826310873031616, "learning_rate": 7.32806190102658e-06, "loss": 0.3797, "step": 2474 }, { "epoch": 1.2342087765957448, "grad_norm": 0.658449113368988, "learning_rate": 7.325493363689946e-06, "loss": 0.4284, "step": 2475 }, { "epoch": 1.2347074468085106, "grad_norm": 0.49155789613723755, "learning_rate": 7.322924043034089e-06, "loss": 0.3997, "step": 2476 }, { "epoch": 1.2352061170212765, "grad_norm": 0.5845656394958496, "learning_rate": 7.320353939924463e-06, "loss": 0.3664, "step": 2477 }, { "epoch": 1.2357047872340425, "grad_norm": 0.6149432063102722, "learning_rate": 7.317783055226777e-06, "loss": 0.3503, "step": 2478 }, { "epoch": 1.2362034574468086, "grad_norm": 0.5802918672561646, "learning_rate": 7.315211389807012e-06, "loss": 0.3663, "step": 2479 }, { "epoch": 1.2367021276595744, "grad_norm": 0.5260826945304871, "learning_rate": 7.312638944531405e-06, "loss": 0.3834, "step": 2480 }, { "epoch": 1.2372007978723405, "grad_norm": 0.6135730743408203, "learning_rate": 7.31006572026646e-06, "loss": 0.4109, "step": 2481 }, { "epoch": 1.2376994680851063, "grad_norm": 0.6645888090133667, "learning_rate": 7.307491717878942e-06, "loss": 0.3505, "step": 2482 }, { "epoch": 1.2381981382978724, "grad_norm": 0.5103488564491272, "learning_rate": 7.304916938235876e-06, "loss": 0.3947, "step": 2483 }, { "epoch": 1.2386968085106382, "grad_norm": 0.4425683617591858, "learning_rate": 7.302341382204554e-06, "loss": 0.3516, "step": 2484 }, { "epoch": 1.2391954787234043, "grad_norm": 0.543150782585144, "learning_rate": 7.299765050652523e-06, "loss": 0.4084, "step": 2485 }, { "epoch": 1.2396941489361701, "grad_norm": 0.5410576462745667, "learning_rate": 7.2971879444475964e-06, "loss": 0.362, "step": 2486 }, { "epoch": 1.2401928191489362, "grad_norm": 0.4989149868488312, "learning_rate": 7.294610064457846e-06, "loss": 0.35, "step": 2487 }, { "epoch": 1.240691489361702, "grad_norm": 0.4894559383392334, "learning_rate": 7.292031411551607e-06, "loss": 0.384, "step": 2488 }, { "epoch": 1.241190159574468, "grad_norm": 0.5268387794494629, "learning_rate": 7.289451986597469e-06, "loss": 0.3927, "step": 2489 }, { "epoch": 1.241688829787234, "grad_norm": 0.5861169695854187, "learning_rate": 7.286871790464289e-06, "loss": 0.4173, "step": 2490 }, { "epoch": 1.2421875, "grad_norm": 0.4851814806461334, "learning_rate": 7.284290824021179e-06, "loss": 0.3496, "step": 2491 }, { "epoch": 1.242686170212766, "grad_norm": 0.506546139717102, "learning_rate": 7.281709088137511e-06, "loss": 0.4224, "step": 2492 }, { "epoch": 1.243184840425532, "grad_norm": 0.4618370831012726, "learning_rate": 7.2791265836829185e-06, "loss": 0.3807, "step": 2493 }, { "epoch": 1.243683510638298, "grad_norm": 0.5226013660430908, "learning_rate": 7.276543311527291e-06, "loss": 0.408, "step": 2494 }, { "epoch": 1.2441821808510638, "grad_norm": 0.5067238211631775, "learning_rate": 7.2739592725407795e-06, "loss": 0.3495, "step": 2495 }, { "epoch": 1.2446808510638299, "grad_norm": 0.49740540981292725, "learning_rate": 7.271374467593791e-06, "loss": 0.3844, "step": 2496 }, { "epoch": 1.2451795212765957, "grad_norm": 0.6064431667327881, "learning_rate": 7.268788897556989e-06, "loss": 0.4398, "step": 2497 }, { "epoch": 1.2456781914893618, "grad_norm": 0.4635569453239441, "learning_rate": 7.2662025633013e-06, "loss": 0.389, "step": 2498 }, { "epoch": 1.2461768617021276, "grad_norm": 0.5058096647262573, "learning_rate": 7.263615465697904e-06, "loss": 0.3874, "step": 2499 }, { "epoch": 1.2466755319148937, "grad_norm": 0.47291818261146545, "learning_rate": 7.261027605618238e-06, "loss": 0.3909, "step": 2500 }, { "epoch": 1.2471742021276595, "grad_norm": 0.5051248073577881, "learning_rate": 7.258438983933999e-06, "loss": 0.3744, "step": 2501 }, { "epoch": 1.2476728723404256, "grad_norm": 0.5855334997177124, "learning_rate": 7.255849601517136e-06, "loss": 0.4101, "step": 2502 }, { "epoch": 1.2481715425531914, "grad_norm": 0.4871473014354706, "learning_rate": 7.253259459239859e-06, "loss": 0.3828, "step": 2503 }, { "epoch": 1.2486702127659575, "grad_norm": 0.5501198768615723, "learning_rate": 7.250668557974629e-06, "loss": 0.4173, "step": 2504 }, { "epoch": 1.2491688829787235, "grad_norm": 0.43567955493927, "learning_rate": 7.248076898594167e-06, "loss": 0.3006, "step": 2505 }, { "epoch": 1.2496675531914894, "grad_norm": 0.5351576805114746, "learning_rate": 7.245484481971448e-06, "loss": 0.4382, "step": 2506 }, { "epoch": 1.2501662234042552, "grad_norm": 0.5575712323188782, "learning_rate": 7.2428913089796994e-06, "loss": 0.4591, "step": 2507 }, { "epoch": 1.2506648936170213, "grad_norm": 0.4388269782066345, "learning_rate": 7.240297380492407e-06, "loss": 0.3825, "step": 2508 }, { "epoch": 1.2511635638297873, "grad_norm": 0.4990021288394928, "learning_rate": 7.237702697383309e-06, "loss": 0.3599, "step": 2509 }, { "epoch": 1.2516622340425532, "grad_norm": 0.5128387808799744, "learning_rate": 7.2351072605264e-06, "loss": 0.4509, "step": 2510 }, { "epoch": 1.2521609042553192, "grad_norm": 0.5540261268615723, "learning_rate": 7.232511070795926e-06, "loss": 0.3954, "step": 2511 }, { "epoch": 1.252659574468085, "grad_norm": 0.43615397810935974, "learning_rate": 7.229914129066388e-06, "loss": 0.3742, "step": 2512 }, { "epoch": 1.2531582446808511, "grad_norm": 0.5031366348266602, "learning_rate": 7.227316436212539e-06, "loss": 0.3751, "step": 2513 }, { "epoch": 1.253656914893617, "grad_norm": 0.47168830037117004, "learning_rate": 7.224717993109386e-06, "loss": 0.3822, "step": 2514 }, { "epoch": 1.254155585106383, "grad_norm": 0.553049623966217, "learning_rate": 7.222118800632189e-06, "loss": 0.4264, "step": 2515 }, { "epoch": 1.2546542553191489, "grad_norm": 0.5044378042221069, "learning_rate": 7.219518859656462e-06, "loss": 0.3833, "step": 2516 }, { "epoch": 1.255152925531915, "grad_norm": 0.5617294311523438, "learning_rate": 7.216918171057966e-06, "loss": 0.4154, "step": 2517 }, { "epoch": 1.2556515957446808, "grad_norm": 0.40011367201805115, "learning_rate": 7.214316735712719e-06, "loss": 0.3521, "step": 2518 }, { "epoch": 1.2561502659574468, "grad_norm": 0.46087348461151123, "learning_rate": 7.211714554496987e-06, "loss": 0.4266, "step": 2519 }, { "epoch": 1.2566489361702127, "grad_norm": 0.4532568156719208, "learning_rate": 7.209111628287289e-06, "loss": 0.3549, "step": 2520 }, { "epoch": 1.2571476063829787, "grad_norm": 0.4956798553466797, "learning_rate": 7.206507957960397e-06, "loss": 0.3897, "step": 2521 }, { "epoch": 1.2576462765957448, "grad_norm": 0.4586049020290375, "learning_rate": 7.203903544393329e-06, "loss": 0.378, "step": 2522 }, { "epoch": 1.2581449468085106, "grad_norm": 0.5898687243461609, "learning_rate": 7.2012983884633555e-06, "loss": 0.4828, "step": 2523 }, { "epoch": 1.2586436170212765, "grad_norm": 0.43577253818511963, "learning_rate": 7.198692491047999e-06, "loss": 0.3409, "step": 2524 }, { "epoch": 1.2591422872340425, "grad_norm": 0.518892765045166, "learning_rate": 7.19608585302503e-06, "loss": 0.3721, "step": 2525 }, { "epoch": 1.2596409574468086, "grad_norm": 0.5531961917877197, "learning_rate": 7.1934784752724666e-06, "loss": 0.4036, "step": 2526 }, { "epoch": 1.2601396276595744, "grad_norm": 0.49896344542503357, "learning_rate": 7.19087035866858e-06, "loss": 0.3712, "step": 2527 }, { "epoch": 1.2606382978723405, "grad_norm": 0.5180708169937134, "learning_rate": 7.188261504091886e-06, "loss": 0.394, "step": 2528 }, { "epoch": 1.2611369680851063, "grad_norm": 0.5703560709953308, "learning_rate": 7.185651912421152e-06, "loss": 0.4622, "step": 2529 }, { "epoch": 1.2616356382978724, "grad_norm": 0.4341517388820648, "learning_rate": 7.183041584535395e-06, "loss": 0.3638, "step": 2530 }, { "epoch": 1.2621343085106382, "grad_norm": 0.45214778184890747, "learning_rate": 7.180430521313875e-06, "loss": 0.3551, "step": 2531 }, { "epoch": 1.2626329787234043, "grad_norm": 0.5246992111206055, "learning_rate": 7.177818723636103e-06, "loss": 0.3915, "step": 2532 }, { "epoch": 1.2631316489361701, "grad_norm": 0.49411121010780334, "learning_rate": 7.175206192381838e-06, "loss": 0.3811, "step": 2533 }, { "epoch": 1.2636303191489362, "grad_norm": 0.4632044732570648, "learning_rate": 7.172592928431087e-06, "loss": 0.3676, "step": 2534 }, { "epoch": 1.2641289893617023, "grad_norm": 0.46390724182128906, "learning_rate": 7.169978932664098e-06, "loss": 0.442, "step": 2535 }, { "epoch": 1.264627659574468, "grad_norm": 0.47734329104423523, "learning_rate": 7.167364205961371e-06, "loss": 0.3438, "step": 2536 }, { "epoch": 1.265126329787234, "grad_norm": 0.46559998393058777, "learning_rate": 7.164748749203653e-06, "loss": 0.3769, "step": 2537 }, { "epoch": 1.265625, "grad_norm": 0.5477844476699829, "learning_rate": 7.162132563271931e-06, "loss": 0.4559, "step": 2538 }, { "epoch": 1.266123670212766, "grad_norm": 0.5213722586631775, "learning_rate": 7.159515649047442e-06, "loss": 0.3949, "step": 2539 }, { "epoch": 1.266622340425532, "grad_norm": 0.46531349420547485, "learning_rate": 7.156898007411666e-06, "loss": 0.422, "step": 2540 }, { "epoch": 1.2671210106382977, "grad_norm": 0.5392589569091797, "learning_rate": 7.154279639246333e-06, "loss": 0.4097, "step": 2541 }, { "epoch": 1.2676196808510638, "grad_norm": 0.4574038088321686, "learning_rate": 7.151660545433412e-06, "loss": 0.3801, "step": 2542 }, { "epoch": 1.2681183510638299, "grad_norm": 0.4694654643535614, "learning_rate": 7.149040726855119e-06, "loss": 0.3939, "step": 2543 }, { "epoch": 1.2686170212765957, "grad_norm": 0.48495256900787354, "learning_rate": 7.146420184393913e-06, "loss": 0.397, "step": 2544 }, { "epoch": 1.2691156914893618, "grad_norm": 0.49752581119537354, "learning_rate": 7.143798918932498e-06, "loss": 0.401, "step": 2545 }, { "epoch": 1.2696143617021276, "grad_norm": 0.4955989122390747, "learning_rate": 7.141176931353824e-06, "loss": 0.3937, "step": 2546 }, { "epoch": 1.2701130319148937, "grad_norm": 0.5080209970474243, "learning_rate": 7.138554222541076e-06, "loss": 0.376, "step": 2547 }, { "epoch": 1.2706117021276595, "grad_norm": 0.5192602276802063, "learning_rate": 7.135930793377691e-06, "loss": 0.3952, "step": 2548 }, { "epoch": 1.2711103723404256, "grad_norm": 0.464322566986084, "learning_rate": 7.133306644747343e-06, "loss": 0.4304, "step": 2549 }, { "epoch": 1.2716090425531914, "grad_norm": 0.5567820072174072, "learning_rate": 7.130681777533952e-06, "loss": 0.3796, "step": 2550 }, { "epoch": 1.2721077127659575, "grad_norm": 0.5275539755821228, "learning_rate": 7.128056192621677e-06, "loss": 0.3559, "step": 2551 }, { "epoch": 1.2726063829787235, "grad_norm": 0.49094629287719727, "learning_rate": 7.125429890894922e-06, "loss": 0.3832, "step": 2552 }, { "epoch": 1.2731050531914894, "grad_norm": 0.5130817890167236, "learning_rate": 7.122802873238327e-06, "loss": 0.3624, "step": 2553 }, { "epoch": 1.2736037234042552, "grad_norm": 0.49832484126091003, "learning_rate": 7.120175140536779e-06, "loss": 0.3973, "step": 2554 }, { "epoch": 1.2741023936170213, "grad_norm": 0.5167518258094788, "learning_rate": 7.117546693675404e-06, "loss": 0.3894, "step": 2555 }, { "epoch": 1.2746010638297873, "grad_norm": 0.5068657398223877, "learning_rate": 7.114917533539567e-06, "loss": 0.3477, "step": 2556 }, { "epoch": 1.2750997340425532, "grad_norm": 0.5776275396347046, "learning_rate": 7.112287661014875e-06, "loss": 0.4314, "step": 2557 }, { "epoch": 1.2755984042553192, "grad_norm": 0.4798465967178345, "learning_rate": 7.109657076987173e-06, "loss": 0.3731, "step": 2558 }, { "epoch": 1.276097074468085, "grad_norm": 0.5218187570571899, "learning_rate": 7.107025782342549e-06, "loss": 0.3311, "step": 2559 }, { "epoch": 1.2765957446808511, "grad_norm": 0.6218535304069519, "learning_rate": 7.104393777967328e-06, "loss": 0.4437, "step": 2560 }, { "epoch": 1.277094414893617, "grad_norm": 0.48659297823905945, "learning_rate": 7.101761064748072e-06, "loss": 0.3704, "step": 2561 }, { "epoch": 1.277593085106383, "grad_norm": 0.47123900055885315, "learning_rate": 7.099127643571586e-06, "loss": 0.3614, "step": 2562 }, { "epoch": 1.2780917553191489, "grad_norm": 0.5338267683982849, "learning_rate": 7.096493515324912e-06, "loss": 0.3858, "step": 2563 }, { "epoch": 1.278590425531915, "grad_norm": 0.532927393913269, "learning_rate": 7.093858680895329e-06, "loss": 0.3962, "step": 2564 }, { "epoch": 1.2790890957446808, "grad_norm": 0.47333863377571106, "learning_rate": 7.091223141170354e-06, "loss": 0.3617, "step": 2565 }, { "epoch": 1.2795877659574468, "grad_norm": 0.49034789204597473, "learning_rate": 7.088586897037745e-06, "loss": 0.3791, "step": 2566 }, { "epoch": 1.2800864361702127, "grad_norm": 0.549291729927063, "learning_rate": 7.0859499493854915e-06, "loss": 0.4313, "step": 2567 }, { "epoch": 1.2805851063829787, "grad_norm": 0.4439654052257538, "learning_rate": 7.083312299101823e-06, "loss": 0.289, "step": 2568 }, { "epoch": 1.2810837765957448, "grad_norm": 0.5449049472808838, "learning_rate": 7.080673947075208e-06, "loss": 0.4321, "step": 2569 }, { "epoch": 1.2815824468085106, "grad_norm": 0.5500699281692505, "learning_rate": 7.078034894194349e-06, "loss": 0.4145, "step": 2570 }, { "epoch": 1.2820811170212765, "grad_norm": 0.4278993010520935, "learning_rate": 7.075395141348182e-06, "loss": 0.3363, "step": 2571 }, { "epoch": 1.2825797872340425, "grad_norm": 0.5069887042045593, "learning_rate": 7.072754689425884e-06, "loss": 0.3999, "step": 2572 }, { "epoch": 1.2830784574468086, "grad_norm": 0.5029987692832947, "learning_rate": 7.070113539316863e-06, "loss": 0.4246, "step": 2573 }, { "epoch": 1.2835771276595744, "grad_norm": 0.48560482263565063, "learning_rate": 7.067471691910763e-06, "loss": 0.3695, "step": 2574 }, { "epoch": 1.2840757978723405, "grad_norm": 0.4975932836532593, "learning_rate": 7.064829148097468e-06, "loss": 0.3473, "step": 2575 }, { "epoch": 1.2845744680851063, "grad_norm": 0.47243520617485046, "learning_rate": 7.06218590876709e-06, "loss": 0.3937, "step": 2576 }, { "epoch": 1.2850731382978724, "grad_norm": 0.4825849235057831, "learning_rate": 7.059541974809978e-06, "loss": 0.3945, "step": 2577 }, { "epoch": 1.2855718085106382, "grad_norm": 0.4803483486175537, "learning_rate": 7.0568973471167135e-06, "loss": 0.3685, "step": 2578 }, { "epoch": 1.2860704787234043, "grad_norm": 0.504999577999115, "learning_rate": 7.054252026578116e-06, "loss": 0.364, "step": 2579 }, { "epoch": 1.2865691489361701, "grad_norm": 0.5010576844215393, "learning_rate": 7.051606014085233e-06, "loss": 0.3928, "step": 2580 }, { "epoch": 1.2870678191489362, "grad_norm": 0.5038896203041077, "learning_rate": 7.048959310529347e-06, "loss": 0.3933, "step": 2581 }, { "epoch": 1.2875664893617023, "grad_norm": 0.506787121295929, "learning_rate": 7.046311916801975e-06, "loss": 0.348, "step": 2582 }, { "epoch": 1.288065159574468, "grad_norm": 0.5016459822654724, "learning_rate": 7.043663833794865e-06, "loss": 0.3987, "step": 2583 }, { "epoch": 1.288563829787234, "grad_norm": 0.5518744587898254, "learning_rate": 7.041015062399996e-06, "loss": 0.3796, "step": 2584 }, { "epoch": 1.2890625, "grad_norm": 0.5704479813575745, "learning_rate": 7.038365603509582e-06, "loss": 0.3885, "step": 2585 }, { "epoch": 1.289561170212766, "grad_norm": 0.46974217891693115, "learning_rate": 7.035715458016066e-06, "loss": 0.3571, "step": 2586 }, { "epoch": 1.290059840425532, "grad_norm": 0.5196946263313293, "learning_rate": 7.033064626812121e-06, "loss": 0.3839, "step": 2587 }, { "epoch": 1.2905585106382977, "grad_norm": 0.5639484524726868, "learning_rate": 7.030413110790657e-06, "loss": 0.4079, "step": 2588 }, { "epoch": 1.2910571808510638, "grad_norm": 0.46215060353279114, "learning_rate": 7.027760910844808e-06, "loss": 0.3765, "step": 2589 }, { "epoch": 1.2915558510638299, "grad_norm": 0.5369568467140198, "learning_rate": 7.025108027867941e-06, "loss": 0.3799, "step": 2590 }, { "epoch": 1.2920545212765957, "grad_norm": 0.5377950668334961, "learning_rate": 7.022454462753658e-06, "loss": 0.3964, "step": 2591 }, { "epoch": 1.2925531914893618, "grad_norm": 0.48269030451774597, "learning_rate": 7.019800216395779e-06, "loss": 0.3701, "step": 2592 }, { "epoch": 1.2930518617021276, "grad_norm": 0.5446078181266785, "learning_rate": 7.017145289688363e-06, "loss": 0.418, "step": 2593 }, { "epoch": 1.2935505319148937, "grad_norm": 0.4939419627189636, "learning_rate": 7.014489683525696e-06, "loss": 0.3774, "step": 2594 }, { "epoch": 1.2940492021276595, "grad_norm": 0.5017947554588318, "learning_rate": 7.011833398802292e-06, "loss": 0.4083, "step": 2595 }, { "epoch": 1.2945478723404256, "grad_norm": 0.4953480660915375, "learning_rate": 7.009176436412896e-06, "loss": 0.3695, "step": 2596 }, { "epoch": 1.2950465425531914, "grad_norm": 0.4672160744667053, "learning_rate": 7.006518797252475e-06, "loss": 0.4095, "step": 2597 }, { "epoch": 1.2955452127659575, "grad_norm": 0.4819324016571045, "learning_rate": 7.003860482216232e-06, "loss": 0.3759, "step": 2598 }, { "epoch": 1.2960438829787235, "grad_norm": 0.4879697859287262, "learning_rate": 7.001201492199592e-06, "loss": 0.4384, "step": 2599 }, { "epoch": 1.2965425531914894, "grad_norm": 0.48428547382354736, "learning_rate": 6.998541828098211e-06, "loss": 0.3728, "step": 2600 }, { "epoch": 1.2970412234042552, "grad_norm": 0.5178827047348022, "learning_rate": 6.9958814908079676e-06, "loss": 0.3913, "step": 2601 }, { "epoch": 1.2975398936170213, "grad_norm": 0.4749337434768677, "learning_rate": 6.993220481224974e-06, "loss": 0.3342, "step": 2602 }, { "epoch": 1.2980385638297873, "grad_norm": 0.4729580581188202, "learning_rate": 6.990558800245559e-06, "loss": 0.344, "step": 2603 }, { "epoch": 1.2985372340425532, "grad_norm": 0.5356308221817017, "learning_rate": 6.987896448766287e-06, "loss": 0.4185, "step": 2604 }, { "epoch": 1.2990359042553192, "grad_norm": 0.5065489411354065, "learning_rate": 6.985233427683944e-06, "loss": 0.4103, "step": 2605 }, { "epoch": 1.299534574468085, "grad_norm": 0.6362532377243042, "learning_rate": 6.982569737895542e-06, "loss": 0.4603, "step": 2606 }, { "epoch": 1.3000332446808511, "grad_norm": 0.5284350514411926, "learning_rate": 6.9799053802983186e-06, "loss": 0.3353, "step": 2607 }, { "epoch": 1.300531914893617, "grad_norm": 0.492896169424057, "learning_rate": 6.977240355789736e-06, "loss": 0.3808, "step": 2608 }, { "epoch": 1.301030585106383, "grad_norm": 0.4509643018245697, "learning_rate": 6.9745746652674795e-06, "loss": 0.3566, "step": 2609 }, { "epoch": 1.3015292553191489, "grad_norm": 0.5628342628479004, "learning_rate": 6.971908309629463e-06, "loss": 0.394, "step": 2610 }, { "epoch": 1.302027925531915, "grad_norm": 0.5812793374061584, "learning_rate": 6.969241289773819e-06, "loss": 0.3794, "step": 2611 }, { "epoch": 1.3025265957446808, "grad_norm": 0.44402316212654114, "learning_rate": 6.96657360659891e-06, "loss": 0.3794, "step": 2612 }, { "epoch": 1.3030252659574468, "grad_norm": 0.4680258333683014, "learning_rate": 6.963905261003315e-06, "loss": 0.3676, "step": 2613 }, { "epoch": 1.3035239361702127, "grad_norm": 0.5320431590080261, "learning_rate": 6.961236253885841e-06, "loss": 0.4221, "step": 2614 }, { "epoch": 1.3040226063829787, "grad_norm": 0.4725250005722046, "learning_rate": 6.958566586145516e-06, "loss": 0.3964, "step": 2615 }, { "epoch": 1.3045212765957448, "grad_norm": 0.4285363256931305, "learning_rate": 6.955896258681592e-06, "loss": 0.291, "step": 2616 }, { "epoch": 1.3050199468085106, "grad_norm": 0.6669567227363586, "learning_rate": 6.95322527239354e-06, "loss": 0.4963, "step": 2617 }, { "epoch": 1.3055186170212765, "grad_norm": 0.521815299987793, "learning_rate": 6.950553628181058e-06, "loss": 0.437, "step": 2618 }, { "epoch": 1.3060172872340425, "grad_norm": 0.48539999127388, "learning_rate": 6.94788132694406e-06, "loss": 0.4131, "step": 2619 }, { "epoch": 1.3065159574468086, "grad_norm": 0.4542159140110016, "learning_rate": 6.945208369582685e-06, "loss": 0.3382, "step": 2620 }, { "epoch": 1.3070146276595744, "grad_norm": 0.553623378276825, "learning_rate": 6.9425347569972935e-06, "loss": 0.387, "step": 2621 }, { "epoch": 1.3075132978723405, "grad_norm": 0.4516220986843109, "learning_rate": 6.939860490088462e-06, "loss": 0.3148, "step": 2622 }, { "epoch": 1.3080119680851063, "grad_norm": 0.5389663577079773, "learning_rate": 6.937185569756993e-06, "loss": 0.4047, "step": 2623 }, { "epoch": 1.3085106382978724, "grad_norm": 0.48725903034210205, "learning_rate": 6.9345099969039075e-06, "loss": 0.3697, "step": 2624 }, { "epoch": 1.3090093085106382, "grad_norm": 0.4306032955646515, "learning_rate": 6.931833772430443e-06, "loss": 0.3841, "step": 2625 }, { "epoch": 1.3095079787234043, "grad_norm": 0.48465555906295776, "learning_rate": 6.929156897238061e-06, "loss": 0.3874, "step": 2626 }, { "epoch": 1.3100066489361701, "grad_norm": 0.5391957759857178, "learning_rate": 6.926479372228437e-06, "loss": 0.3996, "step": 2627 }, { "epoch": 1.3105053191489362, "grad_norm": 0.47909724712371826, "learning_rate": 6.923801198303472e-06, "loss": 0.4046, "step": 2628 }, { "epoch": 1.3110039893617023, "grad_norm": 0.45776742696762085, "learning_rate": 6.9211223763652825e-06, "loss": 0.389, "step": 2629 }, { "epoch": 1.311502659574468, "grad_norm": 0.4850049614906311, "learning_rate": 6.9184429073162e-06, "loss": 0.378, "step": 2630 }, { "epoch": 1.312001329787234, "grad_norm": 0.47021162509918213, "learning_rate": 6.915762792058779e-06, "loss": 0.4002, "step": 2631 }, { "epoch": 1.3125, "grad_norm": 0.47732287645339966, "learning_rate": 6.913082031495789e-06, "loss": 0.3967, "step": 2632 }, { "epoch": 1.312998670212766, "grad_norm": 0.44748562574386597, "learning_rate": 6.910400626530217e-06, "loss": 0.3195, "step": 2633 }, { "epoch": 1.313497340425532, "grad_norm": 0.6233523488044739, "learning_rate": 6.907718578065269e-06, "loss": 0.4833, "step": 2634 }, { "epoch": 1.3139960106382977, "grad_norm": 0.44651567935943604, "learning_rate": 6.905035887004363e-06, "loss": 0.341, "step": 2635 }, { "epoch": 1.3144946808510638, "grad_norm": 0.5440595149993896, "learning_rate": 6.902352554251141e-06, "loss": 0.405, "step": 2636 }, { "epoch": 1.3149933510638299, "grad_norm": 0.5148130059242249, "learning_rate": 6.899668580709453e-06, "loss": 0.3978, "step": 2637 }, { "epoch": 1.3154920212765957, "grad_norm": 0.4795103669166565, "learning_rate": 6.896983967283371e-06, "loss": 0.3273, "step": 2638 }, { "epoch": 1.3159906914893618, "grad_norm": 0.6270907521247864, "learning_rate": 6.894298714877179e-06, "loss": 0.46, "step": 2639 }, { "epoch": 1.3164893617021276, "grad_norm": 0.5539616346359253, "learning_rate": 6.891612824395377e-06, "loss": 0.3948, "step": 2640 }, { "epoch": 1.3169880319148937, "grad_norm": 0.4718599319458008, "learning_rate": 6.888926296742683e-06, "loss": 0.3504, "step": 2641 }, { "epoch": 1.3174867021276595, "grad_norm": 0.4652242958545685, "learning_rate": 6.886239132824025e-06, "loss": 0.4008, "step": 2642 }, { "epoch": 1.3179853723404256, "grad_norm": 0.5131797194480896, "learning_rate": 6.883551333544547e-06, "loss": 0.4022, "step": 2643 }, { "epoch": 1.3184840425531914, "grad_norm": 0.4797559082508087, "learning_rate": 6.880862899809609e-06, "loss": 0.3699, "step": 2644 }, { "epoch": 1.3189827127659575, "grad_norm": 0.5202837586402893, "learning_rate": 6.878173832524783e-06, "loss": 0.4528, "step": 2645 }, { "epoch": 1.3194813829787235, "grad_norm": 0.46714073419570923, "learning_rate": 6.875484132595852e-06, "loss": 0.33, "step": 2646 }, { "epoch": 1.3199800531914894, "grad_norm": 0.5296721458435059, "learning_rate": 6.872793800928817e-06, "loss": 0.4414, "step": 2647 }, { "epoch": 1.3204787234042552, "grad_norm": 0.5183969140052795, "learning_rate": 6.870102838429889e-06, "loss": 0.3849, "step": 2648 }, { "epoch": 1.3209773936170213, "grad_norm": 0.4572626054286957, "learning_rate": 6.867411246005491e-06, "loss": 0.3632, "step": 2649 }, { "epoch": 1.3214760638297873, "grad_norm": 0.56482994556427, "learning_rate": 6.864719024562259e-06, "loss": 0.4007, "step": 2650 }, { "epoch": 1.3219747340425532, "grad_norm": 0.5433633327484131, "learning_rate": 6.862026175007042e-06, "loss": 0.3817, "step": 2651 }, { "epoch": 1.3224734042553192, "grad_norm": 0.4524591863155365, "learning_rate": 6.859332698246898e-06, "loss": 0.3446, "step": 2652 }, { "epoch": 1.322972074468085, "grad_norm": 0.6046844124794006, "learning_rate": 6.856638595189099e-06, "loss": 0.448, "step": 2653 }, { "epoch": 1.3234707446808511, "grad_norm": 0.48235392570495605, "learning_rate": 6.853943866741126e-06, "loss": 0.3594, "step": 2654 }, { "epoch": 1.323969414893617, "grad_norm": 0.5087786912918091, "learning_rate": 6.851248513810673e-06, "loss": 0.3557, "step": 2655 }, { "epoch": 1.324468085106383, "grad_norm": 0.6224291324615479, "learning_rate": 6.848552537305641e-06, "loss": 0.3995, "step": 2656 }, { "epoch": 1.3249667553191489, "grad_norm": 0.5537227392196655, "learning_rate": 6.845855938134143e-06, "loss": 0.3985, "step": 2657 }, { "epoch": 1.325465425531915, "grad_norm": 0.5284193754196167, "learning_rate": 6.843158717204501e-06, "loss": 0.3481, "step": 2658 }, { "epoch": 1.3259640957446808, "grad_norm": 0.6146899461746216, "learning_rate": 6.840460875425247e-06, "loss": 0.4192, "step": 2659 }, { "epoch": 1.3264627659574468, "grad_norm": 0.45568162202835083, "learning_rate": 6.8377624137051235e-06, "loss": 0.367, "step": 2660 }, { "epoch": 1.3269614361702127, "grad_norm": 0.5242910981178284, "learning_rate": 6.83506333295308e-06, "loss": 0.3648, "step": 2661 }, { "epoch": 1.3274601063829787, "grad_norm": 0.4851008355617523, "learning_rate": 6.832363634078274e-06, "loss": 0.3321, "step": 2662 }, { "epoch": 1.3279587765957448, "grad_norm": 0.513672947883606, "learning_rate": 6.829663317990073e-06, "loss": 0.4079, "step": 2663 }, { "epoch": 1.3284574468085106, "grad_norm": 0.5710715055465698, "learning_rate": 6.82696238559805e-06, "loss": 0.4183, "step": 2664 }, { "epoch": 1.3289561170212765, "grad_norm": 0.4580070972442627, "learning_rate": 6.8242608378119905e-06, "loss": 0.4031, "step": 2665 }, { "epoch": 1.3294547872340425, "grad_norm": 0.5811893343925476, "learning_rate": 6.82155867554188e-06, "loss": 0.363, "step": 2666 }, { "epoch": 1.3299534574468086, "grad_norm": 0.5823785066604614, "learning_rate": 6.818855899697918e-06, "loss": 0.3657, "step": 2667 }, { "epoch": 1.3304521276595744, "grad_norm": 0.485531210899353, "learning_rate": 6.816152511190505e-06, "loss": 0.3634, "step": 2668 }, { "epoch": 1.3309507978723405, "grad_norm": 0.5258744359016418, "learning_rate": 6.81344851093025e-06, "loss": 0.4102, "step": 2669 }, { "epoch": 1.3314494680851063, "grad_norm": 0.4765852391719818, "learning_rate": 6.810743899827972e-06, "loss": 0.3589, "step": 2670 }, { "epoch": 1.3319481382978724, "grad_norm": 0.48334190249443054, "learning_rate": 6.8080386787946895e-06, "loss": 0.3423, "step": 2671 }, { "epoch": 1.3324468085106382, "grad_norm": 0.5042089223861694, "learning_rate": 6.80533284874163e-06, "loss": 0.3371, "step": 2672 }, { "epoch": 1.3329454787234043, "grad_norm": 0.48801642656326294, "learning_rate": 6.802626410580225e-06, "loss": 0.4066, "step": 2673 }, { "epoch": 1.3334441489361701, "grad_norm": 0.5287362337112427, "learning_rate": 6.799919365222112e-06, "loss": 0.4186, "step": 2674 }, { "epoch": 1.3339428191489362, "grad_norm": 0.533532440662384, "learning_rate": 6.797211713579131e-06, "loss": 0.3795, "step": 2675 }, { "epoch": 1.3344414893617023, "grad_norm": 0.5668146014213562, "learning_rate": 6.794503456563328e-06, "loss": 0.4188, "step": 2676 }, { "epoch": 1.334940159574468, "grad_norm": 0.4732092618942261, "learning_rate": 6.791794595086954e-06, "loss": 0.3234, "step": 2677 }, { "epoch": 1.335438829787234, "grad_norm": 0.5250312089920044, "learning_rate": 6.7890851300624585e-06, "loss": 0.4144, "step": 2678 }, { "epoch": 1.3359375, "grad_norm": 0.4785168766975403, "learning_rate": 6.7863750624025e-06, "loss": 0.3598, "step": 2679 }, { "epoch": 1.336436170212766, "grad_norm": 0.5225127339363098, "learning_rate": 6.783664393019937e-06, "loss": 0.4209, "step": 2680 }, { "epoch": 1.336934840425532, "grad_norm": 0.5006566047668457, "learning_rate": 6.780953122827832e-06, "loss": 0.3134, "step": 2681 }, { "epoch": 1.3374335106382977, "grad_norm": 0.4957466721534729, "learning_rate": 6.77824125273945e-06, "loss": 0.3839, "step": 2682 }, { "epoch": 1.3379321808510638, "grad_norm": 0.5030401349067688, "learning_rate": 6.775528783668256e-06, "loss": 0.4076, "step": 2683 }, { "epoch": 1.3384308510638299, "grad_norm": 0.4611317217350006, "learning_rate": 6.772815716527917e-06, "loss": 0.3622, "step": 2684 }, { "epoch": 1.3389295212765957, "grad_norm": 0.5490438342094421, "learning_rate": 6.770102052232306e-06, "loss": 0.4523, "step": 2685 }, { "epoch": 1.3394281914893618, "grad_norm": 0.5147542357444763, "learning_rate": 6.767387791695491e-06, "loss": 0.3696, "step": 2686 }, { "epoch": 1.3399268617021276, "grad_norm": 0.6102638244628906, "learning_rate": 6.764672935831745e-06, "loss": 0.4164, "step": 2687 }, { "epoch": 1.3404255319148937, "grad_norm": 0.5732012987136841, "learning_rate": 6.761957485555539e-06, "loss": 0.3612, "step": 2688 }, { "epoch": 1.3409242021276595, "grad_norm": 0.6162610650062561, "learning_rate": 6.759241441781546e-06, "loss": 0.4086, "step": 2689 }, { "epoch": 1.3414228723404256, "grad_norm": 0.5689775943756104, "learning_rate": 6.756524805424638e-06, "loss": 0.4189, "step": 2690 }, { "epoch": 1.3419215425531914, "grad_norm": 0.5029281973838806, "learning_rate": 6.753807577399885e-06, "loss": 0.3914, "step": 2691 }, { "epoch": 1.3424202127659575, "grad_norm": 0.437070369720459, "learning_rate": 6.751089758622562e-06, "loss": 0.3618, "step": 2692 }, { "epoch": 1.3429188829787235, "grad_norm": 0.5256999135017395, "learning_rate": 6.748371350008136e-06, "loss": 0.3509, "step": 2693 }, { "epoch": 1.3434175531914894, "grad_norm": 0.5274101495742798, "learning_rate": 6.745652352472278e-06, "loss": 0.424, "step": 2694 }, { "epoch": 1.3439162234042552, "grad_norm": 0.43487057089805603, "learning_rate": 6.742932766930854e-06, "loss": 0.3354, "step": 2695 }, { "epoch": 1.3444148936170213, "grad_norm": 0.489083468914032, "learning_rate": 6.7402125942999285e-06, "loss": 0.4046, "step": 2696 }, { "epoch": 1.3449135638297873, "grad_norm": 0.5390194058418274, "learning_rate": 6.737491835495765e-06, "loss": 0.4037, "step": 2697 }, { "epoch": 1.3454122340425532, "grad_norm": 0.45483508706092834, "learning_rate": 6.734770491434825e-06, "loss": 0.3601, "step": 2698 }, { "epoch": 1.3459109042553192, "grad_norm": 0.506681501865387, "learning_rate": 6.7320485630337654e-06, "loss": 0.3923, "step": 2699 }, { "epoch": 1.346409574468085, "grad_norm": 0.5151975750923157, "learning_rate": 6.729326051209441e-06, "loss": 0.3753, "step": 2700 }, { "epoch": 1.3469082446808511, "grad_norm": 0.4632432460784912, "learning_rate": 6.7266029568789e-06, "loss": 0.3482, "step": 2701 }, { "epoch": 1.347406914893617, "grad_norm": 0.5354889631271362, "learning_rate": 6.723879280959393e-06, "loss": 0.4555, "step": 2702 }, { "epoch": 1.347905585106383, "grad_norm": 0.5245124101638794, "learning_rate": 6.721155024368363e-06, "loss": 0.3639, "step": 2703 }, { "epoch": 1.3484042553191489, "grad_norm": 0.475216269493103, "learning_rate": 6.718430188023446e-06, "loss": 0.334, "step": 2704 }, { "epoch": 1.348902925531915, "grad_norm": 0.5235077738761902, "learning_rate": 6.715704772842478e-06, "loss": 0.3874, "step": 2705 }, { "epoch": 1.3494015957446808, "grad_norm": 0.4676906168460846, "learning_rate": 6.7129787797434875e-06, "loss": 0.3592, "step": 2706 }, { "epoch": 1.3499002659574468, "grad_norm": 0.5393826365470886, "learning_rate": 6.710252209644698e-06, "loss": 0.3843, "step": 2707 }, { "epoch": 1.3503989361702127, "grad_norm": 0.5767315626144409, "learning_rate": 6.707525063464526e-06, "loss": 0.3951, "step": 2708 }, { "epoch": 1.3508976063829787, "grad_norm": 0.5138171315193176, "learning_rate": 6.704797342121586e-06, "loss": 0.4217, "step": 2709 }, { "epoch": 1.3513962765957448, "grad_norm": 0.4896242022514343, "learning_rate": 6.702069046534681e-06, "loss": 0.3494, "step": 2710 }, { "epoch": 1.3518949468085106, "grad_norm": 0.5788851380348206, "learning_rate": 6.699340177622813e-06, "loss": 0.4322, "step": 2711 }, { "epoch": 1.3523936170212765, "grad_norm": 0.5549031496047974, "learning_rate": 6.696610736305173e-06, "loss": 0.4034, "step": 2712 }, { "epoch": 1.3528922872340425, "grad_norm": 0.630164623260498, "learning_rate": 6.693880723501144e-06, "loss": 0.4099, "step": 2713 }, { "epoch": 1.3533909574468086, "grad_norm": 0.5303331017494202, "learning_rate": 6.691150140130307e-06, "loss": 0.3922, "step": 2714 }, { "epoch": 1.3538896276595744, "grad_norm": 0.4690812826156616, "learning_rate": 6.688418987112428e-06, "loss": 0.3426, "step": 2715 }, { "epoch": 1.3543882978723405, "grad_norm": 0.5587769746780396, "learning_rate": 6.685687265367472e-06, "loss": 0.3719, "step": 2716 }, { "epoch": 1.3548869680851063, "grad_norm": 0.525146484375, "learning_rate": 6.682954975815591e-06, "loss": 0.4078, "step": 2717 }, { "epoch": 1.3553856382978724, "grad_norm": 0.49509677290916443, "learning_rate": 6.680222119377129e-06, "loss": 0.4088, "step": 2718 }, { "epoch": 1.3558843085106382, "grad_norm": 0.49147889018058777, "learning_rate": 6.677488696972624e-06, "loss": 0.3399, "step": 2719 }, { "epoch": 1.3563829787234043, "grad_norm": 0.5483800768852234, "learning_rate": 6.674754709522798e-06, "loss": 0.4396, "step": 2720 }, { "epoch": 1.3568816489361701, "grad_norm": 0.4966241717338562, "learning_rate": 6.6720201579485685e-06, "loss": 0.4204, "step": 2721 }, { "epoch": 1.3573803191489362, "grad_norm": 0.4987573027610779, "learning_rate": 6.669285043171045e-06, "loss": 0.3554, "step": 2722 }, { "epoch": 1.3578789893617023, "grad_norm": 0.44153276085853577, "learning_rate": 6.666549366111519e-06, "loss": 0.4105, "step": 2723 }, { "epoch": 1.358377659574468, "grad_norm": 0.5390828251838684, "learning_rate": 6.663813127691478e-06, "loss": 0.3646, "step": 2724 }, { "epoch": 1.358876329787234, "grad_norm": 0.4715540409088135, "learning_rate": 6.661076328832599e-06, "loss": 0.32, "step": 2725 }, { "epoch": 1.359375, "grad_norm": 0.44496336579322815, "learning_rate": 6.658338970456741e-06, "loss": 0.3686, "step": 2726 }, { "epoch": 1.359873670212766, "grad_norm": 0.4510694444179535, "learning_rate": 6.65560105348596e-06, "loss": 0.3859, "step": 2727 }, { "epoch": 1.360372340425532, "grad_norm": 0.5066868662834167, "learning_rate": 6.652862578842493e-06, "loss": 0.4392, "step": 2728 }, { "epoch": 1.3608710106382977, "grad_norm": 0.4047623574733734, "learning_rate": 6.650123547448769e-06, "loss": 0.3052, "step": 2729 }, { "epoch": 1.3613696808510638, "grad_norm": 0.5104616284370422, "learning_rate": 6.647383960227405e-06, "loss": 0.3901, "step": 2730 }, { "epoch": 1.3618683510638299, "grad_norm": 0.4698323905467987, "learning_rate": 6.644643818101199e-06, "loss": 0.3996, "step": 2731 }, { "epoch": 1.3623670212765957, "grad_norm": 0.4710681736469269, "learning_rate": 6.641903121993144e-06, "loss": 0.4004, "step": 2732 }, { "epoch": 1.3628656914893618, "grad_norm": 0.5145568251609802, "learning_rate": 6.639161872826416e-06, "loss": 0.3676, "step": 2733 }, { "epoch": 1.3633643617021276, "grad_norm": 0.5511937141418457, "learning_rate": 6.636420071524376e-06, "loss": 0.4229, "step": 2734 }, { "epoch": 1.3638630319148937, "grad_norm": 0.5292395949363708, "learning_rate": 6.633677719010574e-06, "loss": 0.398, "step": 2735 }, { "epoch": 1.3643617021276595, "grad_norm": 0.49865323305130005, "learning_rate": 6.630934816208744e-06, "loss": 0.4189, "step": 2736 }, { "epoch": 1.3648603723404256, "grad_norm": 0.4627142548561096, "learning_rate": 6.628191364042804e-06, "loss": 0.3597, "step": 2737 }, { "epoch": 1.3653590425531914, "grad_norm": 0.44648799300193787, "learning_rate": 6.62544736343686e-06, "loss": 0.3714, "step": 2738 }, { "epoch": 1.3658577127659575, "grad_norm": 0.558134913444519, "learning_rate": 6.6227028153151995e-06, "loss": 0.4113, "step": 2739 }, { "epoch": 1.3663563829787235, "grad_norm": 0.4583185315132141, "learning_rate": 6.619957720602298e-06, "loss": 0.3502, "step": 2740 }, { "epoch": 1.3668550531914894, "grad_norm": 0.5234525203704834, "learning_rate": 6.6172120802228126e-06, "loss": 0.3982, "step": 2741 }, { "epoch": 1.3673537234042552, "grad_norm": 0.45428600907325745, "learning_rate": 6.614465895101585e-06, "loss": 0.3722, "step": 2742 }, { "epoch": 1.3678523936170213, "grad_norm": 0.47490814328193665, "learning_rate": 6.611719166163638e-06, "loss": 0.3864, "step": 2743 }, { "epoch": 1.3683510638297873, "grad_norm": 0.47630611062049866, "learning_rate": 6.608971894334183e-06, "loss": 0.3745, "step": 2744 }, { "epoch": 1.3688497340425532, "grad_norm": 0.5918691158294678, "learning_rate": 6.606224080538609e-06, "loss": 0.4438, "step": 2745 }, { "epoch": 1.3693484042553192, "grad_norm": 0.46492984890937805, "learning_rate": 6.6034757257024905e-06, "loss": 0.3856, "step": 2746 }, { "epoch": 1.369847074468085, "grad_norm": 0.4963337182998657, "learning_rate": 6.600726830751583e-06, "loss": 0.4075, "step": 2747 }, { "epoch": 1.3703457446808511, "grad_norm": 0.5010890364646912, "learning_rate": 6.597977396611825e-06, "loss": 0.387, "step": 2748 }, { "epoch": 1.370844414893617, "grad_norm": 0.534563422203064, "learning_rate": 6.595227424209335e-06, "loss": 0.4336, "step": 2749 }, { "epoch": 1.371343085106383, "grad_norm": 0.4263268709182739, "learning_rate": 6.592476914470415e-06, "loss": 0.3414, "step": 2750 }, { "epoch": 1.3718417553191489, "grad_norm": 0.5360243916511536, "learning_rate": 6.589725868321546e-06, "loss": 0.4485, "step": 2751 }, { "epoch": 1.372340425531915, "grad_norm": 0.45036330819129944, "learning_rate": 6.586974286689389e-06, "loss": 0.3524, "step": 2752 }, { "epoch": 1.3728390957446808, "grad_norm": 0.5017392635345459, "learning_rate": 6.584222170500791e-06, "loss": 0.4193, "step": 2753 }, { "epoch": 1.3733377659574468, "grad_norm": 0.5132947564125061, "learning_rate": 6.58146952068277e-06, "loss": 0.4165, "step": 2754 }, { "epoch": 1.3738364361702127, "grad_norm": 0.5023021101951599, "learning_rate": 6.57871633816253e-06, "loss": 0.4171, "step": 2755 }, { "epoch": 1.3743351063829787, "grad_norm": 0.40880897641181946, "learning_rate": 6.575962623867456e-06, "loss": 0.3192, "step": 2756 }, { "epoch": 1.3748337765957448, "grad_norm": 0.595392644405365, "learning_rate": 6.5732083787251065e-06, "loss": 0.4546, "step": 2757 }, { "epoch": 1.3753324468085106, "grad_norm": 0.46306130290031433, "learning_rate": 6.57045360366322e-06, "loss": 0.3916, "step": 2758 }, { "epoch": 1.3758311170212765, "grad_norm": 0.5148966908454895, "learning_rate": 6.567698299609719e-06, "loss": 0.3687, "step": 2759 }, { "epoch": 1.3763297872340425, "grad_norm": 0.5423327088356018, "learning_rate": 6.5649424674926975e-06, "loss": 0.4268, "step": 2760 }, { "epoch": 1.3768284574468086, "grad_norm": 0.4425452649593353, "learning_rate": 6.56218610824043e-06, "loss": 0.3603, "step": 2761 }, { "epoch": 1.3773271276595744, "grad_norm": 0.5049194693565369, "learning_rate": 6.559429222781371e-06, "loss": 0.3822, "step": 2762 }, { "epoch": 1.3778257978723405, "grad_norm": 0.5627256631851196, "learning_rate": 6.556671812044147e-06, "loss": 0.4405, "step": 2763 }, { "epoch": 1.3783244680851063, "grad_norm": 0.47653427720069885, "learning_rate": 6.553913876957564e-06, "loss": 0.4038, "step": 2764 }, { "epoch": 1.3788231382978724, "grad_norm": 0.41815847158432007, "learning_rate": 6.551155418450606e-06, "loss": 0.3605, "step": 2765 }, { "epoch": 1.3793218085106382, "grad_norm": 0.5706623196601868, "learning_rate": 6.548396437452434e-06, "loss": 0.4246, "step": 2766 }, { "epoch": 1.3798204787234043, "grad_norm": 0.4373432993888855, "learning_rate": 6.5456369348923805e-06, "loss": 0.3772, "step": 2767 }, { "epoch": 1.3803191489361701, "grad_norm": 0.5155590176582336, "learning_rate": 6.542876911699958e-06, "loss": 0.4054, "step": 2768 }, { "epoch": 1.3808178191489362, "grad_norm": 0.5197233557701111, "learning_rate": 6.540116368804852e-06, "loss": 0.3605, "step": 2769 }, { "epoch": 1.3813164893617023, "grad_norm": 0.537132740020752, "learning_rate": 6.537355307136925e-06, "loss": 0.3916, "step": 2770 }, { "epoch": 1.381815159574468, "grad_norm": 0.4772527813911438, "learning_rate": 6.53459372762621e-06, "loss": 0.3929, "step": 2771 }, { "epoch": 1.382313829787234, "grad_norm": 0.5163897275924683, "learning_rate": 6.53183163120292e-06, "loss": 0.3327, "step": 2772 }, { "epoch": 1.3828125, "grad_norm": 0.5801275968551636, "learning_rate": 6.529069018797442e-06, "loss": 0.4003, "step": 2773 }, { "epoch": 1.383311170212766, "grad_norm": 0.5072594285011292, "learning_rate": 6.5263058913403285e-06, "loss": 0.3999, "step": 2774 }, { "epoch": 1.383809840425532, "grad_norm": 0.592331051826477, "learning_rate": 6.523542249762316e-06, "loss": 0.4141, "step": 2775 }, { "epoch": 1.3843085106382977, "grad_norm": 0.5534159541130066, "learning_rate": 6.520778094994308e-06, "loss": 0.3926, "step": 2776 }, { "epoch": 1.3848071808510638, "grad_norm": 0.540675163269043, "learning_rate": 6.5180134279673826e-06, "loss": 0.3806, "step": 2777 }, { "epoch": 1.3853058510638299, "grad_norm": 0.5299066305160522, "learning_rate": 6.515248249612789e-06, "loss": 0.4123, "step": 2778 }, { "epoch": 1.3858045212765957, "grad_norm": 0.4926382601261139, "learning_rate": 6.512482560861953e-06, "loss": 0.3452, "step": 2779 }, { "epoch": 1.3863031914893618, "grad_norm": 0.5036928653717041, "learning_rate": 6.509716362646465e-06, "loss": 0.3799, "step": 2780 }, { "epoch": 1.3868018617021276, "grad_norm": 0.5280914902687073, "learning_rate": 6.506949655898095e-06, "loss": 0.3846, "step": 2781 }, { "epoch": 1.3873005319148937, "grad_norm": 0.507045567035675, "learning_rate": 6.504182441548779e-06, "loss": 0.4219, "step": 2782 }, { "epoch": 1.3877992021276595, "grad_norm": 0.5724549293518066, "learning_rate": 6.501414720530626e-06, "loss": 0.387, "step": 2783 }, { "epoch": 1.3882978723404256, "grad_norm": 0.5483629107475281, "learning_rate": 6.498646493775915e-06, "loss": 0.4128, "step": 2784 }, { "epoch": 1.3887965425531914, "grad_norm": 0.4404490292072296, "learning_rate": 6.495877762217094e-06, "loss": 0.3622, "step": 2785 }, { "epoch": 1.3892952127659575, "grad_norm": 0.4895956218242645, "learning_rate": 6.4931085267867856e-06, "loss": 0.4087, "step": 2786 }, { "epoch": 1.3897938829787235, "grad_norm": 0.5351875424385071, "learning_rate": 6.490338788417776e-06, "loss": 0.3666, "step": 2787 }, { "epoch": 1.3902925531914894, "grad_norm": 0.48545077443122864, "learning_rate": 6.487568548043026e-06, "loss": 0.3734, "step": 2788 }, { "epoch": 1.3907912234042552, "grad_norm": 0.4858388304710388, "learning_rate": 6.484797806595663e-06, "loss": 0.3646, "step": 2789 }, { "epoch": 1.3912898936170213, "grad_norm": 0.5103653073310852, "learning_rate": 6.482026565008983e-06, "loss": 0.4311, "step": 2790 }, { "epoch": 1.3917885638297873, "grad_norm": 0.4590196907520294, "learning_rate": 6.479254824216452e-06, "loss": 0.377, "step": 2791 }, { "epoch": 1.3922872340425532, "grad_norm": 0.5185592770576477, "learning_rate": 6.476482585151702e-06, "loss": 0.4017, "step": 2792 }, { "epoch": 1.3927859042553192, "grad_norm": 0.4625968337059021, "learning_rate": 6.473709848748535e-06, "loss": 0.3488, "step": 2793 }, { "epoch": 1.393284574468085, "grad_norm": 0.5203240513801575, "learning_rate": 6.470936615940921e-06, "loss": 0.4096, "step": 2794 }, { "epoch": 1.3937832446808511, "grad_norm": 0.4753365218639374, "learning_rate": 6.468162887662992e-06, "loss": 0.3964, "step": 2795 }, { "epoch": 1.394281914893617, "grad_norm": 0.6019706130027771, "learning_rate": 6.4653886648490535e-06, "loss": 0.3822, "step": 2796 }, { "epoch": 1.394780585106383, "grad_norm": 0.4692594110965729, "learning_rate": 6.462613948433575e-06, "loss": 0.345, "step": 2797 }, { "epoch": 1.3952792553191489, "grad_norm": 0.5117987990379333, "learning_rate": 6.45983873935119e-06, "loss": 0.4232, "step": 2798 }, { "epoch": 1.395777925531915, "grad_norm": 0.4879116117954254, "learning_rate": 6.457063038536701e-06, "loss": 0.4185, "step": 2799 }, { "epoch": 1.3962765957446808, "grad_norm": 0.533001184463501, "learning_rate": 6.454286846925075e-06, "loss": 0.3329, "step": 2800 }, { "epoch": 1.3967752659574468, "grad_norm": 0.5328335165977478, "learning_rate": 6.451510165451444e-06, "loss": 0.3899, "step": 2801 }, { "epoch": 1.3972739361702127, "grad_norm": 0.4920451045036316, "learning_rate": 6.448732995051107e-06, "loss": 0.3396, "step": 2802 }, { "epoch": 1.3977726063829787, "grad_norm": 0.5356482863426208, "learning_rate": 6.445955336659524e-06, "loss": 0.4199, "step": 2803 }, { "epoch": 1.3982712765957448, "grad_norm": 0.49285098910331726, "learning_rate": 6.443177191212324e-06, "loss": 0.3604, "step": 2804 }, { "epoch": 1.3987699468085106, "grad_norm": 0.5314611792564392, "learning_rate": 6.440398559645296e-06, "loss": 0.3774, "step": 2805 }, { "epoch": 1.3992686170212765, "grad_norm": 0.4840170741081238, "learning_rate": 6.437619442894394e-06, "loss": 0.3107, "step": 2806 }, { "epoch": 1.3997672872340425, "grad_norm": 0.5606661438941956, "learning_rate": 6.434839841895735e-06, "loss": 0.3668, "step": 2807 }, { "epoch": 1.4002659574468086, "grad_norm": 0.5044364333152771, "learning_rate": 6.432059757585602e-06, "loss": 0.3859, "step": 2808 }, { "epoch": 1.4007646276595744, "grad_norm": 0.5253499150276184, "learning_rate": 6.429279190900438e-06, "loss": 0.3817, "step": 2809 }, { "epoch": 1.4012632978723405, "grad_norm": 0.49364757537841797, "learning_rate": 6.426498142776849e-06, "loss": 0.409, "step": 2810 }, { "epoch": 1.4017619680851063, "grad_norm": 0.5836560130119324, "learning_rate": 6.423716614151603e-06, "loss": 0.3849, "step": 2811 }, { "epoch": 1.4022606382978724, "grad_norm": 0.5015041828155518, "learning_rate": 6.42093460596163e-06, "loss": 0.396, "step": 2812 }, { "epoch": 1.4027593085106382, "grad_norm": 0.4639809727668762, "learning_rate": 6.418152119144021e-06, "loss": 0.3561, "step": 2813 }, { "epoch": 1.4032579787234043, "grad_norm": 0.5609797239303589, "learning_rate": 6.41536915463603e-06, "loss": 0.4199, "step": 2814 }, { "epoch": 1.4037566489361701, "grad_norm": 0.5417338013648987, "learning_rate": 6.4125857133750725e-06, "loss": 0.3715, "step": 2815 }, { "epoch": 1.4042553191489362, "grad_norm": 0.4794056713581085, "learning_rate": 6.40980179629872e-06, "loss": 0.3786, "step": 2816 }, { "epoch": 1.4047539893617023, "grad_norm": 0.44960421323776245, "learning_rate": 6.407017404344708e-06, "loss": 0.3766, "step": 2817 }, { "epoch": 1.405252659574468, "grad_norm": 0.5204979181289673, "learning_rate": 6.404232538450932e-06, "loss": 0.4337, "step": 2818 }, { "epoch": 1.405751329787234, "grad_norm": 0.5133688449859619, "learning_rate": 6.401447199555445e-06, "loss": 0.3961, "step": 2819 }, { "epoch": 1.40625, "grad_norm": 0.4722800850868225, "learning_rate": 6.398661388596461e-06, "loss": 0.4342, "step": 2820 }, { "epoch": 1.406748670212766, "grad_norm": 0.5221198201179504, "learning_rate": 6.395875106512353e-06, "loss": 0.4044, "step": 2821 }, { "epoch": 1.407247340425532, "grad_norm": 0.5054382681846619, "learning_rate": 6.393088354241652e-06, "loss": 0.3675, "step": 2822 }, { "epoch": 1.4077460106382977, "grad_norm": 0.4840163588523865, "learning_rate": 6.3903011327230476e-06, "loss": 0.4369, "step": 2823 }, { "epoch": 1.4082446808510638, "grad_norm": 0.5392292141914368, "learning_rate": 6.387513442895389e-06, "loss": 0.4148, "step": 2824 }, { "epoch": 1.4087433510638299, "grad_norm": 0.4798351228237152, "learning_rate": 6.384725285697679e-06, "loss": 0.326, "step": 2825 }, { "epoch": 1.4092420212765957, "grad_norm": 0.5543808937072754, "learning_rate": 6.3819366620690815e-06, "loss": 0.4278, "step": 2826 }, { "epoch": 1.4097406914893618, "grad_norm": 0.46569132804870605, "learning_rate": 6.379147572948918e-06, "loss": 0.3088, "step": 2827 }, { "epoch": 1.4102393617021276, "grad_norm": 0.5486361980438232, "learning_rate": 6.376358019276665e-06, "loss": 0.4579, "step": 2828 }, { "epoch": 1.4107380319148937, "grad_norm": 0.5736048221588135, "learning_rate": 6.373568001991954e-06, "loss": 0.4573, "step": 2829 }, { "epoch": 1.4112367021276595, "grad_norm": 0.5490643978118896, "learning_rate": 6.3707775220345755e-06, "loss": 0.3429, "step": 2830 }, { "epoch": 1.4117353723404256, "grad_norm": 0.5443843007087708, "learning_rate": 6.3679865803444755e-06, "loss": 0.376, "step": 2831 }, { "epoch": 1.4122340425531914, "grad_norm": 0.6193292140960693, "learning_rate": 6.365195177861754e-06, "loss": 0.4038, "step": 2832 }, { "epoch": 1.4127327127659575, "grad_norm": 0.46930304169654846, "learning_rate": 6.362403315526666e-06, "loss": 0.3465, "step": 2833 }, { "epoch": 1.4132313829787235, "grad_norm": 0.5520800948143005, "learning_rate": 6.359610994279625e-06, "loss": 0.3938, "step": 2834 }, { "epoch": 1.4137300531914894, "grad_norm": 0.4679763615131378, "learning_rate": 6.356818215061196e-06, "loss": 0.3337, "step": 2835 }, { "epoch": 1.4142287234042552, "grad_norm": 0.49522390961647034, "learning_rate": 6.3540249788120965e-06, "loss": 0.3831, "step": 2836 }, { "epoch": 1.4147273936170213, "grad_norm": 0.5047772526741028, "learning_rate": 6.351231286473203e-06, "loss": 0.3821, "step": 2837 }, { "epoch": 1.4152260638297873, "grad_norm": 0.47942376136779785, "learning_rate": 6.348437138985541e-06, "loss": 0.3841, "step": 2838 }, { "epoch": 1.4157247340425532, "grad_norm": 0.48258182406425476, "learning_rate": 6.345642537290292e-06, "loss": 0.3835, "step": 2839 }, { "epoch": 1.4162234042553192, "grad_norm": 0.49857446551322937, "learning_rate": 6.342847482328789e-06, "loss": 0.3827, "step": 2840 }, { "epoch": 1.416722074468085, "grad_norm": 0.4825318157672882, "learning_rate": 6.340051975042519e-06, "loss": 0.3456, "step": 2841 }, { "epoch": 1.4172207446808511, "grad_norm": 0.507548451423645, "learning_rate": 6.337256016373121e-06, "loss": 0.3905, "step": 2842 }, { "epoch": 1.417719414893617, "grad_norm": 0.47346022725105286, "learning_rate": 6.334459607262384e-06, "loss": 0.3097, "step": 2843 }, { "epoch": 1.418218085106383, "grad_norm": 0.5349465012550354, "learning_rate": 6.331662748652253e-06, "loss": 0.4712, "step": 2844 }, { "epoch": 1.4187167553191489, "grad_norm": 0.4990655779838562, "learning_rate": 6.3288654414848205e-06, "loss": 0.4344, "step": 2845 }, { "epoch": 1.419215425531915, "grad_norm": 0.46758994460105896, "learning_rate": 6.326067686702332e-06, "loss": 0.3886, "step": 2846 }, { "epoch": 1.4197140957446808, "grad_norm": 0.47979840636253357, "learning_rate": 6.323269485247183e-06, "loss": 0.3739, "step": 2847 }, { "epoch": 1.4202127659574468, "grad_norm": 0.5590842962265015, "learning_rate": 6.320470838061921e-06, "loss": 0.4151, "step": 2848 }, { "epoch": 1.4207114361702127, "grad_norm": 0.4752959907054901, "learning_rate": 6.31767174608924e-06, "loss": 0.3786, "step": 2849 }, { "epoch": 1.4212101063829787, "grad_norm": 0.4650644361972809, "learning_rate": 6.3148722102719874e-06, "loss": 0.3589, "step": 2850 }, { "epoch": 1.4217087765957448, "grad_norm": 0.5205345153808594, "learning_rate": 6.312072231553158e-06, "loss": 0.3831, "step": 2851 }, { "epoch": 1.4222074468085106, "grad_norm": 0.5733615159988403, "learning_rate": 6.309271810875898e-06, "loss": 0.4069, "step": 2852 }, { "epoch": 1.4227061170212765, "grad_norm": 0.5057272911071777, "learning_rate": 6.3064709491835e-06, "loss": 0.3414, "step": 2853 }, { "epoch": 1.4232047872340425, "grad_norm": 0.4647471010684967, "learning_rate": 6.303669647419407e-06, "loss": 0.3927, "step": 2854 }, { "epoch": 1.4237034574468086, "grad_norm": 0.45703646540641785, "learning_rate": 6.30086790652721e-06, "loss": 0.3834, "step": 2855 }, { "epoch": 1.4242021276595744, "grad_norm": 0.5111451745033264, "learning_rate": 6.298065727450646e-06, "loss": 0.4179, "step": 2856 }, { "epoch": 1.4247007978723405, "grad_norm": 0.472693532705307, "learning_rate": 6.295263111133603e-06, "loss": 0.4118, "step": 2857 }, { "epoch": 1.4251994680851063, "grad_norm": 0.44298312067985535, "learning_rate": 6.292460058520112e-06, "loss": 0.3304, "step": 2858 }, { "epoch": 1.4256981382978724, "grad_norm": 0.5544400215148926, "learning_rate": 6.289656570554356e-06, "loss": 0.4232, "step": 2859 }, { "epoch": 1.4261968085106382, "grad_norm": 0.5701576471328735, "learning_rate": 6.2868526481806595e-06, "loss": 0.3961, "step": 2860 }, { "epoch": 1.4266954787234043, "grad_norm": 0.43903565406799316, "learning_rate": 6.284048292343496e-06, "loss": 0.3519, "step": 2861 }, { "epoch": 1.4271941489361701, "grad_norm": 0.554401695728302, "learning_rate": 6.281243503987486e-06, "loss": 0.4041, "step": 2862 }, { "epoch": 1.4276928191489362, "grad_norm": 0.562389075756073, "learning_rate": 6.278438284057394e-06, "loss": 0.3773, "step": 2863 }, { "epoch": 1.4281914893617023, "grad_norm": 0.5066273212432861, "learning_rate": 6.275632633498131e-06, "loss": 0.3553, "step": 2864 }, { "epoch": 1.428690159574468, "grad_norm": 0.5353230237960815, "learning_rate": 6.272826553254751e-06, "loss": 0.4, "step": 2865 }, { "epoch": 1.429188829787234, "grad_norm": 0.5228533744812012, "learning_rate": 6.2700200442724545e-06, "loss": 0.4473, "step": 2866 }, { "epoch": 1.4296875, "grad_norm": 0.49971434473991394, "learning_rate": 6.267213107496586e-06, "loss": 0.3378, "step": 2867 }, { "epoch": 1.430186170212766, "grad_norm": 0.49643969535827637, "learning_rate": 6.264405743872634e-06, "loss": 0.3541, "step": 2868 }, { "epoch": 1.430684840425532, "grad_norm": 0.4410136640071869, "learning_rate": 6.2615979543462335e-06, "loss": 0.3775, "step": 2869 }, { "epoch": 1.4311835106382977, "grad_norm": 0.44661107659339905, "learning_rate": 6.258789739863158e-06, "loss": 0.4026, "step": 2870 }, { "epoch": 1.4316821808510638, "grad_norm": 0.47907575964927673, "learning_rate": 6.255981101369325e-06, "loss": 0.4061, "step": 2871 }, { "epoch": 1.4321808510638299, "grad_norm": 0.4681423604488373, "learning_rate": 6.253172039810799e-06, "loss": 0.3542, "step": 2872 }, { "epoch": 1.4326795212765957, "grad_norm": 0.5110459923744202, "learning_rate": 6.2503625561337825e-06, "loss": 0.3774, "step": 2873 }, { "epoch": 1.4331781914893618, "grad_norm": 0.5473583340644836, "learning_rate": 6.247552651284623e-06, "loss": 0.4166, "step": 2874 }, { "epoch": 1.4336768617021276, "grad_norm": 0.4841413199901581, "learning_rate": 6.244742326209809e-06, "loss": 0.3742, "step": 2875 }, { "epoch": 1.4341755319148937, "grad_norm": 0.482741117477417, "learning_rate": 6.24193158185597e-06, "loss": 0.4044, "step": 2876 }, { "epoch": 1.4346742021276595, "grad_norm": 0.479532390832901, "learning_rate": 6.239120419169878e-06, "loss": 0.3987, "step": 2877 }, { "epoch": 1.4351728723404256, "grad_norm": 0.45999398827552795, "learning_rate": 6.236308839098444e-06, "loss": 0.393, "step": 2878 }, { "epoch": 1.4356715425531914, "grad_norm": 0.47697386145591736, "learning_rate": 6.233496842588721e-06, "loss": 0.3755, "step": 2879 }, { "epoch": 1.4361702127659575, "grad_norm": 0.5133532285690308, "learning_rate": 6.230684430587902e-06, "loss": 0.3472, "step": 2880 }, { "epoch": 1.4366688829787235, "grad_norm": 0.4431968033313751, "learning_rate": 6.227871604043321e-06, "loss": 0.3366, "step": 2881 }, { "epoch": 1.4371675531914894, "grad_norm": 0.48146316409111023, "learning_rate": 6.225058363902449e-06, "loss": 0.4129, "step": 2882 }, { "epoch": 1.4376662234042552, "grad_norm": 0.4861428737640381, "learning_rate": 6.222244711112898e-06, "loss": 0.3722, "step": 2883 }, { "epoch": 1.4381648936170213, "grad_norm": 0.5341479778289795, "learning_rate": 6.219430646622419e-06, "loss": 0.4108, "step": 2884 }, { "epoch": 1.4386635638297873, "grad_norm": 0.4365323781967163, "learning_rate": 6.216616171378902e-06, "loss": 0.3281, "step": 2885 }, { "epoch": 1.4391622340425532, "grad_norm": 0.62403804063797, "learning_rate": 6.213801286330375e-06, "loss": 0.4231, "step": 2886 }, { "epoch": 1.4396609042553192, "grad_norm": 0.4516286551952362, "learning_rate": 6.210985992425003e-06, "loss": 0.3669, "step": 2887 }, { "epoch": 1.440159574468085, "grad_norm": 0.4826277196407318, "learning_rate": 6.208170290611091e-06, "loss": 0.3905, "step": 2888 }, { "epoch": 1.4406582446808511, "grad_norm": 0.4891817569732666, "learning_rate": 6.205354181837079e-06, "loss": 0.3707, "step": 2889 }, { "epoch": 1.441156914893617, "grad_norm": 0.4543576240539551, "learning_rate": 6.202537667051544e-06, "loss": 0.3855, "step": 2890 }, { "epoch": 1.441655585106383, "grad_norm": 0.48687735199928284, "learning_rate": 6.199720747203205e-06, "loss": 0.3946, "step": 2891 }, { "epoch": 1.4421542553191489, "grad_norm": 0.47392717003822327, "learning_rate": 6.19690342324091e-06, "loss": 0.3703, "step": 2892 }, { "epoch": 1.442652925531915, "grad_norm": 0.46513620018959045, "learning_rate": 6.194085696113645e-06, "loss": 0.3734, "step": 2893 }, { "epoch": 1.4431515957446808, "grad_norm": 0.4405065178871155, "learning_rate": 6.191267566770537e-06, "loss": 0.3538, "step": 2894 }, { "epoch": 1.4436502659574468, "grad_norm": 0.5405802130699158, "learning_rate": 6.1884490361608426e-06, "loss": 0.4436, "step": 2895 }, { "epoch": 1.4441489361702127, "grad_norm": 0.4857207238674164, "learning_rate": 6.185630105233956e-06, "loss": 0.3857, "step": 2896 }, { "epoch": 1.4446476063829787, "grad_norm": 0.43444615602493286, "learning_rate": 6.182810774939406e-06, "loss": 0.331, "step": 2897 }, { "epoch": 1.4451462765957448, "grad_norm": 0.4974764287471771, "learning_rate": 6.179991046226854e-06, "loss": 0.3214, "step": 2898 }, { "epoch": 1.4456449468085106, "grad_norm": 0.47787994146347046, "learning_rate": 6.177170920046102e-06, "loss": 0.3808, "step": 2899 }, { "epoch": 1.4461436170212765, "grad_norm": 0.5596787929534912, "learning_rate": 6.174350397347078e-06, "loss": 0.4169, "step": 2900 }, { "epoch": 1.4466422872340425, "grad_norm": 0.4705331027507782, "learning_rate": 6.171529479079847e-06, "loss": 0.3811, "step": 2901 }, { "epoch": 1.4471409574468086, "grad_norm": 0.5689371824264526, "learning_rate": 6.16870816619461e-06, "loss": 0.3389, "step": 2902 }, { "epoch": 1.4476396276595744, "grad_norm": 0.5405276417732239, "learning_rate": 6.165886459641694e-06, "loss": 0.3891, "step": 2903 }, { "epoch": 1.4481382978723405, "grad_norm": 0.5401028394699097, "learning_rate": 6.163064360371566e-06, "loss": 0.4126, "step": 2904 }, { "epoch": 1.4486369680851063, "grad_norm": 0.5171117186546326, "learning_rate": 6.160241869334819e-06, "loss": 0.3985, "step": 2905 }, { "epoch": 1.4491356382978724, "grad_norm": 0.5599316954612732, "learning_rate": 6.157418987482182e-06, "loss": 0.4238, "step": 2906 }, { "epoch": 1.4496343085106382, "grad_norm": 0.5056845545768738, "learning_rate": 6.154595715764517e-06, "loss": 0.3843, "step": 2907 }, { "epoch": 1.4501329787234043, "grad_norm": 0.47909802198410034, "learning_rate": 6.151772055132811e-06, "loss": 0.3757, "step": 2908 }, { "epoch": 1.4506316489361701, "grad_norm": 0.48597148060798645, "learning_rate": 6.148948006538189e-06, "loss": 0.4068, "step": 2909 }, { "epoch": 1.4511303191489362, "grad_norm": 0.534533679485321, "learning_rate": 6.1461235709319015e-06, "loss": 0.3903, "step": 2910 }, { "epoch": 1.4516289893617023, "grad_norm": 0.49111244082450867, "learning_rate": 6.143298749265333e-06, "loss": 0.3737, "step": 2911 }, { "epoch": 1.452127659574468, "grad_norm": 0.5026515126228333, "learning_rate": 6.1404735424899966e-06, "loss": 0.3676, "step": 2912 }, { "epoch": 1.452626329787234, "grad_norm": 0.5416674017906189, "learning_rate": 6.137647951557533e-06, "loss": 0.4681, "step": 2913 }, { "epoch": 1.453125, "grad_norm": 0.4475714862346649, "learning_rate": 6.134821977419715e-06, "loss": 0.2978, "step": 2914 }, { "epoch": 1.453623670212766, "grad_norm": 0.4732111692428589, "learning_rate": 6.131995621028443e-06, "loss": 0.3677, "step": 2915 }, { "epoch": 1.454122340425532, "grad_norm": 0.47576847672462463, "learning_rate": 6.12916888333575e-06, "loss": 0.4001, "step": 2916 }, { "epoch": 1.4546210106382977, "grad_norm": 0.545638918876648, "learning_rate": 6.126341765293793e-06, "loss": 0.413, "step": 2917 }, { "epoch": 1.4551196808510638, "grad_norm": 0.5433212518692017, "learning_rate": 6.123514267854856e-06, "loss": 0.4435, "step": 2918 }, { "epoch": 1.4556183510638299, "grad_norm": 0.5062620639801025, "learning_rate": 6.120686391971358e-06, "loss": 0.4019, "step": 2919 }, { "epoch": 1.4561170212765957, "grad_norm": 0.44775813817977905, "learning_rate": 6.117858138595836e-06, "loss": 0.364, "step": 2920 }, { "epoch": 1.4566156914893618, "grad_norm": 0.53187495470047, "learning_rate": 6.115029508680962e-06, "loss": 0.4422, "step": 2921 }, { "epoch": 1.4571143617021276, "grad_norm": 0.5500067472457886, "learning_rate": 6.112200503179531e-06, "loss": 0.367, "step": 2922 }, { "epoch": 1.4576130319148937, "grad_norm": 0.5061266422271729, "learning_rate": 6.109371123044465e-06, "loss": 0.3993, "step": 2923 }, { "epoch": 1.4581117021276595, "grad_norm": 0.4529741108417511, "learning_rate": 6.106541369228811e-06, "loss": 0.3679, "step": 2924 }, { "epoch": 1.4586103723404256, "grad_norm": 0.47072601318359375, "learning_rate": 6.103711242685746e-06, "loss": 0.3775, "step": 2925 }, { "epoch": 1.4591090425531914, "grad_norm": 0.5183207988739014, "learning_rate": 6.100880744368568e-06, "loss": 0.3952, "step": 2926 }, { "epoch": 1.4596077127659575, "grad_norm": 0.49877700209617615, "learning_rate": 6.098049875230703e-06, "loss": 0.3662, "step": 2927 }, { "epoch": 1.4601063829787235, "grad_norm": 0.4734641909599304, "learning_rate": 6.0952186362257e-06, "loss": 0.4106, "step": 2928 }, { "epoch": 1.4606050531914894, "grad_norm": 0.47313109040260315, "learning_rate": 6.092387028307235e-06, "loss": 0.4163, "step": 2929 }, { "epoch": 1.4611037234042552, "grad_norm": 0.4419264495372772, "learning_rate": 6.089555052429106e-06, "loss": 0.3587, "step": 2930 }, { "epoch": 1.4616023936170213, "grad_norm": 0.48683419823646545, "learning_rate": 6.086722709545237e-06, "loss": 0.4217, "step": 2931 }, { "epoch": 1.4621010638297873, "grad_norm": 0.43290287256240845, "learning_rate": 6.083890000609671e-06, "loss": 0.3836, "step": 2932 }, { "epoch": 1.4625997340425532, "grad_norm": 0.44376468658447266, "learning_rate": 6.081056926576582e-06, "loss": 0.3253, "step": 2933 }, { "epoch": 1.4630984042553192, "grad_norm": 0.4601297676563263, "learning_rate": 6.07822348840026e-06, "loss": 0.3993, "step": 2934 }, { "epoch": 1.463597074468085, "grad_norm": 0.4465351104736328, "learning_rate": 6.075389687035121e-06, "loss": 0.3279, "step": 2935 }, { "epoch": 1.4640957446808511, "grad_norm": 0.47235628962516785, "learning_rate": 6.072555523435703e-06, "loss": 0.434, "step": 2936 }, { "epoch": 1.464594414893617, "grad_norm": 0.5000429749488831, "learning_rate": 6.069720998556667e-06, "loss": 0.4426, "step": 2937 }, { "epoch": 1.465093085106383, "grad_norm": 0.45099741220474243, "learning_rate": 6.066886113352791e-06, "loss": 0.3476, "step": 2938 }, { "epoch": 1.4655917553191489, "grad_norm": 0.5089630484580994, "learning_rate": 6.064050868778981e-06, "loss": 0.3677, "step": 2939 }, { "epoch": 1.466090425531915, "grad_norm": 0.4930156171321869, "learning_rate": 6.061215265790259e-06, "loss": 0.42, "step": 2940 }, { "epoch": 1.4665890957446808, "grad_norm": 0.4378209114074707, "learning_rate": 6.05837930534177e-06, "loss": 0.339, "step": 2941 }, { "epoch": 1.4670877659574468, "grad_norm": 0.45111921429634094, "learning_rate": 6.055542988388779e-06, "loss": 0.4103, "step": 2942 }, { "epoch": 1.4675864361702127, "grad_norm": 0.47300854325294495, "learning_rate": 6.052706315886673e-06, "loss": 0.4065, "step": 2943 }, { "epoch": 1.4680851063829787, "grad_norm": 0.5198253393173218, "learning_rate": 6.0498692887909524e-06, "loss": 0.3945, "step": 2944 }, { "epoch": 1.4685837765957448, "grad_norm": 0.5119727849960327, "learning_rate": 6.047031908057247e-06, "loss": 0.3719, "step": 2945 }, { "epoch": 1.4690824468085106, "grad_norm": 0.4480006992816925, "learning_rate": 6.0441941746412945e-06, "loss": 0.3525, "step": 2946 }, { "epoch": 1.4695811170212765, "grad_norm": 0.4245603680610657, "learning_rate": 6.0413560894989606e-06, "loss": 0.362, "step": 2947 }, { "epoch": 1.4700797872340425, "grad_norm": 0.6045058369636536, "learning_rate": 6.038517653586226e-06, "loss": 0.4606, "step": 2948 }, { "epoch": 1.4705784574468086, "grad_norm": 0.5055937767028809, "learning_rate": 6.035678867859187e-06, "loss": 0.3886, "step": 2949 }, { "epoch": 1.4710771276595744, "grad_norm": 0.4720878303050995, "learning_rate": 6.032839733274063e-06, "loss": 0.3699, "step": 2950 }, { "epoch": 1.4715757978723405, "grad_norm": 0.4792514741420746, "learning_rate": 6.0300002507871855e-06, "loss": 0.3783, "step": 2951 }, { "epoch": 1.4720744680851063, "grad_norm": 0.5594308376312256, "learning_rate": 6.027160421355008e-06, "loss": 0.4013, "step": 2952 }, { "epoch": 1.4725731382978724, "grad_norm": 0.4606393277645111, "learning_rate": 6.024320245934099e-06, "loss": 0.3489, "step": 2953 }, { "epoch": 1.4730718085106382, "grad_norm": 0.4734369218349457, "learning_rate": 6.02147972548114e-06, "loss": 0.3968, "step": 2954 }, { "epoch": 1.4735704787234043, "grad_norm": 0.431611567735672, "learning_rate": 6.018638860952937e-06, "loss": 0.3355, "step": 2955 }, { "epoch": 1.4740691489361701, "grad_norm": 0.5769503712654114, "learning_rate": 6.015797653306401e-06, "loss": 0.4139, "step": 2956 }, { "epoch": 1.4745678191489362, "grad_norm": 0.4101065397262573, "learning_rate": 6.01295610349857e-06, "loss": 0.3187, "step": 2957 }, { "epoch": 1.4750664893617023, "grad_norm": 0.53859943151474, "learning_rate": 6.010114212486587e-06, "loss": 0.432, "step": 2958 }, { "epoch": 1.475565159574468, "grad_norm": 0.4389691650867462, "learning_rate": 6.007271981227718e-06, "loss": 0.3524, "step": 2959 }, { "epoch": 1.476063829787234, "grad_norm": 0.5205132365226746, "learning_rate": 6.0044294106793384e-06, "loss": 0.4182, "step": 2960 }, { "epoch": 1.4765625, "grad_norm": 0.4753202795982361, "learning_rate": 6.001586501798941e-06, "loss": 0.3608, "step": 2961 }, { "epoch": 1.477061170212766, "grad_norm": 0.5297532677650452, "learning_rate": 5.99874325554413e-06, "loss": 0.439, "step": 2962 }, { "epoch": 1.477559840425532, "grad_norm": 0.4820711314678192, "learning_rate": 5.995899672872626e-06, "loss": 0.3552, "step": 2963 }, { "epoch": 1.4780585106382977, "grad_norm": 0.532762348651886, "learning_rate": 5.99305575474226e-06, "loss": 0.3757, "step": 2964 }, { "epoch": 1.4785571808510638, "grad_norm": 0.4470153748989105, "learning_rate": 5.990211502110976e-06, "loss": 0.3771, "step": 2965 }, { "epoch": 1.4790558510638299, "grad_norm": 0.4973207414150238, "learning_rate": 5.987366915936838e-06, "loss": 0.3608, "step": 2966 }, { "epoch": 1.4795545212765957, "grad_norm": 0.46493294835090637, "learning_rate": 5.98452199717801e-06, "loss": 0.4582, "step": 2967 }, { "epoch": 1.4800531914893618, "grad_norm": 0.3786752223968506, "learning_rate": 5.981676746792777e-06, "loss": 0.3444, "step": 2968 }, { "epoch": 1.4805518617021276, "grad_norm": 0.5306916832923889, "learning_rate": 5.978831165739532e-06, "loss": 0.4922, "step": 2969 }, { "epoch": 1.4810505319148937, "grad_norm": 0.4216006100177765, "learning_rate": 5.9759852549767824e-06, "loss": 0.3221, "step": 2970 }, { "epoch": 1.4815492021276595, "grad_norm": 0.4735146462917328, "learning_rate": 5.973139015463145e-06, "loss": 0.3535, "step": 2971 }, { "epoch": 1.4820478723404256, "grad_norm": 0.49853819608688354, "learning_rate": 5.970292448157346e-06, "loss": 0.3711, "step": 2972 }, { "epoch": 1.4825465425531914, "grad_norm": 0.4655818045139313, "learning_rate": 5.967445554018223e-06, "loss": 0.3384, "step": 2973 }, { "epoch": 1.4830452127659575, "grad_norm": 0.4966394305229187, "learning_rate": 5.9645983340047245e-06, "loss": 0.4209, "step": 2974 }, { "epoch": 1.4835438829787235, "grad_norm": 0.4929766356945038, "learning_rate": 5.961750789075909e-06, "loss": 0.3937, "step": 2975 }, { "epoch": 1.4840425531914894, "grad_norm": 0.48668166995048523, "learning_rate": 5.958902920190941e-06, "loss": 0.3232, "step": 2976 }, { "epoch": 1.4845412234042552, "grad_norm": 0.5776094794273376, "learning_rate": 5.9560547283091e-06, "loss": 0.4771, "step": 2977 }, { "epoch": 1.4850398936170213, "grad_norm": 0.4275185763835907, "learning_rate": 5.953206214389767e-06, "loss": 0.3134, "step": 2978 }, { "epoch": 1.4855385638297873, "grad_norm": 0.5079979300498962, "learning_rate": 5.95035737939244e-06, "loss": 0.3925, "step": 2979 }, { "epoch": 1.4860372340425532, "grad_norm": 0.5326768755912781, "learning_rate": 5.947508224276717e-06, "loss": 0.4326, "step": 2980 }, { "epoch": 1.4865359042553192, "grad_norm": 0.4241275489330292, "learning_rate": 5.944658750002309e-06, "loss": 0.3758, "step": 2981 }, { "epoch": 1.487034574468085, "grad_norm": 0.4527067542076111, "learning_rate": 5.941808957529033e-06, "loss": 0.3996, "step": 2982 }, { "epoch": 1.4875332446808511, "grad_norm": 0.44110986590385437, "learning_rate": 5.938958847816811e-06, "loss": 0.3074, "step": 2983 }, { "epoch": 1.488031914893617, "grad_norm": 0.5798968076705933, "learning_rate": 5.936108421825678e-06, "loss": 0.4409, "step": 2984 }, { "epoch": 1.488530585106383, "grad_norm": 0.4448283314704895, "learning_rate": 5.93325768051577e-06, "loss": 0.3547, "step": 2985 }, { "epoch": 1.4890292553191489, "grad_norm": 0.5274401903152466, "learning_rate": 5.930406624847328e-06, "loss": 0.398, "step": 2986 }, { "epoch": 1.489527925531915, "grad_norm": 0.4319194555282593, "learning_rate": 5.927555255780706e-06, "loss": 0.3983, "step": 2987 }, { "epoch": 1.4900265957446808, "grad_norm": 0.4633127748966217, "learning_rate": 5.9247035742763545e-06, "loss": 0.3695, "step": 2988 }, { "epoch": 1.4905252659574468, "grad_norm": 0.48601990938186646, "learning_rate": 5.921851581294838e-06, "loss": 0.4001, "step": 2989 }, { "epoch": 1.4910239361702127, "grad_norm": 0.4647945165634155, "learning_rate": 5.918999277796821e-06, "loss": 0.3485, "step": 2990 }, { "epoch": 1.4915226063829787, "grad_norm": 0.4951007664203644, "learning_rate": 5.916146664743072e-06, "loss": 0.3912, "step": 2991 }, { "epoch": 1.4920212765957448, "grad_norm": 0.4069438874721527, "learning_rate": 5.913293743094466e-06, "loss": 0.3695, "step": 2992 }, { "epoch": 1.4925199468085106, "grad_norm": 0.4664281904697418, "learning_rate": 5.910440513811981e-06, "loss": 0.424, "step": 2993 }, { "epoch": 1.4930186170212765, "grad_norm": 0.4932718873023987, "learning_rate": 5.907586977856699e-06, "loss": 0.3623, "step": 2994 }, { "epoch": 1.4935172872340425, "grad_norm": 0.47377079725265503, "learning_rate": 5.904733136189805e-06, "loss": 0.3853, "step": 2995 }, { "epoch": 1.4940159574468086, "grad_norm": 0.4201277494430542, "learning_rate": 5.901878989772587e-06, "loss": 0.3754, "step": 2996 }, { "epoch": 1.4945146276595744, "grad_norm": 0.5463282465934753, "learning_rate": 5.899024539566437e-06, "loss": 0.4487, "step": 2997 }, { "epoch": 1.4950132978723405, "grad_norm": 0.44982030987739563, "learning_rate": 5.896169786532847e-06, "loss": 0.3864, "step": 2998 }, { "epoch": 1.4955119680851063, "grad_norm": 0.45155069231987, "learning_rate": 5.8933147316334105e-06, "loss": 0.3489, "step": 2999 }, { "epoch": 1.4960106382978724, "grad_norm": 0.4440456032752991, "learning_rate": 5.890459375829827e-06, "loss": 0.4063, "step": 3000 }, { "epoch": 1.4965093085106382, "grad_norm": 0.467811644077301, "learning_rate": 5.887603720083892e-06, "loss": 0.404, "step": 3001 }, { "epoch": 1.4970079787234043, "grad_norm": 0.4784848392009735, "learning_rate": 5.884747765357507e-06, "loss": 0.3872, "step": 3002 }, { "epoch": 1.4975066489361701, "grad_norm": 0.45840418338775635, "learning_rate": 5.88189151261267e-06, "loss": 0.347, "step": 3003 }, { "epoch": 1.4980053191489362, "grad_norm": 0.482599675655365, "learning_rate": 5.8790349628114825e-06, "loss": 0.4355, "step": 3004 }, { "epoch": 1.4985039893617023, "grad_norm": 0.45780959725379944, "learning_rate": 5.876178116916144e-06, "loss": 0.3777, "step": 3005 }, { "epoch": 1.499002659574468, "grad_norm": 0.5321133732795715, "learning_rate": 5.873320975888954e-06, "loss": 0.3857, "step": 3006 }, { "epoch": 1.499501329787234, "grad_norm": 0.41460874676704407, "learning_rate": 5.870463540692314e-06, "loss": 0.3143, "step": 3007 }, { "epoch": 1.5, "grad_norm": 0.4988599419593811, "learning_rate": 5.867605812288721e-06, "loss": 0.4051, "step": 3008 }, { "epoch": 1.500498670212766, "grad_norm": 0.46177545189857483, "learning_rate": 5.864747791640772e-06, "loss": 0.3886, "step": 3009 }, { "epoch": 1.500997340425532, "grad_norm": 0.5025841593742371, "learning_rate": 5.861889479711164e-06, "loss": 0.3782, "step": 3010 }, { "epoch": 1.5014960106382977, "grad_norm": 0.4546986222267151, "learning_rate": 5.8590308774626894e-06, "loss": 0.3483, "step": 3011 }, { "epoch": 1.5019946808510638, "grad_norm": 0.5038589239120483, "learning_rate": 5.85617198585824e-06, "loss": 0.4183, "step": 3012 }, { "epoch": 1.5024933510638299, "grad_norm": 0.45726752281188965, "learning_rate": 5.853312805860806e-06, "loss": 0.3678, "step": 3013 }, { "epoch": 1.5029920212765957, "grad_norm": 0.5287886261940002, "learning_rate": 5.8504533384334726e-06, "loss": 0.3561, "step": 3014 }, { "epoch": 1.5034906914893615, "grad_norm": 0.5144929885864258, "learning_rate": 5.847593584539424e-06, "loss": 0.3643, "step": 3015 }, { "epoch": 1.5039893617021276, "grad_norm": 0.5380445122718811, "learning_rate": 5.844733545141938e-06, "loss": 0.41, "step": 3016 }, { "epoch": 1.5044880319148937, "grad_norm": 0.4918644428253174, "learning_rate": 5.8418732212043925e-06, "loss": 0.3252, "step": 3017 }, { "epoch": 1.5049867021276597, "grad_norm": 0.6114775538444519, "learning_rate": 5.839012613690258e-06, "loss": 0.4113, "step": 3018 }, { "epoch": 1.5054853723404256, "grad_norm": 0.42172950506210327, "learning_rate": 5.8361517235631e-06, "loss": 0.3164, "step": 3019 }, { "epoch": 1.5059840425531914, "grad_norm": 0.565361738204956, "learning_rate": 5.833290551786584e-06, "loss": 0.4568, "step": 3020 }, { "epoch": 1.5064827127659575, "grad_norm": 0.5262414216995239, "learning_rate": 5.830429099324466e-06, "loss": 0.3969, "step": 3021 }, { "epoch": 1.5069813829787235, "grad_norm": 0.5103164911270142, "learning_rate": 5.827567367140595e-06, "loss": 0.365, "step": 3022 }, { "epoch": 1.5074800531914894, "grad_norm": 0.48676496744155884, "learning_rate": 5.82470535619892e-06, "loss": 0.3959, "step": 3023 }, { "epoch": 1.5079787234042552, "grad_norm": 0.6067127585411072, "learning_rate": 5.82184306746348e-06, "loss": 0.3867, "step": 3024 }, { "epoch": 1.5084773936170213, "grad_norm": 0.4873794615268707, "learning_rate": 5.818980501898407e-06, "loss": 0.3802, "step": 3025 }, { "epoch": 1.5089760638297873, "grad_norm": 0.5327600836753845, "learning_rate": 5.816117660467928e-06, "loss": 0.3932, "step": 3026 }, { "epoch": 1.5094747340425532, "grad_norm": 0.5096736550331116, "learning_rate": 5.813254544136365e-06, "loss": 0.4003, "step": 3027 }, { "epoch": 1.509973404255319, "grad_norm": 0.4179621934890747, "learning_rate": 5.810391153868129e-06, "loss": 0.3648, "step": 3028 }, { "epoch": 1.510472074468085, "grad_norm": 0.46990281343460083, "learning_rate": 5.807527490627722e-06, "loss": 0.3996, "step": 3029 }, { "epoch": 1.5109707446808511, "grad_norm": 0.4839146137237549, "learning_rate": 5.804663555379746e-06, "loss": 0.3596, "step": 3030 }, { "epoch": 1.511469414893617, "grad_norm": 0.45438218116760254, "learning_rate": 5.801799349088883e-06, "loss": 0.3743, "step": 3031 }, { "epoch": 1.511968085106383, "grad_norm": 0.48179349303245544, "learning_rate": 5.7989348727199136e-06, "loss": 0.2881, "step": 3032 }, { "epoch": 1.5124667553191489, "grad_norm": 0.5368114709854126, "learning_rate": 5.7960701272377105e-06, "loss": 0.4342, "step": 3033 }, { "epoch": 1.512965425531915, "grad_norm": 0.4696180522441864, "learning_rate": 5.793205113607232e-06, "loss": 0.3331, "step": 3034 }, { "epoch": 1.513464095744681, "grad_norm": 0.4694848954677582, "learning_rate": 5.790339832793533e-06, "loss": 0.3905, "step": 3035 }, { "epoch": 1.5139627659574468, "grad_norm": 0.4565582275390625, "learning_rate": 5.787474285761749e-06, "loss": 0.3828, "step": 3036 }, { "epoch": 1.5144614361702127, "grad_norm": 0.49756839871406555, "learning_rate": 5.784608473477116e-06, "loss": 0.4158, "step": 3037 }, { "epoch": 1.5149601063829787, "grad_norm": 0.5912104845046997, "learning_rate": 5.781742396904952e-06, "loss": 0.4178, "step": 3038 }, { "epoch": 1.5154587765957448, "grad_norm": 0.4062100946903229, "learning_rate": 5.778876057010667e-06, "loss": 0.3424, "step": 3039 }, { "epoch": 1.5159574468085106, "grad_norm": 0.47705531120300293, "learning_rate": 5.776009454759758e-06, "loss": 0.4133, "step": 3040 }, { "epoch": 1.5164561170212765, "grad_norm": 0.525937020778656, "learning_rate": 5.773142591117814e-06, "loss": 0.3948, "step": 3041 }, { "epoch": 1.5169547872340425, "grad_norm": 0.43194180727005005, "learning_rate": 5.770275467050505e-06, "loss": 0.3617, "step": 3042 }, { "epoch": 1.5174534574468086, "grad_norm": 0.5035325884819031, "learning_rate": 5.767408083523596e-06, "loss": 0.3908, "step": 3043 }, { "epoch": 1.5179521276595744, "grad_norm": 0.4815022349357605, "learning_rate": 5.764540441502936e-06, "loss": 0.4068, "step": 3044 }, { "epoch": 1.5184507978723403, "grad_norm": 0.5103643536567688, "learning_rate": 5.761672541954463e-06, "loss": 0.3447, "step": 3045 }, { "epoch": 1.5189494680851063, "grad_norm": 0.4559434652328491, "learning_rate": 5.758804385844198e-06, "loss": 0.3589, "step": 3046 }, { "epoch": 1.5194481382978724, "grad_norm": 0.5262813568115234, "learning_rate": 5.7559359741382515e-06, "loss": 0.4061, "step": 3047 }, { "epoch": 1.5199468085106385, "grad_norm": 0.533911943435669, "learning_rate": 5.75306730780282e-06, "loss": 0.4202, "step": 3048 }, { "epoch": 1.5204454787234043, "grad_norm": 0.49298083782196045, "learning_rate": 5.750198387804185e-06, "loss": 0.3937, "step": 3049 }, { "epoch": 1.5209441489361701, "grad_norm": 0.4833589494228363, "learning_rate": 5.747329215108712e-06, "loss": 0.3468, "step": 3050 }, { "epoch": 1.5214428191489362, "grad_norm": 0.4926168918609619, "learning_rate": 5.744459790682855e-06, "loss": 0.4309, "step": 3051 }, { "epoch": 1.5219414893617023, "grad_norm": 0.5066229701042175, "learning_rate": 5.741590115493152e-06, "loss": 0.3791, "step": 3052 }, { "epoch": 1.522440159574468, "grad_norm": 0.5631579756736755, "learning_rate": 5.73872019050622e-06, "loss": 0.4475, "step": 3053 }, { "epoch": 1.522938829787234, "grad_norm": 0.49344363808631897, "learning_rate": 5.735850016688769e-06, "loss": 0.3586, "step": 3054 }, { "epoch": 1.5234375, "grad_norm": 0.5222611427307129, "learning_rate": 5.732979595007586e-06, "loss": 0.4012, "step": 3055 }, { "epoch": 1.523936170212766, "grad_norm": 0.5215228199958801, "learning_rate": 5.7301089264295465e-06, "loss": 0.379, "step": 3056 }, { "epoch": 1.524434840425532, "grad_norm": 0.4871769845485687, "learning_rate": 5.727238011921603e-06, "loss": 0.3731, "step": 3057 }, { "epoch": 1.5249335106382977, "grad_norm": 0.5340349078178406, "learning_rate": 5.724366852450795e-06, "loss": 0.4153, "step": 3058 }, { "epoch": 1.5254321808510638, "grad_norm": 0.500727117061615, "learning_rate": 5.721495448984247e-06, "loss": 0.4288, "step": 3059 }, { "epoch": 1.5259308510638299, "grad_norm": 0.5078384280204773, "learning_rate": 5.71862380248916e-06, "loss": 0.4093, "step": 3060 }, { "epoch": 1.5264295212765957, "grad_norm": 0.4363429546356201, "learning_rate": 5.7157519139328195e-06, "loss": 0.3115, "step": 3061 }, { "epoch": 1.5269281914893615, "grad_norm": 0.5634632706642151, "learning_rate": 5.712879784282594e-06, "loss": 0.4467, "step": 3062 }, { "epoch": 1.5274268617021276, "grad_norm": 0.4505891501903534, "learning_rate": 5.710007414505931e-06, "loss": 0.322, "step": 3063 }, { "epoch": 1.5279255319148937, "grad_norm": 0.538398802280426, "learning_rate": 5.707134805570358e-06, "loss": 0.4133, "step": 3064 }, { "epoch": 1.5284242021276597, "grad_norm": 0.4392928183078766, "learning_rate": 5.704261958443487e-06, "loss": 0.3917, "step": 3065 }, { "epoch": 1.5289228723404256, "grad_norm": 0.5703918933868408, "learning_rate": 5.701388874093005e-06, "loss": 0.3807, "step": 3066 }, { "epoch": 1.5294215425531914, "grad_norm": 0.4971027076244354, "learning_rate": 5.698515553486685e-06, "loss": 0.3756, "step": 3067 }, { "epoch": 1.5299202127659575, "grad_norm": 0.5500981211662292, "learning_rate": 5.695641997592375e-06, "loss": 0.4155, "step": 3068 }, { "epoch": 1.5304188829787235, "grad_norm": 0.4647640883922577, "learning_rate": 5.692768207378003e-06, "loss": 0.3437, "step": 3069 }, { "epoch": 1.5309175531914894, "grad_norm": 0.5221693515777588, "learning_rate": 5.689894183811579e-06, "loss": 0.3967, "step": 3070 }, { "epoch": 1.5314162234042552, "grad_norm": 0.4165395498275757, "learning_rate": 5.687019927861185e-06, "loss": 0.3273, "step": 3071 }, { "epoch": 1.5319148936170213, "grad_norm": 0.46380341053009033, "learning_rate": 5.684145440494988e-06, "loss": 0.3866, "step": 3072 }, { "epoch": 1.5324135638297873, "grad_norm": 0.48316535353660583, "learning_rate": 5.681270722681232e-06, "loss": 0.4077, "step": 3073 }, { "epoch": 1.5329122340425532, "grad_norm": 0.4879753589630127, "learning_rate": 5.6783957753882335e-06, "loss": 0.3883, "step": 3074 }, { "epoch": 1.533410904255319, "grad_norm": 0.5115413069725037, "learning_rate": 5.675520599584391e-06, "loss": 0.3904, "step": 3075 }, { "epoch": 1.533909574468085, "grad_norm": 0.3809710144996643, "learning_rate": 5.672645196238179e-06, "loss": 0.3225, "step": 3076 }, { "epoch": 1.5344082446808511, "grad_norm": 0.47413015365600586, "learning_rate": 5.669769566318149e-06, "loss": 0.4018, "step": 3077 }, { "epoch": 1.534906914893617, "grad_norm": 0.4939810037612915, "learning_rate": 5.666893710792928e-06, "loss": 0.3895, "step": 3078 }, { "epoch": 1.535405585106383, "grad_norm": 0.45511120557785034, "learning_rate": 5.6640176306312175e-06, "loss": 0.3192, "step": 3079 }, { "epoch": 1.5359042553191489, "grad_norm": 0.4911753535270691, "learning_rate": 5.6611413268018e-06, "loss": 0.4202, "step": 3080 }, { "epoch": 1.536402925531915, "grad_norm": 0.4737807810306549, "learning_rate": 5.6582648002735255e-06, "loss": 0.4406, "step": 3081 }, { "epoch": 1.536901595744681, "grad_norm": 0.46114033460617065, "learning_rate": 5.655388052015325e-06, "loss": 0.3876, "step": 3082 }, { "epoch": 1.5374002659574468, "grad_norm": 0.46059200167655945, "learning_rate": 5.652511082996204e-06, "loss": 0.3853, "step": 3083 }, { "epoch": 1.5378989361702127, "grad_norm": 0.5045183300971985, "learning_rate": 5.649633894185241e-06, "loss": 0.3867, "step": 3084 }, { "epoch": 1.5383976063829787, "grad_norm": 0.43868786096572876, "learning_rate": 5.646756486551586e-06, "loss": 0.3796, "step": 3085 }, { "epoch": 1.5388962765957448, "grad_norm": 0.4706818461418152, "learning_rate": 5.643878861064465e-06, "loss": 0.3581, "step": 3086 }, { "epoch": 1.5393949468085106, "grad_norm": 0.5239452123641968, "learning_rate": 5.641001018693177e-06, "loss": 0.4296, "step": 3087 }, { "epoch": 1.5398936170212765, "grad_norm": 0.5268704295158386, "learning_rate": 5.638122960407097e-06, "loss": 0.3801, "step": 3088 }, { "epoch": 1.5403922872340425, "grad_norm": 0.452516108751297, "learning_rate": 5.635244687175669e-06, "loss": 0.3547, "step": 3089 }, { "epoch": 1.5408909574468086, "grad_norm": 0.4830460548400879, "learning_rate": 5.632366199968411e-06, "loss": 0.403, "step": 3090 }, { "epoch": 1.5413896276595744, "grad_norm": 0.48244336247444153, "learning_rate": 5.629487499754912e-06, "loss": 0.3886, "step": 3091 }, { "epoch": 1.5418882978723403, "grad_norm": 0.48905372619628906, "learning_rate": 5.626608587504834e-06, "loss": 0.3844, "step": 3092 }, { "epoch": 1.5423869680851063, "grad_norm": 0.44537830352783203, "learning_rate": 5.623729464187909e-06, "loss": 0.3186, "step": 3093 }, { "epoch": 1.5428856382978724, "grad_norm": 0.5473161935806274, "learning_rate": 5.620850130773942e-06, "loss": 0.4039, "step": 3094 }, { "epoch": 1.5433843085106385, "grad_norm": 0.4393376410007477, "learning_rate": 5.617970588232807e-06, "loss": 0.3745, "step": 3095 }, { "epoch": 1.5438829787234043, "grad_norm": 0.45979949831962585, "learning_rate": 5.615090837534449e-06, "loss": 0.4103, "step": 3096 }, { "epoch": 1.5443816489361701, "grad_norm": 0.5179013013839722, "learning_rate": 5.6122108796488836e-06, "loss": 0.3775, "step": 3097 }, { "epoch": 1.5448803191489362, "grad_norm": 0.5023601651191711, "learning_rate": 5.609330715546195e-06, "loss": 0.4051, "step": 3098 }, { "epoch": 1.5453789893617023, "grad_norm": 0.45172539353370667, "learning_rate": 5.606450346196539e-06, "loss": 0.3486, "step": 3099 }, { "epoch": 1.545877659574468, "grad_norm": 0.5136901140213013, "learning_rate": 5.603569772570139e-06, "loss": 0.4014, "step": 3100 }, { "epoch": 1.546376329787234, "grad_norm": 0.5295348167419434, "learning_rate": 5.600688995637288e-06, "loss": 0.3858, "step": 3101 }, { "epoch": 1.546875, "grad_norm": 0.46364450454711914, "learning_rate": 5.597808016368346e-06, "loss": 0.4117, "step": 3102 }, { "epoch": 1.547373670212766, "grad_norm": 0.3973535895347595, "learning_rate": 5.594926835733742e-06, "loss": 0.3522, "step": 3103 }, { "epoch": 1.547872340425532, "grad_norm": 0.4590149521827698, "learning_rate": 5.592045454703972e-06, "loss": 0.3788, "step": 3104 }, { "epoch": 1.5483710106382977, "grad_norm": 0.4985351860523224, "learning_rate": 5.589163874249604e-06, "loss": 0.3872, "step": 3105 }, { "epoch": 1.5488696808510638, "grad_norm": 0.4611496329307556, "learning_rate": 5.586282095341268e-06, "loss": 0.3889, "step": 3106 }, { "epoch": 1.5493683510638299, "grad_norm": 0.5933761596679688, "learning_rate": 5.583400118949659e-06, "loss": 0.4351, "step": 3107 }, { "epoch": 1.5498670212765957, "grad_norm": 0.463481068611145, "learning_rate": 5.5805179460455465e-06, "loss": 0.3447, "step": 3108 }, { "epoch": 1.5503656914893615, "grad_norm": 0.5273258686065674, "learning_rate": 5.577635577599759e-06, "loss": 0.3962, "step": 3109 }, { "epoch": 1.5508643617021276, "grad_norm": 0.5007009506225586, "learning_rate": 5.5747530145831945e-06, "loss": 0.3725, "step": 3110 }, { "epoch": 1.5513630319148937, "grad_norm": 0.5099436640739441, "learning_rate": 5.571870257966816e-06, "loss": 0.3874, "step": 3111 }, { "epoch": 1.5518617021276597, "grad_norm": 0.5374582409858704, "learning_rate": 5.5689873087216506e-06, "loss": 0.4096, "step": 3112 }, { "epoch": 1.5523603723404256, "grad_norm": 0.5388182997703552, "learning_rate": 5.566104167818791e-06, "loss": 0.3835, "step": 3113 }, { "epoch": 1.5528590425531914, "grad_norm": 0.4451906681060791, "learning_rate": 5.563220836229394e-06, "loss": 0.3348, "step": 3114 }, { "epoch": 1.5533577127659575, "grad_norm": 0.5385422110557556, "learning_rate": 5.560337314924682e-06, "loss": 0.3772, "step": 3115 }, { "epoch": 1.5538563829787235, "grad_norm": 0.48247775435447693, "learning_rate": 5.557453604875941e-06, "loss": 0.3468, "step": 3116 }, { "epoch": 1.5543550531914894, "grad_norm": 0.5285221934318542, "learning_rate": 5.554569707054518e-06, "loss": 0.3973, "step": 3117 }, { "epoch": 1.5548537234042552, "grad_norm": 0.4557757079601288, "learning_rate": 5.551685622431826e-06, "loss": 0.3467, "step": 3118 }, { "epoch": 1.5553523936170213, "grad_norm": 0.49602586030960083, "learning_rate": 5.548801351979341e-06, "loss": 0.3834, "step": 3119 }, { "epoch": 1.5558510638297873, "grad_norm": 0.4848610460758209, "learning_rate": 5.5459168966685995e-06, "loss": 0.3899, "step": 3120 }, { "epoch": 1.5563497340425532, "grad_norm": 0.44102901220321655, "learning_rate": 5.543032257471202e-06, "loss": 0.3704, "step": 3121 }, { "epoch": 1.556848404255319, "grad_norm": 0.41619575023651123, "learning_rate": 5.540147435358809e-06, "loss": 0.3914, "step": 3122 }, { "epoch": 1.557347074468085, "grad_norm": 0.45828402042388916, "learning_rate": 5.537262431303147e-06, "loss": 0.3872, "step": 3123 }, { "epoch": 1.5578457446808511, "grad_norm": 0.4526064693927765, "learning_rate": 5.5343772462759995e-06, "loss": 0.3815, "step": 3124 }, { "epoch": 1.558344414893617, "grad_norm": 0.4333901107311249, "learning_rate": 5.531491881249211e-06, "loss": 0.393, "step": 3125 }, { "epoch": 1.558843085106383, "grad_norm": 0.576759397983551, "learning_rate": 5.528606337194688e-06, "loss": 0.4101, "step": 3126 }, { "epoch": 1.5593417553191489, "grad_norm": 0.47069084644317627, "learning_rate": 5.525720615084401e-06, "loss": 0.395, "step": 3127 }, { "epoch": 1.559840425531915, "grad_norm": 0.4717172384262085, "learning_rate": 5.522834715890372e-06, "loss": 0.4066, "step": 3128 }, { "epoch": 1.560339095744681, "grad_norm": 0.5257126092910767, "learning_rate": 5.519948640584688e-06, "loss": 0.4079, "step": 3129 }, { "epoch": 1.5608377659574468, "grad_norm": 0.503730833530426, "learning_rate": 5.517062390139497e-06, "loss": 0.3637, "step": 3130 }, { "epoch": 1.5613364361702127, "grad_norm": 0.4263148903846741, "learning_rate": 5.514175965527002e-06, "loss": 0.3655, "step": 3131 }, { "epoch": 1.5618351063829787, "grad_norm": 0.45620137453079224, "learning_rate": 5.5112893677194665e-06, "loss": 0.383, "step": 3132 }, { "epoch": 1.5623337765957448, "grad_norm": 0.5051462650299072, "learning_rate": 5.508402597689212e-06, "loss": 0.4211, "step": 3133 }, { "epoch": 1.5628324468085106, "grad_norm": 0.4921666979789734, "learning_rate": 5.505515656408619e-06, "loss": 0.3867, "step": 3134 }, { "epoch": 1.5633311170212765, "grad_norm": 0.5335116386413574, "learning_rate": 5.502628544850124e-06, "loss": 0.3692, "step": 3135 }, { "epoch": 1.5638297872340425, "grad_norm": 0.4724314510822296, "learning_rate": 5.499741263986223e-06, "loss": 0.37, "step": 3136 }, { "epoch": 1.5643284574468086, "grad_norm": 0.4950323700904846, "learning_rate": 5.496853814789465e-06, "loss": 0.4188, "step": 3137 }, { "epoch": 1.5648271276595744, "grad_norm": 0.5237119793891907, "learning_rate": 5.493966198232461e-06, "loss": 0.3953, "step": 3138 }, { "epoch": 1.5653257978723403, "grad_norm": 0.5295531749725342, "learning_rate": 5.491078415287876e-06, "loss": 0.381, "step": 3139 }, { "epoch": 1.5658244680851063, "grad_norm": 0.42507562041282654, "learning_rate": 5.488190466928429e-06, "loss": 0.371, "step": 3140 }, { "epoch": 1.5663231382978724, "grad_norm": 0.5833573937416077, "learning_rate": 5.485302354126897e-06, "loss": 0.4787, "step": 3141 }, { "epoch": 1.5668218085106385, "grad_norm": 0.4335073232650757, "learning_rate": 5.482414077856113e-06, "loss": 0.3316, "step": 3142 }, { "epoch": 1.5673204787234043, "grad_norm": 0.4947368800640106, "learning_rate": 5.479525639088963e-06, "loss": 0.3769, "step": 3143 }, { "epoch": 1.5678191489361701, "grad_norm": 0.4483374357223511, "learning_rate": 5.476637038798389e-06, "loss": 0.3226, "step": 3144 }, { "epoch": 1.5683178191489362, "grad_norm": 0.5881880521774292, "learning_rate": 5.473748277957388e-06, "loss": 0.401, "step": 3145 }, { "epoch": 1.5688164893617023, "grad_norm": 0.542151927947998, "learning_rate": 5.470859357539009e-06, "loss": 0.4259, "step": 3146 }, { "epoch": 1.569315159574468, "grad_norm": 0.43145692348480225, "learning_rate": 5.467970278516357e-06, "loss": 0.3171, "step": 3147 }, { "epoch": 1.569813829787234, "grad_norm": 0.5011803507804871, "learning_rate": 5.46508104186259e-06, "loss": 0.405, "step": 3148 }, { "epoch": 1.5703125, "grad_norm": 0.5254731178283691, "learning_rate": 5.462191648550916e-06, "loss": 0.3649, "step": 3149 }, { "epoch": 1.570811170212766, "grad_norm": 0.5697242617607117, "learning_rate": 5.459302099554601e-06, "loss": 0.4127, "step": 3150 }, { "epoch": 1.571309840425532, "grad_norm": 0.42184630036354065, "learning_rate": 5.4564123958469586e-06, "loss": 0.3671, "step": 3151 }, { "epoch": 1.5718085106382977, "grad_norm": 0.45539313554763794, "learning_rate": 5.453522538401358e-06, "loss": 0.3505, "step": 3152 }, { "epoch": 1.5723071808510638, "grad_norm": 0.5988955497741699, "learning_rate": 5.450632528191218e-06, "loss": 0.4261, "step": 3153 }, { "epoch": 1.5728058510638299, "grad_norm": 0.46889615058898926, "learning_rate": 5.447742366190008e-06, "loss": 0.3826, "step": 3154 }, { "epoch": 1.5733045212765957, "grad_norm": 0.48868364095687866, "learning_rate": 5.444852053371256e-06, "loss": 0.4114, "step": 3155 }, { "epoch": 1.5738031914893615, "grad_norm": 0.40782424807548523, "learning_rate": 5.44196159070853e-06, "loss": 0.3646, "step": 3156 }, { "epoch": 1.5743018617021276, "grad_norm": 0.5421667098999023, "learning_rate": 5.439070979175454e-06, "loss": 0.3785, "step": 3157 }, { "epoch": 1.5748005319148937, "grad_norm": 0.47960758209228516, "learning_rate": 5.436180219745703e-06, "loss": 0.3588, "step": 3158 }, { "epoch": 1.5752992021276597, "grad_norm": 0.4522923529148102, "learning_rate": 5.433289313393002e-06, "loss": 0.39, "step": 3159 }, { "epoch": 1.5757978723404256, "grad_norm": 0.43358755111694336, "learning_rate": 5.43039826109112e-06, "loss": 0.3539, "step": 3160 }, { "epoch": 1.5762965425531914, "grad_norm": 0.5159251093864441, "learning_rate": 5.42750706381388e-06, "loss": 0.3732, "step": 3161 }, { "epoch": 1.5767952127659575, "grad_norm": 0.5313467383384705, "learning_rate": 5.424615722535155e-06, "loss": 0.4459, "step": 3162 }, { "epoch": 1.5772938829787235, "grad_norm": 0.4542351961135864, "learning_rate": 5.4217242382288625e-06, "loss": 0.3482, "step": 3163 }, { "epoch": 1.5777925531914894, "grad_norm": 0.5030403137207031, "learning_rate": 5.4188326118689715e-06, "loss": 0.4043, "step": 3164 }, { "epoch": 1.5782912234042552, "grad_norm": 0.48404088616371155, "learning_rate": 5.415940844429497e-06, "loss": 0.3489, "step": 3165 }, { "epoch": 1.5787898936170213, "grad_norm": 0.4467640221118927, "learning_rate": 5.413048936884502e-06, "loss": 0.4039, "step": 3166 }, { "epoch": 1.5792885638297873, "grad_norm": 0.43598878383636475, "learning_rate": 5.410156890208097e-06, "loss": 0.3655, "step": 3167 }, { "epoch": 1.5797872340425532, "grad_norm": 0.46472978591918945, "learning_rate": 5.407264705374438e-06, "loss": 0.3702, "step": 3168 }, { "epoch": 1.580285904255319, "grad_norm": 0.4718741178512573, "learning_rate": 5.404372383357729e-06, "loss": 0.4217, "step": 3169 }, { "epoch": 1.580784574468085, "grad_norm": 0.42164841294288635, "learning_rate": 5.40147992513222e-06, "loss": 0.3517, "step": 3170 }, { "epoch": 1.5812832446808511, "grad_norm": 0.4439725875854492, "learning_rate": 5.398587331672207e-06, "loss": 0.3444, "step": 3171 }, { "epoch": 1.581781914893617, "grad_norm": 0.5040503740310669, "learning_rate": 5.395694603952031e-06, "loss": 0.4202, "step": 3172 }, { "epoch": 1.582280585106383, "grad_norm": 0.4620877206325531, "learning_rate": 5.392801742946079e-06, "loss": 0.3408, "step": 3173 }, { "epoch": 1.5827792553191489, "grad_norm": 0.47254225611686707, "learning_rate": 5.389908749628781e-06, "loss": 0.3917, "step": 3174 }, { "epoch": 1.583277925531915, "grad_norm": 0.44340232014656067, "learning_rate": 5.387015624974613e-06, "loss": 0.3749, "step": 3175 }, { "epoch": 1.583776595744681, "grad_norm": 0.48526066541671753, "learning_rate": 5.384122369958096e-06, "loss": 0.4221, "step": 3176 }, { "epoch": 1.5842752659574468, "grad_norm": 0.44475212693214417, "learning_rate": 5.381228985553795e-06, "loss": 0.3858, "step": 3177 }, { "epoch": 1.5847739361702127, "grad_norm": 0.4538457691669464, "learning_rate": 5.3783354727363145e-06, "loss": 0.4039, "step": 3178 }, { "epoch": 1.5852726063829787, "grad_norm": 0.405709832906723, "learning_rate": 5.3754418324803095e-06, "loss": 0.3693, "step": 3179 }, { "epoch": 1.5857712765957448, "grad_norm": 0.447581946849823, "learning_rate": 5.372548065760471e-06, "loss": 0.3535, "step": 3180 }, { "epoch": 1.5862699468085106, "grad_norm": 0.48996442556381226, "learning_rate": 5.369654173551536e-06, "loss": 0.4243, "step": 3181 }, { "epoch": 1.5867686170212765, "grad_norm": 0.47253984212875366, "learning_rate": 5.366760156828284e-06, "loss": 0.4106, "step": 3182 }, { "epoch": 1.5872672872340425, "grad_norm": 0.4582904875278473, "learning_rate": 5.363866016565534e-06, "loss": 0.3666, "step": 3183 }, { "epoch": 1.5877659574468086, "grad_norm": 0.4416930079460144, "learning_rate": 5.36097175373815e-06, "loss": 0.3694, "step": 3184 }, { "epoch": 1.5882646276595744, "grad_norm": 0.5515949130058289, "learning_rate": 5.358077369321034e-06, "loss": 0.4144, "step": 3185 }, { "epoch": 1.5887632978723403, "grad_norm": 0.48669353127479553, "learning_rate": 5.355182864289131e-06, "loss": 0.3982, "step": 3186 }, { "epoch": 1.5892619680851063, "grad_norm": 0.42139387130737305, "learning_rate": 5.352288239617427e-06, "loss": 0.352, "step": 3187 }, { "epoch": 1.5897606382978724, "grad_norm": 0.5246798396110535, "learning_rate": 5.3493934962809465e-06, "loss": 0.3828, "step": 3188 }, { "epoch": 1.5902593085106385, "grad_norm": 0.4870327115058899, "learning_rate": 5.346498635254757e-06, "loss": 0.4018, "step": 3189 }, { "epoch": 1.5907579787234043, "grad_norm": 0.474894255399704, "learning_rate": 5.343603657513959e-06, "loss": 0.347, "step": 3190 }, { "epoch": 1.5912566489361701, "grad_norm": 0.47591468691825867, "learning_rate": 5.340708564033701e-06, "loss": 0.3684, "step": 3191 }, { "epoch": 1.5917553191489362, "grad_norm": 0.4638276994228363, "learning_rate": 5.3378133557891656e-06, "loss": 0.3783, "step": 3192 }, { "epoch": 1.5922539893617023, "grad_norm": 0.47108155488967896, "learning_rate": 5.334918033755573e-06, "loss": 0.3966, "step": 3193 }, { "epoch": 1.592752659574468, "grad_norm": 0.43009883165359497, "learning_rate": 5.3320225989081845e-06, "loss": 0.3498, "step": 3194 }, { "epoch": 1.593251329787234, "grad_norm": 0.5969514846801758, "learning_rate": 5.329127052222299e-06, "loss": 0.4122, "step": 3195 }, { "epoch": 1.59375, "grad_norm": 0.5293123722076416, "learning_rate": 5.326231394673252e-06, "loss": 0.4158, "step": 3196 }, { "epoch": 1.594248670212766, "grad_norm": 0.47010454535484314, "learning_rate": 5.323335627236416e-06, "loss": 0.3834, "step": 3197 }, { "epoch": 1.594747340425532, "grad_norm": 0.48813894391059875, "learning_rate": 5.320439750887204e-06, "loss": 0.3795, "step": 3198 }, { "epoch": 1.5952460106382977, "grad_norm": 0.5626384615898132, "learning_rate": 5.31754376660106e-06, "loss": 0.4308, "step": 3199 }, { "epoch": 1.5957446808510638, "grad_norm": 0.44768255949020386, "learning_rate": 5.3146476753534695e-06, "loss": 0.36, "step": 3200 }, { "epoch": 1.5962433510638299, "grad_norm": 0.4740574359893799, "learning_rate": 5.31175147811995e-06, "loss": 0.3614, "step": 3201 }, { "epoch": 1.5967420212765957, "grad_norm": 0.47889262437820435, "learning_rate": 5.30885517587606e-06, "loss": 0.4381, "step": 3202 }, { "epoch": 1.5972406914893615, "grad_norm": 0.5017000436782837, "learning_rate": 5.305958769597386e-06, "loss": 0.3619, "step": 3203 }, { "epoch": 1.5977393617021276, "grad_norm": 0.5075222253799438, "learning_rate": 5.303062260259555e-06, "loss": 0.4688, "step": 3204 }, { "epoch": 1.5982380319148937, "grad_norm": 0.4181979298591614, "learning_rate": 5.300165648838227e-06, "loss": 0.3458, "step": 3205 }, { "epoch": 1.5987367021276597, "grad_norm": 0.4887390732765198, "learning_rate": 5.297268936309098e-06, "loss": 0.3634, "step": 3206 }, { "epoch": 1.5992353723404256, "grad_norm": 0.5216825008392334, "learning_rate": 5.294372123647895e-06, "loss": 0.4129, "step": 3207 }, { "epoch": 1.5997340425531914, "grad_norm": 0.5068340301513672, "learning_rate": 5.291475211830383e-06, "loss": 0.3696, "step": 3208 }, { "epoch": 1.6002327127659575, "grad_norm": 0.43354782462120056, "learning_rate": 5.288578201832353e-06, "loss": 0.3416, "step": 3209 }, { "epoch": 1.6007313829787235, "grad_norm": 0.47524717450141907, "learning_rate": 5.285681094629639e-06, "loss": 0.3415, "step": 3210 }, { "epoch": 1.6012300531914894, "grad_norm": 0.4850345551967621, "learning_rate": 5.2827838911981e-06, "loss": 0.4029, "step": 3211 }, { "epoch": 1.6017287234042552, "grad_norm": 0.44757533073425293, "learning_rate": 5.279886592513631e-06, "loss": 0.4125, "step": 3212 }, { "epoch": 1.6022273936170213, "grad_norm": 0.5079305171966553, "learning_rate": 5.276989199552159e-06, "loss": 0.4659, "step": 3213 }, { "epoch": 1.6027260638297873, "grad_norm": 0.47321414947509766, "learning_rate": 5.2740917132896395e-06, "loss": 0.3962, "step": 3214 }, { "epoch": 1.6032247340425532, "grad_norm": 0.5188072323799133, "learning_rate": 5.271194134702062e-06, "loss": 0.3864, "step": 3215 }, { "epoch": 1.603723404255319, "grad_norm": 0.48887327313423157, "learning_rate": 5.268296464765449e-06, "loss": 0.3872, "step": 3216 }, { "epoch": 1.604222074468085, "grad_norm": 0.4345913529396057, "learning_rate": 5.265398704455851e-06, "loss": 0.3488, "step": 3217 }, { "epoch": 1.6047207446808511, "grad_norm": 0.4936148226261139, "learning_rate": 5.262500854749348e-06, "loss": 0.4027, "step": 3218 }, { "epoch": 1.605219414893617, "grad_norm": 0.44994083046913147, "learning_rate": 5.259602916622053e-06, "loss": 0.3856, "step": 3219 }, { "epoch": 1.605718085106383, "grad_norm": 0.4601457118988037, "learning_rate": 5.256704891050108e-06, "loss": 0.3758, "step": 3220 }, { "epoch": 1.6062167553191489, "grad_norm": 0.475036084651947, "learning_rate": 5.253806779009684e-06, "loss": 0.39, "step": 3221 }, { "epoch": 1.606715425531915, "grad_norm": 0.4485069513320923, "learning_rate": 5.25090858147698e-06, "loss": 0.3244, "step": 3222 }, { "epoch": 1.607214095744681, "grad_norm": 0.5286679863929749, "learning_rate": 5.248010299428227e-06, "loss": 0.3838, "step": 3223 }, { "epoch": 1.6077127659574468, "grad_norm": 0.43542173504829407, "learning_rate": 5.245111933839678e-06, "loss": 0.3677, "step": 3224 }, { "epoch": 1.6082114361702127, "grad_norm": 0.5116370320320129, "learning_rate": 5.242213485687623e-06, "loss": 0.402, "step": 3225 }, { "epoch": 1.6087101063829787, "grad_norm": 0.490854412317276, "learning_rate": 5.239314955948372e-06, "loss": 0.3145, "step": 3226 }, { "epoch": 1.6092087765957448, "grad_norm": 0.468414306640625, "learning_rate": 5.2364163455982685e-06, "loss": 0.377, "step": 3227 }, { "epoch": 1.6097074468085106, "grad_norm": 0.44141721725463867, "learning_rate": 5.233517655613681e-06, "loss": 0.396, "step": 3228 }, { "epoch": 1.6102061170212765, "grad_norm": 0.465471476316452, "learning_rate": 5.230618886971001e-06, "loss": 0.397, "step": 3229 }, { "epoch": 1.6107047872340425, "grad_norm": 0.47847139835357666, "learning_rate": 5.227720040646651e-06, "loss": 0.3703, "step": 3230 }, { "epoch": 1.6112034574468086, "grad_norm": 0.43266013264656067, "learning_rate": 5.22482111761708e-06, "loss": 0.3844, "step": 3231 }, { "epoch": 1.6117021276595744, "grad_norm": 0.46343961358070374, "learning_rate": 5.22192211885876e-06, "loss": 0.3603, "step": 3232 }, { "epoch": 1.6122007978723403, "grad_norm": 0.540951669216156, "learning_rate": 5.219023045348189e-06, "loss": 0.4095, "step": 3233 }, { "epoch": 1.6126994680851063, "grad_norm": 0.49725818634033203, "learning_rate": 5.216123898061894e-06, "loss": 0.3956, "step": 3234 }, { "epoch": 1.6131981382978724, "grad_norm": 0.4342799186706543, "learning_rate": 5.213224677976421e-06, "loss": 0.3284, "step": 3235 }, { "epoch": 1.6136968085106385, "grad_norm": 0.5338433980941772, "learning_rate": 5.210325386068345e-06, "loss": 0.3634, "step": 3236 }, { "epoch": 1.6141954787234043, "grad_norm": 0.46561914682388306, "learning_rate": 5.207426023314261e-06, "loss": 0.4082, "step": 3237 }, { "epoch": 1.6146941489361701, "grad_norm": 0.47157540917396545, "learning_rate": 5.204526590690792e-06, "loss": 0.4369, "step": 3238 }, { "epoch": 1.6151928191489362, "grad_norm": 0.39179396629333496, "learning_rate": 5.2016270891745834e-06, "loss": 0.3212, "step": 3239 }, { "epoch": 1.6156914893617023, "grad_norm": 0.46289822459220886, "learning_rate": 5.198727519742303e-06, "loss": 0.3384, "step": 3240 }, { "epoch": 1.616190159574468, "grad_norm": 0.5215504169464111, "learning_rate": 5.195827883370641e-06, "loss": 0.4086, "step": 3241 }, { "epoch": 1.616688829787234, "grad_norm": 0.4428079128265381, "learning_rate": 5.192928181036312e-06, "loss": 0.308, "step": 3242 }, { "epoch": 1.6171875, "grad_norm": 0.52088862657547, "learning_rate": 5.190028413716052e-06, "loss": 0.4232, "step": 3243 }, { "epoch": 1.617686170212766, "grad_norm": 0.45960530638694763, "learning_rate": 5.187128582386618e-06, "loss": 0.3914, "step": 3244 }, { "epoch": 1.618184840425532, "grad_norm": 0.4490441083908081, "learning_rate": 5.184228688024789e-06, "loss": 0.367, "step": 3245 }, { "epoch": 1.6186835106382977, "grad_norm": 0.5134024620056152, "learning_rate": 5.181328731607366e-06, "loss": 0.3954, "step": 3246 }, { "epoch": 1.6191821808510638, "grad_norm": 0.49582669138908386, "learning_rate": 5.17842871411117e-06, "loss": 0.4198, "step": 3247 }, { "epoch": 1.6196808510638299, "grad_norm": 0.4416256248950958, "learning_rate": 5.175528636513044e-06, "loss": 0.3054, "step": 3248 }, { "epoch": 1.6201795212765957, "grad_norm": 0.5165151953697205, "learning_rate": 5.172628499789848e-06, "loss": 0.3986, "step": 3249 }, { "epoch": 1.6206781914893615, "grad_norm": 0.5079205632209778, "learning_rate": 5.169728304918467e-06, "loss": 0.3636, "step": 3250 }, { "epoch": 1.6211768617021276, "grad_norm": 0.4670352339744568, "learning_rate": 5.1668280528758e-06, "loss": 0.4268, "step": 3251 }, { "epoch": 1.6216755319148937, "grad_norm": 0.4447503089904785, "learning_rate": 5.163927744638769e-06, "loss": 0.3227, "step": 3252 }, { "epoch": 1.6221742021276597, "grad_norm": 0.5275959372520447, "learning_rate": 5.161027381184314e-06, "loss": 0.4068, "step": 3253 }, { "epoch": 1.6226728723404256, "grad_norm": 0.47398805618286133, "learning_rate": 5.158126963489394e-06, "loss": 0.3046, "step": 3254 }, { "epoch": 1.6231715425531914, "grad_norm": 0.5097455382347107, "learning_rate": 5.155226492530985e-06, "loss": 0.4209, "step": 3255 }, { "epoch": 1.6236702127659575, "grad_norm": 0.5130666494369507, "learning_rate": 5.152325969286081e-06, "loss": 0.4291, "step": 3256 }, { "epoch": 1.6241688829787235, "grad_norm": 0.40825799107551575, "learning_rate": 5.1494253947316965e-06, "loss": 0.3242, "step": 3257 }, { "epoch": 1.6246675531914894, "grad_norm": 0.4768308401107788, "learning_rate": 5.146524769844859e-06, "loss": 0.4136, "step": 3258 }, { "epoch": 1.6251662234042552, "grad_norm": 0.4793086051940918, "learning_rate": 5.143624095602614e-06, "loss": 0.4228, "step": 3259 }, { "epoch": 1.6256648936170213, "grad_norm": 0.4529089331626892, "learning_rate": 5.140723372982028e-06, "loss": 0.3787, "step": 3260 }, { "epoch": 1.6261635638297873, "grad_norm": 0.46134570240974426, "learning_rate": 5.1378226029601785e-06, "loss": 0.3834, "step": 3261 }, { "epoch": 1.6266622340425532, "grad_norm": 0.5034077167510986, "learning_rate": 5.134921786514162e-06, "loss": 0.409, "step": 3262 }, { "epoch": 1.627160904255319, "grad_norm": 0.4956651031970978, "learning_rate": 5.132020924621088e-06, "loss": 0.4026, "step": 3263 }, { "epoch": 1.627659574468085, "grad_norm": 0.4547511637210846, "learning_rate": 5.1291200182580835e-06, "loss": 0.3454, "step": 3264 }, { "epoch": 1.6281582446808511, "grad_norm": 0.47477948665618896, "learning_rate": 5.126219068402289e-06, "loss": 0.3764, "step": 3265 }, { "epoch": 1.628656914893617, "grad_norm": 0.4961487352848053, "learning_rate": 5.123318076030865e-06, "loss": 0.3966, "step": 3266 }, { "epoch": 1.629155585106383, "grad_norm": 0.4957907795906067, "learning_rate": 5.120417042120976e-06, "loss": 0.3684, "step": 3267 }, { "epoch": 1.6296542553191489, "grad_norm": 0.5488330125808716, "learning_rate": 5.117515967649809e-06, "loss": 0.4171, "step": 3268 }, { "epoch": 1.630152925531915, "grad_norm": 0.42043301463127136, "learning_rate": 5.114614853594561e-06, "loss": 0.3487, "step": 3269 }, { "epoch": 1.630651595744681, "grad_norm": 0.5244244337081909, "learning_rate": 5.111713700932443e-06, "loss": 0.3279, "step": 3270 }, { "epoch": 1.6311502659574468, "grad_norm": 0.5437098741531372, "learning_rate": 5.108812510640683e-06, "loss": 0.4223, "step": 3271 }, { "epoch": 1.6316489361702127, "grad_norm": 0.48778098821640015, "learning_rate": 5.105911283696513e-06, "loss": 0.3883, "step": 3272 }, { "epoch": 1.6321476063829787, "grad_norm": 0.4349970519542694, "learning_rate": 5.103010021077186e-06, "loss": 0.3804, "step": 3273 }, { "epoch": 1.6326462765957448, "grad_norm": 0.5115519165992737, "learning_rate": 5.1001087237599614e-06, "loss": 0.4424, "step": 3274 }, { "epoch": 1.6331449468085106, "grad_norm": 0.4928317070007324, "learning_rate": 5.097207392722112e-06, "loss": 0.3588, "step": 3275 }, { "epoch": 1.6336436170212765, "grad_norm": 0.4968772828578949, "learning_rate": 5.094306028940923e-06, "loss": 0.4203, "step": 3276 }, { "epoch": 1.6341422872340425, "grad_norm": 0.4725695252418518, "learning_rate": 5.091404633393692e-06, "loss": 0.3545, "step": 3277 }, { "epoch": 1.6346409574468086, "grad_norm": 0.4489421844482422, "learning_rate": 5.08850320705772e-06, "loss": 0.3741, "step": 3278 }, { "epoch": 1.6351396276595744, "grad_norm": 0.4971451163291931, "learning_rate": 5.085601750910327e-06, "loss": 0.4033, "step": 3279 }, { "epoch": 1.6356382978723403, "grad_norm": 0.47382646799087524, "learning_rate": 5.0827002659288395e-06, "loss": 0.3701, "step": 3280 }, { "epoch": 1.6361369680851063, "grad_norm": 0.4536781311035156, "learning_rate": 5.079798753090591e-06, "loss": 0.3745, "step": 3281 }, { "epoch": 1.6366356382978724, "grad_norm": 0.5720585584640503, "learning_rate": 5.076897213372932e-06, "loss": 0.4224, "step": 3282 }, { "epoch": 1.6371343085106385, "grad_norm": 0.42274966835975647, "learning_rate": 5.073995647753212e-06, "loss": 0.3471, "step": 3283 }, { "epoch": 1.6376329787234043, "grad_norm": 0.465618371963501, "learning_rate": 5.0710940572088e-06, "loss": 0.3857, "step": 3284 }, { "epoch": 1.6381316489361701, "grad_norm": 0.49015915393829346, "learning_rate": 5.068192442717063e-06, "loss": 0.3812, "step": 3285 }, { "epoch": 1.6386303191489362, "grad_norm": 0.46080517768859863, "learning_rate": 5.065290805255382e-06, "loss": 0.3511, "step": 3286 }, { "epoch": 1.6391289893617023, "grad_norm": 0.4662431478500366, "learning_rate": 5.062389145801146e-06, "loss": 0.3672, "step": 3287 }, { "epoch": 1.639627659574468, "grad_norm": 0.5237916707992554, "learning_rate": 5.059487465331749e-06, "loss": 0.3776, "step": 3288 }, { "epoch": 1.640126329787234, "grad_norm": 0.5536121129989624, "learning_rate": 5.056585764824593e-06, "loss": 0.3773, "step": 3289 }, { "epoch": 1.640625, "grad_norm": 0.47722572088241577, "learning_rate": 5.053684045257086e-06, "loss": 0.4138, "step": 3290 }, { "epoch": 1.641123670212766, "grad_norm": 0.4663361608982086, "learning_rate": 5.050782307606646e-06, "loss": 0.345, "step": 3291 }, { "epoch": 1.641622340425532, "grad_norm": 0.5041977167129517, "learning_rate": 5.047880552850692e-06, "loss": 0.3653, "step": 3292 }, { "epoch": 1.6421210106382977, "grad_norm": 0.4747192859649658, "learning_rate": 5.044978781966653e-06, "loss": 0.3651, "step": 3293 }, { "epoch": 1.6426196808510638, "grad_norm": 0.42897164821624756, "learning_rate": 5.042076995931959e-06, "loss": 0.3698, "step": 3294 }, { "epoch": 1.6431183510638299, "grad_norm": 0.47379183769226074, "learning_rate": 5.039175195724049e-06, "loss": 0.3841, "step": 3295 }, { "epoch": 1.6436170212765957, "grad_norm": 0.4924936592578888, "learning_rate": 5.036273382320365e-06, "loss": 0.3943, "step": 3296 }, { "epoch": 1.6441156914893615, "grad_norm": 0.5475698709487915, "learning_rate": 5.033371556698356e-06, "loss": 0.4227, "step": 3297 }, { "epoch": 1.6446143617021276, "grad_norm": 0.4492906332015991, "learning_rate": 5.0304697198354704e-06, "loss": 0.3671, "step": 3298 }, { "epoch": 1.6451130319148937, "grad_norm": 0.4895496964454651, "learning_rate": 5.027567872709163e-06, "loss": 0.3661, "step": 3299 }, { "epoch": 1.6456117021276597, "grad_norm": 0.4739525318145752, "learning_rate": 5.024666016296895e-06, "loss": 0.3513, "step": 3300 }, { "epoch": 1.6461103723404256, "grad_norm": 0.5049176216125488, "learning_rate": 5.021764151576124e-06, "loss": 0.3912, "step": 3301 }, { "epoch": 1.6466090425531914, "grad_norm": 0.44023334980010986, "learning_rate": 5.018862279524317e-06, "loss": 0.3664, "step": 3302 }, { "epoch": 1.6471077127659575, "grad_norm": 0.5342357158660889, "learning_rate": 5.015960401118939e-06, "loss": 0.3841, "step": 3303 }, { "epoch": 1.6476063829787235, "grad_norm": 0.6028374433517456, "learning_rate": 5.013058517337458e-06, "loss": 0.4161, "step": 3304 }, { "epoch": 1.6481050531914894, "grad_norm": 0.45772987604141235, "learning_rate": 5.010156629157347e-06, "loss": 0.3752, "step": 3305 }, { "epoch": 1.6486037234042552, "grad_norm": 0.44299960136413574, "learning_rate": 5.007254737556078e-06, "loss": 0.3773, "step": 3306 }, { "epoch": 1.6491023936170213, "grad_norm": 0.510478675365448, "learning_rate": 5.004352843511121e-06, "loss": 0.3667, "step": 3307 }, { "epoch": 1.6496010638297873, "grad_norm": 0.5422745943069458, "learning_rate": 5.001450947999953e-06, "loss": 0.4293, "step": 3308 }, { "epoch": 1.6500997340425532, "grad_norm": 0.5044219493865967, "learning_rate": 4.9985490520000484e-06, "loss": 0.3571, "step": 3309 }, { "epoch": 1.650598404255319, "grad_norm": 0.4276781380176544, "learning_rate": 4.995647156488879e-06, "loss": 0.3254, "step": 3310 }, { "epoch": 1.651097074468085, "grad_norm": 0.5449880957603455, "learning_rate": 4.992745262443925e-06, "loss": 0.4138, "step": 3311 }, { "epoch": 1.6515957446808511, "grad_norm": 0.4423791468143463, "learning_rate": 4.989843370842653e-06, "loss": 0.3038, "step": 3312 }, { "epoch": 1.652094414893617, "grad_norm": 0.5069630742073059, "learning_rate": 4.986941482662543e-06, "loss": 0.3976, "step": 3313 }, { "epoch": 1.652593085106383, "grad_norm": 0.4963846504688263, "learning_rate": 4.984039598881063e-06, "loss": 0.3829, "step": 3314 }, { "epoch": 1.6530917553191489, "grad_norm": 0.5251636505126953, "learning_rate": 4.981137720475685e-06, "loss": 0.3975, "step": 3315 }, { "epoch": 1.653590425531915, "grad_norm": 0.4866626560688019, "learning_rate": 4.978235848423878e-06, "loss": 0.377, "step": 3316 }, { "epoch": 1.654089095744681, "grad_norm": 0.5025191903114319, "learning_rate": 4.9753339837031066e-06, "loss": 0.4184, "step": 3317 }, { "epoch": 1.6545877659574468, "grad_norm": 0.47457778453826904, "learning_rate": 4.9724321272908374e-06, "loss": 0.3925, "step": 3318 }, { "epoch": 1.6550864361702127, "grad_norm": 0.4731907248497009, "learning_rate": 4.969530280164531e-06, "loss": 0.3557, "step": 3319 }, { "epoch": 1.6555851063829787, "grad_norm": 0.4479469656944275, "learning_rate": 4.966628443301645e-06, "loss": 0.3539, "step": 3320 }, { "epoch": 1.6560837765957448, "grad_norm": 0.5829287171363831, "learning_rate": 4.9637266176796346e-06, "loss": 0.4115, "step": 3321 }, { "epoch": 1.6565824468085106, "grad_norm": 0.5210765600204468, "learning_rate": 4.960824804275952e-06, "loss": 0.3927, "step": 3322 }, { "epoch": 1.6570811170212765, "grad_norm": 0.4922831654548645, "learning_rate": 4.957923004068042e-06, "loss": 0.4048, "step": 3323 }, { "epoch": 1.6575797872340425, "grad_norm": 0.47586748003959656, "learning_rate": 4.955021218033349e-06, "loss": 0.4037, "step": 3324 }, { "epoch": 1.6580784574468086, "grad_norm": 0.47294652462005615, "learning_rate": 4.95211944714931e-06, "loss": 0.3507, "step": 3325 }, { "epoch": 1.6585771276595744, "grad_norm": 0.4798189401626587, "learning_rate": 4.949217692393355e-06, "loss": 0.3515, "step": 3326 }, { "epoch": 1.6590757978723403, "grad_norm": 0.5403212308883667, "learning_rate": 4.946315954742915e-06, "loss": 0.4548, "step": 3327 }, { "epoch": 1.6595744680851063, "grad_norm": 0.4689825177192688, "learning_rate": 4.943414235175409e-06, "loss": 0.38, "step": 3328 }, { "epoch": 1.6600731382978724, "grad_norm": 0.4135172963142395, "learning_rate": 4.940512534668253e-06, "loss": 0.3366, "step": 3329 }, { "epoch": 1.6605718085106385, "grad_norm": 0.48607534170150757, "learning_rate": 4.937610854198855e-06, "loss": 0.4653, "step": 3330 }, { "epoch": 1.6610704787234043, "grad_norm": 0.45841100811958313, "learning_rate": 4.9347091947446195e-06, "loss": 0.3839, "step": 3331 }, { "epoch": 1.6615691489361701, "grad_norm": 0.465248703956604, "learning_rate": 4.931807557282938e-06, "loss": 0.3714, "step": 3332 }, { "epoch": 1.6620678191489362, "grad_norm": 0.47332653403282166, "learning_rate": 4.9289059427912026e-06, "loss": 0.3835, "step": 3333 }, { "epoch": 1.6625664893617023, "grad_norm": 0.434819757938385, "learning_rate": 4.926004352246787e-06, "loss": 0.3469, "step": 3334 }, { "epoch": 1.663065159574468, "grad_norm": 0.48399391770362854, "learning_rate": 4.92310278662707e-06, "loss": 0.3974, "step": 3335 }, { "epoch": 1.663563829787234, "grad_norm": 0.4823490381240845, "learning_rate": 4.92020124690941e-06, "loss": 0.413, "step": 3336 }, { "epoch": 1.6640625, "grad_norm": 0.46536529064178467, "learning_rate": 4.917299734071162e-06, "loss": 0.4012, "step": 3337 }, { "epoch": 1.664561170212766, "grad_norm": 0.4519573450088501, "learning_rate": 4.914398249089675e-06, "loss": 0.3522, "step": 3338 }, { "epoch": 1.665059840425532, "grad_norm": 0.4388159215450287, "learning_rate": 4.9114967929422815e-06, "loss": 0.3553, "step": 3339 }, { "epoch": 1.6655585106382977, "grad_norm": 0.4741784930229187, "learning_rate": 4.9085953666063115e-06, "loss": 0.4144, "step": 3340 }, { "epoch": 1.6660571808510638, "grad_norm": 0.43463578820228577, "learning_rate": 4.905693971059077e-06, "loss": 0.3321, "step": 3341 }, { "epoch": 1.6665558510638299, "grad_norm": 0.4981335401535034, "learning_rate": 4.902792607277889e-06, "loss": 0.3743, "step": 3342 }, { "epoch": 1.6670545212765957, "grad_norm": 0.49953946471214294, "learning_rate": 4.899891276240039e-06, "loss": 0.3958, "step": 3343 }, { "epoch": 1.6675531914893615, "grad_norm": 0.4518425166606903, "learning_rate": 4.896989978922816e-06, "loss": 0.3698, "step": 3344 }, { "epoch": 1.6680518617021276, "grad_norm": 0.445557564496994, "learning_rate": 4.894088716303487e-06, "loss": 0.348, "step": 3345 }, { "epoch": 1.6685505319148937, "grad_norm": 0.4784151017665863, "learning_rate": 4.891187489359319e-06, "loss": 0.4039, "step": 3346 }, { "epoch": 1.6690492021276597, "grad_norm": 0.4236558675765991, "learning_rate": 4.888286299067558e-06, "loss": 0.3732, "step": 3347 }, { "epoch": 1.6695478723404256, "grad_norm": 0.4813937544822693, "learning_rate": 4.885385146405441e-06, "loss": 0.4008, "step": 3348 }, { "epoch": 1.6700465425531914, "grad_norm": 0.4510667622089386, "learning_rate": 4.882484032350195e-06, "loss": 0.3939, "step": 3349 }, { "epoch": 1.6705452127659575, "grad_norm": 0.5423458814620972, "learning_rate": 4.879582957879026e-06, "loss": 0.3892, "step": 3350 }, { "epoch": 1.6710438829787235, "grad_norm": 0.46496787667274475, "learning_rate": 4.876681923969138e-06, "loss": 0.3514, "step": 3351 }, { "epoch": 1.6715425531914894, "grad_norm": 0.6261228919029236, "learning_rate": 4.873780931597711e-06, "loss": 0.4325, "step": 3352 }, { "epoch": 1.6720412234042552, "grad_norm": 0.4396975636482239, "learning_rate": 4.870879981741917e-06, "loss": 0.3814, "step": 3353 }, { "epoch": 1.6725398936170213, "grad_norm": 0.47884419560432434, "learning_rate": 4.867979075378914e-06, "loss": 0.3893, "step": 3354 }, { "epoch": 1.6730385638297873, "grad_norm": 0.44452783465385437, "learning_rate": 4.86507821348584e-06, "loss": 0.3904, "step": 3355 }, { "epoch": 1.6735372340425532, "grad_norm": 0.48230352997779846, "learning_rate": 4.862177397039823e-06, "loss": 0.396, "step": 3356 }, { "epoch": 1.674035904255319, "grad_norm": 0.5151346325874329, "learning_rate": 4.8592766270179734e-06, "loss": 0.3195, "step": 3357 }, { "epoch": 1.674534574468085, "grad_norm": 0.47485584020614624, "learning_rate": 4.856375904397388e-06, "loss": 0.3766, "step": 3358 }, { "epoch": 1.6750332446808511, "grad_norm": 0.46977564692497253, "learning_rate": 4.853475230155144e-06, "loss": 0.3498, "step": 3359 }, { "epoch": 1.675531914893617, "grad_norm": 0.47038301825523376, "learning_rate": 4.850574605268307e-06, "loss": 0.4119, "step": 3360 }, { "epoch": 1.676030585106383, "grad_norm": 0.4519514739513397, "learning_rate": 4.84767403071392e-06, "loss": 0.386, "step": 3361 }, { "epoch": 1.6765292553191489, "grad_norm": 0.5089487433433533, "learning_rate": 4.844773507469018e-06, "loss": 0.3603, "step": 3362 }, { "epoch": 1.677027925531915, "grad_norm": 0.530449628829956, "learning_rate": 4.841873036510607e-06, "loss": 0.4106, "step": 3363 }, { "epoch": 1.677526595744681, "grad_norm": 0.473112553358078, "learning_rate": 4.838972618815688e-06, "loss": 0.3911, "step": 3364 }, { "epoch": 1.6780252659574468, "grad_norm": 0.4773564636707306, "learning_rate": 4.836072255361232e-06, "loss": 0.3474, "step": 3365 }, { "epoch": 1.6785239361702127, "grad_norm": 0.5792908072471619, "learning_rate": 4.833171947124202e-06, "loss": 0.4591, "step": 3366 }, { "epoch": 1.6790226063829787, "grad_norm": 0.46712735295295715, "learning_rate": 4.830271695081533e-06, "loss": 0.3453, "step": 3367 }, { "epoch": 1.6795212765957448, "grad_norm": 0.529646635055542, "learning_rate": 4.827371500210153e-06, "loss": 0.3936, "step": 3368 }, { "epoch": 1.6800199468085106, "grad_norm": 0.46670159697532654, "learning_rate": 4.824471363486959e-06, "loss": 0.356, "step": 3369 }, { "epoch": 1.6805186170212765, "grad_norm": 0.5418618321418762, "learning_rate": 4.821571285888831e-06, "loss": 0.3899, "step": 3370 }, { "epoch": 1.6810172872340425, "grad_norm": 0.4927017390727997, "learning_rate": 4.818671268392635e-06, "loss": 0.3904, "step": 3371 }, { "epoch": 1.6815159574468086, "grad_norm": 0.4487558901309967, "learning_rate": 4.815771311975212e-06, "loss": 0.3512, "step": 3372 }, { "epoch": 1.6820146276595744, "grad_norm": 0.5817146301269531, "learning_rate": 4.812871417613383e-06, "loss": 0.4231, "step": 3373 }, { "epoch": 1.6825132978723403, "grad_norm": 0.497183620929718, "learning_rate": 4.809971586283949e-06, "loss": 0.347, "step": 3374 }, { "epoch": 1.6830119680851063, "grad_norm": 0.5156857371330261, "learning_rate": 4.807071818963689e-06, "loss": 0.4191, "step": 3375 }, { "epoch": 1.6835106382978724, "grad_norm": 0.4486006200313568, "learning_rate": 4.80417211662936e-06, "loss": 0.3668, "step": 3376 }, { "epoch": 1.6840093085106385, "grad_norm": 0.5270839333534241, "learning_rate": 4.801272480257699e-06, "loss": 0.3615, "step": 3377 }, { "epoch": 1.6845079787234043, "grad_norm": 0.5548658967018127, "learning_rate": 4.798372910825417e-06, "loss": 0.4146, "step": 3378 }, { "epoch": 1.6850066489361701, "grad_norm": 0.48205557465553284, "learning_rate": 4.79547340930921e-06, "loss": 0.362, "step": 3379 }, { "epoch": 1.6855053191489362, "grad_norm": 0.46804237365722656, "learning_rate": 4.792573976685742e-06, "loss": 0.4177, "step": 3380 }, { "epoch": 1.6860039893617023, "grad_norm": 0.5049431920051575, "learning_rate": 4.789674613931658e-06, "loss": 0.3953, "step": 3381 }, { "epoch": 1.686502659574468, "grad_norm": 0.445559024810791, "learning_rate": 4.786775322023581e-06, "loss": 0.3355, "step": 3382 }, { "epoch": 1.687001329787234, "grad_norm": 0.4492751657962799, "learning_rate": 4.783876101938107e-06, "loss": 0.3547, "step": 3383 }, { "epoch": 1.6875, "grad_norm": 0.5307989716529846, "learning_rate": 4.780976954651812e-06, "loss": 0.4275, "step": 3384 }, { "epoch": 1.687998670212766, "grad_norm": 0.44544529914855957, "learning_rate": 4.778077881141241e-06, "loss": 0.2982, "step": 3385 }, { "epoch": 1.688497340425532, "grad_norm": 0.4668685495853424, "learning_rate": 4.7751788823829214e-06, "loss": 0.4124, "step": 3386 }, { "epoch": 1.6889960106382977, "grad_norm": 0.48278042674064636, "learning_rate": 4.772279959353349e-06, "loss": 0.3813, "step": 3387 }, { "epoch": 1.6894946808510638, "grad_norm": 0.4886055886745453, "learning_rate": 4.769381113029002e-06, "loss": 0.3678, "step": 3388 }, { "epoch": 1.6899933510638299, "grad_norm": 0.47585487365722656, "learning_rate": 4.766482344386321e-06, "loss": 0.4013, "step": 3389 }, { "epoch": 1.6904920212765957, "grad_norm": 0.4666983187198639, "learning_rate": 4.763583654401733e-06, "loss": 0.3489, "step": 3390 }, { "epoch": 1.6909906914893615, "grad_norm": 0.48016229271888733, "learning_rate": 4.76068504405163e-06, "loss": 0.4449, "step": 3391 }, { "epoch": 1.6914893617021276, "grad_norm": 0.40169987082481384, "learning_rate": 4.757786514312378e-06, "loss": 0.3152, "step": 3392 }, { "epoch": 1.6919880319148937, "grad_norm": 0.4799787402153015, "learning_rate": 4.754888066160324e-06, "loss": 0.3988, "step": 3393 }, { "epoch": 1.6924867021276597, "grad_norm": 0.6232563853263855, "learning_rate": 4.751989700571775e-06, "loss": 0.4495, "step": 3394 }, { "epoch": 1.6929853723404256, "grad_norm": 0.38306817412376404, "learning_rate": 4.749091418523022e-06, "loss": 0.3053, "step": 3395 }, { "epoch": 1.6934840425531914, "grad_norm": 0.49315330386161804, "learning_rate": 4.746193220990317e-06, "loss": 0.4242, "step": 3396 }, { "epoch": 1.6939827127659575, "grad_norm": 0.4577007591724396, "learning_rate": 4.743295108949893e-06, "loss": 0.3551, "step": 3397 }, { "epoch": 1.6944813829787235, "grad_norm": 0.4860519766807556, "learning_rate": 4.740397083377947e-06, "loss": 0.4342, "step": 3398 }, { "epoch": 1.6949800531914894, "grad_norm": 0.45013630390167236, "learning_rate": 4.737499145250654e-06, "loss": 0.3539, "step": 3399 }, { "epoch": 1.6954787234042552, "grad_norm": 0.4648714065551758, "learning_rate": 4.734601295544152e-06, "loss": 0.3486, "step": 3400 }, { "epoch": 1.6959773936170213, "grad_norm": 0.5200973749160767, "learning_rate": 4.731703535234552e-06, "loss": 0.4301, "step": 3401 }, { "epoch": 1.6964760638297873, "grad_norm": 0.45266619324684143, "learning_rate": 4.72880586529794e-06, "loss": 0.4229, "step": 3402 }, { "epoch": 1.6969747340425532, "grad_norm": 0.41455936431884766, "learning_rate": 4.725908286710362e-06, "loss": 0.3411, "step": 3403 }, { "epoch": 1.697473404255319, "grad_norm": 0.45154812932014465, "learning_rate": 4.723010800447844e-06, "loss": 0.4029, "step": 3404 }, { "epoch": 1.697972074468085, "grad_norm": 0.4525272250175476, "learning_rate": 4.72011340748637e-06, "loss": 0.4183, "step": 3405 }, { "epoch": 1.6984707446808511, "grad_norm": 0.4914379119873047, "learning_rate": 4.717216108801901e-06, "loss": 0.3971, "step": 3406 }, { "epoch": 1.698969414893617, "grad_norm": 0.4055275619029999, "learning_rate": 4.714318905370362e-06, "loss": 0.323, "step": 3407 }, { "epoch": 1.699468085106383, "grad_norm": 0.4218687415122986, "learning_rate": 4.711421798167648e-06, "loss": 0.3716, "step": 3408 }, { "epoch": 1.6999667553191489, "grad_norm": 0.42190393805503845, "learning_rate": 4.70852478816962e-06, "loss": 0.4233, "step": 3409 }, { "epoch": 1.700465425531915, "grad_norm": 0.46803268790245056, "learning_rate": 4.705627876352106e-06, "loss": 0.3651, "step": 3410 }, { "epoch": 1.700964095744681, "grad_norm": 0.521584689617157, "learning_rate": 4.702731063690905e-06, "loss": 0.352, "step": 3411 }, { "epoch": 1.7014627659574468, "grad_norm": 0.4741959273815155, "learning_rate": 4.699834351161774e-06, "loss": 0.3818, "step": 3412 }, { "epoch": 1.7019614361702127, "grad_norm": 0.40654298663139343, "learning_rate": 4.696937739740448e-06, "loss": 0.3407, "step": 3413 }, { "epoch": 1.7024601063829787, "grad_norm": 0.518452525138855, "learning_rate": 4.694041230402616e-06, "loss": 0.3948, "step": 3414 }, { "epoch": 1.7029587765957448, "grad_norm": 0.4426954686641693, "learning_rate": 4.691144824123944e-06, "loss": 0.3399, "step": 3415 }, { "epoch": 1.7034574468085106, "grad_norm": 0.4847259521484375, "learning_rate": 4.68824852188005e-06, "loss": 0.3907, "step": 3416 }, { "epoch": 1.7039561170212765, "grad_norm": 0.48057618737220764, "learning_rate": 4.685352324646532e-06, "loss": 0.4153, "step": 3417 }, { "epoch": 1.7044547872340425, "grad_norm": 0.46074023842811584, "learning_rate": 4.68245623339894e-06, "loss": 0.3385, "step": 3418 }, { "epoch": 1.7049534574468086, "grad_norm": 0.49484068155288696, "learning_rate": 4.679560249112798e-06, "loss": 0.4202, "step": 3419 }, { "epoch": 1.7054521276595744, "grad_norm": 0.5208789110183716, "learning_rate": 4.676664372763584e-06, "loss": 0.4084, "step": 3420 }, { "epoch": 1.7059507978723403, "grad_norm": 0.4769379496574402, "learning_rate": 4.673768605326749e-06, "loss": 0.4016, "step": 3421 }, { "epoch": 1.7064494680851063, "grad_norm": 0.4594416320323944, "learning_rate": 4.670872947777704e-06, "loss": 0.3614, "step": 3422 }, { "epoch": 1.7069481382978724, "grad_norm": 0.5087977051734924, "learning_rate": 4.667977401091816e-06, "loss": 0.3552, "step": 3423 }, { "epoch": 1.7074468085106385, "grad_norm": 0.4641670286655426, "learning_rate": 4.66508196624443e-06, "loss": 0.4114, "step": 3424 }, { "epoch": 1.7079454787234043, "grad_norm": 0.431601345539093, "learning_rate": 4.662186644210837e-06, "loss": 0.3431, "step": 3425 }, { "epoch": 1.7084441489361701, "grad_norm": 0.48105260729789734, "learning_rate": 4.6592914359663e-06, "loss": 0.3838, "step": 3426 }, { "epoch": 1.7089428191489362, "grad_norm": 0.5112370252609253, "learning_rate": 4.656396342486042e-06, "loss": 0.387, "step": 3427 }, { "epoch": 1.7094414893617023, "grad_norm": 0.5582600831985474, "learning_rate": 4.653501364745246e-06, "loss": 0.3931, "step": 3428 }, { "epoch": 1.709940159574468, "grad_norm": 0.4672035574913025, "learning_rate": 4.650606503719054e-06, "loss": 0.3535, "step": 3429 }, { "epoch": 1.710438829787234, "grad_norm": 0.56452876329422, "learning_rate": 4.647711760382574e-06, "loss": 0.4113, "step": 3430 }, { "epoch": 1.7109375, "grad_norm": 0.547181248664856, "learning_rate": 4.64481713571087e-06, "loss": 0.3804, "step": 3431 }, { "epoch": 1.711436170212766, "grad_norm": 0.4341307580471039, "learning_rate": 4.641922630678968e-06, "loss": 0.3447, "step": 3432 }, { "epoch": 1.711934840425532, "grad_norm": 0.4981434941291809, "learning_rate": 4.639028246261853e-06, "loss": 0.4575, "step": 3433 }, { "epoch": 1.7124335106382977, "grad_norm": 0.506562352180481, "learning_rate": 4.6361339834344674e-06, "loss": 0.3927, "step": 3434 }, { "epoch": 1.7129321808510638, "grad_norm": 0.5093063712120056, "learning_rate": 4.633239843171719e-06, "loss": 0.3444, "step": 3435 }, { "epoch": 1.7134308510638299, "grad_norm": 0.5971559882164001, "learning_rate": 4.630345826448465e-06, "loss": 0.4114, "step": 3436 }, { "epoch": 1.7139295212765957, "grad_norm": 0.4336391091346741, "learning_rate": 4.62745193423953e-06, "loss": 0.34, "step": 3437 }, { "epoch": 1.7144281914893615, "grad_norm": 0.46589338779449463, "learning_rate": 4.624558167519691e-06, "loss": 0.3796, "step": 3438 }, { "epoch": 1.7149268617021276, "grad_norm": 0.606357753276825, "learning_rate": 4.621664527263686e-06, "loss": 0.4429, "step": 3439 }, { "epoch": 1.7154255319148937, "grad_norm": 0.4910435378551483, "learning_rate": 4.618771014446206e-06, "loss": 0.339, "step": 3440 }, { "epoch": 1.7159242021276597, "grad_norm": 0.5078506469726562, "learning_rate": 4.615877630041905e-06, "loss": 0.3888, "step": 3441 }, { "epoch": 1.7164228723404256, "grad_norm": 0.48758357763290405, "learning_rate": 4.612984375025388e-06, "loss": 0.3665, "step": 3442 }, { "epoch": 1.7169215425531914, "grad_norm": 0.5201480984687805, "learning_rate": 4.610091250371221e-06, "loss": 0.4049, "step": 3443 }, { "epoch": 1.7174202127659575, "grad_norm": 0.495848685503006, "learning_rate": 4.6071982570539234e-06, "loss": 0.3896, "step": 3444 }, { "epoch": 1.7179188829787235, "grad_norm": 0.4643031656742096, "learning_rate": 4.604305396047971e-06, "loss": 0.3783, "step": 3445 }, { "epoch": 1.7184175531914894, "grad_norm": 0.5217675566673279, "learning_rate": 4.6014126683277944e-06, "loss": 0.3569, "step": 3446 }, { "epoch": 1.7189162234042552, "grad_norm": 0.5673322081565857, "learning_rate": 4.598520074867781e-06, "loss": 0.4039, "step": 3447 }, { "epoch": 1.7194148936170213, "grad_norm": 0.4826374650001526, "learning_rate": 4.595627616642273e-06, "loss": 0.3527, "step": 3448 }, { "epoch": 1.7199135638297873, "grad_norm": 0.5083222985267639, "learning_rate": 4.592735294625563e-06, "loss": 0.3771, "step": 3449 }, { "epoch": 1.7204122340425532, "grad_norm": 0.44195762276649475, "learning_rate": 4.589843109791905e-06, "loss": 0.3963, "step": 3450 }, { "epoch": 1.720910904255319, "grad_norm": 0.5176145434379578, "learning_rate": 4.5869510631154975e-06, "loss": 0.325, "step": 3451 }, { "epoch": 1.721409574468085, "grad_norm": 0.5358773469924927, "learning_rate": 4.584059155570504e-06, "loss": 0.3791, "step": 3452 }, { "epoch": 1.7219082446808511, "grad_norm": 0.5008425116539001, "learning_rate": 4.5811673881310285e-06, "loss": 0.3967, "step": 3453 }, { "epoch": 1.722406914893617, "grad_norm": 0.43570512533187866, "learning_rate": 4.578275761771138e-06, "loss": 0.3285, "step": 3454 }, { "epoch": 1.722905585106383, "grad_norm": 0.4816908836364746, "learning_rate": 4.575384277464847e-06, "loss": 0.372, "step": 3455 }, { "epoch": 1.7234042553191489, "grad_norm": 0.4827550947666168, "learning_rate": 4.572492936186121e-06, "loss": 0.3942, "step": 3456 }, { "epoch": 1.723902925531915, "grad_norm": 0.4354590177536011, "learning_rate": 4.569601738908883e-06, "loss": 0.3777, "step": 3457 }, { "epoch": 1.724401595744681, "grad_norm": 0.46903321146965027, "learning_rate": 4.566710686607e-06, "loss": 0.3628, "step": 3458 }, { "epoch": 1.7249002659574468, "grad_norm": 0.4897761940956116, "learning_rate": 4.563819780254298e-06, "loss": 0.3978, "step": 3459 }, { "epoch": 1.7253989361702127, "grad_norm": 0.41311880946159363, "learning_rate": 4.560929020824546e-06, "loss": 0.3068, "step": 3460 }, { "epoch": 1.7258976063829787, "grad_norm": 0.46172410249710083, "learning_rate": 4.558038409291472e-06, "loss": 0.3913, "step": 3461 }, { "epoch": 1.7263962765957448, "grad_norm": 0.4638041853904724, "learning_rate": 4.555147946628745e-06, "loss": 0.359, "step": 3462 }, { "epoch": 1.7268949468085106, "grad_norm": 0.4562024772167206, "learning_rate": 4.552257633809993e-06, "loss": 0.347, "step": 3463 }, { "epoch": 1.7273936170212765, "grad_norm": 0.45823895931243896, "learning_rate": 4.549367471808785e-06, "loss": 0.338, "step": 3464 }, { "epoch": 1.7278922872340425, "grad_norm": 0.5221142768859863, "learning_rate": 4.546477461598644e-06, "loss": 0.3906, "step": 3465 }, { "epoch": 1.7283909574468086, "grad_norm": 0.4459966719150543, "learning_rate": 4.543587604153044e-06, "loss": 0.37, "step": 3466 }, { "epoch": 1.7288896276595744, "grad_norm": 0.5348662734031677, "learning_rate": 4.5406979004454e-06, "loss": 0.3908, "step": 3467 }, { "epoch": 1.7293882978723403, "grad_norm": 0.48537158966064453, "learning_rate": 4.537808351449085e-06, "loss": 0.3535, "step": 3468 }, { "epoch": 1.7298869680851063, "grad_norm": 0.4840271770954132, "learning_rate": 4.534918958137411e-06, "loss": 0.3373, "step": 3469 }, { "epoch": 1.7303856382978724, "grad_norm": 0.47321751713752747, "learning_rate": 4.532029721483644e-06, "loss": 0.3726, "step": 3470 }, { "epoch": 1.7308843085106385, "grad_norm": 0.49070122838020325, "learning_rate": 4.529140642460991e-06, "loss": 0.4366, "step": 3471 }, { "epoch": 1.7313829787234043, "grad_norm": 0.4491713047027588, "learning_rate": 4.5262517220426136e-06, "loss": 0.326, "step": 3472 }, { "epoch": 1.7318816489361701, "grad_norm": 0.4866490960121155, "learning_rate": 4.52336296120161e-06, "loss": 0.3935, "step": 3473 }, { "epoch": 1.7323803191489362, "grad_norm": 0.44349366426467896, "learning_rate": 4.520474360911038e-06, "loss": 0.3961, "step": 3474 }, { "epoch": 1.7328789893617023, "grad_norm": 0.4655727744102478, "learning_rate": 4.51758592214389e-06, "loss": 0.3465, "step": 3475 }, { "epoch": 1.733377659574468, "grad_norm": 0.47292882204055786, "learning_rate": 4.514697645873104e-06, "loss": 0.365, "step": 3476 }, { "epoch": 1.733876329787234, "grad_norm": 0.4987860918045044, "learning_rate": 4.511809533071574e-06, "loss": 0.4124, "step": 3477 }, { "epoch": 1.734375, "grad_norm": 0.5480135679244995, "learning_rate": 4.508921584712126e-06, "loss": 0.408, "step": 3478 }, { "epoch": 1.734873670212766, "grad_norm": 0.4225640892982483, "learning_rate": 4.506033801767541e-06, "loss": 0.3215, "step": 3479 }, { "epoch": 1.735372340425532, "grad_norm": 0.49273058772087097, "learning_rate": 4.503146185210536e-06, "loss": 0.3718, "step": 3480 }, { "epoch": 1.7358710106382977, "grad_norm": 0.4728802442550659, "learning_rate": 4.500258736013779e-06, "loss": 0.3969, "step": 3481 }, { "epoch": 1.7363696808510638, "grad_norm": 0.465822696685791, "learning_rate": 4.497371455149877e-06, "loss": 0.3802, "step": 3482 }, { "epoch": 1.7368683510638299, "grad_norm": 0.45870253443717957, "learning_rate": 4.494484343591382e-06, "loss": 0.3622, "step": 3483 }, { "epoch": 1.7373670212765957, "grad_norm": 0.456760972738266, "learning_rate": 4.491597402310789e-06, "loss": 0.32, "step": 3484 }, { "epoch": 1.7378656914893615, "grad_norm": 0.5153799057006836, "learning_rate": 4.488710632280535e-06, "loss": 0.4419, "step": 3485 }, { "epoch": 1.7383643617021276, "grad_norm": 0.485949844121933, "learning_rate": 4.485824034473001e-06, "loss": 0.3622, "step": 3486 }, { "epoch": 1.7388630319148937, "grad_norm": 0.40867769718170166, "learning_rate": 4.482937609860504e-06, "loss": 0.356, "step": 3487 }, { "epoch": 1.7393617021276597, "grad_norm": 0.46310755610466003, "learning_rate": 4.480051359415314e-06, "loss": 0.385, "step": 3488 }, { "epoch": 1.7398603723404256, "grad_norm": 0.4592621326446533, "learning_rate": 4.47716528410963e-06, "loss": 0.3423, "step": 3489 }, { "epoch": 1.7403590425531914, "grad_norm": 0.49413079023361206, "learning_rate": 4.474279384915602e-06, "loss": 0.3926, "step": 3490 }, { "epoch": 1.7408577127659575, "grad_norm": 0.5396145582199097, "learning_rate": 4.471393662805312e-06, "loss": 0.4405, "step": 3491 }, { "epoch": 1.7413563829787235, "grad_norm": 0.4983934462070465, "learning_rate": 4.46850811875079e-06, "loss": 0.3768, "step": 3492 }, { "epoch": 1.7418550531914894, "grad_norm": 0.5469294190406799, "learning_rate": 4.465622753724001e-06, "loss": 0.433, "step": 3493 }, { "epoch": 1.7423537234042552, "grad_norm": 0.44617605209350586, "learning_rate": 4.462737568696854e-06, "loss": 0.3291, "step": 3494 }, { "epoch": 1.7428523936170213, "grad_norm": 0.4823695719242096, "learning_rate": 4.459852564641191e-06, "loss": 0.3731, "step": 3495 }, { "epoch": 1.7433510638297873, "grad_norm": 0.46929997205734253, "learning_rate": 4.4569677425288e-06, "loss": 0.4242, "step": 3496 }, { "epoch": 1.7438497340425532, "grad_norm": 0.4312736988067627, "learning_rate": 4.454083103331402e-06, "loss": 0.3107, "step": 3497 }, { "epoch": 1.744348404255319, "grad_norm": 0.4684121012687683, "learning_rate": 4.451198648020661e-06, "loss": 0.4067, "step": 3498 }, { "epoch": 1.744847074468085, "grad_norm": 0.44607803225517273, "learning_rate": 4.4483143775681755e-06, "loss": 0.3797, "step": 3499 }, { "epoch": 1.7453457446808511, "grad_norm": 0.4690649211406708, "learning_rate": 4.445430292945484e-06, "loss": 0.362, "step": 3500 }, { "epoch": 1.745844414893617, "grad_norm": 0.47404244542121887, "learning_rate": 4.442546395124061e-06, "loss": 0.3512, "step": 3501 }, { "epoch": 1.746343085106383, "grad_norm": 0.5650905966758728, "learning_rate": 4.439662685075319e-06, "loss": 0.4214, "step": 3502 }, { "epoch": 1.7468417553191489, "grad_norm": 0.5080811381340027, "learning_rate": 4.436779163770607e-06, "loss": 0.3898, "step": 3503 }, { "epoch": 1.747340425531915, "grad_norm": 0.453750878572464, "learning_rate": 4.43389583218121e-06, "loss": 0.334, "step": 3504 }, { "epoch": 1.747839095744681, "grad_norm": 0.4604502320289612, "learning_rate": 4.431012691278351e-06, "loss": 0.3881, "step": 3505 }, { "epoch": 1.7483377659574468, "grad_norm": 0.47541072964668274, "learning_rate": 4.428129742033185e-06, "loss": 0.4171, "step": 3506 }, { "epoch": 1.7488364361702127, "grad_norm": 0.49034586548805237, "learning_rate": 4.425246985416806e-06, "loss": 0.3479, "step": 3507 }, { "epoch": 1.7493351063829787, "grad_norm": 0.4513320028781891, "learning_rate": 4.4223644224002435e-06, "loss": 0.402, "step": 3508 }, { "epoch": 1.7498337765957448, "grad_norm": 0.4171520471572876, "learning_rate": 4.419482053954455e-06, "loss": 0.3294, "step": 3509 }, { "epoch": 1.7503324468085106, "grad_norm": 0.5094834566116333, "learning_rate": 4.416599881050343e-06, "loss": 0.4549, "step": 3510 }, { "epoch": 1.7508311170212765, "grad_norm": 0.46503081917762756, "learning_rate": 4.4137179046587345e-06, "loss": 0.356, "step": 3511 }, { "epoch": 1.7513297872340425, "grad_norm": 0.466846227645874, "learning_rate": 4.410836125750397e-06, "loss": 0.3723, "step": 3512 }, { "epoch": 1.7518284574468086, "grad_norm": 0.44882887601852417, "learning_rate": 4.407954545296028e-06, "loss": 0.3605, "step": 3513 }, { "epoch": 1.7523271276595744, "grad_norm": 0.48211440443992615, "learning_rate": 4.40507316426626e-06, "loss": 0.3929, "step": 3514 }, { "epoch": 1.7528257978723403, "grad_norm": 0.46860966086387634, "learning_rate": 4.402191983631655e-06, "loss": 0.4418, "step": 3515 }, { "epoch": 1.7533244680851063, "grad_norm": 0.37329182028770447, "learning_rate": 4.399311004362713e-06, "loss": 0.2796, "step": 3516 }, { "epoch": 1.7538231382978724, "grad_norm": 0.6090427041053772, "learning_rate": 4.396430227429861e-06, "loss": 0.4066, "step": 3517 }, { "epoch": 1.7543218085106385, "grad_norm": 0.5082696676254272, "learning_rate": 4.393549653803462e-06, "loss": 0.4191, "step": 3518 }, { "epoch": 1.7548204787234043, "grad_norm": 0.4705599248409271, "learning_rate": 4.3906692844538065e-06, "loss": 0.3567, "step": 3519 }, { "epoch": 1.7553191489361701, "grad_norm": 0.4593847990036011, "learning_rate": 4.387789120351118e-06, "loss": 0.3774, "step": 3520 }, { "epoch": 1.7558178191489362, "grad_norm": 0.47470298409461975, "learning_rate": 4.384909162465553e-06, "loss": 0.3541, "step": 3521 }, { "epoch": 1.7563164893617023, "grad_norm": 0.48278701305389404, "learning_rate": 4.382029411767194e-06, "loss": 0.3352, "step": 3522 }, { "epoch": 1.756815159574468, "grad_norm": 0.4704849123954773, "learning_rate": 4.37914986922606e-06, "loss": 0.4181, "step": 3523 }, { "epoch": 1.757313829787234, "grad_norm": 0.5110522508621216, "learning_rate": 4.3762705358120914e-06, "loss": 0.4053, "step": 3524 }, { "epoch": 1.7578125, "grad_norm": 0.4340633153915405, "learning_rate": 4.373391412495168e-06, "loss": 0.3746, "step": 3525 }, { "epoch": 1.758311170212766, "grad_norm": 0.515813410282135, "learning_rate": 4.370512500245088e-06, "loss": 0.3919, "step": 3526 }, { "epoch": 1.758809840425532, "grad_norm": 0.4482283592224121, "learning_rate": 4.36763380003159e-06, "loss": 0.3493, "step": 3527 }, { "epoch": 1.7593085106382977, "grad_norm": 0.43950435519218445, "learning_rate": 4.3647553128243306e-06, "loss": 0.3624, "step": 3528 }, { "epoch": 1.7598071808510638, "grad_norm": 0.4592224657535553, "learning_rate": 4.3618770395929035e-06, "loss": 0.3753, "step": 3529 }, { "epoch": 1.7603058510638299, "grad_norm": 0.4676740765571594, "learning_rate": 4.358998981306825e-06, "loss": 0.391, "step": 3530 }, { "epoch": 1.7608045212765957, "grad_norm": 0.4395081698894501, "learning_rate": 4.356121138935538e-06, "loss": 0.3853, "step": 3531 }, { "epoch": 1.7613031914893615, "grad_norm": 0.46152883768081665, "learning_rate": 4.353243513448417e-06, "loss": 0.3649, "step": 3532 }, { "epoch": 1.7618018617021276, "grad_norm": 0.4348742663860321, "learning_rate": 4.35036610581476e-06, "loss": 0.3828, "step": 3533 }, { "epoch": 1.7623005319148937, "grad_norm": 0.48043736815452576, "learning_rate": 4.347488917003797e-06, "loss": 0.3552, "step": 3534 }, { "epoch": 1.7627992021276597, "grad_norm": 0.4678178131580353, "learning_rate": 4.344611947984675e-06, "loss": 0.372, "step": 3535 }, { "epoch": 1.7632978723404256, "grad_norm": 0.5089685320854187, "learning_rate": 4.341735199726477e-06, "loss": 0.4155, "step": 3536 }, { "epoch": 1.7637965425531914, "grad_norm": 0.4415546655654907, "learning_rate": 4.338858673198202e-06, "loss": 0.3524, "step": 3537 }, { "epoch": 1.7642952127659575, "grad_norm": 0.44899293780326843, "learning_rate": 4.335982369368783e-06, "loss": 0.3724, "step": 3538 }, { "epoch": 1.7647938829787235, "grad_norm": 0.5144495964050293, "learning_rate": 4.333106289207075e-06, "loss": 0.4031, "step": 3539 }, { "epoch": 1.7652925531914894, "grad_norm": 0.42437946796417236, "learning_rate": 4.3302304336818524e-06, "loss": 0.2937, "step": 3540 }, { "epoch": 1.7657912234042552, "grad_norm": 0.48802146315574646, "learning_rate": 4.327354803761823e-06, "loss": 0.3967, "step": 3541 }, { "epoch": 1.7662898936170213, "grad_norm": 0.40442657470703125, "learning_rate": 4.324479400415611e-06, "loss": 0.3035, "step": 3542 }, { "epoch": 1.7667885638297873, "grad_norm": 0.5283809304237366, "learning_rate": 4.32160422461177e-06, "loss": 0.4577, "step": 3543 }, { "epoch": 1.7672872340425532, "grad_norm": 0.4346812963485718, "learning_rate": 4.318729277318769e-06, "loss": 0.3502, "step": 3544 }, { "epoch": 1.767785904255319, "grad_norm": 0.5077353119850159, "learning_rate": 4.315854559505013e-06, "loss": 0.3941, "step": 3545 }, { "epoch": 1.768284574468085, "grad_norm": 0.43173980712890625, "learning_rate": 4.312980072138816e-06, "loss": 0.3728, "step": 3546 }, { "epoch": 1.7687832446808511, "grad_norm": 0.5066200494766235, "learning_rate": 4.310105816188424e-06, "loss": 0.3624, "step": 3547 }, { "epoch": 1.769281914893617, "grad_norm": 0.4673221707344055, "learning_rate": 4.307231792621997e-06, "loss": 0.3742, "step": 3548 }, { "epoch": 1.769780585106383, "grad_norm": 0.47654691338539124, "learning_rate": 4.304358002407626e-06, "loss": 0.4251, "step": 3549 }, { "epoch": 1.7702792553191489, "grad_norm": 0.4564894437789917, "learning_rate": 4.301484446513317e-06, "loss": 0.2946, "step": 3550 }, { "epoch": 1.770777925531915, "grad_norm": 0.4923954904079437, "learning_rate": 4.2986111259069955e-06, "loss": 0.4029, "step": 3551 }, { "epoch": 1.771276595744681, "grad_norm": 0.4795021712779999, "learning_rate": 4.295738041556516e-06, "loss": 0.3751, "step": 3552 }, { "epoch": 1.7717752659574468, "grad_norm": 0.4330500662326813, "learning_rate": 4.292865194429644e-06, "loss": 0.2903, "step": 3553 }, { "epoch": 1.7722739361702127, "grad_norm": 0.5276164412498474, "learning_rate": 4.289992585494071e-06, "loss": 0.4509, "step": 3554 }, { "epoch": 1.7727726063829787, "grad_norm": 0.4392486810684204, "learning_rate": 4.2871202157174074e-06, "loss": 0.353, "step": 3555 }, { "epoch": 1.7732712765957448, "grad_norm": 0.3930220305919647, "learning_rate": 4.284248086067181e-06, "loss": 0.3307, "step": 3556 }, { "epoch": 1.7737699468085106, "grad_norm": 0.45290690660476685, "learning_rate": 4.2813761975108415e-06, "loss": 0.4754, "step": 3557 }, { "epoch": 1.7742686170212765, "grad_norm": 0.45703163743019104, "learning_rate": 4.278504551015754e-06, "loss": 0.3717, "step": 3558 }, { "epoch": 1.7747672872340425, "grad_norm": 0.4885738492012024, "learning_rate": 4.2756331475492046e-06, "loss": 0.3626, "step": 3559 }, { "epoch": 1.7752659574468086, "grad_norm": 0.42423155903816223, "learning_rate": 4.2727619880783986e-06, "loss": 0.3612, "step": 3560 }, { "epoch": 1.7757646276595744, "grad_norm": 0.45014920830726624, "learning_rate": 4.269891073570457e-06, "loss": 0.4067, "step": 3561 }, { "epoch": 1.7762632978723403, "grad_norm": 0.43079718947410583, "learning_rate": 4.267020404992415e-06, "loss": 0.3772, "step": 3562 }, { "epoch": 1.7767619680851063, "grad_norm": 0.4653445780277252, "learning_rate": 4.264149983311233e-06, "loss": 0.3669, "step": 3563 }, { "epoch": 1.7772606382978724, "grad_norm": 0.49862295389175415, "learning_rate": 4.2612798094937805e-06, "loss": 0.3787, "step": 3564 }, { "epoch": 1.7777593085106385, "grad_norm": 0.4814307987689972, "learning_rate": 4.258409884506851e-06, "loss": 0.3861, "step": 3565 }, { "epoch": 1.7782579787234043, "grad_norm": 0.4582345187664032, "learning_rate": 4.255540209317146e-06, "loss": 0.3967, "step": 3566 }, { "epoch": 1.7787566489361701, "grad_norm": 0.4788144826889038, "learning_rate": 4.2526707848912894e-06, "loss": 0.4014, "step": 3567 }, { "epoch": 1.7792553191489362, "grad_norm": 0.4914558529853821, "learning_rate": 4.249801612195816e-06, "loss": 0.3738, "step": 3568 }, { "epoch": 1.7797539893617023, "grad_norm": 0.47590455412864685, "learning_rate": 4.246932692197182e-06, "loss": 0.3412, "step": 3569 }, { "epoch": 1.780252659574468, "grad_norm": 0.4955964684486389, "learning_rate": 4.2440640258617484e-06, "loss": 0.3712, "step": 3570 }, { "epoch": 1.780751329787234, "grad_norm": 0.43959546089172363, "learning_rate": 4.241195614155804e-06, "loss": 0.3442, "step": 3571 }, { "epoch": 1.78125, "grad_norm": 0.4777582585811615, "learning_rate": 4.238327458045539e-06, "loss": 0.4029, "step": 3572 }, { "epoch": 1.781748670212766, "grad_norm": 0.5588141083717346, "learning_rate": 4.235459558497065e-06, "loss": 0.3499, "step": 3573 }, { "epoch": 1.782247340425532, "grad_norm": 0.46649521589279175, "learning_rate": 4.232591916476405e-06, "loss": 0.3537, "step": 3574 }, { "epoch": 1.7827460106382977, "grad_norm": 0.38987988233566284, "learning_rate": 4.229724532949497e-06, "loss": 0.3236, "step": 3575 }, { "epoch": 1.7832446808510638, "grad_norm": 0.4725291430950165, "learning_rate": 4.226857408882189e-06, "loss": 0.4118, "step": 3576 }, { "epoch": 1.7837433510638299, "grad_norm": 0.5219936966896057, "learning_rate": 4.223990545240242e-06, "loss": 0.4497, "step": 3577 }, { "epoch": 1.7842420212765957, "grad_norm": 0.4787946045398712, "learning_rate": 4.2211239429893344e-06, "loss": 0.3293, "step": 3578 }, { "epoch": 1.7847406914893615, "grad_norm": 0.46489420533180237, "learning_rate": 4.2182576030950485e-06, "loss": 0.3504, "step": 3579 }, { "epoch": 1.7852393617021276, "grad_norm": 0.5246760249137878, "learning_rate": 4.215391526522886e-06, "loss": 0.4567, "step": 3580 }, { "epoch": 1.7857380319148937, "grad_norm": 0.44735538959503174, "learning_rate": 4.212525714238251e-06, "loss": 0.3948, "step": 3581 }, { "epoch": 1.7862367021276597, "grad_norm": 0.40922874212265015, "learning_rate": 4.20966016720647e-06, "loss": 0.358, "step": 3582 }, { "epoch": 1.7867353723404256, "grad_norm": 0.46475160121917725, "learning_rate": 4.20679488639277e-06, "loss": 0.3819, "step": 3583 }, { "epoch": 1.7872340425531914, "grad_norm": 0.48961198329925537, "learning_rate": 4.203929872762291e-06, "loss": 0.3943, "step": 3584 }, { "epoch": 1.7877327127659575, "grad_norm": 0.5174358487129211, "learning_rate": 4.201065127280088e-06, "loss": 0.406, "step": 3585 }, { "epoch": 1.7882313829787235, "grad_norm": 0.44609829783439636, "learning_rate": 4.198200650911119e-06, "loss": 0.3608, "step": 3586 }, { "epoch": 1.7887300531914894, "grad_norm": 0.4908263087272644, "learning_rate": 4.195336444620257e-06, "loss": 0.36, "step": 3587 }, { "epoch": 1.7892287234042552, "grad_norm": 0.43344902992248535, "learning_rate": 4.192472509372277e-06, "loss": 0.3275, "step": 3588 }, { "epoch": 1.7897273936170213, "grad_norm": 0.527533769607544, "learning_rate": 4.189608846131873e-06, "loss": 0.4075, "step": 3589 }, { "epoch": 1.7902260638297873, "grad_norm": 0.532687783241272, "learning_rate": 4.186745455863636e-06, "loss": 0.3998, "step": 3590 }, { "epoch": 1.7907247340425532, "grad_norm": 0.43987905979156494, "learning_rate": 4.183882339532073e-06, "loss": 0.3456, "step": 3591 }, { "epoch": 1.791223404255319, "grad_norm": 0.4853019118309021, "learning_rate": 4.1810194981015945e-06, "loss": 0.4301, "step": 3592 }, { "epoch": 1.791722074468085, "grad_norm": 0.4053802788257599, "learning_rate": 4.178156932536523e-06, "loss": 0.3589, "step": 3593 }, { "epoch": 1.7922207446808511, "grad_norm": 0.4539749026298523, "learning_rate": 4.175294643801083e-06, "loss": 0.3706, "step": 3594 }, { "epoch": 1.792719414893617, "grad_norm": 0.461627721786499, "learning_rate": 4.172432632859406e-06, "loss": 0.2923, "step": 3595 }, { "epoch": 1.793218085106383, "grad_norm": 0.5595436096191406, "learning_rate": 4.169570900675537e-06, "loss": 0.4428, "step": 3596 }, { "epoch": 1.7937167553191489, "grad_norm": 0.44622302055358887, "learning_rate": 4.1667094482134165e-06, "loss": 0.3174, "step": 3597 }, { "epoch": 1.794215425531915, "grad_norm": 0.5057721734046936, "learning_rate": 4.1638482764369005e-06, "loss": 0.3929, "step": 3598 }, { "epoch": 1.794714095744681, "grad_norm": 0.47695738077163696, "learning_rate": 4.160987386309742e-06, "loss": 0.3472, "step": 3599 }, { "epoch": 1.7952127659574468, "grad_norm": 0.4564208388328552, "learning_rate": 4.158126778795608e-06, "loss": 0.3865, "step": 3600 }, { "epoch": 1.7957114361702127, "grad_norm": 0.4832131862640381, "learning_rate": 4.155266454858062e-06, "loss": 0.3605, "step": 3601 }, { "epoch": 1.7962101063829787, "grad_norm": 0.5251771807670593, "learning_rate": 4.152406415460577e-06, "loss": 0.425, "step": 3602 }, { "epoch": 1.7967087765957448, "grad_norm": 0.4886148273944855, "learning_rate": 4.1495466615665266e-06, "loss": 0.3796, "step": 3603 }, { "epoch": 1.7972074468085106, "grad_norm": 0.4801042079925537, "learning_rate": 4.146687194139195e-06, "loss": 0.3752, "step": 3604 }, { "epoch": 1.7977061170212765, "grad_norm": 0.4805653989315033, "learning_rate": 4.143828014141762e-06, "loss": 0.3685, "step": 3605 }, { "epoch": 1.7982047872340425, "grad_norm": 0.4721483588218689, "learning_rate": 4.140969122537313e-06, "loss": 0.4029, "step": 3606 }, { "epoch": 1.7987034574468086, "grad_norm": 0.4430314898490906, "learning_rate": 4.1381105202888385e-06, "loss": 0.3597, "step": 3607 }, { "epoch": 1.7992021276595744, "grad_norm": 0.5042772889137268, "learning_rate": 4.1352522083592295e-06, "loss": 0.4374, "step": 3608 }, { "epoch": 1.7997007978723403, "grad_norm": 0.42736029624938965, "learning_rate": 4.1323941877112805e-06, "loss": 0.3791, "step": 3609 }, { "epoch": 1.8001994680851063, "grad_norm": 0.46311989426612854, "learning_rate": 4.1295364593076875e-06, "loss": 0.3663, "step": 3610 }, { "epoch": 1.8006981382978724, "grad_norm": 0.4440084993839264, "learning_rate": 4.126679024111047e-06, "loss": 0.3536, "step": 3611 }, { "epoch": 1.8011968085106385, "grad_norm": 0.46849408745765686, "learning_rate": 4.123821883083858e-06, "loss": 0.3758, "step": 3612 }, { "epoch": 1.8016954787234043, "grad_norm": 0.48706090450286865, "learning_rate": 4.120965037188519e-06, "loss": 0.3895, "step": 3613 }, { "epoch": 1.8021941489361701, "grad_norm": 0.4476536512374878, "learning_rate": 4.1181084873873325e-06, "loss": 0.3667, "step": 3614 }, { "epoch": 1.8026928191489362, "grad_norm": 0.42954951524734497, "learning_rate": 4.115252234642494e-06, "loss": 0.3222, "step": 3615 }, { "epoch": 1.8031914893617023, "grad_norm": 0.5192371606826782, "learning_rate": 4.1123962799161106e-06, "loss": 0.4528, "step": 3616 }, { "epoch": 1.803690159574468, "grad_norm": 0.4930797815322876, "learning_rate": 4.109540624170175e-06, "loss": 0.3945, "step": 3617 }, { "epoch": 1.804188829787234, "grad_norm": 0.4238075315952301, "learning_rate": 4.106685268366591e-06, "loss": 0.3489, "step": 3618 }, { "epoch": 1.8046875, "grad_norm": 0.5165034532546997, "learning_rate": 4.1038302134671546e-06, "loss": 0.4356, "step": 3619 }, { "epoch": 1.805186170212766, "grad_norm": 0.4174390733242035, "learning_rate": 4.100975460433565e-06, "loss": 0.3236, "step": 3620 }, { "epoch": 1.805684840425532, "grad_norm": 0.46677613258361816, "learning_rate": 4.098121010227413e-06, "loss": 0.3894, "step": 3621 }, { "epoch": 1.8061835106382977, "grad_norm": 0.4970894157886505, "learning_rate": 4.0952668638101965e-06, "loss": 0.4042, "step": 3622 }, { "epoch": 1.8066821808510638, "grad_norm": 0.4757213890552521, "learning_rate": 4.092413022143301e-06, "loss": 0.3548, "step": 3623 }, { "epoch": 1.8071808510638299, "grad_norm": 0.4480784237384796, "learning_rate": 4.0895594861880205e-06, "loss": 0.3775, "step": 3624 }, { "epoch": 1.8076795212765957, "grad_norm": 0.45150673389434814, "learning_rate": 4.086706256905536e-06, "loss": 0.388, "step": 3625 }, { "epoch": 1.8081781914893615, "grad_norm": 0.44655564427375793, "learning_rate": 4.0838533352569295e-06, "loss": 0.4083, "step": 3626 }, { "epoch": 1.8086768617021276, "grad_norm": 0.3999004065990448, "learning_rate": 4.081000722203181e-06, "loss": 0.3589, "step": 3627 }, { "epoch": 1.8091755319148937, "grad_norm": 0.47485047578811646, "learning_rate": 4.078148418705163e-06, "loss": 0.4106, "step": 3628 }, { "epoch": 1.8096742021276597, "grad_norm": 0.4496597349643707, "learning_rate": 4.075296425723646e-06, "loss": 0.3473, "step": 3629 }, { "epoch": 1.8101728723404256, "grad_norm": 0.5019146800041199, "learning_rate": 4.072444744219296e-06, "loss": 0.3417, "step": 3630 }, { "epoch": 1.8106715425531914, "grad_norm": 0.46512243151664734, "learning_rate": 4.069593375152673e-06, "loss": 0.3772, "step": 3631 }, { "epoch": 1.8111702127659575, "grad_norm": 0.4134522080421448, "learning_rate": 4.066742319484231e-06, "loss": 0.3338, "step": 3632 }, { "epoch": 1.8116688829787235, "grad_norm": 0.46847987174987793, "learning_rate": 4.063891578174323e-06, "loss": 0.3918, "step": 3633 }, { "epoch": 1.8121675531914894, "grad_norm": 0.5046776533126831, "learning_rate": 4.061041152183189e-06, "loss": 0.3909, "step": 3634 }, { "epoch": 1.8126662234042552, "grad_norm": 0.4264944791793823, "learning_rate": 4.058191042470969e-06, "loss": 0.3949, "step": 3635 }, { "epoch": 1.8131648936170213, "grad_norm": 0.43556439876556396, "learning_rate": 4.055341249997694e-06, "loss": 0.3527, "step": 3636 }, { "epoch": 1.8136635638297873, "grad_norm": 0.5481680631637573, "learning_rate": 4.052491775723285e-06, "loss": 0.4164, "step": 3637 }, { "epoch": 1.8141622340425532, "grad_norm": 0.45985838770866394, "learning_rate": 4.0496426206075625e-06, "loss": 0.3521, "step": 3638 }, { "epoch": 1.814660904255319, "grad_norm": 0.4927937686443329, "learning_rate": 4.046793785610234e-06, "loss": 0.3845, "step": 3639 }, { "epoch": 1.815159574468085, "grad_norm": 0.43394336104393005, "learning_rate": 4.043945271690903e-06, "loss": 0.3109, "step": 3640 }, { "epoch": 1.8156582446808511, "grad_norm": 0.5997902750968933, "learning_rate": 4.04109707980906e-06, "loss": 0.3948, "step": 3641 }, { "epoch": 1.816156914893617, "grad_norm": 0.4731028079986572, "learning_rate": 4.038249210924094e-06, "loss": 0.3693, "step": 3642 }, { "epoch": 1.816655585106383, "grad_norm": 0.4605710208415985, "learning_rate": 4.035401665995276e-06, "loss": 0.3437, "step": 3643 }, { "epoch": 1.8171542553191489, "grad_norm": 0.47690704464912415, "learning_rate": 4.032554445981779e-06, "loss": 0.3706, "step": 3644 }, { "epoch": 1.817652925531915, "grad_norm": 0.4766117334365845, "learning_rate": 4.029707551842655e-06, "loss": 0.3806, "step": 3645 }, { "epoch": 1.818151595744681, "grad_norm": 0.5459650158882141, "learning_rate": 4.0268609845368565e-06, "loss": 0.4383, "step": 3646 }, { "epoch": 1.8186502659574468, "grad_norm": 0.5110543966293335, "learning_rate": 4.024014745023219e-06, "loss": 0.4165, "step": 3647 }, { "epoch": 1.8191489361702127, "grad_norm": 0.445468008518219, "learning_rate": 4.0211688342604685e-06, "loss": 0.3355, "step": 3648 }, { "epoch": 1.8196476063829787, "grad_norm": 0.464287668466568, "learning_rate": 4.0183232532072254e-06, "loss": 0.4114, "step": 3649 }, { "epoch": 1.8201462765957448, "grad_norm": 0.4355267286300659, "learning_rate": 4.015478002821992e-06, "loss": 0.3367, "step": 3650 }, { "epoch": 1.8206449468085106, "grad_norm": 0.6133314967155457, "learning_rate": 4.012633084063165e-06, "loss": 0.446, "step": 3651 }, { "epoch": 1.8211436170212765, "grad_norm": 0.45812422037124634, "learning_rate": 4.009788497889023e-06, "loss": 0.3858, "step": 3652 }, { "epoch": 1.8216422872340425, "grad_norm": 0.4332517385482788, "learning_rate": 4.006944245257742e-06, "loss": 0.358, "step": 3653 }, { "epoch": 1.8221409574468086, "grad_norm": 0.5205478072166443, "learning_rate": 4.004100327127375e-06, "loss": 0.4247, "step": 3654 }, { "epoch": 1.8226396276595744, "grad_norm": 0.44265997409820557, "learning_rate": 4.0012567444558705e-06, "loss": 0.3494, "step": 3655 }, { "epoch": 1.8231382978723403, "grad_norm": 0.4448230266571045, "learning_rate": 3.9984134982010595e-06, "loss": 0.3719, "step": 3656 }, { "epoch": 1.8236369680851063, "grad_norm": 0.44355544447898865, "learning_rate": 3.995570589320662e-06, "loss": 0.3912, "step": 3657 }, { "epoch": 1.8241356382978724, "grad_norm": 0.4826110005378723, "learning_rate": 3.992728018772284e-06, "loss": 0.4059, "step": 3658 }, { "epoch": 1.8246343085106385, "grad_norm": 0.47872015833854675, "learning_rate": 3.989885787513414e-06, "loss": 0.3744, "step": 3659 }, { "epoch": 1.8251329787234043, "grad_norm": 0.5133484601974487, "learning_rate": 3.987043896501433e-06, "loss": 0.3736, "step": 3660 }, { "epoch": 1.8256316489361701, "grad_norm": 0.4242737889289856, "learning_rate": 3.9842023466935994e-06, "loss": 0.379, "step": 3661 }, { "epoch": 1.8261303191489362, "grad_norm": 0.4824219048023224, "learning_rate": 3.9813611390470665e-06, "loss": 0.359, "step": 3662 }, { "epoch": 1.8266289893617023, "grad_norm": 0.508557915687561, "learning_rate": 3.978520274518861e-06, "loss": 0.4105, "step": 3663 }, { "epoch": 1.827127659574468, "grad_norm": 0.4451873302459717, "learning_rate": 3.975679754065903e-06, "loss": 0.3482, "step": 3664 }, { "epoch": 1.827626329787234, "grad_norm": 0.45324552059173584, "learning_rate": 3.972839578644993e-06, "loss": 0.3615, "step": 3665 }, { "epoch": 1.828125, "grad_norm": 0.5177964568138123, "learning_rate": 3.969999749212815e-06, "loss": 0.4179, "step": 3666 }, { "epoch": 1.828623670212766, "grad_norm": 0.4641793966293335, "learning_rate": 3.967160266725939e-06, "loss": 0.3857, "step": 3667 }, { "epoch": 1.829122340425532, "grad_norm": 0.4342480003833771, "learning_rate": 3.9643211321408145e-06, "loss": 0.3714, "step": 3668 }, { "epoch": 1.8296210106382977, "grad_norm": 0.4439590573310852, "learning_rate": 3.9614823464137775e-06, "loss": 0.4068, "step": 3669 }, { "epoch": 1.8301196808510638, "grad_norm": 0.5100258588790894, "learning_rate": 3.95864391050104e-06, "loss": 0.3927, "step": 3670 }, { "epoch": 1.8306183510638299, "grad_norm": 0.5162768363952637, "learning_rate": 3.955805825358707e-06, "loss": 0.4263, "step": 3671 }, { "epoch": 1.8311170212765957, "grad_norm": 0.4448370039463043, "learning_rate": 3.952968091942755e-06, "loss": 0.3201, "step": 3672 }, { "epoch": 1.8316156914893615, "grad_norm": 0.5593090653419495, "learning_rate": 3.950130711209048e-06, "loss": 0.3396, "step": 3673 }, { "epoch": 1.8321143617021276, "grad_norm": 0.4966203272342682, "learning_rate": 3.947293684113328e-06, "loss": 0.3422, "step": 3674 }, { "epoch": 1.8326130319148937, "grad_norm": 0.4365728497505188, "learning_rate": 3.944457011611222e-06, "loss": 0.3373, "step": 3675 }, { "epoch": 1.8331117021276597, "grad_norm": 0.46767961978912354, "learning_rate": 3.94162069465823e-06, "loss": 0.3303, "step": 3676 }, { "epoch": 1.8336103723404256, "grad_norm": 0.4907091557979584, "learning_rate": 3.938784734209742e-06, "loss": 0.3887, "step": 3677 }, { "epoch": 1.8341090425531914, "grad_norm": 0.5242593288421631, "learning_rate": 3.935949131221019e-06, "loss": 0.4145, "step": 3678 }, { "epoch": 1.8346077127659575, "grad_norm": 0.48099878430366516, "learning_rate": 3.93311388664721e-06, "loss": 0.3867, "step": 3679 }, { "epoch": 1.8351063829787235, "grad_norm": 0.4550720453262329, "learning_rate": 3.930279001443336e-06, "loss": 0.3388, "step": 3680 }, { "epoch": 1.8356050531914894, "grad_norm": 0.49222245812416077, "learning_rate": 3.9274444765642974e-06, "loss": 0.4009, "step": 3681 }, { "epoch": 1.8361037234042552, "grad_norm": 0.4557308554649353, "learning_rate": 3.92461031296488e-06, "loss": 0.3728, "step": 3682 }, { "epoch": 1.8366023936170213, "grad_norm": 0.48098140954971313, "learning_rate": 3.921776511599741e-06, "loss": 0.3857, "step": 3683 }, { "epoch": 1.8371010638297873, "grad_norm": 0.5132206678390503, "learning_rate": 3.918943073423419e-06, "loss": 0.3598, "step": 3684 }, { "epoch": 1.8375997340425532, "grad_norm": 0.4182107150554657, "learning_rate": 3.91610999939033e-06, "loss": 0.3095, "step": 3685 }, { "epoch": 1.838098404255319, "grad_norm": 0.4693849980831146, "learning_rate": 3.913277290454765e-06, "loss": 0.4193, "step": 3686 }, { "epoch": 1.838597074468085, "grad_norm": 0.43381258845329285, "learning_rate": 3.910444947570894e-06, "loss": 0.3495, "step": 3687 }, { "epoch": 1.8390957446808511, "grad_norm": 0.48729217052459717, "learning_rate": 3.907612971692766e-06, "loss": 0.4162, "step": 3688 }, { "epoch": 1.839594414893617, "grad_norm": 0.502397358417511, "learning_rate": 3.9047813637743e-06, "loss": 0.4191, "step": 3689 }, { "epoch": 1.840093085106383, "grad_norm": 0.41842639446258545, "learning_rate": 3.901950124769298e-06, "loss": 0.3339, "step": 3690 }, { "epoch": 1.8405917553191489, "grad_norm": 0.44604071974754333, "learning_rate": 3.899119255631435e-06, "loss": 0.3985, "step": 3691 }, { "epoch": 1.841090425531915, "grad_norm": 0.4731960892677307, "learning_rate": 3.896288757314255e-06, "loss": 0.4042, "step": 3692 }, { "epoch": 1.841589095744681, "grad_norm": 0.45102056860923767, "learning_rate": 3.893458630771191e-06, "loss": 0.349, "step": 3693 }, { "epoch": 1.8420877659574468, "grad_norm": 0.47637903690338135, "learning_rate": 3.890628876955537e-06, "loss": 0.3907, "step": 3694 }, { "epoch": 1.8425864361702127, "grad_norm": 0.45366623997688293, "learning_rate": 3.887799496820471e-06, "loss": 0.3621, "step": 3695 }, { "epoch": 1.8430851063829787, "grad_norm": 0.405644029378891, "learning_rate": 3.884970491319038e-06, "loss": 0.3489, "step": 3696 }, { "epoch": 1.8435837765957448, "grad_norm": 0.5014949440956116, "learning_rate": 3.882141861404165e-06, "loss": 0.3646, "step": 3697 }, { "epoch": 1.8440824468085106, "grad_norm": 0.49652397632598877, "learning_rate": 3.879313608028643e-06, "loss": 0.3634, "step": 3698 }, { "epoch": 1.8445811170212765, "grad_norm": 0.5812403559684753, "learning_rate": 3.876485732145145e-06, "loss": 0.412, "step": 3699 }, { "epoch": 1.8450797872340425, "grad_norm": 0.43480706214904785, "learning_rate": 3.873658234706209e-06, "loss": 0.3876, "step": 3700 }, { "epoch": 1.8455784574468086, "grad_norm": 0.44951194524765015, "learning_rate": 3.870831116664251e-06, "loss": 0.3572, "step": 3701 }, { "epoch": 1.8460771276595744, "grad_norm": 0.4675796926021576, "learning_rate": 3.868004378971558e-06, "loss": 0.402, "step": 3702 }, { "epoch": 1.8465757978723403, "grad_norm": 0.4609743356704712, "learning_rate": 3.865178022580286e-06, "loss": 0.3737, "step": 3703 }, { "epoch": 1.8470744680851063, "grad_norm": 0.48719197511672974, "learning_rate": 3.862352048442471e-06, "loss": 0.3932, "step": 3704 }, { "epoch": 1.8475731382978724, "grad_norm": 0.4113907217979431, "learning_rate": 3.859526457510006e-06, "loss": 0.3447, "step": 3705 }, { "epoch": 1.8480718085106385, "grad_norm": 0.45677024126052856, "learning_rate": 3.856701250734669e-06, "loss": 0.4008, "step": 3706 }, { "epoch": 1.8485704787234043, "grad_norm": 0.4573383331298828, "learning_rate": 3.8538764290680985e-06, "loss": 0.394, "step": 3707 }, { "epoch": 1.8490691489361701, "grad_norm": 0.5097141861915588, "learning_rate": 3.851051993461813e-06, "loss": 0.3962, "step": 3708 }, { "epoch": 1.8495678191489362, "grad_norm": 0.47786378860473633, "learning_rate": 3.848227944867189e-06, "loss": 0.3308, "step": 3709 }, { "epoch": 1.8500664893617023, "grad_norm": 0.45407167077064514, "learning_rate": 3.845404284235485e-06, "loss": 0.3711, "step": 3710 }, { "epoch": 1.850565159574468, "grad_norm": 0.5287081599235535, "learning_rate": 3.84258101251782e-06, "loss": 0.4049, "step": 3711 }, { "epoch": 1.851063829787234, "grad_norm": 0.4372175633907318, "learning_rate": 3.839758130665183e-06, "loss": 0.3299, "step": 3712 }, { "epoch": 1.8515625, "grad_norm": 0.548180103302002, "learning_rate": 3.836935639628437e-06, "loss": 0.3888, "step": 3713 }, { "epoch": 1.852061170212766, "grad_norm": 0.4817148745059967, "learning_rate": 3.834113540358307e-06, "loss": 0.3966, "step": 3714 }, { "epoch": 1.852559840425532, "grad_norm": 0.4223966896533966, "learning_rate": 3.831291833805392e-06, "loss": 0.3734, "step": 3715 }, { "epoch": 1.8530585106382977, "grad_norm": 0.44581103324890137, "learning_rate": 3.828470520920153e-06, "loss": 0.4254, "step": 3716 }, { "epoch": 1.8535571808510638, "grad_norm": 0.4688778817653656, "learning_rate": 3.825649602652923e-06, "loss": 0.3456, "step": 3717 }, { "epoch": 1.8540558510638299, "grad_norm": 0.541031002998352, "learning_rate": 3.822829079953898e-06, "loss": 0.4237, "step": 3718 }, { "epoch": 1.8545545212765957, "grad_norm": 0.41041988134384155, "learning_rate": 3.820008953773147e-06, "loss": 0.3629, "step": 3719 }, { "epoch": 1.8550531914893615, "grad_norm": 0.45774829387664795, "learning_rate": 3.817189225060596e-06, "loss": 0.438, "step": 3720 }, { "epoch": 1.8555518617021276, "grad_norm": 0.45678916573524475, "learning_rate": 3.8143698947660457e-06, "loss": 0.3234, "step": 3721 }, { "epoch": 1.8560505319148937, "grad_norm": 0.5151733756065369, "learning_rate": 3.81155096383916e-06, "loss": 0.3888, "step": 3722 }, { "epoch": 1.8565492021276597, "grad_norm": 0.5169184803962708, "learning_rate": 3.808732433229464e-06, "loss": 0.3923, "step": 3723 }, { "epoch": 1.8570478723404256, "grad_norm": 0.4310520589351654, "learning_rate": 3.8059143038863566e-06, "loss": 0.3736, "step": 3724 }, { "epoch": 1.8575465425531914, "grad_norm": 0.5153303742408752, "learning_rate": 3.8030965767590922e-06, "loss": 0.3259, "step": 3725 }, { "epoch": 1.8580452127659575, "grad_norm": 0.5071038603782654, "learning_rate": 3.800279252796797e-06, "loss": 0.3999, "step": 3726 }, { "epoch": 1.8585438829787235, "grad_norm": 0.4693155884742737, "learning_rate": 3.7974623329484556e-06, "loss": 0.323, "step": 3727 }, { "epoch": 1.8590425531914894, "grad_norm": 0.4325677454471588, "learning_rate": 3.7946458181629226e-06, "loss": 0.3288, "step": 3728 }, { "epoch": 1.8595412234042552, "grad_norm": 0.4979296028614044, "learning_rate": 3.7918297093889096e-06, "loss": 0.4248, "step": 3729 }, { "epoch": 1.8600398936170213, "grad_norm": 0.5148947834968567, "learning_rate": 3.7890140075749983e-06, "loss": 0.4327, "step": 3730 }, { "epoch": 1.8605385638297873, "grad_norm": 0.48490414023399353, "learning_rate": 3.7861987136696254e-06, "loss": 0.359, "step": 3731 }, { "epoch": 1.8610372340425532, "grad_norm": 0.5283387303352356, "learning_rate": 3.7833838286210995e-06, "loss": 0.4084, "step": 3732 }, { "epoch": 1.861535904255319, "grad_norm": 0.5323678255081177, "learning_rate": 3.7805693533775835e-06, "loss": 0.3877, "step": 3733 }, { "epoch": 1.862034574468085, "grad_norm": 0.43138498067855835, "learning_rate": 3.777755288887104e-06, "loss": 0.3645, "step": 3734 }, { "epoch": 1.8625332446808511, "grad_norm": 0.4134886562824249, "learning_rate": 3.7749416360975544e-06, "loss": 0.3778, "step": 3735 }, { "epoch": 1.863031914893617, "grad_norm": 0.4492696523666382, "learning_rate": 3.7721283959566806e-06, "loss": 0.3568, "step": 3736 }, { "epoch": 1.863530585106383, "grad_norm": 0.5907719135284424, "learning_rate": 3.7693155694120986e-06, "loss": 0.4573, "step": 3737 }, { "epoch": 1.8640292553191489, "grad_norm": 0.4193694591522217, "learning_rate": 3.7665031574112804e-06, "loss": 0.3378, "step": 3738 }, { "epoch": 1.864527925531915, "grad_norm": 0.4660148322582245, "learning_rate": 3.7636911609015575e-06, "loss": 0.3958, "step": 3739 }, { "epoch": 1.865026595744681, "grad_norm": 0.4688831865787506, "learning_rate": 3.7608795808301235e-06, "loss": 0.4136, "step": 3740 }, { "epoch": 1.8655252659574468, "grad_norm": 0.4740435779094696, "learning_rate": 3.758068418144031e-06, "loss": 0.3502, "step": 3741 }, { "epoch": 1.8660239361702127, "grad_norm": 0.3732820749282837, "learning_rate": 3.7552576737901915e-06, "loss": 0.3061, "step": 3742 }, { "epoch": 1.8665226063829787, "grad_norm": 0.46298134326934814, "learning_rate": 3.752447348715378e-06, "loss": 0.4581, "step": 3743 }, { "epoch": 1.8670212765957448, "grad_norm": 0.5301076173782349, "learning_rate": 3.7496374438662196e-06, "loss": 0.4016, "step": 3744 }, { "epoch": 1.8675199468085106, "grad_norm": 0.49257099628448486, "learning_rate": 3.7468279601892027e-06, "loss": 0.3152, "step": 3745 }, { "epoch": 1.8680186170212765, "grad_norm": 0.4270334541797638, "learning_rate": 3.744018898630677e-06, "loss": 0.3615, "step": 3746 }, { "epoch": 1.8685172872340425, "grad_norm": 0.4492776393890381, "learning_rate": 3.741210260136844e-06, "loss": 0.3528, "step": 3747 }, { "epoch": 1.8690159574468086, "grad_norm": 0.4222666323184967, "learning_rate": 3.7384020456537677e-06, "loss": 0.3795, "step": 3748 }, { "epoch": 1.8695146276595744, "grad_norm": 0.4536590874195099, "learning_rate": 3.735594256127365e-06, "loss": 0.3679, "step": 3749 }, { "epoch": 1.8700132978723403, "grad_norm": 0.4485304057598114, "learning_rate": 3.732786892503415e-06, "loss": 0.4013, "step": 3750 }, { "epoch": 1.8705119680851063, "grad_norm": 0.47620493173599243, "learning_rate": 3.7299799557275463e-06, "loss": 0.3834, "step": 3751 }, { "epoch": 1.8710106382978724, "grad_norm": 0.49708986282348633, "learning_rate": 3.72717344674525e-06, "loss": 0.3955, "step": 3752 }, { "epoch": 1.8715093085106385, "grad_norm": 0.45926669239997864, "learning_rate": 3.72436736650187e-06, "loss": 0.3591, "step": 3753 }, { "epoch": 1.8720079787234043, "grad_norm": 0.4326065480709076, "learning_rate": 3.721561715942607e-06, "loss": 0.3525, "step": 3754 }, { "epoch": 1.8725066489361701, "grad_norm": 0.4794120192527771, "learning_rate": 3.718756496012515e-06, "loss": 0.4044, "step": 3755 }, { "epoch": 1.8730053191489362, "grad_norm": 0.4515107572078705, "learning_rate": 3.7159517076565055e-06, "loss": 0.3499, "step": 3756 }, { "epoch": 1.8735039893617023, "grad_norm": 0.46270638704299927, "learning_rate": 3.713147351819343e-06, "loss": 0.3868, "step": 3757 }, { "epoch": 1.874002659574468, "grad_norm": 0.4188578724861145, "learning_rate": 3.7103434294456462e-06, "loss": 0.3866, "step": 3758 }, { "epoch": 1.874501329787234, "grad_norm": 0.4413064420223236, "learning_rate": 3.7075399414798895e-06, "loss": 0.3435, "step": 3759 }, { "epoch": 1.875, "grad_norm": 0.5072236657142639, "learning_rate": 3.704736888866398e-06, "loss": 0.4037, "step": 3760 }, { "epoch": 1.875498670212766, "grad_norm": 0.4276333451271057, "learning_rate": 3.701934272549355e-06, "loss": 0.3739, "step": 3761 }, { "epoch": 1.875997340425532, "grad_norm": 0.49101513624191284, "learning_rate": 3.6991320934727902e-06, "loss": 0.3807, "step": 3762 }, { "epoch": 1.8764960106382977, "grad_norm": 0.43563079833984375, "learning_rate": 3.6963303525805946e-06, "loss": 0.3733, "step": 3763 }, { "epoch": 1.8769946808510638, "grad_norm": 0.4509319067001343, "learning_rate": 3.6935290508165004e-06, "loss": 0.4136, "step": 3764 }, { "epoch": 1.8774933510638299, "grad_norm": 0.47115570306777954, "learning_rate": 3.690728189124104e-06, "loss": 0.3631, "step": 3765 }, { "epoch": 1.8779920212765957, "grad_norm": 0.4651314318180084, "learning_rate": 3.6879277684468453e-06, "loss": 0.3865, "step": 3766 }, { "epoch": 1.8784906914893615, "grad_norm": 0.40742161870002747, "learning_rate": 3.6851277897280146e-06, "loss": 0.3446, "step": 3767 }, { "epoch": 1.8789893617021276, "grad_norm": 0.455545574426651, "learning_rate": 3.682328253910763e-06, "loss": 0.3926, "step": 3768 }, { "epoch": 1.8794880319148937, "grad_norm": 0.48203280568122864, "learning_rate": 3.67952916193808e-06, "loss": 0.3681, "step": 3769 }, { "epoch": 1.8799867021276597, "grad_norm": 0.4711107313632965, "learning_rate": 3.6767305147528186e-06, "loss": 0.3806, "step": 3770 }, { "epoch": 1.8804853723404256, "grad_norm": 0.4027429223060608, "learning_rate": 3.673932313297668e-06, "loss": 0.3666, "step": 3771 }, { "epoch": 1.8809840425531914, "grad_norm": 0.44484832882881165, "learning_rate": 3.6711345585151808e-06, "loss": 0.4396, "step": 3772 }, { "epoch": 1.8814827127659575, "grad_norm": 0.49674445390701294, "learning_rate": 3.668337251347748e-06, "loss": 0.4062, "step": 3773 }, { "epoch": 1.8819813829787235, "grad_norm": 0.40708163380622864, "learning_rate": 3.6655403927376178e-06, "loss": 0.3703, "step": 3774 }, { "epoch": 1.8824800531914894, "grad_norm": 0.4344126880168915, "learning_rate": 3.662743983626882e-06, "loss": 0.3785, "step": 3775 }, { "epoch": 1.8829787234042552, "grad_norm": 0.42597493529319763, "learning_rate": 3.6599480249574827e-06, "loss": 0.3353, "step": 3776 }, { "epoch": 1.8834773936170213, "grad_norm": 0.4866349399089813, "learning_rate": 3.6571525176712134e-06, "loss": 0.3728, "step": 3777 }, { "epoch": 1.8839760638297873, "grad_norm": 0.44266629219055176, "learning_rate": 3.65435746270971e-06, "loss": 0.3538, "step": 3778 }, { "epoch": 1.8844747340425532, "grad_norm": 0.520503580570221, "learning_rate": 3.6515628610144616e-06, "loss": 0.3707, "step": 3779 }, { "epoch": 1.884973404255319, "grad_norm": 0.5142095685005188, "learning_rate": 3.6487687135267986e-06, "loss": 0.3585, "step": 3780 }, { "epoch": 1.885472074468085, "grad_norm": 0.4451873302459717, "learning_rate": 3.645975021187905e-06, "loss": 0.3873, "step": 3781 }, { "epoch": 1.8859707446808511, "grad_norm": 0.5143522620201111, "learning_rate": 3.6431817849388053e-06, "loss": 0.3456, "step": 3782 }, { "epoch": 1.886469414893617, "grad_norm": 0.5417961478233337, "learning_rate": 3.6403890057203763e-06, "loss": 0.3976, "step": 3783 }, { "epoch": 1.886968085106383, "grad_norm": 0.47005918622016907, "learning_rate": 3.637596684473334e-06, "loss": 0.3621, "step": 3784 }, { "epoch": 1.8874667553191489, "grad_norm": 0.43264663219451904, "learning_rate": 3.6348048221382483e-06, "loss": 0.3634, "step": 3785 }, { "epoch": 1.887965425531915, "grad_norm": 0.456860214471817, "learning_rate": 3.6320134196555275e-06, "loss": 0.4467, "step": 3786 }, { "epoch": 1.888464095744681, "grad_norm": 0.44453439116477966, "learning_rate": 3.629222477965426e-06, "loss": 0.3527, "step": 3787 }, { "epoch": 1.8889627659574468, "grad_norm": 0.408048540353775, "learning_rate": 3.626431998008049e-06, "loss": 0.3583, "step": 3788 }, { "epoch": 1.8894614361702127, "grad_norm": 0.453630656003952, "learning_rate": 3.623641980723337e-06, "loss": 0.358, "step": 3789 }, { "epoch": 1.8899601063829787, "grad_norm": 0.4661887288093567, "learning_rate": 3.6208524270510835e-06, "loss": 0.3966, "step": 3790 }, { "epoch": 1.8904587765957448, "grad_norm": 0.3963204324245453, "learning_rate": 3.6180633379309193e-06, "loss": 0.3185, "step": 3791 }, { "epoch": 1.8909574468085106, "grad_norm": 0.4476238489151001, "learning_rate": 3.6152747143023225e-06, "loss": 0.3804, "step": 3792 }, { "epoch": 1.8914561170212765, "grad_norm": 0.448811411857605, "learning_rate": 3.612486557104613e-06, "loss": 0.3901, "step": 3793 }, { "epoch": 1.8919547872340425, "grad_norm": 0.4211987853050232, "learning_rate": 3.609698867276953e-06, "loss": 0.3672, "step": 3794 }, { "epoch": 1.8924534574468086, "grad_norm": 0.46999919414520264, "learning_rate": 3.6069116457583494e-06, "loss": 0.3735, "step": 3795 }, { "epoch": 1.8929521276595744, "grad_norm": 0.4719306528568268, "learning_rate": 3.6041248934876482e-06, "loss": 0.3726, "step": 3796 }, { "epoch": 1.8934507978723403, "grad_norm": 0.5041028261184692, "learning_rate": 3.601338611403541e-06, "loss": 0.4299, "step": 3797 }, { "epoch": 1.8939494680851063, "grad_norm": 0.4553736746311188, "learning_rate": 3.5985528004445564e-06, "loss": 0.3817, "step": 3798 }, { "epoch": 1.8944481382978724, "grad_norm": 0.47602778673171997, "learning_rate": 3.59576746154907e-06, "loss": 0.3697, "step": 3799 }, { "epoch": 1.8949468085106385, "grad_norm": 0.4517029821872711, "learning_rate": 3.592982595655293e-06, "loss": 0.355, "step": 3800 }, { "epoch": 1.8954454787234043, "grad_norm": 0.4202047288417816, "learning_rate": 3.5901982037012818e-06, "loss": 0.3472, "step": 3801 }, { "epoch": 1.8959441489361701, "grad_norm": 0.3885253667831421, "learning_rate": 3.5874142866249283e-06, "loss": 0.3415, "step": 3802 }, { "epoch": 1.8964428191489362, "grad_norm": 0.5405235886573792, "learning_rate": 3.58463084536397e-06, "loss": 0.4289, "step": 3803 }, { "epoch": 1.8969414893617023, "grad_norm": 0.42899957299232483, "learning_rate": 3.5818478808559787e-06, "loss": 0.3433, "step": 3804 }, { "epoch": 1.897440159574468, "grad_norm": 0.45889389514923096, "learning_rate": 3.5790653940383717e-06, "loss": 0.3791, "step": 3805 }, { "epoch": 1.897938829787234, "grad_norm": 0.44802966713905334, "learning_rate": 3.576283385848398e-06, "loss": 0.3679, "step": 3806 }, { "epoch": 1.8984375, "grad_norm": 0.48302263021469116, "learning_rate": 3.573501857223153e-06, "loss": 0.3609, "step": 3807 }, { "epoch": 1.898936170212766, "grad_norm": 0.4834303557872772, "learning_rate": 3.570720809099563e-06, "loss": 0.3502, "step": 3808 }, { "epoch": 1.899434840425532, "grad_norm": 0.5128223896026611, "learning_rate": 3.567940242414399e-06, "loss": 0.3953, "step": 3809 }, { "epoch": 1.8999335106382977, "grad_norm": 0.4325203597545624, "learning_rate": 3.5651601581042662e-06, "loss": 0.3457, "step": 3810 }, { "epoch": 1.9004321808510638, "grad_norm": 0.4597184360027313, "learning_rate": 3.562380557105608e-06, "loss": 0.4061, "step": 3811 }, { "epoch": 1.9009308510638299, "grad_norm": 0.4012354016304016, "learning_rate": 3.5596014403547064e-06, "loss": 0.3412, "step": 3812 }, { "epoch": 1.9014295212765957, "grad_norm": 0.47753098607063293, "learning_rate": 3.5568228087876777e-06, "loss": 0.3855, "step": 3813 }, { "epoch": 1.9019281914893615, "grad_norm": 0.5107449293136597, "learning_rate": 3.554044663340477e-06, "loss": 0.411, "step": 3814 }, { "epoch": 1.9024268617021276, "grad_norm": 0.405380517244339, "learning_rate": 3.5512670049488933e-06, "loss": 0.3384, "step": 3815 }, { "epoch": 1.9029255319148937, "grad_norm": 0.513059139251709, "learning_rate": 3.5484898345485573e-06, "loss": 0.4401, "step": 3816 }, { "epoch": 1.9034242021276597, "grad_norm": 0.4710571765899658, "learning_rate": 3.5457131530749255e-06, "loss": 0.3335, "step": 3817 }, { "epoch": 1.9039228723404256, "grad_norm": 0.48099905252456665, "learning_rate": 3.5429369614633013e-06, "loss": 0.3603, "step": 3818 }, { "epoch": 1.9044215425531914, "grad_norm": 0.3874185085296631, "learning_rate": 3.5401612606488133e-06, "loss": 0.363, "step": 3819 }, { "epoch": 1.9049202127659575, "grad_norm": 0.4430481195449829, "learning_rate": 3.537386051566427e-06, "loss": 0.3625, "step": 3820 }, { "epoch": 1.9054188829787235, "grad_norm": 0.4587818682193756, "learning_rate": 3.534611335150948e-06, "loss": 0.3794, "step": 3821 }, { "epoch": 1.9059175531914894, "grad_norm": 0.4433348774909973, "learning_rate": 3.5318371123370087e-06, "loss": 0.3721, "step": 3822 }, { "epoch": 1.9064162234042552, "grad_norm": 0.4108625650405884, "learning_rate": 3.5290633840590814e-06, "loss": 0.3292, "step": 3823 }, { "epoch": 1.9069148936170213, "grad_norm": 0.4463042616844177, "learning_rate": 3.5262901512514646e-06, "loss": 0.3847, "step": 3824 }, { "epoch": 1.9074135638297873, "grad_norm": 0.5040532350540161, "learning_rate": 3.5235174148482993e-06, "loss": 0.4062, "step": 3825 }, { "epoch": 1.9079122340425532, "grad_norm": 0.509014368057251, "learning_rate": 3.5207451757835488e-06, "loss": 0.3622, "step": 3826 }, { "epoch": 1.908410904255319, "grad_norm": 0.4709191918373108, "learning_rate": 3.517973434991019e-06, "loss": 0.313, "step": 3827 }, { "epoch": 1.908909574468085, "grad_norm": 0.5973730087280273, "learning_rate": 3.515202193404338e-06, "loss": 0.3691, "step": 3828 }, { "epoch": 1.9094082446808511, "grad_norm": 0.48279109597206116, "learning_rate": 3.5124314519569756e-06, "loss": 0.3475, "step": 3829 }, { "epoch": 1.909906914893617, "grad_norm": 0.5241132974624634, "learning_rate": 3.509661211582226e-06, "loss": 0.3924, "step": 3830 }, { "epoch": 1.910405585106383, "grad_norm": 0.44357335567474365, "learning_rate": 3.5068914732132157e-06, "loss": 0.3345, "step": 3831 }, { "epoch": 1.9109042553191489, "grad_norm": 0.508755087852478, "learning_rate": 3.5041222377829076e-06, "loss": 0.3841, "step": 3832 }, { "epoch": 1.911402925531915, "grad_norm": 0.51004958152771, "learning_rate": 3.5013535062240865e-06, "loss": 0.3245, "step": 3833 }, { "epoch": 1.911901595744681, "grad_norm": 0.5373508930206299, "learning_rate": 3.4985852794693764e-06, "loss": 0.4486, "step": 3834 }, { "epoch": 1.9124002659574468, "grad_norm": 0.49075543880462646, "learning_rate": 3.4958175584512217e-06, "loss": 0.365, "step": 3835 }, { "epoch": 1.9128989361702127, "grad_norm": 0.46190163493156433, "learning_rate": 3.493050344101907e-06, "loss": 0.361, "step": 3836 }, { "epoch": 1.9133976063829787, "grad_norm": 0.5239810347557068, "learning_rate": 3.4902836373535356e-06, "loss": 0.3449, "step": 3837 }, { "epoch": 1.9138962765957448, "grad_norm": 0.4992389678955078, "learning_rate": 3.4875174391380493e-06, "loss": 0.354, "step": 3838 }, { "epoch": 1.9143949468085106, "grad_norm": 0.46709051728248596, "learning_rate": 3.4847517503872115e-06, "loss": 0.3473, "step": 3839 }, { "epoch": 1.9148936170212765, "grad_norm": 0.548531711101532, "learning_rate": 3.4819865720326195e-06, "loss": 0.3984, "step": 3840 }, { "epoch": 1.9153922872340425, "grad_norm": 0.4931449890136719, "learning_rate": 3.479221905005695e-06, "loss": 0.3456, "step": 3841 }, { "epoch": 1.9158909574468086, "grad_norm": 0.4677758812904358, "learning_rate": 3.476457750237685e-06, "loss": 0.3641, "step": 3842 }, { "epoch": 1.9163896276595744, "grad_norm": 0.4812643826007843, "learning_rate": 3.473694108659673e-06, "loss": 0.3936, "step": 3843 }, { "epoch": 1.9168882978723403, "grad_norm": 0.514256477355957, "learning_rate": 3.4709309812025604e-06, "loss": 0.4042, "step": 3844 }, { "epoch": 1.9173869680851063, "grad_norm": 0.4462766647338867, "learning_rate": 3.4681683687970814e-06, "loss": 0.3581, "step": 3845 }, { "epoch": 1.9178856382978724, "grad_norm": 0.5016101598739624, "learning_rate": 3.4654062723737908e-06, "loss": 0.2992, "step": 3846 }, { "epoch": 1.9183843085106385, "grad_norm": 0.5104228854179382, "learning_rate": 3.4626446928630775e-06, "loss": 0.3628, "step": 3847 }, { "epoch": 1.9188829787234043, "grad_norm": 0.490497887134552, "learning_rate": 3.459883631195149e-06, "loss": 0.3547, "step": 3848 }, { "epoch": 1.9193816489361701, "grad_norm": 0.4632130563259125, "learning_rate": 3.4571230883000434e-06, "loss": 0.3858, "step": 3849 }, { "epoch": 1.9198803191489362, "grad_norm": 0.419454425573349, "learning_rate": 3.454363065107622e-06, "loss": 0.3064, "step": 3850 }, { "epoch": 1.9203789893617023, "grad_norm": 0.46839597821235657, "learning_rate": 3.4516035625475675e-06, "loss": 0.3852, "step": 3851 }, { "epoch": 1.920877659574468, "grad_norm": 0.47547242045402527, "learning_rate": 3.448844581549396e-06, "loss": 0.3414, "step": 3852 }, { "epoch": 1.921376329787234, "grad_norm": 0.4219913184642792, "learning_rate": 3.446086123042438e-06, "loss": 0.3523, "step": 3853 }, { "epoch": 1.921875, "grad_norm": 0.419171541929245, "learning_rate": 3.4433281879558568e-06, "loss": 0.3563, "step": 3854 }, { "epoch": 1.922373670212766, "grad_norm": 0.4903222322463989, "learning_rate": 3.440570777218631e-06, "loss": 0.4488, "step": 3855 }, { "epoch": 1.922872340425532, "grad_norm": 0.47956550121307373, "learning_rate": 3.4378138917595717e-06, "loss": 0.4022, "step": 3856 }, { "epoch": 1.9233710106382977, "grad_norm": 0.44157177209854126, "learning_rate": 3.4350575325073033e-06, "loss": 0.3453, "step": 3857 }, { "epoch": 1.9238696808510638, "grad_norm": 0.47561559081077576, "learning_rate": 3.4323017003902827e-06, "loss": 0.3354, "step": 3858 }, { "epoch": 1.9243683510638299, "grad_norm": 0.4151248335838318, "learning_rate": 3.429546396336779e-06, "loss": 0.3537, "step": 3859 }, { "epoch": 1.9248670212765957, "grad_norm": 0.4795345366001129, "learning_rate": 3.426791621274895e-06, "loss": 0.4215, "step": 3860 }, { "epoch": 1.9253656914893615, "grad_norm": 0.4535539150238037, "learning_rate": 3.4240373761325465e-06, "loss": 0.4057, "step": 3861 }, { "epoch": 1.9258643617021276, "grad_norm": 0.4067715108394623, "learning_rate": 3.42128366183747e-06, "loss": 0.3371, "step": 3862 }, { "epoch": 1.9263630319148937, "grad_norm": 0.5162743330001831, "learning_rate": 3.4185304793172326e-06, "loss": 0.3935, "step": 3863 }, { "epoch": 1.9268617021276597, "grad_norm": 0.4352348744869232, "learning_rate": 3.415777829499211e-06, "loss": 0.3423, "step": 3864 }, { "epoch": 1.9273603723404256, "grad_norm": 0.46105608344078064, "learning_rate": 3.4130257133106114e-06, "loss": 0.4125, "step": 3865 }, { "epoch": 1.9278590425531914, "grad_norm": 0.5011154413223267, "learning_rate": 3.4102741316784552e-06, "loss": 0.352, "step": 3866 }, { "epoch": 1.9283577127659575, "grad_norm": 0.5017003417015076, "learning_rate": 3.407523085529586e-06, "loss": 0.3937, "step": 3867 }, { "epoch": 1.9288563829787235, "grad_norm": 0.49036023020744324, "learning_rate": 3.4047725757906656e-06, "loss": 0.4694, "step": 3868 }, { "epoch": 1.9293550531914894, "grad_norm": 0.44811883568763733, "learning_rate": 3.4020226033881763e-06, "loss": 0.3652, "step": 3869 }, { "epoch": 1.9298537234042552, "grad_norm": 0.45202139019966125, "learning_rate": 3.399273169248417e-06, "loss": 0.3466, "step": 3870 }, { "epoch": 1.9303523936170213, "grad_norm": 0.503516674041748, "learning_rate": 3.396524274297511e-06, "loss": 0.4419, "step": 3871 }, { "epoch": 1.9308510638297873, "grad_norm": 0.4463275671005249, "learning_rate": 3.393775919461394e-06, "loss": 0.3679, "step": 3872 }, { "epoch": 1.9313497340425532, "grad_norm": 0.46175214648246765, "learning_rate": 3.391028105665819e-06, "loss": 0.3299, "step": 3873 }, { "epoch": 1.931848404255319, "grad_norm": 0.48987647891044617, "learning_rate": 3.388280833836364e-06, "loss": 0.3831, "step": 3874 }, { "epoch": 1.932347074468085, "grad_norm": 0.5331224203109741, "learning_rate": 3.385534104898417e-06, "loss": 0.3743, "step": 3875 }, { "epoch": 1.9328457446808511, "grad_norm": 0.460164874792099, "learning_rate": 3.3827879197771895e-06, "loss": 0.3793, "step": 3876 }, { "epoch": 1.933344414893617, "grad_norm": 0.4550994038581848, "learning_rate": 3.3800422793977023e-06, "loss": 0.3696, "step": 3877 }, { "epoch": 1.933843085106383, "grad_norm": 0.45996272563934326, "learning_rate": 3.3772971846848014e-06, "loss": 0.3821, "step": 3878 }, { "epoch": 1.9343417553191489, "grad_norm": 0.4539780616760254, "learning_rate": 3.3745526365631405e-06, "loss": 0.3738, "step": 3879 }, { "epoch": 1.934840425531915, "grad_norm": 0.4313676655292511, "learning_rate": 3.3718086359571974e-06, "loss": 0.3569, "step": 3880 }, { "epoch": 1.935339095744681, "grad_norm": 0.5326755046844482, "learning_rate": 3.3690651837912563e-06, "loss": 0.4362, "step": 3881 }, { "epoch": 1.9358377659574468, "grad_norm": 0.4328010678291321, "learning_rate": 3.366322280989427e-06, "loss": 0.3908, "step": 3882 }, { "epoch": 1.9363364361702127, "grad_norm": 0.4388115406036377, "learning_rate": 3.3635799284756254e-06, "loss": 0.3669, "step": 3883 }, { "epoch": 1.9368351063829787, "grad_norm": 0.4744568467140198, "learning_rate": 3.360838127173586e-06, "loss": 0.3905, "step": 3884 }, { "epoch": 1.9373337765957448, "grad_norm": 0.4365004301071167, "learning_rate": 3.358096878006858e-06, "loss": 0.3237, "step": 3885 }, { "epoch": 1.9378324468085106, "grad_norm": 0.5625734925270081, "learning_rate": 3.3553561818988035e-06, "loss": 0.3892, "step": 3886 }, { "epoch": 1.9383311170212765, "grad_norm": 0.464995801448822, "learning_rate": 3.3526160397725984e-06, "loss": 0.3925, "step": 3887 }, { "epoch": 1.9388297872340425, "grad_norm": 0.4867073595523834, "learning_rate": 3.3498764525512317e-06, "loss": 0.4069, "step": 3888 }, { "epoch": 1.9393284574468086, "grad_norm": 0.45019009709358215, "learning_rate": 3.347137421157508e-06, "loss": 0.3241, "step": 3889 }, { "epoch": 1.9398271276595744, "grad_norm": 0.5346182584762573, "learning_rate": 3.3443989465140405e-06, "loss": 0.4167, "step": 3890 }, { "epoch": 1.9403257978723403, "grad_norm": 0.4929727017879486, "learning_rate": 3.3416610295432595e-06, "loss": 0.3754, "step": 3891 }, { "epoch": 1.9408244680851063, "grad_norm": 0.44171208143234253, "learning_rate": 3.338923671167401e-06, "loss": 0.3634, "step": 3892 }, { "epoch": 1.9413231382978724, "grad_norm": 0.48850440979003906, "learning_rate": 3.336186872308522e-06, "loss": 0.4065, "step": 3893 }, { "epoch": 1.9418218085106385, "grad_norm": 0.45806556940078735, "learning_rate": 3.3334506338884832e-06, "loss": 0.3412, "step": 3894 }, { "epoch": 1.9423204787234043, "grad_norm": 0.531964123249054, "learning_rate": 3.330714956828957e-06, "loss": 0.3887, "step": 3895 }, { "epoch": 1.9428191489361701, "grad_norm": 0.5007511973381042, "learning_rate": 3.327979842051433e-06, "loss": 0.403, "step": 3896 }, { "epoch": 1.9433178191489362, "grad_norm": 0.4297804832458496, "learning_rate": 3.3252452904772037e-06, "loss": 0.3398, "step": 3897 }, { "epoch": 1.9438164893617023, "grad_norm": 0.6205043792724609, "learning_rate": 3.3225113030273786e-06, "loss": 0.4238, "step": 3898 }, { "epoch": 1.944315159574468, "grad_norm": 0.46692752838134766, "learning_rate": 3.319777880622871e-06, "loss": 0.3377, "step": 3899 }, { "epoch": 1.944813829787234, "grad_norm": 0.49087128043174744, "learning_rate": 3.3170450241844105e-06, "loss": 0.3771, "step": 3900 }, { "epoch": 1.9453125, "grad_norm": 0.4487667381763458, "learning_rate": 3.3143127346325286e-06, "loss": 0.3651, "step": 3901 }, { "epoch": 1.945811170212766, "grad_norm": 0.4141943156719208, "learning_rate": 3.3115810128875736e-06, "loss": 0.3837, "step": 3902 }, { "epoch": 1.946309840425532, "grad_norm": 0.444807231426239, "learning_rate": 3.308849859869695e-06, "loss": 0.38, "step": 3903 }, { "epoch": 1.9468085106382977, "grad_norm": 0.5265575647354126, "learning_rate": 3.306119276498857e-06, "loss": 0.4063, "step": 3904 }, { "epoch": 1.9473071808510638, "grad_norm": 0.47857627272605896, "learning_rate": 3.30338926369483e-06, "loss": 0.3628, "step": 3905 }, { "epoch": 1.9478058510638299, "grad_norm": 0.45147356390953064, "learning_rate": 3.300659822377188e-06, "loss": 0.3424, "step": 3906 }, { "epoch": 1.9483045212765957, "grad_norm": 0.433038592338562, "learning_rate": 3.2979309534653203e-06, "loss": 0.3626, "step": 3907 }, { "epoch": 1.9488031914893615, "grad_norm": 0.4877112805843353, "learning_rate": 3.295202657878415e-06, "loss": 0.3848, "step": 3908 }, { "epoch": 1.9493018617021276, "grad_norm": 0.44427135586738586, "learning_rate": 3.292474936535476e-06, "loss": 0.3834, "step": 3909 }, { "epoch": 1.9498005319148937, "grad_norm": 0.461820513010025, "learning_rate": 3.289747790355303e-06, "loss": 0.4016, "step": 3910 }, { "epoch": 1.9502992021276597, "grad_norm": 0.525769054889679, "learning_rate": 3.2870212202565145e-06, "loss": 0.4019, "step": 3911 }, { "epoch": 1.9507978723404256, "grad_norm": 0.4335799813270569, "learning_rate": 3.2842952271575225e-06, "loss": 0.323, "step": 3912 }, { "epoch": 1.9512965425531914, "grad_norm": 0.4432854652404785, "learning_rate": 3.281569811976555e-06, "loss": 0.3382, "step": 3913 }, { "epoch": 1.9517952127659575, "grad_norm": 0.499689519405365, "learning_rate": 3.2788449756316376e-06, "loss": 0.4252, "step": 3914 }, { "epoch": 1.9522938829787235, "grad_norm": 0.4372518062591553, "learning_rate": 3.2761207190406073e-06, "loss": 0.3121, "step": 3915 }, { "epoch": 1.9527925531914894, "grad_norm": 0.45546063780784607, "learning_rate": 3.2733970431211014e-06, "loss": 0.3521, "step": 3916 }, { "epoch": 1.9532912234042552, "grad_norm": 0.48119619488716125, "learning_rate": 3.2706739487905615e-06, "loss": 0.3724, "step": 3917 }, { "epoch": 1.9537898936170213, "grad_norm": 0.47261926531791687, "learning_rate": 3.2679514369662366e-06, "loss": 0.4093, "step": 3918 }, { "epoch": 1.9542885638297873, "grad_norm": 0.4130690097808838, "learning_rate": 3.2652295085651764e-06, "loss": 0.3497, "step": 3919 }, { "epoch": 1.9547872340425532, "grad_norm": 0.47956064343452454, "learning_rate": 3.2625081645042354e-06, "loss": 0.3933, "step": 3920 }, { "epoch": 1.955285904255319, "grad_norm": 0.5339139103889465, "learning_rate": 3.2597874057000732e-06, "loss": 0.4275, "step": 3921 }, { "epoch": 1.955784574468085, "grad_norm": 0.5207850933074951, "learning_rate": 3.2570672330691477e-06, "loss": 0.3772, "step": 3922 }, { "epoch": 1.9562832446808511, "grad_norm": 0.46722412109375, "learning_rate": 3.2543476475277236e-06, "loss": 0.3853, "step": 3923 }, { "epoch": 1.956781914893617, "grad_norm": 0.4211895167827606, "learning_rate": 3.251628649991865e-06, "loss": 0.3517, "step": 3924 }, { "epoch": 1.957280585106383, "grad_norm": 0.4802846908569336, "learning_rate": 3.2489102413774397e-06, "loss": 0.3686, "step": 3925 }, { "epoch": 1.9577792553191489, "grad_norm": 0.45018672943115234, "learning_rate": 3.246192422600116e-06, "loss": 0.3285, "step": 3926 }, { "epoch": 1.958277925531915, "grad_norm": 0.5165684819221497, "learning_rate": 3.243475194575365e-06, "loss": 0.3989, "step": 3927 }, { "epoch": 1.958776595744681, "grad_norm": 0.4293542206287384, "learning_rate": 3.2407585582184555e-06, "loss": 0.3587, "step": 3928 }, { "epoch": 1.9592752659574468, "grad_norm": 0.45075130462646484, "learning_rate": 3.238042514444463e-06, "loss": 0.3986, "step": 3929 }, { "epoch": 1.9597739361702127, "grad_norm": 0.43908753991127014, "learning_rate": 3.235327064168256e-06, "loss": 0.3394, "step": 3930 }, { "epoch": 1.9602726063829787, "grad_norm": 0.44391489028930664, "learning_rate": 3.232612208304511e-06, "loss": 0.3672, "step": 3931 }, { "epoch": 1.9607712765957448, "grad_norm": 0.44441652297973633, "learning_rate": 3.2298979477676944e-06, "loss": 0.3763, "step": 3932 }, { "epoch": 1.9612699468085106, "grad_norm": 0.4241260290145874, "learning_rate": 3.2271842834720833e-06, "loss": 0.4383, "step": 3933 }, { "epoch": 1.9617686170212765, "grad_norm": 0.4560188353061676, "learning_rate": 3.2244712163317447e-06, "loss": 0.3707, "step": 3934 }, { "epoch": 1.9622672872340425, "grad_norm": 0.4212411046028137, "learning_rate": 3.2217587472605518e-06, "loss": 0.3391, "step": 3935 }, { "epoch": 1.9627659574468086, "grad_norm": 0.4886115491390228, "learning_rate": 3.2190468771721685e-06, "loss": 0.4164, "step": 3936 }, { "epoch": 1.9632646276595744, "grad_norm": 0.5183068513870239, "learning_rate": 3.216335606980064e-06, "loss": 0.402, "step": 3937 }, { "epoch": 1.9637632978723403, "grad_norm": 0.4659089148044586, "learning_rate": 3.2136249375975014e-06, "loss": 0.374, "step": 3938 }, { "epoch": 1.9642619680851063, "grad_norm": 0.48630785942077637, "learning_rate": 3.2109148699375427e-06, "loss": 0.3268, "step": 3939 }, { "epoch": 1.9647606382978724, "grad_norm": 0.46704530715942383, "learning_rate": 3.208205404913048e-06, "loss": 0.3763, "step": 3940 }, { "epoch": 1.9652593085106385, "grad_norm": 0.4034091532230377, "learning_rate": 3.2054965434366725e-06, "loss": 0.3506, "step": 3941 }, { "epoch": 1.9657579787234043, "grad_norm": 0.4459627568721771, "learning_rate": 3.2027882864208705e-06, "loss": 0.4026, "step": 3942 }, { "epoch": 1.9662566489361701, "grad_norm": 0.46621793508529663, "learning_rate": 3.2000806347778883e-06, "loss": 0.3555, "step": 3943 }, { "epoch": 1.9667553191489362, "grad_norm": 0.48519137501716614, "learning_rate": 3.197373589419776e-06, "loss": 0.3534, "step": 3944 }, { "epoch": 1.9672539893617023, "grad_norm": 0.40385907888412476, "learning_rate": 3.1946671512583705e-06, "loss": 0.3804, "step": 3945 }, { "epoch": 1.967752659574468, "grad_norm": 0.45706427097320557, "learning_rate": 3.1919613212053118e-06, "loss": 0.3725, "step": 3946 }, { "epoch": 1.968251329787234, "grad_norm": 0.44647395610809326, "learning_rate": 3.18925610017203e-06, "loss": 0.4267, "step": 3947 }, { "epoch": 1.96875, "grad_norm": 0.4301709234714508, "learning_rate": 3.186551489069751e-06, "loss": 0.3218, "step": 3948 }, { "epoch": 1.969248670212766, "grad_norm": 0.47386643290519714, "learning_rate": 3.183847488809498e-06, "loss": 0.3745, "step": 3949 }, { "epoch": 1.969747340425532, "grad_norm": 0.4589731693267822, "learning_rate": 3.181144100302084e-06, "loss": 0.3758, "step": 3950 }, { "epoch": 1.9702460106382977, "grad_norm": 0.44358113408088684, "learning_rate": 3.1784413244581215e-06, "loss": 0.3887, "step": 3951 }, { "epoch": 1.9707446808510638, "grad_norm": 0.48973989486694336, "learning_rate": 3.1757391621880107e-06, "loss": 0.4116, "step": 3952 }, { "epoch": 1.9712433510638299, "grad_norm": 0.43771031498908997, "learning_rate": 3.1730376144019504e-06, "loss": 0.3202, "step": 3953 }, { "epoch": 1.9717420212765957, "grad_norm": 0.48581448197364807, "learning_rate": 3.170336682009928e-06, "loss": 0.3922, "step": 3954 }, { "epoch": 1.9722406914893615, "grad_norm": 0.4478948712348938, "learning_rate": 3.1676363659217273e-06, "loss": 0.4079, "step": 3955 }, { "epoch": 1.9727393617021276, "grad_norm": 0.4420483708381653, "learning_rate": 3.1649366670469206e-06, "loss": 0.3782, "step": 3956 }, { "epoch": 1.9732380319148937, "grad_norm": 0.4020446240901947, "learning_rate": 3.1622375862948777e-06, "loss": 0.385, "step": 3957 }, { "epoch": 1.9737367021276597, "grad_norm": 0.51207435131073, "learning_rate": 3.1595391245747547e-06, "loss": 0.3615, "step": 3958 }, { "epoch": 1.9742353723404256, "grad_norm": 0.4708404541015625, "learning_rate": 3.1568412827955008e-06, "loss": 0.4203, "step": 3959 }, { "epoch": 1.9747340425531914, "grad_norm": 0.4121657609939575, "learning_rate": 3.1541440618658604e-06, "loss": 0.3449, "step": 3960 }, { "epoch": 1.9752327127659575, "grad_norm": 0.49026817083358765, "learning_rate": 3.1514474626943604e-06, "loss": 0.4009, "step": 3961 }, { "epoch": 1.9757313829787235, "grad_norm": 0.4465181231498718, "learning_rate": 3.148751486189329e-06, "loss": 0.3391, "step": 3962 }, { "epoch": 1.9762300531914894, "grad_norm": 0.4505116939544678, "learning_rate": 3.146056133258874e-06, "loss": 0.355, "step": 3963 }, { "epoch": 1.9767287234042552, "grad_norm": 0.4312651455402374, "learning_rate": 3.1433614048109028e-06, "loss": 0.3606, "step": 3964 }, { "epoch": 1.9772273936170213, "grad_norm": 0.47814980149269104, "learning_rate": 3.1406673017531026e-06, "loss": 0.4438, "step": 3965 }, { "epoch": 1.9777260638297873, "grad_norm": 0.41055142879486084, "learning_rate": 3.1379738249929604e-06, "loss": 0.3812, "step": 3966 }, { "epoch": 1.9782247340425532, "grad_norm": 0.4621718227863312, "learning_rate": 3.1352809754377415e-06, "loss": 0.3996, "step": 3967 }, { "epoch": 1.978723404255319, "grad_norm": 0.4095640480518341, "learning_rate": 3.132588753994511e-06, "loss": 0.3835, "step": 3968 }, { "epoch": 1.979222074468085, "grad_norm": 0.46091902256011963, "learning_rate": 3.1298971615701134e-06, "loss": 0.3805, "step": 3969 }, { "epoch": 1.9797207446808511, "grad_norm": 0.42469221353530884, "learning_rate": 3.1272061990711843e-06, "loss": 0.3533, "step": 3970 }, { "epoch": 1.980219414893617, "grad_norm": 0.46898019313812256, "learning_rate": 3.12451586740415e-06, "loss": 0.4511, "step": 3971 }, { "epoch": 1.980718085106383, "grad_norm": 0.41590195894241333, "learning_rate": 3.1218261674752186e-06, "loss": 0.3825, "step": 3972 }, { "epoch": 1.9812167553191489, "grad_norm": 0.40719354152679443, "learning_rate": 3.1191371001903925e-06, "loss": 0.3875, "step": 3973 }, { "epoch": 1.981715425531915, "grad_norm": 0.4666401445865631, "learning_rate": 3.116448666455454e-06, "loss": 0.3965, "step": 3974 }, { "epoch": 1.982214095744681, "grad_norm": 0.4984409213066101, "learning_rate": 3.113760867175976e-06, "loss": 0.4117, "step": 3975 }, { "epoch": 1.9827127659574468, "grad_norm": 0.4865085780620575, "learning_rate": 3.1110737032573178e-06, "loss": 0.3099, "step": 3976 }, { "epoch": 1.9832114361702127, "grad_norm": 0.48120880126953125, "learning_rate": 3.1083871756046235e-06, "loss": 0.3848, "step": 3977 }, { "epoch": 1.9837101063829787, "grad_norm": 0.4518281817436218, "learning_rate": 3.105701285122823e-06, "loss": 0.3345, "step": 3978 }, { "epoch": 1.9842087765957448, "grad_norm": 0.44199517369270325, "learning_rate": 3.1030160327166313e-06, "loss": 0.3565, "step": 3979 }, { "epoch": 1.9847074468085106, "grad_norm": 0.4524848461151123, "learning_rate": 3.10033141929055e-06, "loss": 0.3829, "step": 3980 }, { "epoch": 1.9852061170212765, "grad_norm": 0.4999317526817322, "learning_rate": 3.0976474457488612e-06, "loss": 0.3936, "step": 3981 }, { "epoch": 1.9857047872340425, "grad_norm": 0.4185335636138916, "learning_rate": 3.094964112995639e-06, "loss": 0.3146, "step": 3982 }, { "epoch": 1.9862034574468086, "grad_norm": 0.5211443305015564, "learning_rate": 3.092281421934733e-06, "loss": 0.4445, "step": 3983 }, { "epoch": 1.9867021276595744, "grad_norm": 0.45424139499664307, "learning_rate": 3.089599373469785e-06, "loss": 0.3027, "step": 3984 }, { "epoch": 1.9872007978723403, "grad_norm": 0.4982874095439911, "learning_rate": 3.0869179685042116e-06, "loss": 0.3728, "step": 3985 }, { "epoch": 1.9876994680851063, "grad_norm": 0.42323318123817444, "learning_rate": 3.0842372079412218e-06, "loss": 0.4164, "step": 3986 }, { "epoch": 1.9881981382978724, "grad_norm": 0.4153226315975189, "learning_rate": 3.0815570926838e-06, "loss": 0.4078, "step": 3987 }, { "epoch": 1.9886968085106385, "grad_norm": 0.4892559051513672, "learning_rate": 3.0788776236347183e-06, "loss": 0.3824, "step": 3988 }, { "epoch": 1.9891954787234043, "grad_norm": 0.4838097393512726, "learning_rate": 3.076198801696527e-06, "loss": 0.3969, "step": 3989 }, { "epoch": 1.9896941489361701, "grad_norm": 0.45815300941467285, "learning_rate": 3.0735206277715634e-06, "loss": 0.3749, "step": 3990 }, { "epoch": 1.9901928191489362, "grad_norm": 0.4628193974494934, "learning_rate": 3.0708431027619423e-06, "loss": 0.3869, "step": 3991 }, { "epoch": 1.9906914893617023, "grad_norm": 0.4380626678466797, "learning_rate": 3.068166227569558e-06, "loss": 0.3919, "step": 3992 }, { "epoch": 1.991190159574468, "grad_norm": 0.4273108243942261, "learning_rate": 3.065490003096094e-06, "loss": 0.3829, "step": 3993 }, { "epoch": 1.991688829787234, "grad_norm": 0.4283585250377655, "learning_rate": 3.0628144302430076e-06, "loss": 0.3592, "step": 3994 }, { "epoch": 1.9921875, "grad_norm": 0.4557812809944153, "learning_rate": 3.060139509911539e-06, "loss": 0.3867, "step": 3995 }, { "epoch": 1.992686170212766, "grad_norm": 0.47466573119163513, "learning_rate": 3.0574652430027086e-06, "loss": 0.3799, "step": 3996 }, { "epoch": 1.993184840425532, "grad_norm": 0.48478612303733826, "learning_rate": 3.0547916304173156e-06, "loss": 0.4182, "step": 3997 }, { "epoch": 1.9936835106382977, "grad_norm": 0.44406619668006897, "learning_rate": 3.0521186730559404e-06, "loss": 0.3241, "step": 3998 }, { "epoch": 1.9941821808510638, "grad_norm": 0.43927183747291565, "learning_rate": 3.049446371818944e-06, "loss": 0.3926, "step": 3999 }, { "epoch": 1.9946808510638299, "grad_norm": 0.4371697008609772, "learning_rate": 3.0467747276064617e-06, "loss": 0.3356, "step": 4000 }, { "epoch": 1.9951795212765957, "grad_norm": 0.48549169301986694, "learning_rate": 3.04410374131841e-06, "loss": 0.4832, "step": 4001 }, { "epoch": 1.9956781914893615, "grad_norm": 0.43378058075904846, "learning_rate": 3.0414334138544866e-06, "loss": 0.4048, "step": 4002 }, { "epoch": 1.9961768617021276, "grad_norm": 0.4992956817150116, "learning_rate": 3.0387637461141606e-06, "loss": 0.3755, "step": 4003 }, { "epoch": 1.9966755319148937, "grad_norm": 0.4508220851421356, "learning_rate": 3.036094738996688e-06, "loss": 0.3482, "step": 4004 }, { "epoch": 1.9971742021276597, "grad_norm": 0.4571388065814972, "learning_rate": 3.033426393401091e-06, "loss": 0.3843, "step": 4005 }, { "epoch": 1.9976728723404256, "grad_norm": 0.44321224093437195, "learning_rate": 3.030758710226182e-06, "loss": 0.3838, "step": 4006 }, { "epoch": 1.9981715425531914, "grad_norm": 0.4206823706626892, "learning_rate": 3.0280916903705375e-06, "loss": 0.3589, "step": 4007 }, { "epoch": 1.9986702127659575, "grad_norm": 0.4471704363822937, "learning_rate": 3.0254253347325214e-06, "loss": 0.3777, "step": 4008 }, { "epoch": 1.9991688829787235, "grad_norm": 0.4933423399925232, "learning_rate": 3.0227596442102647e-06, "loss": 0.3895, "step": 4009 }, { "epoch": 1.9996675531914894, "grad_norm": 0.4366466999053955, "learning_rate": 3.0200946197016827e-06, "loss": 0.3779, "step": 4010 }, { "epoch": 2.000166223404255, "grad_norm": 1.1386834383010864, "learning_rate": 3.0174302621044595e-06, "loss": 0.5018, "step": 4011 }, { "epoch": 2.0006648936170213, "grad_norm": 0.46889933943748474, "learning_rate": 3.0147665723160574e-06, "loss": 0.3755, "step": 4012 }, { "epoch": 2.0011635638297873, "grad_norm": 0.46176406741142273, "learning_rate": 3.012103551233715e-06, "loss": 0.3231, "step": 4013 }, { "epoch": 2.0016622340425534, "grad_norm": 0.4137144386768341, "learning_rate": 3.009441199754442e-06, "loss": 0.3877, "step": 4014 }, { "epoch": 2.002160904255319, "grad_norm": 0.42991843819618225, "learning_rate": 3.0067795187750294e-06, "loss": 0.326, "step": 4015 }, { "epoch": 2.002659574468085, "grad_norm": 0.44340720772743225, "learning_rate": 3.004118509192032e-06, "loss": 0.3346, "step": 4016 }, { "epoch": 2.003158244680851, "grad_norm": 0.4438060224056244, "learning_rate": 3.001458171901791e-06, "loss": 0.3741, "step": 4017 }, { "epoch": 2.003656914893617, "grad_norm": 0.4065329134464264, "learning_rate": 2.9987985078004078e-06, "loss": 0.3311, "step": 4018 }, { "epoch": 2.004155585106383, "grad_norm": 0.43356409668922424, "learning_rate": 2.9961395177837694e-06, "loss": 0.3954, "step": 4019 }, { "epoch": 2.004654255319149, "grad_norm": 0.4584113657474518, "learning_rate": 2.993481202747525e-06, "loss": 0.3431, "step": 4020 }, { "epoch": 2.005152925531915, "grad_norm": 0.4528484046459198, "learning_rate": 2.9908235635871063e-06, "loss": 0.3324, "step": 4021 }, { "epoch": 2.005651595744681, "grad_norm": 0.4515042304992676, "learning_rate": 2.9881666011977097e-06, "loss": 0.3146, "step": 4022 }, { "epoch": 2.0061502659574466, "grad_norm": 0.42528262734413147, "learning_rate": 2.9855103164743048e-06, "loss": 0.3148, "step": 4023 }, { "epoch": 2.0066489361702127, "grad_norm": 0.47979018092155457, "learning_rate": 2.982854710311639e-06, "loss": 0.4015, "step": 4024 }, { "epoch": 2.0071476063829787, "grad_norm": 0.4139260947704315, "learning_rate": 2.980199783604223e-06, "loss": 0.3315, "step": 4025 }, { "epoch": 2.007646276595745, "grad_norm": 0.3894333839416504, "learning_rate": 2.9775455372463445e-06, "loss": 0.3266, "step": 4026 }, { "epoch": 2.008144946808511, "grad_norm": 0.475291907787323, "learning_rate": 2.974891972132058e-06, "loss": 0.3831, "step": 4027 }, { "epoch": 2.0086436170212765, "grad_norm": 0.4824581444263458, "learning_rate": 2.9722390891551933e-06, "loss": 0.3856, "step": 4028 }, { "epoch": 2.0091422872340425, "grad_norm": 0.44162726402282715, "learning_rate": 2.969586889209344e-06, "loss": 0.2951, "step": 4029 }, { "epoch": 2.0096409574468086, "grad_norm": 0.546813428401947, "learning_rate": 2.96693537318788e-06, "loss": 0.3691, "step": 4030 }, { "epoch": 2.0101396276595747, "grad_norm": 0.4077206254005432, "learning_rate": 2.964284541983936e-06, "loss": 0.3245, "step": 4031 }, { "epoch": 2.0106382978723403, "grad_norm": 0.3849606215953827, "learning_rate": 2.9616343964904194e-06, "loss": 0.3327, "step": 4032 }, { "epoch": 2.0111369680851063, "grad_norm": 0.4326135218143463, "learning_rate": 2.9589849376000062e-06, "loss": 0.3921, "step": 4033 }, { "epoch": 2.0116356382978724, "grad_norm": 0.4382765591144562, "learning_rate": 2.956336166205137e-06, "loss": 0.3147, "step": 4034 }, { "epoch": 2.0121343085106385, "grad_norm": 0.44101718068122864, "learning_rate": 2.9536880831980273e-06, "loss": 0.3318, "step": 4035 }, { "epoch": 2.012632978723404, "grad_norm": 0.43787360191345215, "learning_rate": 2.9510406894706546e-06, "loss": 0.2712, "step": 4036 }, { "epoch": 2.01313164893617, "grad_norm": 0.47856488823890686, "learning_rate": 2.948393985914769e-06, "loss": 0.4194, "step": 4037 }, { "epoch": 2.013630319148936, "grad_norm": 0.49233272671699524, "learning_rate": 2.9457479734218854e-06, "loss": 0.3591, "step": 4038 }, { "epoch": 2.0141289893617023, "grad_norm": 0.47044292092323303, "learning_rate": 2.9431026528832874e-06, "loss": 0.3604, "step": 4039 }, { "epoch": 2.014627659574468, "grad_norm": 0.412157267332077, "learning_rate": 2.9404580251900227e-06, "loss": 0.2948, "step": 4040 }, { "epoch": 2.015126329787234, "grad_norm": 0.41864287853240967, "learning_rate": 2.9378140912329112e-06, "loss": 0.3585, "step": 4041 }, { "epoch": 2.015625, "grad_norm": 0.39932966232299805, "learning_rate": 2.935170851902532e-06, "loss": 0.2721, "step": 4042 }, { "epoch": 2.016123670212766, "grad_norm": 0.45460301637649536, "learning_rate": 2.9325283080892375e-06, "loss": 0.3411, "step": 4043 }, { "epoch": 2.016622340425532, "grad_norm": 0.5510486960411072, "learning_rate": 2.9298864606831402e-06, "loss": 0.3835, "step": 4044 }, { "epoch": 2.0171210106382977, "grad_norm": 0.44924601912498474, "learning_rate": 2.9272453105741184e-06, "loss": 0.3406, "step": 4045 }, { "epoch": 2.017619680851064, "grad_norm": 0.4490600824356079, "learning_rate": 2.9246048586518206e-06, "loss": 0.3261, "step": 4046 }, { "epoch": 2.01811835106383, "grad_norm": 0.4479586184024811, "learning_rate": 2.921965105805653e-06, "loss": 0.3518, "step": 4047 }, { "epoch": 2.018617021276596, "grad_norm": 0.44005125761032104, "learning_rate": 2.9193260529247925e-06, "loss": 0.3485, "step": 4048 }, { "epoch": 2.0191156914893615, "grad_norm": 0.4185522496700287, "learning_rate": 2.9166877008981775e-06, "loss": 0.2913, "step": 4049 }, { "epoch": 2.0196143617021276, "grad_norm": 0.47455355525016785, "learning_rate": 2.9140500506145114e-06, "loss": 0.332, "step": 4050 }, { "epoch": 2.0201130319148937, "grad_norm": 0.42195427417755127, "learning_rate": 2.911413102962257e-06, "loss": 0.3039, "step": 4051 }, { "epoch": 2.0206117021276597, "grad_norm": 0.4582855999469757, "learning_rate": 2.908776858829648e-06, "loss": 0.3418, "step": 4052 }, { "epoch": 2.0211103723404253, "grad_norm": 0.4555753171443939, "learning_rate": 2.9061413191046726e-06, "loss": 0.378, "step": 4053 }, { "epoch": 2.0216090425531914, "grad_norm": 0.48843085765838623, "learning_rate": 2.9035064846750904e-06, "loss": 0.3095, "step": 4054 }, { "epoch": 2.0221077127659575, "grad_norm": 0.40765807032585144, "learning_rate": 2.9008723564284154e-06, "loss": 0.359, "step": 4055 }, { "epoch": 2.0226063829787235, "grad_norm": 0.4414226710796356, "learning_rate": 2.8982389352519304e-06, "loss": 0.3474, "step": 4056 }, { "epoch": 2.023105053191489, "grad_norm": 0.49452707171440125, "learning_rate": 2.895606222032674e-06, "loss": 0.3456, "step": 4057 }, { "epoch": 2.023603723404255, "grad_norm": 0.40308648347854614, "learning_rate": 2.8929742176574526e-06, "loss": 0.2888, "step": 4058 }, { "epoch": 2.0241023936170213, "grad_norm": 0.402471661567688, "learning_rate": 2.890342923012827e-06, "loss": 0.3403, "step": 4059 }, { "epoch": 2.0246010638297873, "grad_norm": 0.4739495515823364, "learning_rate": 2.887712338985127e-06, "loss": 0.3651, "step": 4060 }, { "epoch": 2.0250997340425534, "grad_norm": 0.4481804072856903, "learning_rate": 2.885082466460434e-06, "loss": 0.3597, "step": 4061 }, { "epoch": 2.025598404255319, "grad_norm": 0.39458444714546204, "learning_rate": 2.882453306324595e-06, "loss": 0.2901, "step": 4062 }, { "epoch": 2.026097074468085, "grad_norm": 0.4238298535346985, "learning_rate": 2.879824859463221e-06, "loss": 0.3486, "step": 4063 }, { "epoch": 2.026595744680851, "grad_norm": 0.4135909974575043, "learning_rate": 2.877197126761673e-06, "loss": 0.3859, "step": 4064 }, { "epoch": 2.027094414893617, "grad_norm": 0.4036499559879303, "learning_rate": 2.8745701091050794e-06, "loss": 0.2647, "step": 4065 }, { "epoch": 2.027593085106383, "grad_norm": 0.43845197558403015, "learning_rate": 2.871943807378324e-06, "loss": 0.3503, "step": 4066 }, { "epoch": 2.028091755319149, "grad_norm": 0.44022223353385925, "learning_rate": 2.869318222466048e-06, "loss": 0.3154, "step": 4067 }, { "epoch": 2.028590425531915, "grad_norm": 0.5093199014663696, "learning_rate": 2.866693355252658e-06, "loss": 0.3886, "step": 4068 }, { "epoch": 2.029089095744681, "grad_norm": 0.4244902431964874, "learning_rate": 2.8640692066223097e-06, "loss": 0.3537, "step": 4069 }, { "epoch": 2.0295877659574466, "grad_norm": 0.4233805239200592, "learning_rate": 2.8614457774589256e-06, "loss": 0.3193, "step": 4070 }, { "epoch": 2.0300864361702127, "grad_norm": 0.4426143169403076, "learning_rate": 2.858823068646177e-06, "loss": 0.3746, "step": 4071 }, { "epoch": 2.0305851063829787, "grad_norm": 0.3856036365032196, "learning_rate": 2.8562010810675025e-06, "loss": 0.36, "step": 4072 }, { "epoch": 2.031083776595745, "grad_norm": 0.38416358828544617, "learning_rate": 2.8535798156060866e-06, "loss": 0.3773, "step": 4073 }, { "epoch": 2.031582446808511, "grad_norm": 0.42154330015182495, "learning_rate": 2.8509592731448827e-06, "loss": 0.3577, "step": 4074 }, { "epoch": 2.0320811170212765, "grad_norm": 0.43457263708114624, "learning_rate": 2.8483394545665906e-06, "loss": 0.3327, "step": 4075 }, { "epoch": 2.0325797872340425, "grad_norm": 0.42967209219932556, "learning_rate": 2.8457203607536686e-06, "loss": 0.3241, "step": 4076 }, { "epoch": 2.0330784574468086, "grad_norm": 0.45895183086395264, "learning_rate": 2.8431019925883363e-06, "loss": 0.3188, "step": 4077 }, { "epoch": 2.0335771276595747, "grad_norm": 0.4607016444206238, "learning_rate": 2.8404843509525605e-06, "loss": 0.3755, "step": 4078 }, { "epoch": 2.0340757978723403, "grad_norm": 0.4423665404319763, "learning_rate": 2.8378674367280723e-06, "loss": 0.3245, "step": 4079 }, { "epoch": 2.0345744680851063, "grad_norm": 0.47331297397613525, "learning_rate": 2.8352512507963488e-06, "loss": 0.3339, "step": 4080 }, { "epoch": 2.0350731382978724, "grad_norm": 0.40828952193260193, "learning_rate": 2.8326357940386296e-06, "loss": 0.3554, "step": 4081 }, { "epoch": 2.0355718085106385, "grad_norm": 0.46144458651542664, "learning_rate": 2.8300210673359022e-06, "loss": 0.3806, "step": 4082 }, { "epoch": 2.036070478723404, "grad_norm": 0.41099295020103455, "learning_rate": 2.827407071568915e-06, "loss": 0.3197, "step": 4083 }, { "epoch": 2.03656914893617, "grad_norm": 0.4208088517189026, "learning_rate": 2.8247938076181614e-06, "loss": 0.3325, "step": 4084 }, { "epoch": 2.037067819148936, "grad_norm": 0.4675513803958893, "learning_rate": 2.822181276363898e-06, "loss": 0.3567, "step": 4085 }, { "epoch": 2.0375664893617023, "grad_norm": 0.4323137402534485, "learning_rate": 2.819569478686128e-06, "loss": 0.3327, "step": 4086 }, { "epoch": 2.038065159574468, "grad_norm": 0.46098437905311584, "learning_rate": 2.816958415464608e-06, "loss": 0.3281, "step": 4087 }, { "epoch": 2.038563829787234, "grad_norm": 0.4422532320022583, "learning_rate": 2.814348087578851e-06, "loss": 0.3571, "step": 4088 }, { "epoch": 2.0390625, "grad_norm": 0.438358873128891, "learning_rate": 2.8117384959081164e-06, "loss": 0.3715, "step": 4089 }, { "epoch": 2.039561170212766, "grad_norm": 0.421798974275589, "learning_rate": 2.809129641331424e-06, "loss": 0.2929, "step": 4090 }, { "epoch": 2.040059840425532, "grad_norm": 0.45438072085380554, "learning_rate": 2.806521524727535e-06, "loss": 0.3842, "step": 4091 }, { "epoch": 2.0405585106382977, "grad_norm": 0.3874478340148926, "learning_rate": 2.8039141469749733e-06, "loss": 0.2583, "step": 4092 }, { "epoch": 2.041057180851064, "grad_norm": 0.47128069400787354, "learning_rate": 2.8013075089520014e-06, "loss": 0.3683, "step": 4093 }, { "epoch": 2.04155585106383, "grad_norm": 0.4251592457294464, "learning_rate": 2.798701611536644e-06, "loss": 0.3699, "step": 4094 }, { "epoch": 2.042054521276596, "grad_norm": 0.4689197540283203, "learning_rate": 2.7960964556066723e-06, "loss": 0.3353, "step": 4095 }, { "epoch": 2.0425531914893615, "grad_norm": 0.3996008634567261, "learning_rate": 2.7934920420396028e-06, "loss": 0.3674, "step": 4096 }, { "epoch": 2.0430518617021276, "grad_norm": 0.4319206774234772, "learning_rate": 2.790888371712711e-06, "loss": 0.3976, "step": 4097 }, { "epoch": 2.0435505319148937, "grad_norm": 0.4150974452495575, "learning_rate": 2.7882854455030136e-06, "loss": 0.3218, "step": 4098 }, { "epoch": 2.0440492021276597, "grad_norm": 0.44509443640708923, "learning_rate": 2.785683264287283e-06, "loss": 0.2743, "step": 4099 }, { "epoch": 2.0445478723404253, "grad_norm": 0.4238615334033966, "learning_rate": 2.7830818289420343e-06, "loss": 0.3811, "step": 4100 }, { "epoch": 2.0450465425531914, "grad_norm": 0.399382621049881, "learning_rate": 2.7804811403435393e-06, "loss": 0.3186, "step": 4101 }, { "epoch": 2.0455452127659575, "grad_norm": 0.4306763708591461, "learning_rate": 2.7778811993678103e-06, "loss": 0.3382, "step": 4102 }, { "epoch": 2.0460438829787235, "grad_norm": 0.40267491340637207, "learning_rate": 2.775282006890615e-06, "loss": 0.3479, "step": 4103 }, { "epoch": 2.046542553191489, "grad_norm": 0.4109043776988983, "learning_rate": 2.7726835637874615e-06, "loss": 0.3346, "step": 4104 }, { "epoch": 2.047041223404255, "grad_norm": 0.44893673062324524, "learning_rate": 2.7700858709336143e-06, "loss": 0.3615, "step": 4105 }, { "epoch": 2.0475398936170213, "grad_norm": 0.4270349442958832, "learning_rate": 2.7674889292040745e-06, "loss": 0.3074, "step": 4106 }, { "epoch": 2.0480385638297873, "grad_norm": 0.4633520245552063, "learning_rate": 2.7648927394736015e-06, "loss": 0.3407, "step": 4107 }, { "epoch": 2.0485372340425534, "grad_norm": 0.5762377381324768, "learning_rate": 2.7622973026166933e-06, "loss": 0.3467, "step": 4108 }, { "epoch": 2.049035904255319, "grad_norm": 0.46060043573379517, "learning_rate": 2.759702619507595e-06, "loss": 0.3053, "step": 4109 }, { "epoch": 2.049534574468085, "grad_norm": 0.4493368864059448, "learning_rate": 2.757108691020304e-06, "loss": 0.3653, "step": 4110 }, { "epoch": 2.050033244680851, "grad_norm": 0.47001081705093384, "learning_rate": 2.7545155180285537e-06, "loss": 0.3591, "step": 4111 }, { "epoch": 2.050531914893617, "grad_norm": 0.42628365755081177, "learning_rate": 2.7519231014058344e-06, "loss": 0.325, "step": 4112 }, { "epoch": 2.051030585106383, "grad_norm": 0.43404096364974976, "learning_rate": 2.749331442025371e-06, "loss": 0.3315, "step": 4113 }, { "epoch": 2.051529255319149, "grad_norm": 0.44153639674186707, "learning_rate": 2.7467405407601426e-06, "loss": 0.3629, "step": 4114 }, { "epoch": 2.052027925531915, "grad_norm": 0.408420592546463, "learning_rate": 2.7441503984828636e-06, "loss": 0.2644, "step": 4115 }, { "epoch": 2.052526595744681, "grad_norm": 0.4777335822582245, "learning_rate": 2.7415610160660023e-06, "loss": 0.3305, "step": 4116 }, { "epoch": 2.0530252659574466, "grad_norm": 0.4719204604625702, "learning_rate": 2.7389723943817615e-06, "loss": 0.3351, "step": 4117 }, { "epoch": 2.0535239361702127, "grad_norm": 0.4356273114681244, "learning_rate": 2.736384534302098e-06, "loss": 0.3059, "step": 4118 }, { "epoch": 2.0540226063829787, "grad_norm": 0.4685983657836914, "learning_rate": 2.7337974366987027e-06, "loss": 0.3212, "step": 4119 }, { "epoch": 2.054521276595745, "grad_norm": 0.431197851896286, "learning_rate": 2.7312111024430127e-06, "loss": 0.3258, "step": 4120 }, { "epoch": 2.055019946808511, "grad_norm": 0.4753769040107727, "learning_rate": 2.7286255324062127e-06, "loss": 0.3376, "step": 4121 }, { "epoch": 2.0555186170212765, "grad_norm": 0.4233231246471405, "learning_rate": 2.7260407274592226e-06, "loss": 0.327, "step": 4122 }, { "epoch": 2.0560172872340425, "grad_norm": 0.46914851665496826, "learning_rate": 2.7234566884727107e-06, "loss": 0.3728, "step": 4123 }, { "epoch": 2.0565159574468086, "grad_norm": 0.49615392088890076, "learning_rate": 2.720873416317083e-06, "loss": 0.3401, "step": 4124 }, { "epoch": 2.0570146276595747, "grad_norm": 0.4575982093811035, "learning_rate": 2.7182909118624913e-06, "loss": 0.2956, "step": 4125 }, { "epoch": 2.0575132978723403, "grad_norm": 0.4393768608570099, "learning_rate": 2.7157091759788224e-06, "loss": 0.3314, "step": 4126 }, { "epoch": 2.0580119680851063, "grad_norm": 0.4905119240283966, "learning_rate": 2.7131282095357137e-06, "loss": 0.3481, "step": 4127 }, { "epoch": 2.0585106382978724, "grad_norm": 0.4703672230243683, "learning_rate": 2.710548013402532e-06, "loss": 0.3436, "step": 4128 }, { "epoch": 2.0590093085106385, "grad_norm": 0.5397093296051025, "learning_rate": 2.7079685884483957e-06, "loss": 0.3071, "step": 4129 }, { "epoch": 2.059507978723404, "grad_norm": 0.5126345753669739, "learning_rate": 2.7053899355421544e-06, "loss": 0.3724, "step": 4130 }, { "epoch": 2.06000664893617, "grad_norm": 0.431681752204895, "learning_rate": 2.7028120555524052e-06, "loss": 0.3235, "step": 4131 }, { "epoch": 2.060505319148936, "grad_norm": 0.42472678422927856, "learning_rate": 2.700234949347478e-06, "loss": 0.3569, "step": 4132 }, { "epoch": 2.0610039893617023, "grad_norm": 0.4484044015407562, "learning_rate": 2.697658617795448e-06, "loss": 0.3574, "step": 4133 }, { "epoch": 2.061502659574468, "grad_norm": 0.428227037191391, "learning_rate": 2.6950830617641238e-06, "loss": 0.3085, "step": 4134 }, { "epoch": 2.062001329787234, "grad_norm": 0.3765662908554077, "learning_rate": 2.6925082821210578e-06, "loss": 0.2607, "step": 4135 }, { "epoch": 2.0625, "grad_norm": 0.5891051888465881, "learning_rate": 2.6899342797335396e-06, "loss": 0.4725, "step": 4136 }, { "epoch": 2.062998670212766, "grad_norm": 0.40098172426223755, "learning_rate": 2.6873610554685936e-06, "loss": 0.3096, "step": 4137 }, { "epoch": 2.063497340425532, "grad_norm": 0.44930514693260193, "learning_rate": 2.684788610192988e-06, "loss": 0.3546, "step": 4138 }, { "epoch": 2.0639960106382977, "grad_norm": 0.4279641807079315, "learning_rate": 2.6822169447732216e-06, "loss": 0.3146, "step": 4139 }, { "epoch": 2.064494680851064, "grad_norm": 0.4372367858886719, "learning_rate": 2.679646060075538e-06, "loss": 0.3272, "step": 4140 }, { "epoch": 2.06499335106383, "grad_norm": 0.42562124133110046, "learning_rate": 2.677075956965912e-06, "loss": 0.3504, "step": 4141 }, { "epoch": 2.065492021276596, "grad_norm": 0.4282964766025543, "learning_rate": 2.674506636310056e-06, "loss": 0.3133, "step": 4142 }, { "epoch": 2.0659906914893615, "grad_norm": 0.44337841868400574, "learning_rate": 2.6719380989734222e-06, "loss": 0.3853, "step": 4143 }, { "epoch": 2.0664893617021276, "grad_norm": 0.41779395937919617, "learning_rate": 2.669370345821195e-06, "loss": 0.3216, "step": 4144 }, { "epoch": 2.0669880319148937, "grad_norm": 0.48933425545692444, "learning_rate": 2.6668033777182982e-06, "loss": 0.4086, "step": 4145 }, { "epoch": 2.0674867021276597, "grad_norm": 0.4054243564605713, "learning_rate": 2.6642371955293865e-06, "loss": 0.2901, "step": 4146 }, { "epoch": 2.0679853723404253, "grad_norm": 0.41999033093452454, "learning_rate": 2.661671800118858e-06, "loss": 0.3466, "step": 4147 }, { "epoch": 2.0684840425531914, "grad_norm": 0.4263971149921417, "learning_rate": 2.6591071923508338e-06, "loss": 0.3246, "step": 4148 }, { "epoch": 2.0689827127659575, "grad_norm": 0.4509006440639496, "learning_rate": 2.656543373089182e-06, "loss": 0.3309, "step": 4149 }, { "epoch": 2.0694813829787235, "grad_norm": 0.4279864430427551, "learning_rate": 2.653980343197498e-06, "loss": 0.3523, "step": 4150 }, { "epoch": 2.0699800531914896, "grad_norm": 0.45799121260643005, "learning_rate": 2.65141810353911e-06, "loss": 0.3592, "step": 4151 }, { "epoch": 2.070478723404255, "grad_norm": 0.41752108931541443, "learning_rate": 2.648856654977087e-06, "loss": 0.3267, "step": 4152 }, { "epoch": 2.0709773936170213, "grad_norm": 0.4454391598701477, "learning_rate": 2.646295998374223e-06, "loss": 0.3741, "step": 4153 }, { "epoch": 2.0714760638297873, "grad_norm": 0.4359075725078583, "learning_rate": 2.6437361345930546e-06, "loss": 0.3237, "step": 4154 }, { "epoch": 2.0719747340425534, "grad_norm": 0.42169442772865295, "learning_rate": 2.6411770644958417e-06, "loss": 0.3425, "step": 4155 }, { "epoch": 2.072473404255319, "grad_norm": 0.38385990262031555, "learning_rate": 2.6386187889445857e-06, "loss": 0.3332, "step": 4156 }, { "epoch": 2.072972074468085, "grad_norm": 0.4719814360141754, "learning_rate": 2.636061308801012e-06, "loss": 0.3433, "step": 4157 }, { "epoch": 2.073470744680851, "grad_norm": 0.45068782567977905, "learning_rate": 2.633504624926586e-06, "loss": 0.3318, "step": 4158 }, { "epoch": 2.073969414893617, "grad_norm": 0.4057500958442688, "learning_rate": 2.6309487381824975e-06, "loss": 0.3219, "step": 4159 }, { "epoch": 2.074468085106383, "grad_norm": 0.440489262342453, "learning_rate": 2.628393649429676e-06, "loss": 0.4203, "step": 4160 }, { "epoch": 2.074966755319149, "grad_norm": 0.3984268307685852, "learning_rate": 2.6258393595287744e-06, "loss": 0.308, "step": 4161 }, { "epoch": 2.075465425531915, "grad_norm": 0.4301084280014038, "learning_rate": 2.6232858693401784e-06, "loss": 0.376, "step": 4162 }, { "epoch": 2.075964095744681, "grad_norm": 0.38088804483413696, "learning_rate": 2.62073317972401e-06, "loss": 0.3236, "step": 4163 }, { "epoch": 2.0764627659574466, "grad_norm": 0.433200865983963, "learning_rate": 2.618181291540113e-06, "loss": 0.3543, "step": 4164 }, { "epoch": 2.0769614361702127, "grad_norm": 0.5148292183876038, "learning_rate": 2.6156302056480685e-06, "loss": 0.3497, "step": 4165 }, { "epoch": 2.0774601063829787, "grad_norm": 0.42599308490753174, "learning_rate": 2.6130799229071814e-06, "loss": 0.3136, "step": 4166 }, { "epoch": 2.077958776595745, "grad_norm": 0.3904781639575958, "learning_rate": 2.6105304441764913e-06, "loss": 0.3284, "step": 4167 }, { "epoch": 2.0784574468085104, "grad_norm": 0.4601826071739197, "learning_rate": 2.607981770314766e-06, "loss": 0.409, "step": 4168 }, { "epoch": 2.0789561170212765, "grad_norm": 0.4040687680244446, "learning_rate": 2.6054339021804964e-06, "loss": 0.2513, "step": 4169 }, { "epoch": 2.0794547872340425, "grad_norm": 0.4636107087135315, "learning_rate": 2.602886840631911e-06, "loss": 0.3526, "step": 4170 }, { "epoch": 2.0799534574468086, "grad_norm": 0.3701099753379822, "learning_rate": 2.6003405865269582e-06, "loss": 0.3099, "step": 4171 }, { "epoch": 2.0804521276595747, "grad_norm": 0.39532920718193054, "learning_rate": 2.5977951407233213e-06, "loss": 0.3084, "step": 4172 }, { "epoch": 2.0809507978723403, "grad_norm": 0.4191504120826721, "learning_rate": 2.5952505040784064e-06, "loss": 0.3442, "step": 4173 }, { "epoch": 2.0814494680851063, "grad_norm": 0.4363122582435608, "learning_rate": 2.5927066774493507e-06, "loss": 0.3678, "step": 4174 }, { "epoch": 2.0819481382978724, "grad_norm": 0.40957528352737427, "learning_rate": 2.590163661693015e-06, "loss": 0.3333, "step": 4175 }, { "epoch": 2.0824468085106385, "grad_norm": 0.4100664556026459, "learning_rate": 2.5876214576659896e-06, "loss": 0.3418, "step": 4176 }, { "epoch": 2.082945478723404, "grad_norm": 0.4938824474811554, "learning_rate": 2.58508006622459e-06, "loss": 0.4123, "step": 4177 }, { "epoch": 2.08344414893617, "grad_norm": 0.4010179042816162, "learning_rate": 2.58253948822486e-06, "loss": 0.2876, "step": 4178 }, { "epoch": 2.083942819148936, "grad_norm": 0.5195074677467346, "learning_rate": 2.5799997245225657e-06, "loss": 0.3538, "step": 4179 }, { "epoch": 2.0844414893617023, "grad_norm": 0.45879343152046204, "learning_rate": 2.577460775973204e-06, "loss": 0.2946, "step": 4180 }, { "epoch": 2.084940159574468, "grad_norm": 0.4722862243652344, "learning_rate": 2.574922643431991e-06, "loss": 0.4011, "step": 4181 }, { "epoch": 2.085438829787234, "grad_norm": 0.44623786211013794, "learning_rate": 2.5723853277538745e-06, "loss": 0.3042, "step": 4182 }, { "epoch": 2.0859375, "grad_norm": 0.49725547432899475, "learning_rate": 2.5698488297935224e-06, "loss": 0.3848, "step": 4183 }, { "epoch": 2.086436170212766, "grad_norm": 0.3977830708026886, "learning_rate": 2.5673131504053273e-06, "loss": 0.3017, "step": 4184 }, { "epoch": 2.086934840425532, "grad_norm": 0.45368656516075134, "learning_rate": 2.5647782904434105e-06, "loss": 0.3662, "step": 4185 }, { "epoch": 2.0874335106382977, "grad_norm": 0.4591296911239624, "learning_rate": 2.5622442507616114e-06, "loss": 0.332, "step": 4186 }, { "epoch": 2.087932180851064, "grad_norm": 0.4736798405647278, "learning_rate": 2.559711032213499e-06, "loss": 0.3703, "step": 4187 }, { "epoch": 2.08843085106383, "grad_norm": 0.3910393714904785, "learning_rate": 2.557178635652359e-06, "loss": 0.3091, "step": 4188 }, { "epoch": 2.088929521276596, "grad_norm": 0.4780357778072357, "learning_rate": 2.554647061931208e-06, "loss": 0.3145, "step": 4189 }, { "epoch": 2.0894281914893615, "grad_norm": 0.42473074793815613, "learning_rate": 2.552116311902777e-06, "loss": 0.3023, "step": 4190 }, { "epoch": 2.0899268617021276, "grad_norm": 0.4256925582885742, "learning_rate": 2.549586386419528e-06, "loss": 0.3653, "step": 4191 }, { "epoch": 2.0904255319148937, "grad_norm": 0.48633021116256714, "learning_rate": 2.5470572863336374e-06, "loss": 0.3949, "step": 4192 }, { "epoch": 2.0909242021276597, "grad_norm": 0.4727499186992645, "learning_rate": 2.544529012497011e-06, "loss": 0.3966, "step": 4193 }, { "epoch": 2.0914228723404253, "grad_norm": 0.3663601279258728, "learning_rate": 2.5420015657612705e-06, "loss": 0.2522, "step": 4194 }, { "epoch": 2.0919215425531914, "grad_norm": 0.43614962697029114, "learning_rate": 2.53947494697776e-06, "loss": 0.3481, "step": 4195 }, { "epoch": 2.0924202127659575, "grad_norm": 0.44035571813583374, "learning_rate": 2.5369491569975492e-06, "loss": 0.3145, "step": 4196 }, { "epoch": 2.0929188829787235, "grad_norm": 0.46680939197540283, "learning_rate": 2.5344241966714213e-06, "loss": 0.3377, "step": 4197 }, { "epoch": 2.0934175531914896, "grad_norm": 0.45562589168548584, "learning_rate": 2.5319000668498873e-06, "loss": 0.3466, "step": 4198 }, { "epoch": 2.093916223404255, "grad_norm": 0.4308147728443146, "learning_rate": 2.5293767683831717e-06, "loss": 0.3531, "step": 4199 }, { "epoch": 2.0944148936170213, "grad_norm": 0.4653606414794922, "learning_rate": 2.5268543021212264e-06, "loss": 0.3155, "step": 4200 }, { "epoch": 2.0949135638297873, "grad_norm": 0.4060400724411011, "learning_rate": 2.524332668913716e-06, "loss": 0.3147, "step": 4201 }, { "epoch": 2.0954122340425534, "grad_norm": 0.470231831073761, "learning_rate": 2.5218118696100292e-06, "loss": 0.3509, "step": 4202 }, { "epoch": 2.095910904255319, "grad_norm": 0.47403010725975037, "learning_rate": 2.51929190505927e-06, "loss": 0.3617, "step": 4203 }, { "epoch": 2.096409574468085, "grad_norm": 0.45942577719688416, "learning_rate": 2.516772776110267e-06, "loss": 0.3241, "step": 4204 }, { "epoch": 2.096908244680851, "grad_norm": 0.44825422763824463, "learning_rate": 2.5142544836115577e-06, "loss": 0.3284, "step": 4205 }, { "epoch": 2.097406914893617, "grad_norm": 0.4779915511608124, "learning_rate": 2.5117370284114082e-06, "loss": 0.3885, "step": 4206 }, { "epoch": 2.097905585106383, "grad_norm": 0.41180500388145447, "learning_rate": 2.5092204113578e-06, "loss": 0.2797, "step": 4207 }, { "epoch": 2.098404255319149, "grad_norm": 0.46342378854751587, "learning_rate": 2.5067046332984248e-06, "loss": 0.4382, "step": 4208 }, { "epoch": 2.098902925531915, "grad_norm": 0.4214823544025421, "learning_rate": 2.504189695080703e-06, "loss": 0.2964, "step": 4209 }, { "epoch": 2.099401595744681, "grad_norm": 0.4614854156970978, "learning_rate": 2.501675597551761e-06, "loss": 0.3789, "step": 4210 }, { "epoch": 2.0999002659574466, "grad_norm": 0.4533722996711731, "learning_rate": 2.499162341558452e-06, "loss": 0.3474, "step": 4211 }, { "epoch": 2.1003989361702127, "grad_norm": 0.4387909770011902, "learning_rate": 2.4966499279473376e-06, "loss": 0.3152, "step": 4212 }, { "epoch": 2.1008976063829787, "grad_norm": 0.3965546786785126, "learning_rate": 2.4941383575647034e-06, "loss": 0.2843, "step": 4213 }, { "epoch": 2.101396276595745, "grad_norm": 0.4417296350002289, "learning_rate": 2.4916276312565417e-06, "loss": 0.3096, "step": 4214 }, { "epoch": 2.1018949468085104, "grad_norm": 0.49729123711586, "learning_rate": 2.489117749868571e-06, "loss": 0.3566, "step": 4215 }, { "epoch": 2.1023936170212765, "grad_norm": 0.39966246485710144, "learning_rate": 2.4866087142462165e-06, "loss": 0.3436, "step": 4216 }, { "epoch": 2.1028922872340425, "grad_norm": 0.45553678274154663, "learning_rate": 2.4841005252346203e-06, "loss": 0.3151, "step": 4217 }, { "epoch": 2.1033909574468086, "grad_norm": 0.5128469467163086, "learning_rate": 2.481593183678645e-06, "loss": 0.3533, "step": 4218 }, { "epoch": 2.1038896276595747, "grad_norm": 0.4254779517650604, "learning_rate": 2.4790866904228595e-06, "loss": 0.3162, "step": 4219 }, { "epoch": 2.1043882978723403, "grad_norm": 0.46225622296333313, "learning_rate": 2.4765810463115546e-06, "loss": 0.4052, "step": 4220 }, { "epoch": 2.1048869680851063, "grad_norm": 0.41395512223243713, "learning_rate": 2.474076252188728e-06, "loss": 0.3422, "step": 4221 }, { "epoch": 2.1053856382978724, "grad_norm": 0.44821301102638245, "learning_rate": 2.4715723088980986e-06, "loss": 0.3454, "step": 4222 }, { "epoch": 2.1058843085106385, "grad_norm": 0.4290139079093933, "learning_rate": 2.4690692172830905e-06, "loss": 0.3575, "step": 4223 }, { "epoch": 2.106382978723404, "grad_norm": 0.39855408668518066, "learning_rate": 2.4665669781868488e-06, "loss": 0.3345, "step": 4224 }, { "epoch": 2.10688164893617, "grad_norm": 0.5021311640739441, "learning_rate": 2.4640655924522266e-06, "loss": 0.3898, "step": 4225 }, { "epoch": 2.107380319148936, "grad_norm": 0.413887083530426, "learning_rate": 2.4615650609217884e-06, "loss": 0.3032, "step": 4226 }, { "epoch": 2.1078789893617023, "grad_norm": 0.43488261103630066, "learning_rate": 2.4590653844378166e-06, "loss": 0.3387, "step": 4227 }, { "epoch": 2.108377659574468, "grad_norm": 0.49369966983795166, "learning_rate": 2.4565665638422987e-06, "loss": 0.363, "step": 4228 }, { "epoch": 2.108876329787234, "grad_norm": 0.4003843665122986, "learning_rate": 2.454068599976942e-06, "loss": 0.284, "step": 4229 }, { "epoch": 2.109375, "grad_norm": 0.48096078634262085, "learning_rate": 2.451571493683156e-06, "loss": 0.356, "step": 4230 }, { "epoch": 2.109873670212766, "grad_norm": 0.49327975511550903, "learning_rate": 2.4490752458020706e-06, "loss": 0.3633, "step": 4231 }, { "epoch": 2.110372340425532, "grad_norm": 0.4766596853733063, "learning_rate": 2.446579857174517e-06, "loss": 0.3724, "step": 4232 }, { "epoch": 2.1108710106382977, "grad_norm": 0.47969257831573486, "learning_rate": 2.444085328641047e-06, "loss": 0.3823, "step": 4233 }, { "epoch": 2.111369680851064, "grad_norm": 0.3811202943325043, "learning_rate": 2.441591661041913e-06, "loss": 0.2725, "step": 4234 }, { "epoch": 2.11186835106383, "grad_norm": 0.4790569543838501, "learning_rate": 2.4390988552170863e-06, "loss": 0.3681, "step": 4235 }, { "epoch": 2.112367021276596, "grad_norm": 0.4385063350200653, "learning_rate": 2.436606912006242e-06, "loss": 0.3196, "step": 4236 }, { "epoch": 2.1128656914893615, "grad_norm": 0.4403355121612549, "learning_rate": 2.4341158322487634e-06, "loss": 0.3459, "step": 4237 }, { "epoch": 2.1133643617021276, "grad_norm": 0.45451733469963074, "learning_rate": 2.4316256167837506e-06, "loss": 0.336, "step": 4238 }, { "epoch": 2.1138630319148937, "grad_norm": 0.4176180362701416, "learning_rate": 2.4291362664500034e-06, "loss": 0.3207, "step": 4239 }, { "epoch": 2.1143617021276597, "grad_norm": 0.4010412096977234, "learning_rate": 2.4266477820860366e-06, "loss": 0.3485, "step": 4240 }, { "epoch": 2.1148603723404253, "grad_norm": 0.44226253032684326, "learning_rate": 2.424160164530074e-06, "loss": 0.3908, "step": 4241 }, { "epoch": 2.1153590425531914, "grad_norm": 0.4782584011554718, "learning_rate": 2.42167341462004e-06, "loss": 0.3392, "step": 4242 }, { "epoch": 2.1158577127659575, "grad_norm": 0.4446498453617096, "learning_rate": 2.4191875331935755e-06, "loss": 0.3272, "step": 4243 }, { "epoch": 2.1163563829787235, "grad_norm": 0.4095752239227295, "learning_rate": 2.416702521088021e-06, "loss": 0.3363, "step": 4244 }, { "epoch": 2.1168550531914896, "grad_norm": 0.43678298592567444, "learning_rate": 2.41421837914043e-06, "loss": 0.3841, "step": 4245 }, { "epoch": 2.117353723404255, "grad_norm": 0.4067336320877075, "learning_rate": 2.4117351081875628e-06, "loss": 0.3223, "step": 4246 }, { "epoch": 2.1178523936170213, "grad_norm": 0.40416377782821655, "learning_rate": 2.409252709065883e-06, "loss": 0.2965, "step": 4247 }, { "epoch": 2.1183510638297873, "grad_norm": 0.49074798822402954, "learning_rate": 2.4067711826115593e-06, "loss": 0.3908, "step": 4248 }, { "epoch": 2.1188497340425534, "grad_norm": 0.4118264615535736, "learning_rate": 2.4042905296604734e-06, "loss": 0.2933, "step": 4249 }, { "epoch": 2.119348404255319, "grad_norm": 0.4297908544540405, "learning_rate": 2.401810751048205e-06, "loss": 0.3247, "step": 4250 }, { "epoch": 2.119847074468085, "grad_norm": 0.482593297958374, "learning_rate": 2.399331847610045e-06, "loss": 0.3754, "step": 4251 }, { "epoch": 2.120345744680851, "grad_norm": 0.4525277316570282, "learning_rate": 2.3968538201809856e-06, "loss": 0.3635, "step": 4252 }, { "epoch": 2.120844414893617, "grad_norm": 0.3976767957210541, "learning_rate": 2.394376669595728e-06, "loss": 0.3188, "step": 4253 }, { "epoch": 2.121343085106383, "grad_norm": 0.4377324879169464, "learning_rate": 2.3919003966886718e-06, "loss": 0.3882, "step": 4254 }, { "epoch": 2.121841755319149, "grad_norm": 0.4095824360847473, "learning_rate": 2.389425002293929e-06, "loss": 0.2882, "step": 4255 }, { "epoch": 2.122340425531915, "grad_norm": 0.365953654050827, "learning_rate": 2.386950487245308e-06, "loss": 0.3298, "step": 4256 }, { "epoch": 2.122839095744681, "grad_norm": 0.48359447717666626, "learning_rate": 2.3844768523763284e-06, "loss": 0.2881, "step": 4257 }, { "epoch": 2.1233377659574466, "grad_norm": 0.46228986978530884, "learning_rate": 2.3820040985202066e-06, "loss": 0.3539, "step": 4258 }, { "epoch": 2.1238364361702127, "grad_norm": 0.4020352065563202, "learning_rate": 2.3795322265098635e-06, "loss": 0.3579, "step": 4259 }, { "epoch": 2.1243351063829787, "grad_norm": 0.4123663306236267, "learning_rate": 2.377061237177929e-06, "loss": 0.386, "step": 4260 }, { "epoch": 2.124833776595745, "grad_norm": 0.3943619132041931, "learning_rate": 2.3745911313567265e-06, "loss": 0.3062, "step": 4261 }, { "epoch": 2.1253324468085104, "grad_norm": 0.4090651273727417, "learning_rate": 2.3721219098782905e-06, "loss": 0.3323, "step": 4262 }, { "epoch": 2.1258311170212765, "grad_norm": 0.45636892318725586, "learning_rate": 2.36965357357435e-06, "loss": 0.3688, "step": 4263 }, { "epoch": 2.1263297872340425, "grad_norm": 0.4372006356716156, "learning_rate": 2.3671861232763434e-06, "loss": 0.3023, "step": 4264 }, { "epoch": 2.1268284574468086, "grad_norm": 0.4279671013355255, "learning_rate": 2.364719559815402e-06, "loss": 0.3182, "step": 4265 }, { "epoch": 2.1273271276595747, "grad_norm": 0.4855438768863678, "learning_rate": 2.3622538840223687e-06, "loss": 0.3746, "step": 4266 }, { "epoch": 2.1278257978723403, "grad_norm": 0.4554178714752197, "learning_rate": 2.359789096727776e-06, "loss": 0.3671, "step": 4267 }, { "epoch": 2.1283244680851063, "grad_norm": 0.4431704580783844, "learning_rate": 2.357325198761867e-06, "loss": 0.3455, "step": 4268 }, { "epoch": 2.1288231382978724, "grad_norm": 0.4461771845817566, "learning_rate": 2.3548621909545803e-06, "loss": 0.2904, "step": 4269 }, { "epoch": 2.1293218085106385, "grad_norm": 0.46119311451911926, "learning_rate": 2.3524000741355534e-06, "loss": 0.3785, "step": 4270 }, { "epoch": 2.129820478723404, "grad_norm": 0.4784943461418152, "learning_rate": 2.349938849134129e-06, "loss": 0.3297, "step": 4271 }, { "epoch": 2.13031914893617, "grad_norm": 0.3986727297306061, "learning_rate": 2.3474785167793423e-06, "loss": 0.3091, "step": 4272 }, { "epoch": 2.130817819148936, "grad_norm": 0.421191930770874, "learning_rate": 2.345019077899936e-06, "loss": 0.4044, "step": 4273 }, { "epoch": 2.1313164893617023, "grad_norm": 0.4352326989173889, "learning_rate": 2.3425605333243434e-06, "loss": 0.358, "step": 4274 }, { "epoch": 2.131815159574468, "grad_norm": 0.4112214148044586, "learning_rate": 2.3401028838807044e-06, "loss": 0.2938, "step": 4275 }, { "epoch": 2.132313829787234, "grad_norm": 0.4473170340061188, "learning_rate": 2.33764613039685e-06, "loss": 0.3376, "step": 4276 }, { "epoch": 2.1328125, "grad_norm": 0.4827459156513214, "learning_rate": 2.335190273700317e-06, "loss": 0.3409, "step": 4277 }, { "epoch": 2.133311170212766, "grad_norm": 0.3639219403266907, "learning_rate": 2.3327353146183325e-06, "loss": 0.2566, "step": 4278 }, { "epoch": 2.133809840425532, "grad_norm": 0.482585608959198, "learning_rate": 2.3302812539778263e-06, "loss": 0.3982, "step": 4279 }, { "epoch": 2.1343085106382977, "grad_norm": 0.3879324495792389, "learning_rate": 2.3278280926054268e-06, "loss": 0.3476, "step": 4280 }, { "epoch": 2.134807180851064, "grad_norm": 0.41777271032333374, "learning_rate": 2.3253758313274527e-06, "loss": 0.3659, "step": 4281 }, { "epoch": 2.13530585106383, "grad_norm": 0.42823317646980286, "learning_rate": 2.322924470969928e-06, "loss": 0.3178, "step": 4282 }, { "epoch": 2.135804521276596, "grad_norm": 0.43251070380210876, "learning_rate": 2.3204740123585645e-06, "loss": 0.3246, "step": 4283 }, { "epoch": 2.1363031914893615, "grad_norm": 0.3845297694206238, "learning_rate": 2.3180244563187786e-06, "loss": 0.2895, "step": 4284 }, { "epoch": 2.1368018617021276, "grad_norm": 0.43302828073501587, "learning_rate": 2.315575803675676e-06, "loss": 0.3327, "step": 4285 }, { "epoch": 2.1373005319148937, "grad_norm": 0.41370245814323425, "learning_rate": 2.313128055254063e-06, "loss": 0.3612, "step": 4286 }, { "epoch": 2.1377992021276597, "grad_norm": 0.4234575927257538, "learning_rate": 2.310681211878437e-06, "loss": 0.3417, "step": 4287 }, { "epoch": 2.1382978723404253, "grad_norm": 0.4327595829963684, "learning_rate": 2.3082352743729957e-06, "loss": 0.3694, "step": 4288 }, { "epoch": 2.1387965425531914, "grad_norm": 0.4134533703327179, "learning_rate": 2.305790243561625e-06, "loss": 0.3253, "step": 4289 }, { "epoch": 2.1392952127659575, "grad_norm": 0.42380791902542114, "learning_rate": 2.303346120267913e-06, "loss": 0.3667, "step": 4290 }, { "epoch": 2.1397938829787235, "grad_norm": 0.4327552616596222, "learning_rate": 2.3009029053151366e-06, "loss": 0.3634, "step": 4291 }, { "epoch": 2.1402925531914896, "grad_norm": 0.4556273818016052, "learning_rate": 2.2984605995262667e-06, "loss": 0.3852, "step": 4292 }, { "epoch": 2.140791223404255, "grad_norm": 0.3934900760650635, "learning_rate": 2.296019203723972e-06, "loss": 0.3248, "step": 4293 }, { "epoch": 2.1412898936170213, "grad_norm": 0.40938249230384827, "learning_rate": 2.2935787187306104e-06, "loss": 0.3372, "step": 4294 }, { "epoch": 2.1417885638297873, "grad_norm": 0.3956049978733063, "learning_rate": 2.291139145368237e-06, "loss": 0.3734, "step": 4295 }, { "epoch": 2.1422872340425534, "grad_norm": 0.37088170647621155, "learning_rate": 2.2887004844585953e-06, "loss": 0.3023, "step": 4296 }, { "epoch": 2.142785904255319, "grad_norm": 0.42604270577430725, "learning_rate": 2.2862627368231267e-06, "loss": 0.3624, "step": 4297 }, { "epoch": 2.143284574468085, "grad_norm": 0.4564799964427948, "learning_rate": 2.2838259032829587e-06, "loss": 0.3528, "step": 4298 }, { "epoch": 2.143783244680851, "grad_norm": 0.451243132352829, "learning_rate": 2.2813899846589183e-06, "loss": 0.322, "step": 4299 }, { "epoch": 2.144281914893617, "grad_norm": 0.3973666727542877, "learning_rate": 2.278954981771518e-06, "loss": 0.3249, "step": 4300 }, { "epoch": 2.144780585106383, "grad_norm": 0.39528340101242065, "learning_rate": 2.276520895440963e-06, "loss": 0.2912, "step": 4301 }, { "epoch": 2.145279255319149, "grad_norm": 0.4570300877094269, "learning_rate": 2.274087726487154e-06, "loss": 0.3696, "step": 4302 }, { "epoch": 2.145777925531915, "grad_norm": 0.43028947710990906, "learning_rate": 2.271655475729677e-06, "loss": 0.3643, "step": 4303 }, { "epoch": 2.146276595744681, "grad_norm": 0.4101642370223999, "learning_rate": 2.2692241439878143e-06, "loss": 0.348, "step": 4304 }, { "epoch": 2.1467752659574466, "grad_norm": 0.3852508068084717, "learning_rate": 2.266793732080532e-06, "loss": 0.3189, "step": 4305 }, { "epoch": 2.1472739361702127, "grad_norm": 0.43341147899627686, "learning_rate": 2.264364240826494e-06, "loss": 0.3549, "step": 4306 }, { "epoch": 2.1477726063829787, "grad_norm": 0.40052953362464905, "learning_rate": 2.261935671044046e-06, "loss": 0.2982, "step": 4307 }, { "epoch": 2.148271276595745, "grad_norm": 0.385576069355011, "learning_rate": 2.2595080235512322e-06, "loss": 0.3807, "step": 4308 }, { "epoch": 2.1487699468085104, "grad_norm": 0.41617709398269653, "learning_rate": 2.257081299165777e-06, "loss": 0.3219, "step": 4309 }, { "epoch": 2.1492686170212765, "grad_norm": 0.43982791900634766, "learning_rate": 2.254655498705102e-06, "loss": 0.3687, "step": 4310 }, { "epoch": 2.1497672872340425, "grad_norm": 0.40657636523246765, "learning_rate": 2.25223062298631e-06, "loss": 0.3151, "step": 4311 }, { "epoch": 2.1502659574468086, "grad_norm": 0.4794269800186157, "learning_rate": 2.2498066728262003e-06, "loss": 0.3532, "step": 4312 }, { "epoch": 2.1507646276595747, "grad_norm": 0.4042162299156189, "learning_rate": 2.2473836490412528e-06, "loss": 0.2989, "step": 4313 }, { "epoch": 2.1512632978723403, "grad_norm": 0.45536452531814575, "learning_rate": 2.2449615524476416e-06, "loss": 0.3743, "step": 4314 }, { "epoch": 2.1517619680851063, "grad_norm": 0.47446757555007935, "learning_rate": 2.242540383861223e-06, "loss": 0.3861, "step": 4315 }, { "epoch": 2.1522606382978724, "grad_norm": 0.41286394000053406, "learning_rate": 2.2401201440975466e-06, "loss": 0.3008, "step": 4316 }, { "epoch": 2.1527593085106385, "grad_norm": 0.426445871591568, "learning_rate": 2.237700833971843e-06, "loss": 0.2848, "step": 4317 }, { "epoch": 2.153257978723404, "grad_norm": 0.41221776604652405, "learning_rate": 2.2352824542990336e-06, "loss": 0.3992, "step": 4318 }, { "epoch": 2.15375664893617, "grad_norm": 0.4795021712779999, "learning_rate": 2.232865005893728e-06, "loss": 0.3891, "step": 4319 }, { "epoch": 2.154255319148936, "grad_norm": 0.45316383242607117, "learning_rate": 2.2304484895702157e-06, "loss": 0.3333, "step": 4320 }, { "epoch": 2.1547539893617023, "grad_norm": 0.38889777660369873, "learning_rate": 2.2280329061424792e-06, "loss": 0.3252, "step": 4321 }, { "epoch": 2.155252659574468, "grad_norm": 0.4310622215270996, "learning_rate": 2.2256182564241827e-06, "loss": 0.3527, "step": 4322 }, { "epoch": 2.155751329787234, "grad_norm": 0.442238450050354, "learning_rate": 2.223204541228675e-06, "loss": 0.3506, "step": 4323 }, { "epoch": 2.15625, "grad_norm": 0.4495842158794403, "learning_rate": 2.220791761368995e-06, "loss": 0.3554, "step": 4324 }, { "epoch": 2.156748670212766, "grad_norm": 0.4253484606742859, "learning_rate": 2.2183799176578597e-06, "loss": 0.3428, "step": 4325 }, { "epoch": 2.157247340425532, "grad_norm": 0.4500124752521515, "learning_rate": 2.215969010907679e-06, "loss": 0.3858, "step": 4326 }, { "epoch": 2.1577460106382977, "grad_norm": 0.43157196044921875, "learning_rate": 2.213559041930539e-06, "loss": 0.3277, "step": 4327 }, { "epoch": 2.158244680851064, "grad_norm": 0.4175984561443329, "learning_rate": 2.2111500115382163e-06, "loss": 0.2928, "step": 4328 }, { "epoch": 2.15874335106383, "grad_norm": 0.45183610916137695, "learning_rate": 2.208741920542166e-06, "loss": 0.3915, "step": 4329 }, { "epoch": 2.159242021276596, "grad_norm": 0.4457416236400604, "learning_rate": 2.206334769753534e-06, "loss": 0.3415, "step": 4330 }, { "epoch": 2.1597406914893615, "grad_norm": 0.4630952477455139, "learning_rate": 2.2039285599831394e-06, "loss": 0.3752, "step": 4331 }, { "epoch": 2.1602393617021276, "grad_norm": 0.394555002450943, "learning_rate": 2.2015232920414947e-06, "loss": 0.316, "step": 4332 }, { "epoch": 2.1607380319148937, "grad_norm": 0.38934871554374695, "learning_rate": 2.1991189667387887e-06, "loss": 0.2836, "step": 4333 }, { "epoch": 2.1612367021276597, "grad_norm": 0.4507996439933777, "learning_rate": 2.196715584884892e-06, "loss": 0.3732, "step": 4334 }, { "epoch": 2.1617353723404253, "grad_norm": 0.42680224776268005, "learning_rate": 2.194313147289364e-06, "loss": 0.3488, "step": 4335 }, { "epoch": 2.1622340425531914, "grad_norm": 0.4458331763744354, "learning_rate": 2.1919116547614376e-06, "loss": 0.3486, "step": 4336 }, { "epoch": 2.1627327127659575, "grad_norm": 0.4284341335296631, "learning_rate": 2.1895111081100364e-06, "loss": 0.372, "step": 4337 }, { "epoch": 2.1632313829787235, "grad_norm": 0.37739038467407227, "learning_rate": 2.187111508143755e-06, "loss": 0.2886, "step": 4338 }, { "epoch": 2.1637300531914896, "grad_norm": 0.4749111235141754, "learning_rate": 2.1847128556708795e-06, "loss": 0.339, "step": 4339 }, { "epoch": 2.164228723404255, "grad_norm": 0.4165570139884949, "learning_rate": 2.182315151499369e-06, "loss": 0.2974, "step": 4340 }, { "epoch": 2.1647273936170213, "grad_norm": 0.4284687340259552, "learning_rate": 2.1799183964368686e-06, "loss": 0.3227, "step": 4341 }, { "epoch": 2.1652260638297873, "grad_norm": 0.4222649037837982, "learning_rate": 2.177522591290698e-06, "loss": 0.2955, "step": 4342 }, { "epoch": 2.1657247340425534, "grad_norm": 0.4690038561820984, "learning_rate": 2.175127736867865e-06, "loss": 0.3753, "step": 4343 }, { "epoch": 2.166223404255319, "grad_norm": 0.3969247043132782, "learning_rate": 2.1727338339750486e-06, "loss": 0.3094, "step": 4344 }, { "epoch": 2.166722074468085, "grad_norm": 0.5070837140083313, "learning_rate": 2.170340883418611e-06, "loss": 0.4088, "step": 4345 }, { "epoch": 2.167220744680851, "grad_norm": 0.4116975963115692, "learning_rate": 2.167948886004596e-06, "loss": 0.3645, "step": 4346 }, { "epoch": 2.167719414893617, "grad_norm": 0.39470767974853516, "learning_rate": 2.165557842538722e-06, "loss": 0.3507, "step": 4347 }, { "epoch": 2.168218085106383, "grad_norm": 0.41963696479797363, "learning_rate": 2.16316775382639e-06, "loss": 0.3564, "step": 4348 }, { "epoch": 2.168716755319149, "grad_norm": 0.4076034128665924, "learning_rate": 2.160778620672675e-06, "loss": 0.3309, "step": 4349 }, { "epoch": 2.169215425531915, "grad_norm": 0.43494486808776855, "learning_rate": 2.1583904438823344e-06, "loss": 0.3413, "step": 4350 }, { "epoch": 2.169714095744681, "grad_norm": 0.45481976866722107, "learning_rate": 2.1560032242598032e-06, "loss": 0.3651, "step": 4351 }, { "epoch": 2.1702127659574466, "grad_norm": 0.4131263196468353, "learning_rate": 2.153616962609189e-06, "loss": 0.2816, "step": 4352 }, { "epoch": 2.1707114361702127, "grad_norm": 0.43828341364860535, "learning_rate": 2.1512316597342836e-06, "loss": 0.3656, "step": 4353 }, { "epoch": 2.1712101063829787, "grad_norm": 0.42392876744270325, "learning_rate": 2.1488473164385494e-06, "loss": 0.2686, "step": 4354 }, { "epoch": 2.171708776595745, "grad_norm": 0.46693155169487, "learning_rate": 2.1464639335251312e-06, "loss": 0.3252, "step": 4355 }, { "epoch": 2.1722074468085104, "grad_norm": 0.4699467122554779, "learning_rate": 2.1440815117968457e-06, "loss": 0.346, "step": 4356 }, { "epoch": 2.1727061170212765, "grad_norm": 0.4088408052921295, "learning_rate": 2.1417000520561904e-06, "loss": 0.3427, "step": 4357 }, { "epoch": 2.1732047872340425, "grad_norm": 0.4404662251472473, "learning_rate": 2.1393195551053327e-06, "loss": 0.3766, "step": 4358 }, { "epoch": 2.1737034574468086, "grad_norm": 0.38724973797798157, "learning_rate": 2.136940021746123e-06, "loss": 0.2633, "step": 4359 }, { "epoch": 2.1742021276595747, "grad_norm": 0.4383286237716675, "learning_rate": 2.1345614527800794e-06, "loss": 0.3588, "step": 4360 }, { "epoch": 2.1747007978723403, "grad_norm": 0.4681608974933624, "learning_rate": 2.1321838490084033e-06, "loss": 0.3284, "step": 4361 }, { "epoch": 2.1751994680851063, "grad_norm": 0.4571946859359741, "learning_rate": 2.129807211231963e-06, "loss": 0.3759, "step": 4362 }, { "epoch": 2.1756981382978724, "grad_norm": 0.4377949833869934, "learning_rate": 2.1274315402513085e-06, "loss": 0.3729, "step": 4363 }, { "epoch": 2.1761968085106385, "grad_norm": 0.4068440794944763, "learning_rate": 2.125056836866658e-06, "loss": 0.2582, "step": 4364 }, { "epoch": 2.176695478723404, "grad_norm": 0.45293664932250977, "learning_rate": 2.1226831018779094e-06, "loss": 0.4013, "step": 4365 }, { "epoch": 2.17719414893617, "grad_norm": 0.42767783999443054, "learning_rate": 2.12031033608463e-06, "loss": 0.3478, "step": 4366 }, { "epoch": 2.177692819148936, "grad_norm": 0.4087601602077484, "learning_rate": 2.117938540286061e-06, "loss": 0.3363, "step": 4367 }, { "epoch": 2.1781914893617023, "grad_norm": 0.461725652217865, "learning_rate": 2.115567715281121e-06, "loss": 0.3552, "step": 4368 }, { "epoch": 2.178690159574468, "grad_norm": 0.4288575053215027, "learning_rate": 2.1131978618683952e-06, "loss": 0.3382, "step": 4369 }, { "epoch": 2.179188829787234, "grad_norm": 0.405370831489563, "learning_rate": 2.1108289808461498e-06, "loss": 0.378, "step": 4370 }, { "epoch": 2.1796875, "grad_norm": 0.389682799577713, "learning_rate": 2.1084610730123133e-06, "loss": 0.3481, "step": 4371 }, { "epoch": 2.180186170212766, "grad_norm": 0.40909528732299805, "learning_rate": 2.106094139164497e-06, "loss": 0.318, "step": 4372 }, { "epoch": 2.180684840425532, "grad_norm": 0.4434879720211029, "learning_rate": 2.103728180099974e-06, "loss": 0.3621, "step": 4373 }, { "epoch": 2.1811835106382977, "grad_norm": 0.4609620273113251, "learning_rate": 2.1013631966156996e-06, "loss": 0.3631, "step": 4374 }, { "epoch": 2.181682180851064, "grad_norm": 0.41806432604789734, "learning_rate": 2.098999189508289e-06, "loss": 0.3422, "step": 4375 }, { "epoch": 2.18218085106383, "grad_norm": 0.4344795346260071, "learning_rate": 2.0966361595740393e-06, "loss": 0.3872, "step": 4376 }, { "epoch": 2.182679521276596, "grad_norm": 0.4181789457798004, "learning_rate": 2.0942741076089106e-06, "loss": 0.3365, "step": 4377 }, { "epoch": 2.1831781914893615, "grad_norm": 0.4175093173980713, "learning_rate": 2.0919130344085366e-06, "loss": 0.336, "step": 4378 }, { "epoch": 2.1836768617021276, "grad_norm": 0.41721421480178833, "learning_rate": 2.089552940768223e-06, "loss": 0.3499, "step": 4379 }, { "epoch": 2.1841755319148937, "grad_norm": 0.38774359226226807, "learning_rate": 2.087193827482941e-06, "loss": 0.3256, "step": 4380 }, { "epoch": 2.1846742021276597, "grad_norm": 0.4403991401195526, "learning_rate": 2.0848356953473385e-06, "loss": 0.3714, "step": 4381 }, { "epoch": 2.1851728723404253, "grad_norm": 0.38390547037124634, "learning_rate": 2.082478545155724e-06, "loss": 0.3588, "step": 4382 }, { "epoch": 2.1856715425531914, "grad_norm": 0.41092073917388916, "learning_rate": 2.0801223777020844e-06, "loss": 0.356, "step": 4383 }, { "epoch": 2.1861702127659575, "grad_norm": 0.4140447676181793, "learning_rate": 2.0777671937800675e-06, "loss": 0.2826, "step": 4384 }, { "epoch": 2.1866688829787235, "grad_norm": 0.46575894951820374, "learning_rate": 2.0754129941829964e-06, "loss": 0.3694, "step": 4385 }, { "epoch": 2.1871675531914896, "grad_norm": 0.44080883264541626, "learning_rate": 2.0730597797038573e-06, "loss": 0.3172, "step": 4386 }, { "epoch": 2.187666223404255, "grad_norm": 0.3841540217399597, "learning_rate": 2.070707551135309e-06, "loss": 0.2968, "step": 4387 }, { "epoch": 2.1881648936170213, "grad_norm": 0.42863932251930237, "learning_rate": 2.0683563092696747e-06, "loss": 0.3568, "step": 4388 }, { "epoch": 2.1886635638297873, "grad_norm": 0.4044308364391327, "learning_rate": 2.066006054898947e-06, "loss": 0.366, "step": 4389 }, { "epoch": 2.1891622340425534, "grad_norm": 0.4065200686454773, "learning_rate": 2.0636567888147874e-06, "loss": 0.3374, "step": 4390 }, { "epoch": 2.189660904255319, "grad_norm": 0.4096095561981201, "learning_rate": 2.06130851180852e-06, "loss": 0.3131, "step": 4391 }, { "epoch": 2.190159574468085, "grad_norm": 0.4444645345211029, "learning_rate": 2.0589612246711415e-06, "loss": 0.3461, "step": 4392 }, { "epoch": 2.190658244680851, "grad_norm": 0.4513739049434662, "learning_rate": 2.056614928193309e-06, "loss": 0.2658, "step": 4393 }, { "epoch": 2.191156914893617, "grad_norm": 0.4805681109428406, "learning_rate": 2.054269623165352e-06, "loss": 0.39, "step": 4394 }, { "epoch": 2.191655585106383, "grad_norm": 0.42129144072532654, "learning_rate": 2.0519253103772595e-06, "loss": 0.3575, "step": 4395 }, { "epoch": 2.192154255319149, "grad_norm": 0.45109543204307556, "learning_rate": 2.0495819906186944e-06, "loss": 0.3889, "step": 4396 }, { "epoch": 2.192652925531915, "grad_norm": 0.40953755378723145, "learning_rate": 2.0472396646789778e-06, "loss": 0.2965, "step": 4397 }, { "epoch": 2.193151595744681, "grad_norm": 0.5118685960769653, "learning_rate": 2.0448983333470975e-06, "loss": 0.3929, "step": 4398 }, { "epoch": 2.1936502659574466, "grad_norm": 0.3875616788864136, "learning_rate": 2.0425579974117107e-06, "loss": 0.3219, "step": 4399 }, { "epoch": 2.1941489361702127, "grad_norm": 0.4308790862560272, "learning_rate": 2.0402186576611327e-06, "loss": 0.3316, "step": 4400 }, { "epoch": 2.1946476063829787, "grad_norm": 0.42732030153274536, "learning_rate": 2.0378803148833507e-06, "loss": 0.3682, "step": 4401 }, { "epoch": 2.195146276595745, "grad_norm": 0.39692994952201843, "learning_rate": 2.0355429698660083e-06, "loss": 0.311, "step": 4402 }, { "epoch": 2.1956449468085104, "grad_norm": 0.41131827235221863, "learning_rate": 2.033206623396421e-06, "loss": 0.3632, "step": 4403 }, { "epoch": 2.1961436170212765, "grad_norm": 0.4258839786052704, "learning_rate": 2.030871276261559e-06, "loss": 0.3408, "step": 4404 }, { "epoch": 2.1966422872340425, "grad_norm": 0.45439469814300537, "learning_rate": 2.0285369292480656e-06, "loss": 0.3521, "step": 4405 }, { "epoch": 2.1971409574468086, "grad_norm": 0.4268215000629425, "learning_rate": 2.0262035831422373e-06, "loss": 0.3805, "step": 4406 }, { "epoch": 2.1976396276595747, "grad_norm": 0.4493885338306427, "learning_rate": 2.023871238730043e-06, "loss": 0.2983, "step": 4407 }, { "epoch": 2.1981382978723403, "grad_norm": 0.46090468764305115, "learning_rate": 2.021539896797108e-06, "loss": 0.3712, "step": 4408 }, { "epoch": 2.1986369680851063, "grad_norm": 0.4366537034511566, "learning_rate": 2.019209558128719e-06, "loss": 0.3501, "step": 4409 }, { "epoch": 2.1991356382978724, "grad_norm": 0.4379933178424835, "learning_rate": 2.0168802235098313e-06, "loss": 0.3072, "step": 4410 }, { "epoch": 2.1996343085106385, "grad_norm": 0.4452645778656006, "learning_rate": 2.0145518937250543e-06, "loss": 0.3483, "step": 4411 }, { "epoch": 2.200132978723404, "grad_norm": 0.4637349545955658, "learning_rate": 2.012224569558666e-06, "loss": 0.3459, "step": 4412 }, { "epoch": 2.20063164893617, "grad_norm": 0.46350738406181335, "learning_rate": 2.0098982517945996e-06, "loss": 0.3406, "step": 4413 }, { "epoch": 2.201130319148936, "grad_norm": 0.42493873834609985, "learning_rate": 2.0075729412164547e-06, "loss": 0.3784, "step": 4414 }, { "epoch": 2.2016289893617023, "grad_norm": 0.33409786224365234, "learning_rate": 2.0052486386074856e-06, "loss": 0.3066, "step": 4415 }, { "epoch": 2.202127659574468, "grad_norm": 0.4117850661277771, "learning_rate": 2.0029253447506143e-06, "loss": 0.3257, "step": 4416 }, { "epoch": 2.202626329787234, "grad_norm": 0.4044877588748932, "learning_rate": 2.0006030604284153e-06, "loss": 0.3289, "step": 4417 }, { "epoch": 2.203125, "grad_norm": 0.4185197949409485, "learning_rate": 1.99828178642313e-06, "loss": 0.3457, "step": 4418 }, { "epoch": 2.203623670212766, "grad_norm": 0.38849082589149475, "learning_rate": 1.995961523516656e-06, "loss": 0.3198, "step": 4419 }, { "epoch": 2.204122340425532, "grad_norm": 0.46492552757263184, "learning_rate": 1.9936422724905476e-06, "loss": 0.3592, "step": 4420 }, { "epoch": 2.2046210106382977, "grad_norm": 0.4386153519153595, "learning_rate": 1.9913240341260254e-06, "loss": 0.3996, "step": 4421 }, { "epoch": 2.205119680851064, "grad_norm": 0.39339497685432434, "learning_rate": 1.9890068092039613e-06, "loss": 0.2615, "step": 4422 }, { "epoch": 2.20561835106383, "grad_norm": 0.5060541033744812, "learning_rate": 1.9866905985048905e-06, "loss": 0.3989, "step": 4423 }, { "epoch": 2.206117021276596, "grad_norm": 0.39201879501342773, "learning_rate": 1.9843754028090083e-06, "loss": 0.2858, "step": 4424 }, { "epoch": 2.2066156914893615, "grad_norm": 0.4178335964679718, "learning_rate": 1.982061222896161e-06, "loss": 0.3424, "step": 4425 }, { "epoch": 2.2071143617021276, "grad_norm": 0.3866746127605438, "learning_rate": 1.9797480595458613e-06, "loss": 0.3253, "step": 4426 }, { "epoch": 2.2076130319148937, "grad_norm": 0.40797343850135803, "learning_rate": 1.9774359135372705e-06, "loss": 0.3569, "step": 4427 }, { "epoch": 2.2081117021276597, "grad_norm": 0.4094391465187073, "learning_rate": 1.9751247856492147e-06, "loss": 0.3482, "step": 4428 }, { "epoch": 2.2086103723404253, "grad_norm": 0.41927996277809143, "learning_rate": 1.972814676660175e-06, "loss": 0.3436, "step": 4429 }, { "epoch": 2.2091090425531914, "grad_norm": 0.4099690020084381, "learning_rate": 1.9705055873482877e-06, "loss": 0.3205, "step": 4430 }, { "epoch": 2.2096077127659575, "grad_norm": 0.40244823694229126, "learning_rate": 1.9681975184913445e-06, "loss": 0.3058, "step": 4431 }, { "epoch": 2.2101063829787235, "grad_norm": 0.43951451778411865, "learning_rate": 1.965890470866798e-06, "loss": 0.3727, "step": 4432 }, { "epoch": 2.2106050531914896, "grad_norm": 0.4203910231590271, "learning_rate": 1.963584445251752e-06, "loss": 0.292, "step": 4433 }, { "epoch": 2.211103723404255, "grad_norm": 0.4540725350379944, "learning_rate": 1.961279442422971e-06, "loss": 0.371, "step": 4434 }, { "epoch": 2.2116023936170213, "grad_norm": 0.43658244609832764, "learning_rate": 1.958975463156869e-06, "loss": 0.3597, "step": 4435 }, { "epoch": 2.2121010638297873, "grad_norm": 0.4342116713523865, "learning_rate": 1.9566725082295214e-06, "loss": 0.3147, "step": 4436 }, { "epoch": 2.2125997340425534, "grad_norm": 0.4792589247226715, "learning_rate": 1.954370578416652e-06, "loss": 0.3351, "step": 4437 }, { "epoch": 2.213098404255319, "grad_norm": 0.39763879776000977, "learning_rate": 1.952069674493648e-06, "loss": 0.2939, "step": 4438 }, { "epoch": 2.213597074468085, "grad_norm": 0.438777357339859, "learning_rate": 1.94976979723554e-06, "loss": 0.3447, "step": 4439 }, { "epoch": 2.214095744680851, "grad_norm": 0.4931119680404663, "learning_rate": 1.947470947417024e-06, "loss": 0.3352, "step": 4440 }, { "epoch": 2.214594414893617, "grad_norm": 0.3977721333503723, "learning_rate": 1.9451731258124417e-06, "loss": 0.2709, "step": 4441 }, { "epoch": 2.215093085106383, "grad_norm": 0.5104916095733643, "learning_rate": 1.9428763331957905e-06, "loss": 0.3982, "step": 4442 }, { "epoch": 2.215591755319149, "grad_norm": 0.41065555810928345, "learning_rate": 1.940580570340725e-06, "loss": 0.3006, "step": 4443 }, { "epoch": 2.216090425531915, "grad_norm": 0.45233237743377686, "learning_rate": 1.9382858380205467e-06, "loss": 0.3039, "step": 4444 }, { "epoch": 2.216589095744681, "grad_norm": 0.4550899565219879, "learning_rate": 1.935992137008217e-06, "loss": 0.411, "step": 4445 }, { "epoch": 2.2170877659574466, "grad_norm": 0.409765362739563, "learning_rate": 1.9336994680763422e-06, "loss": 0.2676, "step": 4446 }, { "epoch": 2.2175864361702127, "grad_norm": 0.45356571674346924, "learning_rate": 1.9314078319971886e-06, "loss": 0.3369, "step": 4447 }, { "epoch": 2.2180851063829787, "grad_norm": 0.44939279556274414, "learning_rate": 1.929117229542668e-06, "loss": 0.2805, "step": 4448 }, { "epoch": 2.218583776595745, "grad_norm": 0.4109923839569092, "learning_rate": 1.9268276614843493e-06, "loss": 0.3046, "step": 4449 }, { "epoch": 2.2190824468085104, "grad_norm": 0.4089254140853882, "learning_rate": 1.9245391285934482e-06, "loss": 0.3127, "step": 4450 }, { "epoch": 2.2195811170212765, "grad_norm": 0.4872036278247833, "learning_rate": 1.9222516316408374e-06, "loss": 0.3435, "step": 4451 }, { "epoch": 2.2200797872340425, "grad_norm": 0.42618536949157715, "learning_rate": 1.919965171397035e-06, "loss": 0.321, "step": 4452 }, { "epoch": 2.2205784574468086, "grad_norm": 0.42638063430786133, "learning_rate": 1.917679748632212e-06, "loss": 0.3202, "step": 4453 }, { "epoch": 2.2210771276595747, "grad_norm": 0.4229138493537903, "learning_rate": 1.915395364116192e-06, "loss": 0.3855, "step": 4454 }, { "epoch": 2.2215757978723403, "grad_norm": 0.44827646017074585, "learning_rate": 1.9131120186184447e-06, "loss": 0.3791, "step": 4455 }, { "epoch": 2.2220744680851063, "grad_norm": 0.41283807158470154, "learning_rate": 1.9108297129080953e-06, "loss": 0.3321, "step": 4456 }, { "epoch": 2.2225731382978724, "grad_norm": 0.43510088324546814, "learning_rate": 1.908548447753913e-06, "loss": 0.2901, "step": 4457 }, { "epoch": 2.2230718085106385, "grad_norm": 0.4509607255458832, "learning_rate": 1.9062682239243218e-06, "loss": 0.3208, "step": 4458 }, { "epoch": 2.223570478723404, "grad_norm": 0.4533460736274719, "learning_rate": 1.9039890421873886e-06, "loss": 0.3713, "step": 4459 }, { "epoch": 2.22406914893617, "grad_norm": 0.3321383595466614, "learning_rate": 1.9017109033108378e-06, "loss": 0.2708, "step": 4460 }, { "epoch": 2.224567819148936, "grad_norm": 0.3934299349784851, "learning_rate": 1.8994338080620329e-06, "loss": 0.3725, "step": 4461 }, { "epoch": 2.2250664893617023, "grad_norm": 0.409408837556839, "learning_rate": 1.897157757207993e-06, "loss": 0.3233, "step": 4462 }, { "epoch": 2.225565159574468, "grad_norm": 0.47569650411605835, "learning_rate": 1.894882751515385e-06, "loss": 0.3823, "step": 4463 }, { "epoch": 2.226063829787234, "grad_norm": 0.4310730993747711, "learning_rate": 1.8926087917505181e-06, "loss": 0.3299, "step": 4464 }, { "epoch": 2.2265625, "grad_norm": 0.4252654016017914, "learning_rate": 1.8903358786793563e-06, "loss": 0.2835, "step": 4465 }, { "epoch": 2.227061170212766, "grad_norm": 0.43047037720680237, "learning_rate": 1.888064013067505e-06, "loss": 0.3825, "step": 4466 }, { "epoch": 2.227559840425532, "grad_norm": 0.3970855474472046, "learning_rate": 1.8857931956802217e-06, "loss": 0.3011, "step": 4467 }, { "epoch": 2.2280585106382977, "grad_norm": 0.4889185428619385, "learning_rate": 1.883523427282406e-06, "loss": 0.3916, "step": 4468 }, { "epoch": 2.228557180851064, "grad_norm": 0.4002976715564728, "learning_rate": 1.8812547086386102e-06, "loss": 0.2548, "step": 4469 }, { "epoch": 2.22905585106383, "grad_norm": 0.47212183475494385, "learning_rate": 1.8789870405130255e-06, "loss": 0.3869, "step": 4470 }, { "epoch": 2.229554521276596, "grad_norm": 0.443175733089447, "learning_rate": 1.8767204236694974e-06, "loss": 0.2612, "step": 4471 }, { "epoch": 2.2300531914893615, "grad_norm": 0.47883012890815735, "learning_rate": 1.8744548588715107e-06, "loss": 0.3846, "step": 4472 }, { "epoch": 2.2305518617021276, "grad_norm": 0.41679829359054565, "learning_rate": 1.8721903468821973e-06, "loss": 0.3035, "step": 4473 }, { "epoch": 2.2310505319148937, "grad_norm": 0.43749985098838806, "learning_rate": 1.869926888464338e-06, "loss": 0.3322, "step": 4474 }, { "epoch": 2.2315492021276597, "grad_norm": 0.429892361164093, "learning_rate": 1.8676644843803532e-06, "loss": 0.3556, "step": 4475 }, { "epoch": 2.2320478723404253, "grad_norm": 0.4894966185092926, "learning_rate": 1.8654031353923152e-06, "loss": 0.3169, "step": 4476 }, { "epoch": 2.2325465425531914, "grad_norm": 0.45771336555480957, "learning_rate": 1.8631428422619324e-06, "loss": 0.3738, "step": 4477 }, { "epoch": 2.2330452127659575, "grad_norm": 0.45620888471603394, "learning_rate": 1.8608836057505658e-06, "loss": 0.4293, "step": 4478 }, { "epoch": 2.2335438829787235, "grad_norm": 0.37815243005752563, "learning_rate": 1.8586254266192133e-06, "loss": 0.2587, "step": 4479 }, { "epoch": 2.2340425531914896, "grad_norm": 0.468959242105484, "learning_rate": 1.856368305628523e-06, "loss": 0.4022, "step": 4480 }, { "epoch": 2.234541223404255, "grad_norm": 0.4431917369365692, "learning_rate": 1.854112243538781e-06, "loss": 0.3547, "step": 4481 }, { "epoch": 2.2350398936170213, "grad_norm": 0.4153378903865814, "learning_rate": 1.8518572411099222e-06, "loss": 0.3059, "step": 4482 }, { "epoch": 2.2355385638297873, "grad_norm": 0.3866026699542999, "learning_rate": 1.8496032991015195e-06, "loss": 0.3116, "step": 4483 }, { "epoch": 2.2360372340425534, "grad_norm": 0.45535218715667725, "learning_rate": 1.8473504182727897e-06, "loss": 0.3491, "step": 4484 }, { "epoch": 2.236535904255319, "grad_norm": 0.4041760265827179, "learning_rate": 1.8450985993825965e-06, "loss": 0.3341, "step": 4485 }, { "epoch": 2.237034574468085, "grad_norm": 0.3899959623813629, "learning_rate": 1.8428478431894393e-06, "loss": 0.3567, "step": 4486 }, { "epoch": 2.237533244680851, "grad_norm": 0.3993665277957916, "learning_rate": 1.8405981504514659e-06, "loss": 0.3116, "step": 4487 }, { "epoch": 2.238031914893617, "grad_norm": 0.41483479738235474, "learning_rate": 1.8383495219264597e-06, "loss": 0.3376, "step": 4488 }, { "epoch": 2.238530585106383, "grad_norm": 0.4009610712528229, "learning_rate": 1.8361019583718521e-06, "loss": 0.3008, "step": 4489 }, { "epoch": 2.239029255319149, "grad_norm": 0.5118954181671143, "learning_rate": 1.833855460544709e-06, "loss": 0.3645, "step": 4490 }, { "epoch": 2.239527925531915, "grad_norm": 0.4122650921344757, "learning_rate": 1.8316100292017442e-06, "loss": 0.3187, "step": 4491 }, { "epoch": 2.240026595744681, "grad_norm": 0.4030008316040039, "learning_rate": 1.8293656650993052e-06, "loss": 0.361, "step": 4492 }, { "epoch": 2.2405252659574466, "grad_norm": 0.4256528913974762, "learning_rate": 1.8271223689933876e-06, "loss": 0.36, "step": 4493 }, { "epoch": 2.2410239361702127, "grad_norm": 0.37976428866386414, "learning_rate": 1.8248801416396195e-06, "loss": 0.3599, "step": 4494 }, { "epoch": 2.2415226063829787, "grad_norm": 0.3807145655155182, "learning_rate": 1.8226389837932762e-06, "loss": 0.3566, "step": 4495 }, { "epoch": 2.242021276595745, "grad_norm": 0.4167250096797943, "learning_rate": 1.820398896209266e-06, "loss": 0.3645, "step": 4496 }, { "epoch": 2.2425199468085104, "grad_norm": 0.36498063802719116, "learning_rate": 1.818159879642144e-06, "loss": 0.3577, "step": 4497 }, { "epoch": 2.2430186170212765, "grad_norm": 0.4082206189632416, "learning_rate": 1.8159219348460961e-06, "loss": 0.3648, "step": 4498 }, { "epoch": 2.2435172872340425, "grad_norm": 0.3943209946155548, "learning_rate": 1.8136850625749553e-06, "loss": 0.2872, "step": 4499 }, { "epoch": 2.2440159574468086, "grad_norm": 0.4623855948448181, "learning_rate": 1.8114492635821867e-06, "loss": 0.2823, "step": 4500 }, { "epoch": 2.2445146276595747, "grad_norm": 0.46256327629089355, "learning_rate": 1.8092145386208986e-06, "loss": 0.3627, "step": 4501 }, { "epoch": 2.2450132978723403, "grad_norm": 0.4447336792945862, "learning_rate": 1.8069808884438378e-06, "loss": 0.3284, "step": 4502 }, { "epoch": 2.2455119680851063, "grad_norm": 0.41104739904403687, "learning_rate": 1.8047483138033822e-06, "loss": 0.3817, "step": 4503 }, { "epoch": 2.2460106382978724, "grad_norm": 0.38551875948905945, "learning_rate": 1.8025168154515577e-06, "loss": 0.3108, "step": 4504 }, { "epoch": 2.2465093085106385, "grad_norm": 0.4237922430038452, "learning_rate": 1.800286394140019e-06, "loss": 0.3657, "step": 4505 }, { "epoch": 2.247007978723404, "grad_norm": 0.3692452013492584, "learning_rate": 1.7980570506200601e-06, "loss": 0.2867, "step": 4506 }, { "epoch": 2.24750664893617, "grad_norm": 0.4065890908241272, "learning_rate": 1.7958287856426159e-06, "loss": 0.3546, "step": 4507 }, { "epoch": 2.248005319148936, "grad_norm": 0.43990787863731384, "learning_rate": 1.7936015999582535e-06, "loss": 0.3569, "step": 4508 }, { "epoch": 2.2485039893617023, "grad_norm": 0.4604511559009552, "learning_rate": 1.7913754943171797e-06, "loss": 0.3361, "step": 4509 }, { "epoch": 2.249002659574468, "grad_norm": 0.40096646547317505, "learning_rate": 1.7891504694692336e-06, "loss": 0.3624, "step": 4510 }, { "epoch": 2.249501329787234, "grad_norm": 0.35676270723342896, "learning_rate": 1.7869265261638952e-06, "loss": 0.3147, "step": 4511 }, { "epoch": 2.25, "grad_norm": 0.4137072265148163, "learning_rate": 1.7847036651502752e-06, "loss": 0.3864, "step": 4512 }, { "epoch": 2.250498670212766, "grad_norm": 0.39376169443130493, "learning_rate": 1.7824818871771249e-06, "loss": 0.2847, "step": 4513 }, { "epoch": 2.250997340425532, "grad_norm": 0.4047314524650574, "learning_rate": 1.7802611929928249e-06, "loss": 0.2481, "step": 4514 }, { "epoch": 2.2514960106382977, "grad_norm": 0.441168874502182, "learning_rate": 1.7780415833453969e-06, "loss": 0.355, "step": 4515 }, { "epoch": 2.251994680851064, "grad_norm": 0.44451749324798584, "learning_rate": 1.7758230589824926e-06, "loss": 0.4065, "step": 4516 }, { "epoch": 2.25249335106383, "grad_norm": 0.39015859365463257, "learning_rate": 1.7736056206513985e-06, "loss": 0.2855, "step": 4517 }, { "epoch": 2.252992021276596, "grad_norm": 0.49626028537750244, "learning_rate": 1.7713892690990398e-06, "loss": 0.3886, "step": 4518 }, { "epoch": 2.2534906914893615, "grad_norm": 0.43043938279151917, "learning_rate": 1.7691740050719692e-06, "loss": 0.3176, "step": 4519 }, { "epoch": 2.2539893617021276, "grad_norm": 0.48905760049819946, "learning_rate": 1.7669598293163787e-06, "loss": 0.3501, "step": 4520 }, { "epoch": 2.2544880319148937, "grad_norm": 0.4665457010269165, "learning_rate": 1.764746742578089e-06, "loss": 0.3587, "step": 4521 }, { "epoch": 2.2549867021276597, "grad_norm": 0.42940714955329895, "learning_rate": 1.7625347456025593e-06, "loss": 0.2951, "step": 4522 }, { "epoch": 2.2554853723404253, "grad_norm": 0.4128459095954895, "learning_rate": 1.7603238391348748e-06, "loss": 0.3123, "step": 4523 }, { "epoch": 2.2559840425531914, "grad_norm": 0.42761993408203125, "learning_rate": 1.7581140239197614e-06, "loss": 0.3721, "step": 4524 }, { "epoch": 2.2564827127659575, "grad_norm": 0.37199705839157104, "learning_rate": 1.7559053007015698e-06, "loss": 0.3018, "step": 4525 }, { "epoch": 2.2569813829787235, "grad_norm": 0.4308476448059082, "learning_rate": 1.7536976702242897e-06, "loss": 0.3468, "step": 4526 }, { "epoch": 2.2574800531914896, "grad_norm": 0.3878461420536041, "learning_rate": 1.751491133231537e-06, "loss": 0.3242, "step": 4527 }, { "epoch": 2.257978723404255, "grad_norm": 0.3967863917350769, "learning_rate": 1.7492856904665607e-06, "loss": 0.3044, "step": 4528 }, { "epoch": 2.2584773936170213, "grad_norm": 0.45059776306152344, "learning_rate": 1.7470813426722454e-06, "loss": 0.3645, "step": 4529 }, { "epoch": 2.2589760638297873, "grad_norm": 0.3757149577140808, "learning_rate": 1.7448780905911006e-06, "loss": 0.303, "step": 4530 }, { "epoch": 2.259474734042553, "grad_norm": 0.40555256605148315, "learning_rate": 1.7426759349652722e-06, "loss": 0.3516, "step": 4531 }, { "epoch": 2.259973404255319, "grad_norm": 0.4401825964450836, "learning_rate": 1.740474876536532e-06, "loss": 0.3866, "step": 4532 }, { "epoch": 2.260472074468085, "grad_norm": 0.44132712483406067, "learning_rate": 1.738274916046286e-06, "loss": 0.3429, "step": 4533 }, { "epoch": 2.260970744680851, "grad_norm": 0.39297130703926086, "learning_rate": 1.7360760542355698e-06, "loss": 0.3129, "step": 4534 }, { "epoch": 2.261469414893617, "grad_norm": 0.41182810068130493, "learning_rate": 1.7338782918450453e-06, "loss": 0.3624, "step": 4535 }, { "epoch": 2.261968085106383, "grad_norm": 0.4128105938434601, "learning_rate": 1.7316816296150108e-06, "loss": 0.3416, "step": 4536 }, { "epoch": 2.262466755319149, "grad_norm": 0.47598376870155334, "learning_rate": 1.7294860682853853e-06, "loss": 0.336, "step": 4537 }, { "epoch": 2.262965425531915, "grad_norm": 0.39434874057769775, "learning_rate": 1.7272916085957263e-06, "loss": 0.3601, "step": 4538 }, { "epoch": 2.263464095744681, "grad_norm": 0.3929523825645447, "learning_rate": 1.7250982512852116e-06, "loss": 0.3648, "step": 4539 }, { "epoch": 2.263962765957447, "grad_norm": 0.39410144090652466, "learning_rate": 1.7229059970926553e-06, "loss": 0.3454, "step": 4540 }, { "epoch": 2.2644614361702127, "grad_norm": 0.4431554079055786, "learning_rate": 1.7207148467564933e-06, "loss": 0.3747, "step": 4541 }, { "epoch": 2.2649601063829787, "grad_norm": 0.411900132894516, "learning_rate": 1.7185248010147953e-06, "loss": 0.3584, "step": 4542 }, { "epoch": 2.265458776595745, "grad_norm": 0.4118770658969879, "learning_rate": 1.7163358606052532e-06, "loss": 0.3711, "step": 4543 }, { "epoch": 2.2659574468085104, "grad_norm": 0.43077367544174194, "learning_rate": 1.714148026265194e-06, "loss": 0.3393, "step": 4544 }, { "epoch": 2.2664561170212765, "grad_norm": 0.3869442343711853, "learning_rate": 1.7119612987315632e-06, "loss": 0.3253, "step": 4545 }, { "epoch": 2.2669547872340425, "grad_norm": 0.39843693375587463, "learning_rate": 1.709775678740942e-06, "loss": 0.3182, "step": 4546 }, { "epoch": 2.2674534574468086, "grad_norm": 0.45141667127609253, "learning_rate": 1.707591167029533e-06, "loss": 0.3758, "step": 4547 }, { "epoch": 2.2679521276595747, "grad_norm": 0.41604119539260864, "learning_rate": 1.7054077643331662e-06, "loss": 0.3046, "step": 4548 }, { "epoch": 2.2684507978723403, "grad_norm": 0.46693506836891174, "learning_rate": 1.7032254713873014e-06, "loss": 0.3731, "step": 4549 }, { "epoch": 2.2689494680851063, "grad_norm": 0.4394146502017975, "learning_rate": 1.701044288927019e-06, "loss": 0.3338, "step": 4550 }, { "epoch": 2.2694481382978724, "grad_norm": 0.4473007917404175, "learning_rate": 1.6988642176870319e-06, "loss": 0.3424, "step": 4551 }, { "epoch": 2.2699468085106385, "grad_norm": 0.4471880793571472, "learning_rate": 1.6966852584016724e-06, "loss": 0.3414, "step": 4552 }, { "epoch": 2.270445478723404, "grad_norm": 0.3965458273887634, "learning_rate": 1.6945074118049038e-06, "loss": 0.3169, "step": 4553 }, { "epoch": 2.27094414893617, "grad_norm": 0.4556237757205963, "learning_rate": 1.6923306786303096e-06, "loss": 0.3395, "step": 4554 }, { "epoch": 2.271442819148936, "grad_norm": 0.4206327199935913, "learning_rate": 1.6901550596111032e-06, "loss": 0.3958, "step": 4555 }, { "epoch": 2.2719414893617023, "grad_norm": 0.4047166109085083, "learning_rate": 1.6879805554801172e-06, "loss": 0.358, "step": 4556 }, { "epoch": 2.272440159574468, "grad_norm": 0.3921888470649719, "learning_rate": 1.6858071669698146e-06, "loss": 0.3637, "step": 4557 }, { "epoch": 2.272938829787234, "grad_norm": 0.43393459916114807, "learning_rate": 1.6836348948122777e-06, "loss": 0.3462, "step": 4558 }, { "epoch": 2.2734375, "grad_norm": 0.4746190011501312, "learning_rate": 1.681463739739214e-06, "loss": 0.3156, "step": 4559 }, { "epoch": 2.273936170212766, "grad_norm": 0.42186009883880615, "learning_rate": 1.6792937024819583e-06, "loss": 0.3532, "step": 4560 }, { "epoch": 2.274434840425532, "grad_norm": 0.3946254849433899, "learning_rate": 1.6771247837714622e-06, "loss": 0.3341, "step": 4561 }, { "epoch": 2.2749335106382977, "grad_norm": 0.4131389856338501, "learning_rate": 1.6749569843383084e-06, "loss": 0.3323, "step": 4562 }, { "epoch": 2.275432180851064, "grad_norm": 0.44116392731666565, "learning_rate": 1.6727903049126948e-06, "loss": 0.3473, "step": 4563 }, { "epoch": 2.27593085106383, "grad_norm": 0.42212846875190735, "learning_rate": 1.6706247462244484e-06, "loss": 0.359, "step": 4564 }, { "epoch": 2.276429521276596, "grad_norm": 0.40971753001213074, "learning_rate": 1.6684603090030139e-06, "loss": 0.3481, "step": 4565 }, { "epoch": 2.2769281914893615, "grad_norm": 0.38760989904403687, "learning_rate": 1.6662969939774631e-06, "loss": 0.3244, "step": 4566 }, { "epoch": 2.2774268617021276, "grad_norm": 0.45889946818351746, "learning_rate": 1.6641348018764837e-06, "loss": 0.3961, "step": 4567 }, { "epoch": 2.2779255319148937, "grad_norm": 0.43479663133621216, "learning_rate": 1.661973733428392e-06, "loss": 0.3475, "step": 4568 }, { "epoch": 2.2784242021276597, "grad_norm": 0.40265220403671265, "learning_rate": 1.6598137893611182e-06, "loss": 0.3683, "step": 4569 }, { "epoch": 2.2789228723404253, "grad_norm": 0.3666144907474518, "learning_rate": 1.6576549704022226e-06, "loss": 0.323, "step": 4570 }, { "epoch": 2.2794215425531914, "grad_norm": 0.47905856370925903, "learning_rate": 1.655497277278878e-06, "loss": 0.3126, "step": 4571 }, { "epoch": 2.2799202127659575, "grad_norm": 0.47323715686798096, "learning_rate": 1.6533407107178829e-06, "loss": 0.3649, "step": 4572 }, { "epoch": 2.2804188829787235, "grad_norm": 0.40980592370033264, "learning_rate": 1.6511852714456573e-06, "loss": 0.3129, "step": 4573 }, { "epoch": 2.2809175531914896, "grad_norm": 0.47023630142211914, "learning_rate": 1.649030960188236e-06, "loss": 0.3608, "step": 4574 }, { "epoch": 2.281416223404255, "grad_norm": 0.4014923870563507, "learning_rate": 1.6468777776712802e-06, "loss": 0.3198, "step": 4575 }, { "epoch": 2.2819148936170213, "grad_norm": 0.4098421335220337, "learning_rate": 1.6447257246200654e-06, "loss": 0.315, "step": 4576 }, { "epoch": 2.2824135638297873, "grad_norm": 0.41763752698898315, "learning_rate": 1.6425748017594922e-06, "loss": 0.3186, "step": 4577 }, { "epoch": 2.282912234042553, "grad_norm": 0.48628556728363037, "learning_rate": 1.640425009814074e-06, "loss": 0.3433, "step": 4578 }, { "epoch": 2.283410904255319, "grad_norm": 0.45226049423217773, "learning_rate": 1.6382763495079501e-06, "loss": 0.3732, "step": 4579 }, { "epoch": 2.283909574468085, "grad_norm": 0.41630685329437256, "learning_rate": 1.6361288215648735e-06, "loss": 0.3389, "step": 4580 }, { "epoch": 2.284408244680851, "grad_norm": 0.4679237902164459, "learning_rate": 1.6339824267082167e-06, "loss": 0.332, "step": 4581 }, { "epoch": 2.284906914893617, "grad_norm": 0.4314337372779846, "learning_rate": 1.6318371656609733e-06, "loss": 0.3406, "step": 4582 }, { "epoch": 2.285405585106383, "grad_norm": 0.4020390808582306, "learning_rate": 1.6296930391457505e-06, "loss": 0.3272, "step": 4583 }, { "epoch": 2.285904255319149, "grad_norm": 0.41693976521492004, "learning_rate": 1.6275500478847795e-06, "loss": 0.3692, "step": 4584 }, { "epoch": 2.286402925531915, "grad_norm": 0.41598761081695557, "learning_rate": 1.6254081925999016e-06, "loss": 0.3418, "step": 4585 }, { "epoch": 2.286901595744681, "grad_norm": 0.4006984233856201, "learning_rate": 1.623267474012583e-06, "loss": 0.3438, "step": 4586 }, { "epoch": 2.287400265957447, "grad_norm": 0.4375319182872772, "learning_rate": 1.6211278928439006e-06, "loss": 0.3412, "step": 4587 }, { "epoch": 2.2878989361702127, "grad_norm": 0.4819337725639343, "learning_rate": 1.618989449814553e-06, "loss": 0.364, "step": 4588 }, { "epoch": 2.2883976063829787, "grad_norm": 0.4327966570854187, "learning_rate": 1.6168521456448511e-06, "loss": 0.3447, "step": 4589 }, { "epoch": 2.288896276595745, "grad_norm": 0.41153907775878906, "learning_rate": 1.6147159810547275e-06, "loss": 0.3367, "step": 4590 }, { "epoch": 2.2893949468085104, "grad_norm": 0.4326966106891632, "learning_rate": 1.6125809567637262e-06, "loss": 0.2792, "step": 4591 }, { "epoch": 2.2898936170212765, "grad_norm": 0.4334745705127716, "learning_rate": 1.610447073491007e-06, "loss": 0.3449, "step": 4592 }, { "epoch": 2.2903922872340425, "grad_norm": 0.4989224970340729, "learning_rate": 1.6083143319553512e-06, "loss": 0.313, "step": 4593 }, { "epoch": 2.2908909574468086, "grad_norm": 0.40012863278388977, "learning_rate": 1.6061827328751473e-06, "loss": 0.2932, "step": 4594 }, { "epoch": 2.2913896276595747, "grad_norm": 0.417005717754364, "learning_rate": 1.604052276968407e-06, "loss": 0.3462, "step": 4595 }, { "epoch": 2.2918882978723403, "grad_norm": 0.453519344329834, "learning_rate": 1.6019229649527496e-06, "loss": 0.3638, "step": 4596 }, { "epoch": 2.2923869680851063, "grad_norm": 0.42913320660591125, "learning_rate": 1.599794797545416e-06, "loss": 0.3264, "step": 4597 }, { "epoch": 2.2928856382978724, "grad_norm": 0.4294193387031555, "learning_rate": 1.597667775463254e-06, "loss": 0.3079, "step": 4598 }, { "epoch": 2.2933843085106385, "grad_norm": 0.4281362295150757, "learning_rate": 1.5955418994227345e-06, "loss": 0.4125, "step": 4599 }, { "epoch": 2.293882978723404, "grad_norm": 0.3873682916164398, "learning_rate": 1.5934171701399337e-06, "loss": 0.3149, "step": 4600 }, { "epoch": 2.29438164893617, "grad_norm": 0.42091402411460876, "learning_rate": 1.5912935883305486e-06, "loss": 0.2956, "step": 4601 }, { "epoch": 2.294880319148936, "grad_norm": 0.4521392583847046, "learning_rate": 1.589171154709885e-06, "loss": 0.3469, "step": 4602 }, { "epoch": 2.2953789893617023, "grad_norm": 0.41527920961380005, "learning_rate": 1.5870498699928615e-06, "loss": 0.3558, "step": 4603 }, { "epoch": 2.295877659574468, "grad_norm": 0.42319613695144653, "learning_rate": 1.5849297348940157e-06, "loss": 0.357, "step": 4604 }, { "epoch": 2.296376329787234, "grad_norm": 0.39259815216064453, "learning_rate": 1.5828107501274898e-06, "loss": 0.3256, "step": 4605 }, { "epoch": 2.296875, "grad_norm": 0.44079136848449707, "learning_rate": 1.580692916407045e-06, "loss": 0.3377, "step": 4606 }, { "epoch": 2.297373670212766, "grad_norm": 0.38100171089172363, "learning_rate": 1.5785762344460537e-06, "loss": 0.2734, "step": 4607 }, { "epoch": 2.297872340425532, "grad_norm": 0.42140135169029236, "learning_rate": 1.5764607049574958e-06, "loss": 0.4499, "step": 4608 }, { "epoch": 2.2983710106382977, "grad_norm": 0.368158757686615, "learning_rate": 1.57434632865397e-06, "loss": 0.3554, "step": 4609 }, { "epoch": 2.298869680851064, "grad_norm": 0.4275476634502411, "learning_rate": 1.5722331062476788e-06, "loss": 0.3639, "step": 4610 }, { "epoch": 2.29936835106383, "grad_norm": 0.3994699716567993, "learning_rate": 1.570121038450444e-06, "loss": 0.3403, "step": 4611 }, { "epoch": 2.299867021276596, "grad_norm": 0.4100092649459839, "learning_rate": 1.5680101259736913e-06, "loss": 0.3299, "step": 4612 }, { "epoch": 2.3003656914893615, "grad_norm": 0.4478920102119446, "learning_rate": 1.565900369528463e-06, "loss": 0.3145, "step": 4613 }, { "epoch": 2.3008643617021276, "grad_norm": 0.42498868703842163, "learning_rate": 1.5637917698254074e-06, "loss": 0.3786, "step": 4614 }, { "epoch": 2.3013630319148937, "grad_norm": 0.431672602891922, "learning_rate": 1.5616843275747873e-06, "loss": 0.3524, "step": 4615 }, { "epoch": 2.3018617021276597, "grad_norm": 0.45713838934898376, "learning_rate": 1.559578043486471e-06, "loss": 0.3463, "step": 4616 }, { "epoch": 2.3023603723404253, "grad_norm": 0.4335775673389435, "learning_rate": 1.5574729182699428e-06, "loss": 0.4109, "step": 4617 }, { "epoch": 2.3028590425531914, "grad_norm": 0.3722214102745056, "learning_rate": 1.5553689526342892e-06, "loss": 0.2854, "step": 4618 }, { "epoch": 2.3033577127659575, "grad_norm": 0.4585992693901062, "learning_rate": 1.5532661472882133e-06, "loss": 0.3907, "step": 4619 }, { "epoch": 2.3038563829787235, "grad_norm": 0.4290821850299835, "learning_rate": 1.5511645029400213e-06, "loss": 0.3028, "step": 4620 }, { "epoch": 2.3043550531914896, "grad_norm": 0.4237549602985382, "learning_rate": 1.5490640202976343e-06, "loss": 0.3638, "step": 4621 }, { "epoch": 2.304853723404255, "grad_norm": 0.4167039692401886, "learning_rate": 1.5469647000685773e-06, "loss": 0.3702, "step": 4622 }, { "epoch": 2.3053523936170213, "grad_norm": 0.4119398593902588, "learning_rate": 1.5448665429599829e-06, "loss": 0.3192, "step": 4623 }, { "epoch": 2.3058510638297873, "grad_norm": 0.4088422656059265, "learning_rate": 1.5427695496785988e-06, "loss": 0.3218, "step": 4624 }, { "epoch": 2.306349734042553, "grad_norm": 0.42592647671699524, "learning_rate": 1.540673720930772e-06, "loss": 0.3679, "step": 4625 }, { "epoch": 2.306848404255319, "grad_norm": 0.4179607927799225, "learning_rate": 1.5385790574224656e-06, "loss": 0.3352, "step": 4626 }, { "epoch": 2.307347074468085, "grad_norm": 0.4046172499656677, "learning_rate": 1.5364855598592426e-06, "loss": 0.324, "step": 4627 }, { "epoch": 2.307845744680851, "grad_norm": 0.3840457797050476, "learning_rate": 1.5343932289462799e-06, "loss": 0.3241, "step": 4628 }, { "epoch": 2.308344414893617, "grad_norm": 0.43520528078079224, "learning_rate": 1.5323020653883548e-06, "loss": 0.3357, "step": 4629 }, { "epoch": 2.308843085106383, "grad_norm": 0.44575321674346924, "learning_rate": 1.5302120698898581e-06, "loss": 0.3115, "step": 4630 }, { "epoch": 2.309341755319149, "grad_norm": 0.4227120578289032, "learning_rate": 1.528123243154781e-06, "loss": 0.3262, "step": 4631 }, { "epoch": 2.309840425531915, "grad_norm": 0.44955939054489136, "learning_rate": 1.5260355858867271e-06, "loss": 0.3123, "step": 4632 }, { "epoch": 2.310339095744681, "grad_norm": 0.4562029540538788, "learning_rate": 1.523949098788901e-06, "loss": 0.3819, "step": 4633 }, { "epoch": 2.310837765957447, "grad_norm": 0.3904390037059784, "learning_rate": 1.521863782564114e-06, "loss": 0.3181, "step": 4634 }, { "epoch": 2.3113364361702127, "grad_norm": 0.4293251931667328, "learning_rate": 1.5197796379147873e-06, "loss": 0.3637, "step": 4635 }, { "epoch": 2.3118351063829787, "grad_norm": 0.3812035620212555, "learning_rate": 1.5176966655429398e-06, "loss": 0.322, "step": 4636 }, { "epoch": 2.312333776595745, "grad_norm": 0.3842618763446808, "learning_rate": 1.5156148661502046e-06, "loss": 0.3351, "step": 4637 }, { "epoch": 2.3128324468085104, "grad_norm": 0.422040194272995, "learning_rate": 1.5135342404378111e-06, "loss": 0.3053, "step": 4638 }, { "epoch": 2.3133311170212765, "grad_norm": 0.43934711813926697, "learning_rate": 1.5114547891066006e-06, "loss": 0.3016, "step": 4639 }, { "epoch": 2.3138297872340425, "grad_norm": 0.43509674072265625, "learning_rate": 1.509376512857012e-06, "loss": 0.3732, "step": 4640 }, { "epoch": 2.3143284574468086, "grad_norm": 0.4379585385322571, "learning_rate": 1.507299412389096e-06, "loss": 0.3324, "step": 4641 }, { "epoch": 2.3148271276595747, "grad_norm": 0.4737410545349121, "learning_rate": 1.5052234884024991e-06, "loss": 0.3485, "step": 4642 }, { "epoch": 2.3153257978723403, "grad_norm": 0.40729641914367676, "learning_rate": 1.503148741596479e-06, "loss": 0.3363, "step": 4643 }, { "epoch": 2.3158244680851063, "grad_norm": 0.4585707485675812, "learning_rate": 1.5010751726698909e-06, "loss": 0.4127, "step": 4644 }, { "epoch": 2.3163231382978724, "grad_norm": 0.44733282923698425, "learning_rate": 1.499002782321196e-06, "loss": 0.3158, "step": 4645 }, { "epoch": 2.3168218085106385, "grad_norm": 0.4286123514175415, "learning_rate": 1.4969315712484617e-06, "loss": 0.3161, "step": 4646 }, { "epoch": 2.317320478723404, "grad_norm": 0.42869019508361816, "learning_rate": 1.49486154014935e-06, "loss": 0.3324, "step": 4647 }, { "epoch": 2.31781914893617, "grad_norm": 0.4148261845111847, "learning_rate": 1.4927926897211338e-06, "loss": 0.3129, "step": 4648 }, { "epoch": 2.318317819148936, "grad_norm": 0.41832852363586426, "learning_rate": 1.4907250206606822e-06, "loss": 0.3294, "step": 4649 }, { "epoch": 2.3188164893617023, "grad_norm": 0.46199876070022583, "learning_rate": 1.488658533664471e-06, "loss": 0.4014, "step": 4650 }, { "epoch": 2.319315159574468, "grad_norm": 0.44448164105415344, "learning_rate": 1.4865932294285735e-06, "loss": 0.2968, "step": 4651 }, { "epoch": 2.319813829787234, "grad_norm": 0.44287675619125366, "learning_rate": 1.4845291086486695e-06, "loss": 0.2938, "step": 4652 }, { "epoch": 2.3203125, "grad_norm": 0.38062816858291626, "learning_rate": 1.4824661720200345e-06, "loss": 0.3209, "step": 4653 }, { "epoch": 2.320811170212766, "grad_norm": 0.4515875577926636, "learning_rate": 1.4804044202375512e-06, "loss": 0.3986, "step": 4654 }, { "epoch": 2.321309840425532, "grad_norm": 0.38894060254096985, "learning_rate": 1.4783438539956979e-06, "loss": 0.2917, "step": 4655 }, { "epoch": 2.3218085106382977, "grad_norm": 0.451008677482605, "learning_rate": 1.4762844739885557e-06, "loss": 0.361, "step": 4656 }, { "epoch": 2.322307180851064, "grad_norm": 0.46281078457832336, "learning_rate": 1.4742262809098079e-06, "loss": 0.3237, "step": 4657 }, { "epoch": 2.32280585106383, "grad_norm": 0.4450860619544983, "learning_rate": 1.4721692754527333e-06, "loss": 0.3491, "step": 4658 }, { "epoch": 2.323304521276596, "grad_norm": 0.4395565390586853, "learning_rate": 1.4701134583102178e-06, "loss": 0.3474, "step": 4659 }, { "epoch": 2.3238031914893615, "grad_norm": 0.400605708360672, "learning_rate": 1.468058830174739e-06, "loss": 0.3004, "step": 4660 }, { "epoch": 2.3243018617021276, "grad_norm": 0.4630294740200043, "learning_rate": 1.466005391738381e-06, "loss": 0.3784, "step": 4661 }, { "epoch": 2.3248005319148937, "grad_norm": 0.3955363631248474, "learning_rate": 1.4639531436928211e-06, "loss": 0.3084, "step": 4662 }, { "epoch": 2.3252992021276597, "grad_norm": 0.39330407977104187, "learning_rate": 1.4619020867293416e-06, "loss": 0.33, "step": 4663 }, { "epoch": 2.3257978723404253, "grad_norm": 0.4357622265815735, "learning_rate": 1.4598522215388171e-06, "loss": 0.3356, "step": 4664 }, { "epoch": 2.3262965425531914, "grad_norm": 0.4609587490558624, "learning_rate": 1.4578035488117282e-06, "loss": 0.3382, "step": 4665 }, { "epoch": 2.3267952127659575, "grad_norm": 0.4538816213607788, "learning_rate": 1.4557560692381479e-06, "loss": 0.3032, "step": 4666 }, { "epoch": 2.3272938829787235, "grad_norm": 0.37600767612457275, "learning_rate": 1.4537097835077469e-06, "loss": 0.345, "step": 4667 }, { "epoch": 2.3277925531914896, "grad_norm": 0.4562889635562897, "learning_rate": 1.4516646923098004e-06, "loss": 0.3628, "step": 4668 }, { "epoch": 2.328291223404255, "grad_norm": 0.404079407453537, "learning_rate": 1.4496207963331731e-06, "loss": 0.3031, "step": 4669 }, { "epoch": 2.3287898936170213, "grad_norm": 0.4569692611694336, "learning_rate": 1.4475780962663343e-06, "loss": 0.3614, "step": 4670 }, { "epoch": 2.3292885638297873, "grad_norm": 0.4719158113002777, "learning_rate": 1.4455365927973442e-06, "loss": 0.338, "step": 4671 }, { "epoch": 2.329787234042553, "grad_norm": 0.4524783194065094, "learning_rate": 1.4434962866138658e-06, "loss": 0.3309, "step": 4672 }, { "epoch": 2.330285904255319, "grad_norm": 0.413446843624115, "learning_rate": 1.4414571784031527e-06, "loss": 0.3761, "step": 4673 }, { "epoch": 2.330784574468085, "grad_norm": 0.41072317957878113, "learning_rate": 1.4394192688520613e-06, "loss": 0.2966, "step": 4674 }, { "epoch": 2.331283244680851, "grad_norm": 0.4029808044433594, "learning_rate": 1.437382558647038e-06, "loss": 0.341, "step": 4675 }, { "epoch": 2.331781914893617, "grad_norm": 0.40059712529182434, "learning_rate": 1.4353470484741312e-06, "loss": 0.3034, "step": 4676 }, { "epoch": 2.332280585106383, "grad_norm": 0.45940101146698, "learning_rate": 1.4333127390189793e-06, "loss": 0.3826, "step": 4677 }, { "epoch": 2.332779255319149, "grad_norm": 0.42621368169784546, "learning_rate": 1.4312796309668226e-06, "loss": 0.3057, "step": 4678 }, { "epoch": 2.333277925531915, "grad_norm": 0.4453943967819214, "learning_rate": 1.4292477250024888e-06, "loss": 0.2979, "step": 4679 }, { "epoch": 2.333776595744681, "grad_norm": 0.47744259238243103, "learning_rate": 1.4272170218104103e-06, "loss": 0.374, "step": 4680 }, { "epoch": 2.334275265957447, "grad_norm": 0.40184175968170166, "learning_rate": 1.425187522074604e-06, "loss": 0.3014, "step": 4681 }, { "epoch": 2.3347739361702127, "grad_norm": 0.4302511215209961, "learning_rate": 1.4231592264786914e-06, "loss": 0.332, "step": 4682 }, { "epoch": 2.3352726063829787, "grad_norm": 0.45629122853279114, "learning_rate": 1.4211321357058794e-06, "loss": 0.3672, "step": 4683 }, { "epoch": 2.335771276595745, "grad_norm": 0.3796422779560089, "learning_rate": 1.4191062504389758e-06, "loss": 0.2664, "step": 4684 }, { "epoch": 2.3362699468085104, "grad_norm": 0.40381696820259094, "learning_rate": 1.4170815713603802e-06, "loss": 0.3127, "step": 4685 }, { "epoch": 2.3367686170212765, "grad_norm": 0.4527059495449066, "learning_rate": 1.4150580991520851e-06, "loss": 0.3615, "step": 4686 }, { "epoch": 2.3372672872340425, "grad_norm": 0.51701819896698, "learning_rate": 1.4130358344956746e-06, "loss": 0.3079, "step": 4687 }, { "epoch": 2.3377659574468086, "grad_norm": 0.3674989342689514, "learning_rate": 1.4110147780723315e-06, "loss": 0.3306, "step": 4688 }, { "epoch": 2.3382646276595747, "grad_norm": 0.453670471906662, "learning_rate": 1.4089949305628259e-06, "loss": 0.3845, "step": 4689 }, { "epoch": 2.3387632978723403, "grad_norm": 0.421041339635849, "learning_rate": 1.4069762926475262e-06, "loss": 0.3577, "step": 4690 }, { "epoch": 2.3392619680851063, "grad_norm": 0.4198717772960663, "learning_rate": 1.4049588650063867e-06, "loss": 0.3436, "step": 4691 }, { "epoch": 2.3397606382978724, "grad_norm": 0.43094608187675476, "learning_rate": 1.4029426483189618e-06, "loss": 0.3153, "step": 4692 }, { "epoch": 2.3402593085106385, "grad_norm": 0.45516541600227356, "learning_rate": 1.4009276432643903e-06, "loss": 0.3427, "step": 4693 }, { "epoch": 2.340757978723404, "grad_norm": 0.367805153131485, "learning_rate": 1.3989138505214095e-06, "loss": 0.2847, "step": 4694 }, { "epoch": 2.34125664893617, "grad_norm": 0.4271205961704254, "learning_rate": 1.3969012707683432e-06, "loss": 0.3946, "step": 4695 }, { "epoch": 2.341755319148936, "grad_norm": 0.42437493801116943, "learning_rate": 1.3948899046831105e-06, "loss": 0.3225, "step": 4696 }, { "epoch": 2.3422539893617023, "grad_norm": 0.472021222114563, "learning_rate": 1.3928797529432197e-06, "loss": 0.3942, "step": 4697 }, { "epoch": 2.342752659574468, "grad_norm": 0.39978426694869995, "learning_rate": 1.3908708162257679e-06, "loss": 0.3466, "step": 4698 }, { "epoch": 2.343251329787234, "grad_norm": 0.40677040815353394, "learning_rate": 1.388863095207449e-06, "loss": 0.313, "step": 4699 }, { "epoch": 2.34375, "grad_norm": 0.43540653586387634, "learning_rate": 1.3868565905645403e-06, "loss": 0.3426, "step": 4700 }, { "epoch": 2.344248670212766, "grad_norm": 0.4414442479610443, "learning_rate": 1.3848513029729167e-06, "loss": 0.3268, "step": 4701 }, { "epoch": 2.344747340425532, "grad_norm": 0.4415828585624695, "learning_rate": 1.382847233108035e-06, "loss": 0.3096, "step": 4702 }, { "epoch": 2.3452460106382977, "grad_norm": 0.4085775315761566, "learning_rate": 1.38084438164495e-06, "loss": 0.3666, "step": 4703 }, { "epoch": 2.345744680851064, "grad_norm": 0.3684706687927246, "learning_rate": 1.3788427492582995e-06, "loss": 0.2772, "step": 4704 }, { "epoch": 2.34624335106383, "grad_norm": 0.422232061624527, "learning_rate": 1.3768423366223155e-06, "loss": 0.3888, "step": 4705 }, { "epoch": 2.346742021276596, "grad_norm": 0.4126427471637726, "learning_rate": 1.3748431444108145e-06, "loss": 0.3535, "step": 4706 }, { "epoch": 2.3472406914893615, "grad_norm": 0.4517859220504761, "learning_rate": 1.3728451732972076e-06, "loss": 0.3743, "step": 4707 }, { "epoch": 2.3477393617021276, "grad_norm": 0.4034726619720459, "learning_rate": 1.37084842395449e-06, "loss": 0.3201, "step": 4708 }, { "epoch": 2.3482380319148937, "grad_norm": 0.4301390051841736, "learning_rate": 1.3688528970552444e-06, "loss": 0.3559, "step": 4709 }, { "epoch": 2.3487367021276597, "grad_norm": 0.4154106378555298, "learning_rate": 1.3668585932716484e-06, "loss": 0.3353, "step": 4710 }, { "epoch": 2.3492353723404253, "grad_norm": 0.480331689119339, "learning_rate": 1.364865513275459e-06, "loss": 0.3815, "step": 4711 }, { "epoch": 2.3497340425531914, "grad_norm": 0.4095054268836975, "learning_rate": 1.362873657738029e-06, "loss": 0.2949, "step": 4712 }, { "epoch": 2.3502327127659575, "grad_norm": 0.4290769696235657, "learning_rate": 1.3608830273302926e-06, "loss": 0.3573, "step": 4713 }, { "epoch": 2.3507313829787235, "grad_norm": 0.4484621286392212, "learning_rate": 1.3588936227227755e-06, "loss": 0.3718, "step": 4714 }, { "epoch": 2.3512300531914896, "grad_norm": 0.4139614701271057, "learning_rate": 1.3569054445855866e-06, "loss": 0.3379, "step": 4715 }, { "epoch": 2.351728723404255, "grad_norm": 0.45951610803604126, "learning_rate": 1.3549184935884257e-06, "loss": 0.3483, "step": 4716 }, { "epoch": 2.3522273936170213, "grad_norm": 0.3798485994338989, "learning_rate": 1.3529327704005778e-06, "loss": 0.2776, "step": 4717 }, { "epoch": 2.3527260638297873, "grad_norm": 0.3966914117336273, "learning_rate": 1.3509482756909114e-06, "loss": 0.3401, "step": 4718 }, { "epoch": 2.353224734042553, "grad_norm": 0.3844318687915802, "learning_rate": 1.3489650101278872e-06, "loss": 0.3257, "step": 4719 }, { "epoch": 2.353723404255319, "grad_norm": 0.43323206901550293, "learning_rate": 1.3469829743795453e-06, "loss": 0.3507, "step": 4720 }, { "epoch": 2.354222074468085, "grad_norm": 0.39698687195777893, "learning_rate": 1.3450021691135174e-06, "loss": 0.3549, "step": 4721 }, { "epoch": 2.354720744680851, "grad_norm": 0.4253648817539215, "learning_rate": 1.3430225949970144e-06, "loss": 0.3564, "step": 4722 }, { "epoch": 2.355219414893617, "grad_norm": 0.45131945610046387, "learning_rate": 1.3410442526968404e-06, "loss": 0.3486, "step": 4723 }, { "epoch": 2.355718085106383, "grad_norm": 0.38208720088005066, "learning_rate": 1.339067142879376e-06, "loss": 0.3053, "step": 4724 }, { "epoch": 2.356216755319149, "grad_norm": 0.3957991302013397, "learning_rate": 1.3370912662105946e-06, "loss": 0.3369, "step": 4725 }, { "epoch": 2.356715425531915, "grad_norm": 0.44708067178726196, "learning_rate": 1.3351166233560475e-06, "loss": 0.3443, "step": 4726 }, { "epoch": 2.357214095744681, "grad_norm": 0.4001621901988983, "learning_rate": 1.3331432149808758e-06, "loss": 0.3591, "step": 4727 }, { "epoch": 2.357712765957447, "grad_norm": 0.42992863059043884, "learning_rate": 1.331171041749801e-06, "loss": 0.3273, "step": 4728 }, { "epoch": 2.3582114361702127, "grad_norm": 0.4048927426338196, "learning_rate": 1.3292001043271308e-06, "loss": 0.366, "step": 4729 }, { "epoch": 2.3587101063829787, "grad_norm": 0.4208419919013977, "learning_rate": 1.3272304033767558e-06, "loss": 0.3804, "step": 4730 }, { "epoch": 2.359208776595745, "grad_norm": 0.4637756645679474, "learning_rate": 1.325261939562148e-06, "loss": 0.3273, "step": 4731 }, { "epoch": 2.3597074468085104, "grad_norm": 0.4091758131980896, "learning_rate": 1.3232947135463675e-06, "loss": 0.3609, "step": 4732 }, { "epoch": 2.3602061170212765, "grad_norm": 0.42846816778182983, "learning_rate": 1.3213287259920527e-06, "loss": 0.3866, "step": 4733 }, { "epoch": 2.3607047872340425, "grad_norm": 0.37015533447265625, "learning_rate": 1.3193639775614292e-06, "loss": 0.3373, "step": 4734 }, { "epoch": 2.3612034574468086, "grad_norm": 0.43797677755355835, "learning_rate": 1.3174004689163e-06, "loss": 0.3369, "step": 4735 }, { "epoch": 2.3617021276595747, "grad_norm": 0.41366657614707947, "learning_rate": 1.3154382007180565e-06, "loss": 0.2991, "step": 4736 }, { "epoch": 2.3622007978723403, "grad_norm": 0.4450483024120331, "learning_rate": 1.3134771736276663e-06, "loss": 0.3327, "step": 4737 }, { "epoch": 2.3626994680851063, "grad_norm": 0.4188437759876251, "learning_rate": 1.3115173883056843e-06, "loss": 0.3564, "step": 4738 }, { "epoch": 2.3631981382978724, "grad_norm": 0.4274151921272278, "learning_rate": 1.309558845412242e-06, "loss": 0.3291, "step": 4739 }, { "epoch": 2.3636968085106385, "grad_norm": 0.40464136004447937, "learning_rate": 1.307601545607058e-06, "loss": 0.3521, "step": 4740 }, { "epoch": 2.364195478723404, "grad_norm": 0.4821837246417999, "learning_rate": 1.3056454895494275e-06, "loss": 0.3233, "step": 4741 }, { "epoch": 2.36469414893617, "grad_norm": 0.4260486364364624, "learning_rate": 1.3036906778982273e-06, "loss": 0.3555, "step": 4742 }, { "epoch": 2.365192819148936, "grad_norm": 0.3811884820461273, "learning_rate": 1.3017371113119186e-06, "loss": 0.3314, "step": 4743 }, { "epoch": 2.3656914893617023, "grad_norm": 0.4055250287055969, "learning_rate": 1.2997847904485384e-06, "loss": 0.3815, "step": 4744 }, { "epoch": 2.366190159574468, "grad_norm": 0.3988044857978821, "learning_rate": 1.297833715965709e-06, "loss": 0.3694, "step": 4745 }, { "epoch": 2.366688829787234, "grad_norm": 0.5471863746643066, "learning_rate": 1.2958838885206277e-06, "loss": 0.3362, "step": 4746 }, { "epoch": 2.3671875, "grad_norm": 0.41119584441185, "learning_rate": 1.2939353087700774e-06, "loss": 0.3089, "step": 4747 }, { "epoch": 2.367686170212766, "grad_norm": 0.41801488399505615, "learning_rate": 1.291987977370414e-06, "loss": 0.3359, "step": 4748 }, { "epoch": 2.368184840425532, "grad_norm": 0.4040954113006592, "learning_rate": 1.2900418949775811e-06, "loss": 0.3335, "step": 4749 }, { "epoch": 2.3686835106382977, "grad_norm": 0.408543199300766, "learning_rate": 1.2880970622470928e-06, "loss": 0.3985, "step": 4750 }, { "epoch": 2.369182180851064, "grad_norm": 0.49340519309043884, "learning_rate": 1.2861534798340509e-06, "loss": 0.3947, "step": 4751 }, { "epoch": 2.36968085106383, "grad_norm": 0.41685950756073, "learning_rate": 1.284211148393127e-06, "loss": 0.3099, "step": 4752 }, { "epoch": 2.370179521276596, "grad_norm": 0.40875375270843506, "learning_rate": 1.2822700685785805e-06, "loss": 0.3094, "step": 4753 }, { "epoch": 2.3706781914893615, "grad_norm": 0.49725663661956787, "learning_rate": 1.280330241044241e-06, "loss": 0.3811, "step": 4754 }, { "epoch": 2.3711768617021276, "grad_norm": 0.538062572479248, "learning_rate": 1.278391666443522e-06, "loss": 0.4018, "step": 4755 }, { "epoch": 2.3716755319148937, "grad_norm": 0.4506058096885681, "learning_rate": 1.2764543454294138e-06, "loss": 0.321, "step": 4756 }, { "epoch": 2.3721742021276597, "grad_norm": 0.37927234172821045, "learning_rate": 1.2745182786544812e-06, "loss": 0.2868, "step": 4757 }, { "epoch": 2.3726728723404253, "grad_norm": 0.4363057315349579, "learning_rate": 1.2725834667708725e-06, "loss": 0.3797, "step": 4758 }, { "epoch": 2.3731715425531914, "grad_norm": 0.45230403542518616, "learning_rate": 1.2706499104303056e-06, "loss": 0.3616, "step": 4759 }, { "epoch": 2.3736702127659575, "grad_norm": 0.4068848192691803, "learning_rate": 1.2687176102840832e-06, "loss": 0.394, "step": 4760 }, { "epoch": 2.3741688829787235, "grad_norm": 0.37688034772872925, "learning_rate": 1.2667865669830798e-06, "loss": 0.295, "step": 4761 }, { "epoch": 2.3746675531914896, "grad_norm": 0.41719868779182434, "learning_rate": 1.2648567811777469e-06, "loss": 0.3421, "step": 4762 }, { "epoch": 2.375166223404255, "grad_norm": 0.397979736328125, "learning_rate": 1.262928253518116e-06, "loss": 0.319, "step": 4763 }, { "epoch": 2.3756648936170213, "grad_norm": 0.4134211540222168, "learning_rate": 1.2610009846537896e-06, "loss": 0.2739, "step": 4764 }, { "epoch": 2.3761635638297873, "grad_norm": 0.47447946667671204, "learning_rate": 1.2590749752339515e-06, "loss": 0.4074, "step": 4765 }, { "epoch": 2.376662234042553, "grad_norm": 0.3852970600128174, "learning_rate": 1.2571502259073571e-06, "loss": 0.2742, "step": 4766 }, { "epoch": 2.377160904255319, "grad_norm": 0.3798247277736664, "learning_rate": 1.255226737322341e-06, "loss": 0.3376, "step": 4767 }, { "epoch": 2.377659574468085, "grad_norm": 0.4171146750450134, "learning_rate": 1.2533045101268076e-06, "loss": 0.3575, "step": 4768 }, { "epoch": 2.378158244680851, "grad_norm": 0.43339475989341736, "learning_rate": 1.2513835449682442e-06, "loss": 0.3417, "step": 4769 }, { "epoch": 2.378656914893617, "grad_norm": 0.4141106903553009, "learning_rate": 1.249463842493705e-06, "loss": 0.2888, "step": 4770 }, { "epoch": 2.379155585106383, "grad_norm": 0.4323447048664093, "learning_rate": 1.2475454033498258e-06, "loss": 0.3588, "step": 4771 }, { "epoch": 2.379654255319149, "grad_norm": 0.43670278787612915, "learning_rate": 1.2456282281828118e-06, "loss": 0.3834, "step": 4772 }, { "epoch": 2.380152925531915, "grad_norm": 0.403980553150177, "learning_rate": 1.243712317638443e-06, "loss": 0.3155, "step": 4773 }, { "epoch": 2.380651595744681, "grad_norm": 0.4308607578277588, "learning_rate": 1.2417976723620784e-06, "loss": 0.3155, "step": 4774 }, { "epoch": 2.381150265957447, "grad_norm": 0.4613769054412842, "learning_rate": 1.2398842929986431e-06, "loss": 0.3828, "step": 4775 }, { "epoch": 2.3816489361702127, "grad_norm": 0.42183536291122437, "learning_rate": 1.2379721801926442e-06, "loss": 0.3362, "step": 4776 }, { "epoch": 2.3821476063829787, "grad_norm": 0.4082029461860657, "learning_rate": 1.2360613345881535e-06, "loss": 0.2769, "step": 4777 }, { "epoch": 2.382646276595745, "grad_norm": 0.42014381289482117, "learning_rate": 1.2341517568288235e-06, "loss": 0.3633, "step": 4778 }, { "epoch": 2.3831449468085104, "grad_norm": 0.39959490299224854, "learning_rate": 1.2322434475578737e-06, "loss": 0.3289, "step": 4779 }, { "epoch": 2.3836436170212765, "grad_norm": 0.4382026195526123, "learning_rate": 1.2303364074181017e-06, "loss": 0.3482, "step": 4780 }, { "epoch": 2.3841422872340425, "grad_norm": 0.44110700488090515, "learning_rate": 1.2284306370518728e-06, "loss": 0.3852, "step": 4781 }, { "epoch": 2.3846409574468086, "grad_norm": 0.3800439238548279, "learning_rate": 1.2265261371011284e-06, "loss": 0.2892, "step": 4782 }, { "epoch": 2.3851396276595747, "grad_norm": 0.4074316918849945, "learning_rate": 1.2246229082073801e-06, "loss": 0.3237, "step": 4783 }, { "epoch": 2.3856382978723403, "grad_norm": 0.411122590303421, "learning_rate": 1.2227209510117094e-06, "loss": 0.3555, "step": 4784 }, { "epoch": 2.3861369680851063, "grad_norm": 0.3830230236053467, "learning_rate": 1.2208202661547742e-06, "loss": 0.3257, "step": 4785 }, { "epoch": 2.3866356382978724, "grad_norm": 0.4293520450592041, "learning_rate": 1.2189208542767989e-06, "loss": 0.3973, "step": 4786 }, { "epoch": 2.3871343085106385, "grad_norm": 0.4314119219779968, "learning_rate": 1.217022716017584e-06, "loss": 0.3594, "step": 4787 }, { "epoch": 2.387632978723404, "grad_norm": 0.37784793972969055, "learning_rate": 1.215125852016496e-06, "loss": 0.2876, "step": 4788 }, { "epoch": 2.38813164893617, "grad_norm": 0.42106083035469055, "learning_rate": 1.2132302629124753e-06, "loss": 0.4161, "step": 4789 }, { "epoch": 2.388630319148936, "grad_norm": 0.35097038745880127, "learning_rate": 1.211335949344034e-06, "loss": 0.3146, "step": 4790 }, { "epoch": 2.3891289893617023, "grad_norm": 0.4177091717720032, "learning_rate": 1.20944291194925e-06, "loss": 0.3252, "step": 4791 }, { "epoch": 2.389627659574468, "grad_norm": 0.4260154962539673, "learning_rate": 1.2075511513657768e-06, "loss": 0.373, "step": 4792 }, { "epoch": 2.390126329787234, "grad_norm": 0.362454891204834, "learning_rate": 1.2056606682308326e-06, "loss": 0.3291, "step": 4793 }, { "epoch": 2.390625, "grad_norm": 0.4262488782405853, "learning_rate": 1.203771463181209e-06, "loss": 0.3169, "step": 4794 }, { "epoch": 2.391123670212766, "grad_norm": 0.41771307587623596, "learning_rate": 1.201883536853265e-06, "loss": 0.3326, "step": 4795 }, { "epoch": 2.391622340425532, "grad_norm": 0.40677106380462646, "learning_rate": 1.1999968898829317e-06, "loss": 0.3758, "step": 4796 }, { "epoch": 2.3921210106382977, "grad_norm": 0.43436938524246216, "learning_rate": 1.1981115229057045e-06, "loss": 0.3339, "step": 4797 }, { "epoch": 2.392619680851064, "grad_norm": 0.3815542459487915, "learning_rate": 1.1962274365566534e-06, "loss": 0.3301, "step": 4798 }, { "epoch": 2.39311835106383, "grad_norm": 0.41153374314308167, "learning_rate": 1.194344631470411e-06, "loss": 0.3523, "step": 4799 }, { "epoch": 2.393617021276596, "grad_norm": 0.4271272122859955, "learning_rate": 1.1924631082811845e-06, "loss": 0.3174, "step": 4800 }, { "epoch": 2.3941156914893615, "grad_norm": 0.4356432855129242, "learning_rate": 1.1905828676227432e-06, "loss": 0.3599, "step": 4801 }, { "epoch": 2.3946143617021276, "grad_norm": 0.4468269944190979, "learning_rate": 1.1887039101284298e-06, "loss": 0.3498, "step": 4802 }, { "epoch": 2.3951130319148937, "grad_norm": 0.443447470664978, "learning_rate": 1.1868262364311507e-06, "loss": 0.3945, "step": 4803 }, { "epoch": 2.3956117021276597, "grad_norm": 0.3742583096027374, "learning_rate": 1.184949847163383e-06, "loss": 0.3094, "step": 4804 }, { "epoch": 2.3961103723404253, "grad_norm": 0.4288971722126007, "learning_rate": 1.1830747429571693e-06, "loss": 0.3884, "step": 4805 }, { "epoch": 2.3966090425531914, "grad_norm": 0.37040722370147705, "learning_rate": 1.181200924444118e-06, "loss": 0.2802, "step": 4806 }, { "epoch": 2.3971077127659575, "grad_norm": 0.4381345510482788, "learning_rate": 1.1793283922554088e-06, "loss": 0.2963, "step": 4807 }, { "epoch": 2.3976063829787235, "grad_norm": 0.4955851435661316, "learning_rate": 1.1774571470217832e-06, "loss": 0.3694, "step": 4808 }, { "epoch": 2.3981050531914896, "grad_norm": 0.4304719865322113, "learning_rate": 1.1755871893735538e-06, "loss": 0.3717, "step": 4809 }, { "epoch": 2.398603723404255, "grad_norm": 0.4188970923423767, "learning_rate": 1.1737185199405943e-06, "loss": 0.3355, "step": 4810 }, { "epoch": 2.3991023936170213, "grad_norm": 0.41271746158599854, "learning_rate": 1.17185113935235e-06, "loss": 0.3596, "step": 4811 }, { "epoch": 2.3996010638297873, "grad_norm": 0.3815654218196869, "learning_rate": 1.1699850482378272e-06, "loss": 0.3081, "step": 4812 }, { "epoch": 2.400099734042553, "grad_norm": 0.39367467164993286, "learning_rate": 1.1681202472256025e-06, "loss": 0.3313, "step": 4813 }, { "epoch": 2.400598404255319, "grad_norm": 0.41685622930526733, "learning_rate": 1.1662567369438127e-06, "loss": 0.3513, "step": 4814 }, { "epoch": 2.401097074468085, "grad_norm": 0.38045042753219604, "learning_rate": 1.1643945180201654e-06, "loss": 0.3224, "step": 4815 }, { "epoch": 2.401595744680851, "grad_norm": 0.42668384313583374, "learning_rate": 1.1625335910819291e-06, "loss": 0.3529, "step": 4816 }, { "epoch": 2.402094414893617, "grad_norm": 0.4049879014492035, "learning_rate": 1.1606739567559372e-06, "loss": 0.2851, "step": 4817 }, { "epoch": 2.402593085106383, "grad_norm": 0.41446036100387573, "learning_rate": 1.158815615668592e-06, "loss": 0.3528, "step": 4818 }, { "epoch": 2.403091755319149, "grad_norm": 0.4519102871417999, "learning_rate": 1.1569585684458533e-06, "loss": 0.2849, "step": 4819 }, { "epoch": 2.403590425531915, "grad_norm": 0.4627401828765869, "learning_rate": 1.1551028157132533e-06, "loss": 0.372, "step": 4820 }, { "epoch": 2.404089095744681, "grad_norm": 0.40552279353141785, "learning_rate": 1.1532483580958792e-06, "loss": 0.3407, "step": 4821 }, { "epoch": 2.404587765957447, "grad_norm": 0.4202900528907776, "learning_rate": 1.1513951962183912e-06, "loss": 0.3461, "step": 4822 }, { "epoch": 2.4050864361702127, "grad_norm": 0.4637872278690338, "learning_rate": 1.1495433307050035e-06, "loss": 0.3184, "step": 4823 }, { "epoch": 2.4055851063829787, "grad_norm": 0.47273722290992737, "learning_rate": 1.147692762179503e-06, "loss": 0.3419, "step": 4824 }, { "epoch": 2.406083776595745, "grad_norm": 0.39524292945861816, "learning_rate": 1.1458434912652323e-06, "loss": 0.2975, "step": 4825 }, { "epoch": 2.4065824468085104, "grad_norm": 0.44725343585014343, "learning_rate": 1.1439955185851016e-06, "loss": 0.3588, "step": 4826 }, { "epoch": 2.4070811170212765, "grad_norm": 0.3957047760486603, "learning_rate": 1.1421488447615798e-06, "loss": 0.2958, "step": 4827 }, { "epoch": 2.4075797872340425, "grad_norm": 0.43862906098365784, "learning_rate": 1.140303470416702e-06, "loss": 0.3913, "step": 4828 }, { "epoch": 2.4080784574468086, "grad_norm": 0.4316008985042572, "learning_rate": 1.1384593961720653e-06, "loss": 0.3653, "step": 4829 }, { "epoch": 2.4085771276595747, "grad_norm": 0.3656449615955353, "learning_rate": 1.1366166226488258e-06, "loss": 0.259, "step": 4830 }, { "epoch": 2.4090757978723403, "grad_norm": 0.43735748529434204, "learning_rate": 1.1347751504677047e-06, "loss": 0.3546, "step": 4831 }, { "epoch": 2.4095744680851063, "grad_norm": 0.37945690751075745, "learning_rate": 1.132934980248982e-06, "loss": 0.3063, "step": 4832 }, { "epoch": 2.4100731382978724, "grad_norm": 0.40800774097442627, "learning_rate": 1.1310961126125026e-06, "loss": 0.3663, "step": 4833 }, { "epoch": 2.4105718085106385, "grad_norm": 0.4296668767929077, "learning_rate": 1.1292585481776685e-06, "loss": 0.3549, "step": 4834 }, { "epoch": 2.411070478723404, "grad_norm": 0.45969951152801514, "learning_rate": 1.1274222875634472e-06, "loss": 0.3936, "step": 4835 }, { "epoch": 2.41156914893617, "grad_norm": 0.442556232213974, "learning_rate": 1.1255873313883637e-06, "loss": 0.2564, "step": 4836 }, { "epoch": 2.412067819148936, "grad_norm": 0.4034481942653656, "learning_rate": 1.1237536802705034e-06, "loss": 0.331, "step": 4837 }, { "epoch": 2.4125664893617023, "grad_norm": 0.4548307955265045, "learning_rate": 1.121921334827516e-06, "loss": 0.3909, "step": 4838 }, { "epoch": 2.413065159574468, "grad_norm": 0.43499329686164856, "learning_rate": 1.1200902956766058e-06, "loss": 0.3273, "step": 4839 }, { "epoch": 2.413563829787234, "grad_norm": 0.4275858998298645, "learning_rate": 1.1182605634345429e-06, "loss": 0.3494, "step": 4840 }, { "epoch": 2.4140625, "grad_norm": 0.4141581058502197, "learning_rate": 1.116432138717652e-06, "loss": 0.329, "step": 4841 }, { "epoch": 2.414561170212766, "grad_norm": 0.4311729669570923, "learning_rate": 1.1146050221418214e-06, "loss": 0.3349, "step": 4842 }, { "epoch": 2.415059840425532, "grad_norm": 0.4350718855857849, "learning_rate": 1.1127792143224952e-06, "loss": 0.3888, "step": 4843 }, { "epoch": 2.4155585106382977, "grad_norm": 0.4092673063278198, "learning_rate": 1.110954715874682e-06, "loss": 0.298, "step": 4844 }, { "epoch": 2.416057180851064, "grad_norm": 0.41169703006744385, "learning_rate": 1.1091315274129417e-06, "loss": 0.3195, "step": 4845 }, { "epoch": 2.41655585106383, "grad_norm": 0.4299285113811493, "learning_rate": 1.1073096495514002e-06, "loss": 0.3712, "step": 4846 }, { "epoch": 2.417054521276596, "grad_norm": 0.407307505607605, "learning_rate": 1.1054890829037378e-06, "loss": 0.3429, "step": 4847 }, { "epoch": 2.4175531914893615, "grad_norm": 0.44742023944854736, "learning_rate": 1.1036698280831932e-06, "loss": 0.3656, "step": 4848 }, { "epoch": 2.4180518617021276, "grad_norm": 0.39052218198776245, "learning_rate": 1.1018518857025662e-06, "loss": 0.3332, "step": 4849 }, { "epoch": 2.4185505319148937, "grad_norm": 0.4349275231361389, "learning_rate": 1.10003525637421e-06, "loss": 0.3813, "step": 4850 }, { "epoch": 2.4190492021276597, "grad_norm": 0.429035484790802, "learning_rate": 1.0982199407100409e-06, "loss": 0.4051, "step": 4851 }, { "epoch": 2.4195478723404253, "grad_norm": 0.4246768653392792, "learning_rate": 1.0964059393215276e-06, "loss": 0.3382, "step": 4852 }, { "epoch": 2.4200465425531914, "grad_norm": 0.4096680283546448, "learning_rate": 1.0945932528197e-06, "loss": 0.291, "step": 4853 }, { "epoch": 2.4205452127659575, "grad_norm": 0.42026421427726746, "learning_rate": 1.092781881815142e-06, "loss": 0.3706, "step": 4854 }, { "epoch": 2.4210438829787235, "grad_norm": 0.45444968342781067, "learning_rate": 1.0909718269179975e-06, "loss": 0.3294, "step": 4855 }, { "epoch": 2.4215425531914896, "grad_norm": 0.3933725655078888, "learning_rate": 1.0891630887379633e-06, "loss": 0.3215, "step": 4856 }, { "epoch": 2.422041223404255, "grad_norm": 0.4240787625312805, "learning_rate": 1.0873556678842973e-06, "loss": 0.3351, "step": 4857 }, { "epoch": 2.4225398936170213, "grad_norm": 0.40938249230384827, "learning_rate": 1.0855495649658092e-06, "loss": 0.3734, "step": 4858 }, { "epoch": 2.4230385638297873, "grad_norm": 0.43904680013656616, "learning_rate": 1.0837447805908668e-06, "loss": 0.3199, "step": 4859 }, { "epoch": 2.423537234042553, "grad_norm": 0.4075852930545807, "learning_rate": 1.0819413153673942e-06, "loss": 0.3806, "step": 4860 }, { "epoch": 2.424035904255319, "grad_norm": 0.3842877447605133, "learning_rate": 1.0801391699028718e-06, "loss": 0.3153, "step": 4861 }, { "epoch": 2.424534574468085, "grad_norm": 0.4235413074493408, "learning_rate": 1.0783383448043322e-06, "loss": 0.3763, "step": 4862 }, { "epoch": 2.425033244680851, "grad_norm": 0.4126816391944885, "learning_rate": 1.0765388406783671e-06, "loss": 0.351, "step": 4863 }, { "epoch": 2.425531914893617, "grad_norm": 0.4540381133556366, "learning_rate": 1.0747406581311204e-06, "loss": 0.423, "step": 4864 }, { "epoch": 2.426030585106383, "grad_norm": 0.3715782165527344, "learning_rate": 1.0729437977682926e-06, "loss": 0.3108, "step": 4865 }, { "epoch": 2.426529255319149, "grad_norm": 0.35162588953971863, "learning_rate": 1.0711482601951373e-06, "loss": 0.3285, "step": 4866 }, { "epoch": 2.427027925531915, "grad_norm": 0.43035903573036194, "learning_rate": 1.0693540460164636e-06, "loss": 0.3829, "step": 4867 }, { "epoch": 2.427526595744681, "grad_norm": 0.3659517467021942, "learning_rate": 1.0675611558366366e-06, "loss": 0.3167, "step": 4868 }, { "epoch": 2.428025265957447, "grad_norm": 0.3972339332103729, "learning_rate": 1.0657695902595722e-06, "loss": 0.3586, "step": 4869 }, { "epoch": 2.4285239361702127, "grad_norm": 0.35645216703414917, "learning_rate": 1.063979349888739e-06, "loss": 0.3597, "step": 4870 }, { "epoch": 2.4290226063829787, "grad_norm": 0.43477678298950195, "learning_rate": 1.0621904353271655e-06, "loss": 0.3545, "step": 4871 }, { "epoch": 2.429521276595745, "grad_norm": 0.4065863788127899, "learning_rate": 1.0604028471774264e-06, "loss": 0.2693, "step": 4872 }, { "epoch": 2.4300199468085104, "grad_norm": 0.397876501083374, "learning_rate": 1.0586165860416557e-06, "loss": 0.3391, "step": 4873 }, { "epoch": 2.4305186170212765, "grad_norm": 0.42527544498443604, "learning_rate": 1.056831652521535e-06, "loss": 0.3353, "step": 4874 }, { "epoch": 2.4310172872340425, "grad_norm": 0.4363025426864624, "learning_rate": 1.0550480472183039e-06, "loss": 0.3212, "step": 4875 }, { "epoch": 2.4315159574468086, "grad_norm": 0.410292387008667, "learning_rate": 1.0532657707327498e-06, "loss": 0.3141, "step": 4876 }, { "epoch": 2.4320146276595747, "grad_norm": 0.4538896977901459, "learning_rate": 1.0514848236652164e-06, "loss": 0.3188, "step": 4877 }, { "epoch": 2.4325132978723403, "grad_norm": 0.4495841860771179, "learning_rate": 1.0497052066155966e-06, "loss": 0.3876, "step": 4878 }, { "epoch": 2.4330119680851063, "grad_norm": 0.42729973793029785, "learning_rate": 1.0479269201833381e-06, "loss": 0.3687, "step": 4879 }, { "epoch": 2.4335106382978724, "grad_norm": 0.4260372221469879, "learning_rate": 1.0461499649674383e-06, "loss": 0.3272, "step": 4880 }, { "epoch": 2.4340093085106385, "grad_norm": 0.39861124753952026, "learning_rate": 1.044374341566446e-06, "loss": 0.3204, "step": 4881 }, { "epoch": 2.434507978723404, "grad_norm": 0.4231519103050232, "learning_rate": 1.0426000505784633e-06, "loss": 0.3384, "step": 4882 }, { "epoch": 2.43500664893617, "grad_norm": 0.44437479972839355, "learning_rate": 1.040827092601141e-06, "loss": 0.3745, "step": 4883 }, { "epoch": 2.435505319148936, "grad_norm": 0.40129825472831726, "learning_rate": 1.0390554682316844e-06, "loss": 0.3421, "step": 4884 }, { "epoch": 2.4360039893617023, "grad_norm": 0.42828574776649475, "learning_rate": 1.0372851780668453e-06, "loss": 0.3617, "step": 4885 }, { "epoch": 2.436502659574468, "grad_norm": 0.44119924306869507, "learning_rate": 1.0355162227029309e-06, "loss": 0.3274, "step": 4886 }, { "epoch": 2.437001329787234, "grad_norm": 0.43073707818984985, "learning_rate": 1.0337486027357924e-06, "loss": 0.3175, "step": 4887 }, { "epoch": 2.4375, "grad_norm": 0.4121411442756653, "learning_rate": 1.0319823187608391e-06, "loss": 0.304, "step": 4888 }, { "epoch": 2.437998670212766, "grad_norm": 0.41272857785224915, "learning_rate": 1.0302173713730224e-06, "loss": 0.3538, "step": 4889 }, { "epoch": 2.438497340425532, "grad_norm": 0.404481440782547, "learning_rate": 1.0284537611668505e-06, "loss": 0.3749, "step": 4890 }, { "epoch": 2.4389960106382977, "grad_norm": 0.44457077980041504, "learning_rate": 1.0266914887363767e-06, "loss": 0.3529, "step": 4891 }, { "epoch": 2.439494680851064, "grad_norm": 0.4042561948299408, "learning_rate": 1.0249305546752026e-06, "loss": 0.3043, "step": 4892 }, { "epoch": 2.43999335106383, "grad_norm": 0.41435566544532776, "learning_rate": 1.0231709595764843e-06, "loss": 0.3298, "step": 4893 }, { "epoch": 2.440492021276596, "grad_norm": 0.460421621799469, "learning_rate": 1.0214127040329218e-06, "loss": 0.3588, "step": 4894 }, { "epoch": 2.4409906914893615, "grad_norm": 0.4816964268684387, "learning_rate": 1.0196557886367681e-06, "loss": 0.3906, "step": 4895 }, { "epoch": 2.4414893617021276, "grad_norm": 0.43298834562301636, "learning_rate": 1.01790021397982e-06, "loss": 0.296, "step": 4896 }, { "epoch": 2.4419880319148937, "grad_norm": 0.4095512628555298, "learning_rate": 1.016145980653428e-06, "loss": 0.2707, "step": 4897 }, { "epoch": 2.4424867021276597, "grad_norm": 0.4785206913948059, "learning_rate": 1.014393089248485e-06, "loss": 0.3704, "step": 4898 }, { "epoch": 2.4429853723404253, "grad_norm": 0.3813346028327942, "learning_rate": 1.0126415403554385e-06, "loss": 0.3091, "step": 4899 }, { "epoch": 2.4434840425531914, "grad_norm": 0.3852085471153259, "learning_rate": 1.0108913345642769e-06, "loss": 0.3628, "step": 4900 }, { "epoch": 2.4439827127659575, "grad_norm": 0.3954811096191406, "learning_rate": 1.0091424724645405e-06, "loss": 0.3513, "step": 4901 }, { "epoch": 2.4444813829787235, "grad_norm": 0.4661111533641815, "learning_rate": 1.0073949546453188e-06, "loss": 0.3248, "step": 4902 }, { "epoch": 2.4449800531914896, "grad_norm": 0.41646018624305725, "learning_rate": 1.005648781695242e-06, "loss": 0.3627, "step": 4903 }, { "epoch": 2.445478723404255, "grad_norm": 0.40390628576278687, "learning_rate": 1.0039039542024936e-06, "loss": 0.2951, "step": 4904 }, { "epoch": 2.4459773936170213, "grad_norm": 0.46972376108169556, "learning_rate": 1.002160472754799e-06, "loss": 0.3525, "step": 4905 }, { "epoch": 2.4464760638297873, "grad_norm": 0.39830482006073, "learning_rate": 1.0004183379394355e-06, "loss": 0.2904, "step": 4906 }, { "epoch": 2.446974734042553, "grad_norm": 0.45328855514526367, "learning_rate": 9.986775503432207e-07, "loss": 0.3596, "step": 4907 }, { "epoch": 2.447473404255319, "grad_norm": 0.4453526735305786, "learning_rate": 9.96938110552524e-07, "loss": 0.2971, "step": 4908 }, { "epoch": 2.447972074468085, "grad_norm": 0.4125150144100189, "learning_rate": 9.952000191532562e-07, "loss": 0.312, "step": 4909 }, { "epoch": 2.448470744680851, "grad_norm": 0.4160078465938568, "learning_rate": 9.934632767308783e-07, "loss": 0.3358, "step": 4910 }, { "epoch": 2.448969414893617, "grad_norm": 0.4313279092311859, "learning_rate": 9.917278838703937e-07, "loss": 0.3303, "step": 4911 }, { "epoch": 2.449468085106383, "grad_norm": 0.4015246331691742, "learning_rate": 9.899938411563498e-07, "loss": 0.3486, "step": 4912 }, { "epoch": 2.449966755319149, "grad_norm": 0.3902813792228699, "learning_rate": 9.882611491728455e-07, "loss": 0.315, "step": 4913 }, { "epoch": 2.450465425531915, "grad_norm": 0.3929996192455292, "learning_rate": 9.86529808503518e-07, "loss": 0.3549, "step": 4914 }, { "epoch": 2.450964095744681, "grad_norm": 0.40547212958335876, "learning_rate": 9.847998197315546e-07, "loss": 0.3377, "step": 4915 }, { "epoch": 2.451462765957447, "grad_norm": 0.3998861312866211, "learning_rate": 9.830711834396812e-07, "loss": 0.3415, "step": 4916 }, { "epoch": 2.4519614361702127, "grad_norm": 0.36838382482528687, "learning_rate": 9.813439002101754e-07, "loss": 0.3474, "step": 4917 }, { "epoch": 2.4524601063829787, "grad_norm": 0.3999634087085724, "learning_rate": 9.79617970624852e-07, "loss": 0.3348, "step": 4918 }, { "epoch": 2.452958776595745, "grad_norm": 0.428193062543869, "learning_rate": 9.778933952650765e-07, "loss": 0.3034, "step": 4919 }, { "epoch": 2.4534574468085104, "grad_norm": 0.4473617672920227, "learning_rate": 9.761701747117513e-07, "loss": 0.3515, "step": 4920 }, { "epoch": 2.4539561170212765, "grad_norm": 0.41138067841529846, "learning_rate": 9.7444830954533e-07, "loss": 0.3061, "step": 4921 }, { "epoch": 2.4544547872340425, "grad_norm": 0.4100894033908844, "learning_rate": 9.72727800345803e-07, "loss": 0.3257, "step": 4922 }, { "epoch": 2.4549534574468086, "grad_norm": 0.4402252733707428, "learning_rate": 9.710086476927061e-07, "loss": 0.3879, "step": 4923 }, { "epoch": 2.4554521276595747, "grad_norm": 0.3790316581726074, "learning_rate": 9.692908521651213e-07, "loss": 0.289, "step": 4924 }, { "epoch": 2.4559507978723403, "grad_norm": 0.39526695013046265, "learning_rate": 9.675744143416683e-07, "loss": 0.3564, "step": 4925 }, { "epoch": 2.4564494680851063, "grad_norm": 0.4428338408470154, "learning_rate": 9.658593348005141e-07, "loss": 0.3688, "step": 4926 }, { "epoch": 2.4569481382978724, "grad_norm": 0.41833582520484924, "learning_rate": 9.64145614119364e-07, "loss": 0.3581, "step": 4927 }, { "epoch": 2.4574468085106385, "grad_norm": 0.3904329538345337, "learning_rate": 9.624332528754698e-07, "loss": 0.3538, "step": 4928 }, { "epoch": 2.457945478723404, "grad_norm": 0.4188867509365082, "learning_rate": 9.607222516456215e-07, "loss": 0.2735, "step": 4929 }, { "epoch": 2.45844414893617, "grad_norm": 0.42610663175582886, "learning_rate": 9.59012611006155e-07, "loss": 0.3815, "step": 4930 }, { "epoch": 2.458942819148936, "grad_norm": 0.4341682493686676, "learning_rate": 9.573043315329428e-07, "loss": 0.3671, "step": 4931 }, { "epoch": 2.4594414893617023, "grad_norm": 0.4063419699668884, "learning_rate": 9.555974138014046e-07, "loss": 0.3258, "step": 4932 }, { "epoch": 2.459940159574468, "grad_norm": 0.38188743591308594, "learning_rate": 9.538918583864965e-07, "loss": 0.333, "step": 4933 }, { "epoch": 2.460438829787234, "grad_norm": 0.4016352891921997, "learning_rate": 9.521876658627194e-07, "loss": 0.3678, "step": 4934 }, { "epoch": 2.4609375, "grad_norm": 0.3893921673297882, "learning_rate": 9.50484836804112e-07, "loss": 0.3078, "step": 4935 }, { "epoch": 2.461436170212766, "grad_norm": 0.42613500356674194, "learning_rate": 9.487833717842565e-07, "loss": 0.3481, "step": 4936 }, { "epoch": 2.461934840425532, "grad_norm": 0.38711005449295044, "learning_rate": 9.470832713762734e-07, "loss": 0.3125, "step": 4937 }, { "epoch": 2.4624335106382977, "grad_norm": 0.4031774699687958, "learning_rate": 9.453845361528263e-07, "loss": 0.3217, "step": 4938 }, { "epoch": 2.462932180851064, "grad_norm": 0.3840700089931488, "learning_rate": 9.436871666861141e-07, "loss": 0.3444, "step": 4939 }, { "epoch": 2.46343085106383, "grad_norm": 0.43896761536598206, "learning_rate": 9.419911635478812e-07, "loss": 0.3313, "step": 4940 }, { "epoch": 2.463929521276596, "grad_norm": 0.47798269987106323, "learning_rate": 9.4029652730941e-07, "loss": 0.3782, "step": 4941 }, { "epoch": 2.4644281914893615, "grad_norm": 0.39631375670433044, "learning_rate": 9.386032585415195e-07, "loss": 0.2412, "step": 4942 }, { "epoch": 2.4649268617021276, "grad_norm": 0.42101559042930603, "learning_rate": 9.369113578145722e-07, "loss": 0.341, "step": 4943 }, { "epoch": 2.4654255319148937, "grad_norm": 0.37772199511528015, "learning_rate": 9.352208256984674e-07, "loss": 0.2999, "step": 4944 }, { "epoch": 2.4659242021276597, "grad_norm": 0.37025177478790283, "learning_rate": 9.335316627626423e-07, "loss": 0.3281, "step": 4945 }, { "epoch": 2.4664228723404253, "grad_norm": 0.3870023488998413, "learning_rate": 9.318438695760778e-07, "loss": 0.3566, "step": 4946 }, { "epoch": 2.4669215425531914, "grad_norm": 0.4286319315433502, "learning_rate": 9.301574467072871e-07, "loss": 0.3765, "step": 4947 }, { "epoch": 2.4674202127659575, "grad_norm": 0.43094402551651, "learning_rate": 9.284723947243274e-07, "loss": 0.3133, "step": 4948 }, { "epoch": 2.4679188829787235, "grad_norm": 0.42888695001602173, "learning_rate": 9.26788714194789e-07, "loss": 0.3559, "step": 4949 }, { "epoch": 2.4684175531914896, "grad_norm": 0.39583873748779297, "learning_rate": 9.251064056858056e-07, "loss": 0.309, "step": 4950 }, { "epoch": 2.468916223404255, "grad_norm": 0.3810369670391083, "learning_rate": 9.234254697640438e-07, "loss": 0.3339, "step": 4951 }, { "epoch": 2.4694148936170213, "grad_norm": 0.42391538619995117, "learning_rate": 9.217459069957119e-07, "loss": 0.3306, "step": 4952 }, { "epoch": 2.4699135638297873, "grad_norm": 0.4338097870349884, "learning_rate": 9.200677179465522e-07, "loss": 0.3853, "step": 4953 }, { "epoch": 2.470412234042553, "grad_norm": 0.4249626398086548, "learning_rate": 9.183909031818478e-07, "loss": 0.3607, "step": 4954 }, { "epoch": 2.470910904255319, "grad_norm": 0.4389422535896301, "learning_rate": 9.16715463266416e-07, "loss": 0.3261, "step": 4955 }, { "epoch": 2.471409574468085, "grad_norm": 0.4040607511997223, "learning_rate": 9.150413987646107e-07, "loss": 0.3277, "step": 4956 }, { "epoch": 2.471908244680851, "grad_norm": 0.4250637888908386, "learning_rate": 9.13368710240326e-07, "loss": 0.3432, "step": 4957 }, { "epoch": 2.472406914893617, "grad_norm": 0.42801526188850403, "learning_rate": 9.116973982569882e-07, "loss": 0.3052, "step": 4958 }, { "epoch": 2.472905585106383, "grad_norm": 0.4372919499874115, "learning_rate": 9.100274633775652e-07, "loss": 0.3174, "step": 4959 }, { "epoch": 2.473404255319149, "grad_norm": 0.4434429407119751, "learning_rate": 9.083589061645537e-07, "loss": 0.3783, "step": 4960 }, { "epoch": 2.473902925531915, "grad_norm": 0.400494247674942, "learning_rate": 9.066917271799946e-07, "loss": 0.3665, "step": 4961 }, { "epoch": 2.474401595744681, "grad_norm": 0.3832798898220062, "learning_rate": 9.050259269854572e-07, "loss": 0.3612, "step": 4962 }, { "epoch": 2.474900265957447, "grad_norm": 0.3941778540611267, "learning_rate": 9.033615061420525e-07, "loss": 0.3627, "step": 4963 }, { "epoch": 2.4753989361702127, "grad_norm": 0.380984902381897, "learning_rate": 9.016984652104216e-07, "loss": 0.3296, "step": 4964 }, { "epoch": 2.4758976063829787, "grad_norm": 0.44891858100891113, "learning_rate": 9.000368047507452e-07, "loss": 0.3572, "step": 4965 }, { "epoch": 2.476396276595745, "grad_norm": 0.4266890585422516, "learning_rate": 8.983765253227366e-07, "loss": 0.247, "step": 4966 }, { "epoch": 2.4768949468085104, "grad_norm": 0.4922742545604706, "learning_rate": 8.967176274856427e-07, "loss": 0.39, "step": 4967 }, { "epoch": 2.4773936170212765, "grad_norm": 0.4705757200717926, "learning_rate": 8.950601117982482e-07, "loss": 0.3677, "step": 4968 }, { "epoch": 2.4778922872340425, "grad_norm": 0.3752896785736084, "learning_rate": 8.934039788188698e-07, "loss": 0.2826, "step": 4969 }, { "epoch": 2.4783909574468086, "grad_norm": 0.4012126624584198, "learning_rate": 8.917492291053614e-07, "loss": 0.3303, "step": 4970 }, { "epoch": 2.4788896276595747, "grad_norm": 0.4936978220939636, "learning_rate": 8.900958632151058e-07, "loss": 0.3712, "step": 4971 }, { "epoch": 2.4793882978723403, "grad_norm": 0.37025943398475647, "learning_rate": 8.884438817050245e-07, "loss": 0.2775, "step": 4972 }, { "epoch": 2.4798869680851063, "grad_norm": 0.4280363619327545, "learning_rate": 8.867932851315725e-07, "loss": 0.3494, "step": 4973 }, { "epoch": 2.4803856382978724, "grad_norm": 0.4341171979904175, "learning_rate": 8.851440740507334e-07, "loss": 0.3287, "step": 4974 }, { "epoch": 2.4808843085106385, "grad_norm": 0.4218170642852783, "learning_rate": 8.83496249018031e-07, "loss": 0.3614, "step": 4975 }, { "epoch": 2.481382978723404, "grad_norm": 0.39012905955314636, "learning_rate": 8.81849810588516e-07, "loss": 0.3518, "step": 4976 }, { "epoch": 2.48188164893617, "grad_norm": 0.37841030955314636, "learning_rate": 8.802047593167762e-07, "loss": 0.3081, "step": 4977 }, { "epoch": 2.482380319148936, "grad_norm": 0.4525388777256012, "learning_rate": 8.785610957569296e-07, "loss": 0.3585, "step": 4978 }, { "epoch": 2.4828789893617023, "grad_norm": 0.4620401859283447, "learning_rate": 8.769188204626295e-07, "loss": 0.3904, "step": 4979 }, { "epoch": 2.483377659574468, "grad_norm": 0.424099862575531, "learning_rate": 8.752779339870577e-07, "loss": 0.3161, "step": 4980 }, { "epoch": 2.483876329787234, "grad_norm": 0.4395163059234619, "learning_rate": 8.736384368829331e-07, "loss": 0.3401, "step": 4981 }, { "epoch": 2.484375, "grad_norm": 0.47416791319847107, "learning_rate": 8.72000329702501e-07, "loss": 0.3757, "step": 4982 }, { "epoch": 2.484873670212766, "grad_norm": 0.37134578824043274, "learning_rate": 8.70363612997544e-07, "loss": 0.3171, "step": 4983 }, { "epoch": 2.485372340425532, "grad_norm": 0.35191959142684937, "learning_rate": 8.687282873193719e-07, "loss": 0.3103, "step": 4984 }, { "epoch": 2.4858710106382977, "grad_norm": 0.4767099916934967, "learning_rate": 8.670943532188298e-07, "loss": 0.3546, "step": 4985 }, { "epoch": 2.486369680851064, "grad_norm": 0.4195844531059265, "learning_rate": 8.654618112462909e-07, "loss": 0.3322, "step": 4986 }, { "epoch": 2.48686835106383, "grad_norm": 0.47937822341918945, "learning_rate": 8.638306619516596e-07, "loss": 0.3648, "step": 4987 }, { "epoch": 2.487367021276596, "grad_norm": 0.370620459318161, "learning_rate": 8.622009058843745e-07, "loss": 0.3077, "step": 4988 }, { "epoch": 2.4878656914893615, "grad_norm": 0.48954635858535767, "learning_rate": 8.605725435934004e-07, "loss": 0.3622, "step": 4989 }, { "epoch": 2.4883643617021276, "grad_norm": 0.422769159078598, "learning_rate": 8.589455756272369e-07, "loss": 0.2839, "step": 4990 }, { "epoch": 2.4888630319148937, "grad_norm": 0.4360896944999695, "learning_rate": 8.573200025339101e-07, "loss": 0.3734, "step": 4991 }, { "epoch": 2.4893617021276597, "grad_norm": 0.4107685983181, "learning_rate": 8.556958248609798e-07, "loss": 0.3424, "step": 4992 }, { "epoch": 2.4898603723404253, "grad_norm": 0.44925785064697266, "learning_rate": 8.540730431555322e-07, "loss": 0.3615, "step": 4993 }, { "epoch": 2.4903590425531914, "grad_norm": 0.439975768327713, "learning_rate": 8.524516579641873e-07, "loss": 0.3312, "step": 4994 }, { "epoch": 2.4908577127659575, "grad_norm": 0.45085954666137695, "learning_rate": 8.508316698330904e-07, "loss": 0.3299, "step": 4995 }, { "epoch": 2.4913563829787235, "grad_norm": 0.458018958568573, "learning_rate": 8.492130793079206e-07, "loss": 0.3064, "step": 4996 }, { "epoch": 2.4918550531914896, "grad_norm": 0.42192742228507996, "learning_rate": 8.475958869338818e-07, "loss": 0.3506, "step": 4997 }, { "epoch": 2.492353723404255, "grad_norm": 0.4004508852958679, "learning_rate": 8.459800932557094e-07, "loss": 0.2917, "step": 4998 }, { "epoch": 2.4928523936170213, "grad_norm": 0.4419281780719757, "learning_rate": 8.443656988176691e-07, "loss": 0.3484, "step": 4999 }, { "epoch": 2.4933510638297873, "grad_norm": 0.40917837619781494, "learning_rate": 8.427527041635508e-07, "loss": 0.3385, "step": 5000 }, { "epoch": 2.493849734042553, "grad_norm": 0.3738405406475067, "learning_rate": 8.41141109836679e-07, "loss": 0.3913, "step": 5001 }, { "epoch": 2.494348404255319, "grad_norm": 0.40353909134864807, "learning_rate": 8.395309163798998e-07, "loss": 0.2872, "step": 5002 }, { "epoch": 2.494847074468085, "grad_norm": 0.43724167346954346, "learning_rate": 8.379221243355933e-07, "loss": 0.3469, "step": 5003 }, { "epoch": 2.495345744680851, "grad_norm": 0.37733975052833557, "learning_rate": 8.363147342456629e-07, "loss": 0.2914, "step": 5004 }, { "epoch": 2.495844414893617, "grad_norm": 0.4382196366786957, "learning_rate": 8.347087466515446e-07, "loss": 0.3654, "step": 5005 }, { "epoch": 2.496343085106383, "grad_norm": 0.3612316846847534, "learning_rate": 8.331041620941965e-07, "loss": 0.3797, "step": 5006 }, { "epoch": 2.496841755319149, "grad_norm": 0.3690952956676483, "learning_rate": 8.315009811141095e-07, "loss": 0.3515, "step": 5007 }, { "epoch": 2.497340425531915, "grad_norm": 0.44150304794311523, "learning_rate": 8.29899204251296e-07, "loss": 0.394, "step": 5008 }, { "epoch": 2.497839095744681, "grad_norm": 0.4247155785560608, "learning_rate": 8.282988320453017e-07, "loss": 0.3138, "step": 5009 }, { "epoch": 2.498337765957447, "grad_norm": 0.41506093740463257, "learning_rate": 8.266998650351937e-07, "loss": 0.3284, "step": 5010 }, { "epoch": 2.4988364361702127, "grad_norm": 0.4418303668498993, "learning_rate": 8.25102303759569e-07, "loss": 0.3904, "step": 5011 }, { "epoch": 2.4993351063829787, "grad_norm": 0.40535908937454224, "learning_rate": 8.23506148756551e-07, "loss": 0.3359, "step": 5012 }, { "epoch": 2.499833776595745, "grad_norm": 0.42911049723625183, "learning_rate": 8.219114005637868e-07, "loss": 0.3241, "step": 5013 }, { "epoch": 2.5003324468085104, "grad_norm": 0.39453205466270447, "learning_rate": 8.203180597184534e-07, "loss": 0.3427, "step": 5014 }, { "epoch": 2.5008311170212765, "grad_norm": 0.4150414764881134, "learning_rate": 8.187261267572494e-07, "loss": 0.3499, "step": 5015 }, { "epoch": 2.5013297872340425, "grad_norm": 0.3676816523075104, "learning_rate": 8.17135602216404e-07, "loss": 0.2913, "step": 5016 }, { "epoch": 2.5018284574468086, "grad_norm": 0.4193357825279236, "learning_rate": 8.155464866316675e-07, "loss": 0.4041, "step": 5017 }, { "epoch": 2.5023271276595747, "grad_norm": 0.44842228293418884, "learning_rate": 8.139587805383192e-07, "loss": 0.3848, "step": 5018 }, { "epoch": 2.5028257978723403, "grad_norm": 0.4180735647678375, "learning_rate": 8.123724844711611e-07, "loss": 0.3434, "step": 5019 }, { "epoch": 2.5033244680851063, "grad_norm": 0.35630717873573303, "learning_rate": 8.107875989645209e-07, "loss": 0.3157, "step": 5020 }, { "epoch": 2.5038231382978724, "grad_norm": 0.43120840191841125, "learning_rate": 8.092041245522525e-07, "loss": 0.3632, "step": 5021 }, { "epoch": 2.5043218085106385, "grad_norm": 0.4426720440387726, "learning_rate": 8.076220617677321e-07, "loss": 0.3658, "step": 5022 }, { "epoch": 2.5048204787234045, "grad_norm": 0.3844355046749115, "learning_rate": 8.060414111438635e-07, "loss": 0.3133, "step": 5023 }, { "epoch": 2.50531914893617, "grad_norm": 0.40534934401512146, "learning_rate": 8.044621732130708e-07, "loss": 0.3464, "step": 5024 }, { "epoch": 2.505817819148936, "grad_norm": 0.39703091979026794, "learning_rate": 8.028843485073074e-07, "loss": 0.3537, "step": 5025 }, { "epoch": 2.5063164893617023, "grad_norm": 0.38259878754615784, "learning_rate": 8.013079375580446e-07, "loss": 0.3064, "step": 5026 }, { "epoch": 2.506815159574468, "grad_norm": 0.4175355136394501, "learning_rate": 7.99732940896284e-07, "loss": 0.3269, "step": 5027 }, { "epoch": 2.507313829787234, "grad_norm": 0.43860161304473877, "learning_rate": 7.981593590525444e-07, "loss": 0.3382, "step": 5028 }, { "epoch": 2.5078125, "grad_norm": 0.41170844435691833, "learning_rate": 7.965871925568736e-07, "loss": 0.321, "step": 5029 }, { "epoch": 2.508311170212766, "grad_norm": 0.3833220601081848, "learning_rate": 7.950164419388395e-07, "loss": 0.3474, "step": 5030 }, { "epoch": 2.508809840425532, "grad_norm": 0.43570709228515625, "learning_rate": 7.934471077275318e-07, "loss": 0.3105, "step": 5031 }, { "epoch": 2.5093085106382977, "grad_norm": 0.4391835033893585, "learning_rate": 7.918791904515683e-07, "loss": 0.3457, "step": 5032 }, { "epoch": 2.509807180851064, "grad_norm": 0.4494100511074066, "learning_rate": 7.903126906390829e-07, "loss": 0.3387, "step": 5033 }, { "epoch": 2.51030585106383, "grad_norm": 0.42506206035614014, "learning_rate": 7.887476088177387e-07, "loss": 0.3732, "step": 5034 }, { "epoch": 2.5108045212765955, "grad_norm": 0.380880743265152, "learning_rate": 7.871839455147156e-07, "loss": 0.2786, "step": 5035 }, { "epoch": 2.5113031914893615, "grad_norm": 0.4144347310066223, "learning_rate": 7.856217012567191e-07, "loss": 0.3569, "step": 5036 }, { "epoch": 2.5118018617021276, "grad_norm": 0.40645521879196167, "learning_rate": 7.840608765699747e-07, "loss": 0.3654, "step": 5037 }, { "epoch": 2.5123005319148937, "grad_norm": 0.4773932993412018, "learning_rate": 7.825014719802321e-07, "loss": 0.3462, "step": 5038 }, { "epoch": 2.5127992021276597, "grad_norm": 0.41341322660446167, "learning_rate": 7.809434880127586e-07, "loss": 0.3485, "step": 5039 }, { "epoch": 2.5132978723404253, "grad_norm": 0.3960746228694916, "learning_rate": 7.793869251923486e-07, "loss": 0.3074, "step": 5040 }, { "epoch": 2.5137965425531914, "grad_norm": 0.3947775661945343, "learning_rate": 7.778317840433136e-07, "loss": 0.3624, "step": 5041 }, { "epoch": 2.5142952127659575, "grad_norm": 0.41797173023223877, "learning_rate": 7.762780650894858e-07, "loss": 0.3224, "step": 5042 }, { "epoch": 2.5147938829787235, "grad_norm": 0.4234639108181, "learning_rate": 7.74725768854222e-07, "loss": 0.3572, "step": 5043 }, { "epoch": 2.5152925531914896, "grad_norm": 0.4503370523452759, "learning_rate": 7.731748958603957e-07, "loss": 0.3303, "step": 5044 }, { "epoch": 2.515791223404255, "grad_norm": 0.41973069310188293, "learning_rate": 7.716254466304046e-07, "loss": 0.3303, "step": 5045 }, { "epoch": 2.5162898936170213, "grad_norm": 0.4771246016025543, "learning_rate": 7.700774216861656e-07, "loss": 0.3965, "step": 5046 }, { "epoch": 2.5167885638297873, "grad_norm": 0.36797401309013367, "learning_rate": 7.685308215491133e-07, "loss": 0.2398, "step": 5047 }, { "epoch": 2.517287234042553, "grad_norm": 0.4674247205257416, "learning_rate": 7.66985646740207e-07, "loss": 0.3539, "step": 5048 }, { "epoch": 2.517785904255319, "grad_norm": 0.4118252992630005, "learning_rate": 7.654418977799211e-07, "loss": 0.3118, "step": 5049 }, { "epoch": 2.518284574468085, "grad_norm": 0.41508033871650696, "learning_rate": 7.638995751882533e-07, "loss": 0.3363, "step": 5050 }, { "epoch": 2.518783244680851, "grad_norm": 0.4076232612133026, "learning_rate": 7.623586794847205e-07, "loss": 0.3131, "step": 5051 }, { "epoch": 2.519281914893617, "grad_norm": 0.4679321348667145, "learning_rate": 7.608192111883566e-07, "loss": 0.3786, "step": 5052 }, { "epoch": 2.519780585106383, "grad_norm": 0.392647922039032, "learning_rate": 7.592811708177156e-07, "loss": 0.3176, "step": 5053 }, { "epoch": 2.520279255319149, "grad_norm": 0.39879095554351807, "learning_rate": 7.577445588908728e-07, "loss": 0.3195, "step": 5054 }, { "epoch": 2.520777925531915, "grad_norm": 0.36492693424224854, "learning_rate": 7.56209375925419e-07, "loss": 0.3034, "step": 5055 }, { "epoch": 2.521276595744681, "grad_norm": 0.4043058454990387, "learning_rate": 7.546756224384666e-07, "loss": 0.361, "step": 5056 }, { "epoch": 2.521775265957447, "grad_norm": 0.4280066192150116, "learning_rate": 7.531432989466436e-07, "loss": 0.3568, "step": 5057 }, { "epoch": 2.5222739361702127, "grad_norm": 0.45753854513168335, "learning_rate": 7.516124059661001e-07, "loss": 0.36, "step": 5058 }, { "epoch": 2.5227726063829787, "grad_norm": 0.44438400864601135, "learning_rate": 7.50082944012499e-07, "loss": 0.3554, "step": 5059 }, { "epoch": 2.523271276595745, "grad_norm": 0.3877485394477844, "learning_rate": 7.485549136010278e-07, "loss": 0.3925, "step": 5060 }, { "epoch": 2.5237699468085104, "grad_norm": 0.361763060092926, "learning_rate": 7.470283152463853e-07, "loss": 0.2763, "step": 5061 }, { "epoch": 2.5242686170212765, "grad_norm": 0.4299820065498352, "learning_rate": 7.455031494627935e-07, "loss": 0.3639, "step": 5062 }, { "epoch": 2.5247672872340425, "grad_norm": 0.3891012370586395, "learning_rate": 7.439794167639875e-07, "loss": 0.344, "step": 5063 }, { "epoch": 2.5252659574468086, "grad_norm": 0.45603126287460327, "learning_rate": 7.424571176632206e-07, "loss": 0.3382, "step": 5064 }, { "epoch": 2.5257646276595747, "grad_norm": 0.4362563490867615, "learning_rate": 7.409362526732672e-07, "loss": 0.3823, "step": 5065 }, { "epoch": 2.5262632978723403, "grad_norm": 0.40440472960472107, "learning_rate": 7.394168223064118e-07, "loss": 0.3336, "step": 5066 }, { "epoch": 2.5267619680851063, "grad_norm": 0.4042242467403412, "learning_rate": 7.378988270744625e-07, "loss": 0.3277, "step": 5067 }, { "epoch": 2.5272606382978724, "grad_norm": 0.42799073457717896, "learning_rate": 7.363822674887383e-07, "loss": 0.3868, "step": 5068 }, { "epoch": 2.5277593085106385, "grad_norm": 0.4008268713951111, "learning_rate": 7.348671440600796e-07, "loss": 0.3535, "step": 5069 }, { "epoch": 2.5282579787234045, "grad_norm": 0.39666128158569336, "learning_rate": 7.333534572988382e-07, "loss": 0.3515, "step": 5070 }, { "epoch": 2.52875664893617, "grad_norm": 0.3888581693172455, "learning_rate": 7.318412077148867e-07, "loss": 0.2903, "step": 5071 }, { "epoch": 2.529255319148936, "grad_norm": 0.41787800192832947, "learning_rate": 7.3033039581761e-07, "loss": 0.2841, "step": 5072 }, { "epoch": 2.5297539893617023, "grad_norm": 0.4626651704311371, "learning_rate": 7.288210221159092e-07, "loss": 0.4303, "step": 5073 }, { "epoch": 2.530252659574468, "grad_norm": 0.3752155900001526, "learning_rate": 7.273130871182038e-07, "loss": 0.2958, "step": 5074 }, { "epoch": 2.530751329787234, "grad_norm": 0.3939574360847473, "learning_rate": 7.258065913324241e-07, "loss": 0.3184, "step": 5075 }, { "epoch": 2.53125, "grad_norm": 0.41606810688972473, "learning_rate": 7.24301535266021e-07, "loss": 0.3592, "step": 5076 }, { "epoch": 2.531748670212766, "grad_norm": 0.4315279424190521, "learning_rate": 7.227979194259549e-07, "loss": 0.3134, "step": 5077 }, { "epoch": 2.532247340425532, "grad_norm": 0.4233711063861847, "learning_rate": 7.212957443187063e-07, "loss": 0.3402, "step": 5078 }, { "epoch": 2.5327460106382977, "grad_norm": 0.40739357471466064, "learning_rate": 7.197950104502654e-07, "loss": 0.421, "step": 5079 }, { "epoch": 2.533244680851064, "grad_norm": 0.37765857577323914, "learning_rate": 7.182957183261419e-07, "loss": 0.376, "step": 5080 }, { "epoch": 2.53374335106383, "grad_norm": 0.4095718562602997, "learning_rate": 7.167978684513549e-07, "loss": 0.3302, "step": 5081 }, { "epoch": 2.5342420212765955, "grad_norm": 0.36845487356185913, "learning_rate": 7.153014613304432e-07, "loss": 0.2867, "step": 5082 }, { "epoch": 2.5347406914893615, "grad_norm": 0.41854584217071533, "learning_rate": 7.13806497467453e-07, "loss": 0.3129, "step": 5083 }, { "epoch": 2.5352393617021276, "grad_norm": 0.40103837847709656, "learning_rate": 7.123129773659499e-07, "loss": 0.3214, "step": 5084 }, { "epoch": 2.5357380319148937, "grad_norm": 0.3907422721385956, "learning_rate": 7.108209015290124e-07, "loss": 0.3393, "step": 5085 }, { "epoch": 2.5362367021276597, "grad_norm": 0.4170506000518799, "learning_rate": 7.093302704592287e-07, "loss": 0.3683, "step": 5086 }, { "epoch": 2.5367353723404253, "grad_norm": 0.3990945816040039, "learning_rate": 7.078410846587058e-07, "loss": 0.3713, "step": 5087 }, { "epoch": 2.5372340425531914, "grad_norm": 0.43342649936676025, "learning_rate": 7.063533446290582e-07, "loss": 0.3446, "step": 5088 }, { "epoch": 2.5377327127659575, "grad_norm": 0.39322152733802795, "learning_rate": 7.048670508714195e-07, "loss": 0.319, "step": 5089 }, { "epoch": 2.5382313829787235, "grad_norm": 0.3841460645198822, "learning_rate": 7.033822038864291e-07, "loss": 0.3401, "step": 5090 }, { "epoch": 2.5387300531914896, "grad_norm": 0.4040398597717285, "learning_rate": 7.018988041742463e-07, "loss": 0.3749, "step": 5091 }, { "epoch": 2.539228723404255, "grad_norm": 0.43722274899482727, "learning_rate": 7.004168522345372e-07, "loss": 0.3336, "step": 5092 }, { "epoch": 2.5397273936170213, "grad_norm": 0.40340283513069153, "learning_rate": 6.989363485664846e-07, "loss": 0.3509, "step": 5093 }, { "epoch": 2.5402260638297873, "grad_norm": 0.40495869517326355, "learning_rate": 6.974572936687801e-07, "loss": 0.313, "step": 5094 }, { "epoch": 2.540724734042553, "grad_norm": 0.4189288914203644, "learning_rate": 6.959796880396275e-07, "loss": 0.3385, "step": 5095 }, { "epoch": 2.541223404255319, "grad_norm": 0.4086788594722748, "learning_rate": 6.945035321767457e-07, "loss": 0.3615, "step": 5096 }, { "epoch": 2.541722074468085, "grad_norm": 0.4340481162071228, "learning_rate": 6.93028826577361e-07, "loss": 0.3017, "step": 5097 }, { "epoch": 2.542220744680851, "grad_norm": 0.45534390211105347, "learning_rate": 6.91555571738215e-07, "loss": 0.3145, "step": 5098 }, { "epoch": 2.542719414893617, "grad_norm": 0.45214715600013733, "learning_rate": 6.900837681555578e-07, "loss": 0.3241, "step": 5099 }, { "epoch": 2.543218085106383, "grad_norm": 0.3919798731803894, "learning_rate": 6.886134163251524e-07, "loss": 0.3773, "step": 5100 }, { "epoch": 2.543716755319149, "grad_norm": 0.3999141454696655, "learning_rate": 6.871445167422714e-07, "loss": 0.3431, "step": 5101 }, { "epoch": 2.544215425531915, "grad_norm": 0.3964936137199402, "learning_rate": 6.856770699017001e-07, "loss": 0.2833, "step": 5102 }, { "epoch": 2.544714095744681, "grad_norm": 0.4328470528125763, "learning_rate": 6.842110762977317e-07, "loss": 0.4333, "step": 5103 }, { "epoch": 2.545212765957447, "grad_norm": 0.38751861453056335, "learning_rate": 6.827465364241736e-07, "loss": 0.3189, "step": 5104 }, { "epoch": 2.5457114361702127, "grad_norm": 0.4344808757305145, "learning_rate": 6.812834507743399e-07, "loss": 0.3743, "step": 5105 }, { "epoch": 2.5462101063829787, "grad_norm": 0.32644882798194885, "learning_rate": 6.798218198410561e-07, "loss": 0.2455, "step": 5106 }, { "epoch": 2.546708776595745, "grad_norm": 0.4438695013523102, "learning_rate": 6.78361644116659e-07, "loss": 0.3808, "step": 5107 }, { "epoch": 2.5472074468085104, "grad_norm": 0.4270351827144623, "learning_rate": 6.769029240929937e-07, "loss": 0.3432, "step": 5108 }, { "epoch": 2.5477061170212765, "grad_norm": 0.3645988404750824, "learning_rate": 6.754456602614162e-07, "loss": 0.2791, "step": 5109 }, { "epoch": 2.5482047872340425, "grad_norm": 0.37813717126846313, "learning_rate": 6.739898531127897e-07, "loss": 0.3085, "step": 5110 }, { "epoch": 2.5487034574468086, "grad_norm": 0.45494115352630615, "learning_rate": 6.725355031374902e-07, "loss": 0.359, "step": 5111 }, { "epoch": 2.5492021276595747, "grad_norm": 0.4077233076095581, "learning_rate": 6.710826108253992e-07, "loss": 0.2833, "step": 5112 }, { "epoch": 2.5497007978723403, "grad_norm": 0.4037989675998688, "learning_rate": 6.696311766659109e-07, "loss": 0.3701, "step": 5113 }, { "epoch": 2.5501994680851063, "grad_norm": 0.4364321231842041, "learning_rate": 6.681812011479244e-07, "loss": 0.3105, "step": 5114 }, { "epoch": 2.5506981382978724, "grad_norm": 0.40912479162216187, "learning_rate": 6.66732684759851e-07, "loss": 0.3315, "step": 5115 }, { "epoch": 2.5511968085106385, "grad_norm": 0.3932953476905823, "learning_rate": 6.65285627989608e-07, "loss": 0.3519, "step": 5116 }, { "epoch": 2.5516954787234045, "grad_norm": 0.38217461109161377, "learning_rate": 6.638400313246229e-07, "loss": 0.3708, "step": 5117 }, { "epoch": 2.55219414893617, "grad_norm": 0.4539680778980255, "learning_rate": 6.623958952518295e-07, "loss": 0.3127, "step": 5118 }, { "epoch": 2.552692819148936, "grad_norm": 0.4063948094844818, "learning_rate": 6.609532202576718e-07, "loss": 0.3183, "step": 5119 }, { "epoch": 2.5531914893617023, "grad_norm": 0.46593329310417175, "learning_rate": 6.595120068280991e-07, "loss": 0.3645, "step": 5120 }, { "epoch": 2.553690159574468, "grad_norm": 0.3927479386329651, "learning_rate": 6.580722554485713e-07, "loss": 0.3568, "step": 5121 }, { "epoch": 2.554188829787234, "grad_norm": 0.415546178817749, "learning_rate": 6.566339666040522e-07, "loss": 0.315, "step": 5122 }, { "epoch": 2.5546875, "grad_norm": 0.4221090078353882, "learning_rate": 6.551971407790164e-07, "loss": 0.3582, "step": 5123 }, { "epoch": 2.555186170212766, "grad_norm": 0.42543336749076843, "learning_rate": 6.537617784574457e-07, "loss": 0.3952, "step": 5124 }, { "epoch": 2.555684840425532, "grad_norm": 0.40682557225227356, "learning_rate": 6.523278801228244e-07, "loss": 0.3205, "step": 5125 }, { "epoch": 2.5561835106382977, "grad_norm": 0.44364359974861145, "learning_rate": 6.508954462581501e-07, "loss": 0.33, "step": 5126 }, { "epoch": 2.556682180851064, "grad_norm": 0.4145762622356415, "learning_rate": 6.494644773459219e-07, "loss": 0.4042, "step": 5127 }, { "epoch": 2.55718085106383, "grad_norm": 0.35073599219322205, "learning_rate": 6.480349738681474e-07, "loss": 0.2886, "step": 5128 }, { "epoch": 2.5576795212765955, "grad_norm": 0.40052366256713867, "learning_rate": 6.46606936306341e-07, "loss": 0.3872, "step": 5129 }, { "epoch": 2.5581781914893615, "grad_norm": 0.4460281729698181, "learning_rate": 6.451803651415223e-07, "loss": 0.3904, "step": 5130 }, { "epoch": 2.5586768617021276, "grad_norm": 0.3950686752796173, "learning_rate": 6.437552608542192e-07, "loss": 0.337, "step": 5131 }, { "epoch": 2.5591755319148937, "grad_norm": 0.4314110279083252, "learning_rate": 6.423316239244615e-07, "loss": 0.3125, "step": 5132 }, { "epoch": 2.5596742021276597, "grad_norm": 0.4041016399860382, "learning_rate": 6.409094548317896e-07, "loss": 0.3741, "step": 5133 }, { "epoch": 2.5601728723404253, "grad_norm": 0.40919628739356995, "learning_rate": 6.394887540552447e-07, "loss": 0.3485, "step": 5134 }, { "epoch": 2.5606715425531914, "grad_norm": 0.40650224685668945, "learning_rate": 6.380695220733774e-07, "loss": 0.3143, "step": 5135 }, { "epoch": 2.5611702127659575, "grad_norm": 0.44922903180122375, "learning_rate": 6.366517593642402e-07, "loss": 0.3358, "step": 5136 }, { "epoch": 2.5616688829787235, "grad_norm": 0.4182490110397339, "learning_rate": 6.352354664053939e-07, "loss": 0.3197, "step": 5137 }, { "epoch": 2.5621675531914896, "grad_norm": 0.42139726877212524, "learning_rate": 6.33820643673902e-07, "loss": 0.3693, "step": 5138 }, { "epoch": 2.562666223404255, "grad_norm": 0.4519859850406647, "learning_rate": 6.324072916463326e-07, "loss": 0.3417, "step": 5139 }, { "epoch": 2.5631648936170213, "grad_norm": 0.42672523856163025, "learning_rate": 6.309954107987609e-07, "loss": 0.3206, "step": 5140 }, { "epoch": 2.5636635638297873, "grad_norm": 0.41885146498680115, "learning_rate": 6.295850016067628e-07, "loss": 0.3411, "step": 5141 }, { "epoch": 2.564162234042553, "grad_norm": 0.40293967723846436, "learning_rate": 6.281760645454227e-07, "loss": 0.3654, "step": 5142 }, { "epoch": 2.564660904255319, "grad_norm": 0.39634397625923157, "learning_rate": 6.267686000893252e-07, "loss": 0.3075, "step": 5143 }, { "epoch": 2.565159574468085, "grad_norm": 0.4441419839859009, "learning_rate": 6.25362608712562e-07, "loss": 0.3829, "step": 5144 }, { "epoch": 2.565658244680851, "grad_norm": 0.4016419053077698, "learning_rate": 6.23958090888726e-07, "loss": 0.2912, "step": 5145 }, { "epoch": 2.566156914893617, "grad_norm": 0.4092303216457367, "learning_rate": 6.22555047090917e-07, "loss": 0.3377, "step": 5146 }, { "epoch": 2.566655585106383, "grad_norm": 0.40493735671043396, "learning_rate": 6.211534777917344e-07, "loss": 0.3407, "step": 5147 }, { "epoch": 2.567154255319149, "grad_norm": 0.431973397731781, "learning_rate": 6.197533834632824e-07, "loss": 0.3506, "step": 5148 }, { "epoch": 2.567652925531915, "grad_norm": 0.41420069336891174, "learning_rate": 6.183547645771709e-07, "loss": 0.3403, "step": 5149 }, { "epoch": 2.568151595744681, "grad_norm": 0.4259856641292572, "learning_rate": 6.169576216045087e-07, "loss": 0.3388, "step": 5150 }, { "epoch": 2.568650265957447, "grad_norm": 0.4156022369861603, "learning_rate": 6.155619550159114e-07, "loss": 0.3531, "step": 5151 }, { "epoch": 2.5691489361702127, "grad_norm": 0.43525755405426025, "learning_rate": 6.141677652814931e-07, "loss": 0.3297, "step": 5152 }, { "epoch": 2.5696476063829787, "grad_norm": 0.3787674903869629, "learning_rate": 6.127750528708753e-07, "loss": 0.382, "step": 5153 }, { "epoch": 2.570146276595745, "grad_norm": 0.41540399193763733, "learning_rate": 6.113838182531767e-07, "loss": 0.3367, "step": 5154 }, { "epoch": 2.5706449468085104, "grad_norm": 0.38251957297325134, "learning_rate": 6.099940618970218e-07, "loss": 0.2994, "step": 5155 }, { "epoch": 2.5711436170212765, "grad_norm": 0.3705880343914032, "learning_rate": 6.086057842705372e-07, "loss": 0.3422, "step": 5156 }, { "epoch": 2.5716422872340425, "grad_norm": 0.4551851749420166, "learning_rate": 6.072189858413485e-07, "loss": 0.3809, "step": 5157 }, { "epoch": 2.5721409574468086, "grad_norm": 0.38061901926994324, "learning_rate": 6.058336670765874e-07, "loss": 0.2912, "step": 5158 }, { "epoch": 2.5726396276595747, "grad_norm": 0.4535444974899292, "learning_rate": 6.044498284428818e-07, "loss": 0.3486, "step": 5159 }, { "epoch": 2.5731382978723403, "grad_norm": 0.4237176477909088, "learning_rate": 6.030674704063666e-07, "loss": 0.2827, "step": 5160 }, { "epoch": 2.5736369680851063, "grad_norm": 0.4019792079925537, "learning_rate": 6.016865934326726e-07, "loss": 0.3453, "step": 5161 }, { "epoch": 2.5741356382978724, "grad_norm": 0.39475080370903015, "learning_rate": 6.003071979869368e-07, "loss": 0.2963, "step": 5162 }, { "epoch": 2.5746343085106385, "grad_norm": 0.4381581246852875, "learning_rate": 5.989292845337935e-07, "loss": 0.4229, "step": 5163 }, { "epoch": 2.5751329787234045, "grad_norm": 0.3603830933570862, "learning_rate": 5.975528535373798e-07, "loss": 0.3203, "step": 5164 }, { "epoch": 2.57563164893617, "grad_norm": 0.3879035413265228, "learning_rate": 5.961779054613315e-07, "loss": 0.3411, "step": 5165 }, { "epoch": 2.576130319148936, "grad_norm": 0.42577263712882996, "learning_rate": 5.948044407687881e-07, "loss": 0.3422, "step": 5166 }, { "epoch": 2.5766289893617023, "grad_norm": 0.4024268388748169, "learning_rate": 5.934324599223851e-07, "loss": 0.3659, "step": 5167 }, { "epoch": 2.577127659574468, "grad_norm": 0.43329864740371704, "learning_rate": 5.920619633842628e-07, "loss": 0.3853, "step": 5168 }, { "epoch": 2.577626329787234, "grad_norm": 0.39691364765167236, "learning_rate": 5.906929516160587e-07, "loss": 0.3603, "step": 5169 }, { "epoch": 2.578125, "grad_norm": 0.47148534655570984, "learning_rate": 5.893254250789088e-07, "loss": 0.3735, "step": 5170 }, { "epoch": 2.578623670212766, "grad_norm": 0.3863942623138428, "learning_rate": 5.879593842334536e-07, "loss": 0.3021, "step": 5171 }, { "epoch": 2.579122340425532, "grad_norm": 0.459734708070755, "learning_rate": 5.865948295398278e-07, "loss": 0.3702, "step": 5172 }, { "epoch": 2.5796210106382977, "grad_norm": 0.4170764088630676, "learning_rate": 5.852317614576703e-07, "loss": 0.3772, "step": 5173 }, { "epoch": 2.580119680851064, "grad_norm": 0.3860659897327423, "learning_rate": 5.838701804461144e-07, "loss": 0.3266, "step": 5174 }, { "epoch": 2.58061835106383, "grad_norm": 0.3732469975948334, "learning_rate": 5.825100869637984e-07, "loss": 0.3114, "step": 5175 }, { "epoch": 2.5811170212765955, "grad_norm": 0.4082172214984894, "learning_rate": 5.81151481468853e-07, "loss": 0.3511, "step": 5176 }, { "epoch": 2.5816156914893615, "grad_norm": 0.4330865144729614, "learning_rate": 5.79794364418913e-07, "loss": 0.3667, "step": 5177 }, { "epoch": 2.5821143617021276, "grad_norm": 0.3812054693698883, "learning_rate": 5.784387362711086e-07, "loss": 0.2999, "step": 5178 }, { "epoch": 2.5826130319148937, "grad_norm": 0.3943544328212738, "learning_rate": 5.770845974820705e-07, "loss": 0.3554, "step": 5179 }, { "epoch": 2.5831117021276597, "grad_norm": 0.402310848236084, "learning_rate": 5.757319485079266e-07, "loss": 0.305, "step": 5180 }, { "epoch": 2.5836103723404253, "grad_norm": 0.4442414343357086, "learning_rate": 5.743807898043019e-07, "loss": 0.3632, "step": 5181 }, { "epoch": 2.5841090425531914, "grad_norm": 0.38917243480682373, "learning_rate": 5.73031121826323e-07, "loss": 0.301, "step": 5182 }, { "epoch": 2.5846077127659575, "grad_norm": 0.3987419009208679, "learning_rate": 5.716829450286093e-07, "loss": 0.3191, "step": 5183 }, { "epoch": 2.5851063829787235, "grad_norm": 0.41415223479270935, "learning_rate": 5.703362598652834e-07, "loss": 0.3533, "step": 5184 }, { "epoch": 2.5856050531914896, "grad_norm": 0.3962191045284271, "learning_rate": 5.689910667899607e-07, "loss": 0.3535, "step": 5185 }, { "epoch": 2.586103723404255, "grad_norm": 0.3653096854686737, "learning_rate": 5.67647366255758e-07, "loss": 0.3159, "step": 5186 }, { "epoch": 2.5866023936170213, "grad_norm": 0.40281084179878235, "learning_rate": 5.663051587152852e-07, "loss": 0.3485, "step": 5187 }, { "epoch": 2.5871010638297873, "grad_norm": 0.416720449924469, "learning_rate": 5.649644446206537e-07, "loss": 0.3346, "step": 5188 }, { "epoch": 2.587599734042553, "grad_norm": 0.418984055519104, "learning_rate": 5.63625224423468e-07, "loss": 0.4024, "step": 5189 }, { "epoch": 2.588098404255319, "grad_norm": 0.42683061957359314, "learning_rate": 5.622874985748322e-07, "loss": 0.3169, "step": 5190 }, { "epoch": 2.588597074468085, "grad_norm": 0.39704614877700806, "learning_rate": 5.609512675253443e-07, "loss": 0.3413, "step": 5191 }, { "epoch": 2.589095744680851, "grad_norm": 0.4248439073562622, "learning_rate": 5.596165317251029e-07, "loss": 0.3852, "step": 5192 }, { "epoch": 2.589594414893617, "grad_norm": 0.38589635491371155, "learning_rate": 5.582832916236974e-07, "loss": 0.3403, "step": 5193 }, { "epoch": 2.590093085106383, "grad_norm": 0.44302892684936523, "learning_rate": 5.569515476702187e-07, "loss": 0.3509, "step": 5194 }, { "epoch": 2.590591755319149, "grad_norm": 0.4324144124984741, "learning_rate": 5.55621300313251e-07, "loss": 0.3271, "step": 5195 }, { "epoch": 2.591090425531915, "grad_norm": 0.4750141501426697, "learning_rate": 5.542925500008739e-07, "loss": 0.3198, "step": 5196 }, { "epoch": 2.591589095744681, "grad_norm": 0.46262943744659424, "learning_rate": 5.529652971806654e-07, "loss": 0.3953, "step": 5197 }, { "epoch": 2.592087765957447, "grad_norm": 0.4226190149784088, "learning_rate": 5.516395422996956e-07, "loss": 0.3024, "step": 5198 }, { "epoch": 2.5925864361702127, "grad_norm": 0.4099268913269043, "learning_rate": 5.503152858045335e-07, "loss": 0.3219, "step": 5199 }, { "epoch": 2.5930851063829787, "grad_norm": 0.41744840145111084, "learning_rate": 5.489925281412395e-07, "loss": 0.4103, "step": 5200 }, { "epoch": 2.593583776595745, "grad_norm": 0.3608529567718506, "learning_rate": 5.476712697553732e-07, "loss": 0.3016, "step": 5201 }, { "epoch": 2.5940824468085104, "grad_norm": 0.4052233099937439, "learning_rate": 5.46351511091987e-07, "loss": 0.3819, "step": 5202 }, { "epoch": 2.5945811170212765, "grad_norm": 0.4306817054748535, "learning_rate": 5.45033252595627e-07, "loss": 0.3275, "step": 5203 }, { "epoch": 2.5950797872340425, "grad_norm": 0.39761629700660706, "learning_rate": 5.437164947103374e-07, "loss": 0.3246, "step": 5204 }, { "epoch": 2.5955784574468086, "grad_norm": 0.42756369709968567, "learning_rate": 5.424012378796528e-07, "loss": 0.3436, "step": 5205 }, { "epoch": 2.5960771276595747, "grad_norm": 0.42338991165161133, "learning_rate": 5.410874825466073e-07, "loss": 0.3428, "step": 5206 }, { "epoch": 2.5965757978723403, "grad_norm": 0.4252713620662689, "learning_rate": 5.397752291537229e-07, "loss": 0.3568, "step": 5207 }, { "epoch": 2.5970744680851063, "grad_norm": 0.42364516854286194, "learning_rate": 5.384644781430215e-07, "loss": 0.3231, "step": 5208 }, { "epoch": 2.5975731382978724, "grad_norm": 0.3741852343082428, "learning_rate": 5.371552299560146e-07, "loss": 0.3057, "step": 5209 }, { "epoch": 2.5980718085106385, "grad_norm": 0.3778824210166931, "learning_rate": 5.35847485033712e-07, "loss": 0.2993, "step": 5210 }, { "epoch": 2.5985704787234045, "grad_norm": 0.4052342474460602, "learning_rate": 5.345412438166115e-07, "loss": 0.3212, "step": 5211 }, { "epoch": 2.59906914893617, "grad_norm": 0.4110845923423767, "learning_rate": 5.332365067447098e-07, "loss": 0.3805, "step": 5212 }, { "epoch": 2.599567819148936, "grad_norm": 0.394595205783844, "learning_rate": 5.319332742574939e-07, "loss": 0.3058, "step": 5213 }, { "epoch": 2.6000664893617023, "grad_norm": 0.36770448088645935, "learning_rate": 5.306315467939433e-07, "loss": 0.304, "step": 5214 }, { "epoch": 2.600565159574468, "grad_norm": 0.3994077444076538, "learning_rate": 5.293313247925341e-07, "loss": 0.3149, "step": 5215 }, { "epoch": 2.601063829787234, "grad_norm": 0.46223771572113037, "learning_rate": 5.280326086912307e-07, "loss": 0.3585, "step": 5216 }, { "epoch": 2.6015625, "grad_norm": 0.371895432472229, "learning_rate": 5.267353989274948e-07, "loss": 0.2704, "step": 5217 }, { "epoch": 2.602061170212766, "grad_norm": 0.47036486864089966, "learning_rate": 5.254396959382768e-07, "loss": 0.3676, "step": 5218 }, { "epoch": 2.602559840425532, "grad_norm": 0.40552040934562683, "learning_rate": 5.241455001600232e-07, "loss": 0.3511, "step": 5219 }, { "epoch": 2.6030585106382977, "grad_norm": 0.39633533358573914, "learning_rate": 5.228528120286691e-07, "loss": 0.3594, "step": 5220 }, { "epoch": 2.603557180851064, "grad_norm": 0.3765134811401367, "learning_rate": 5.215616319796451e-07, "loss": 0.34, "step": 5221 }, { "epoch": 2.60405585106383, "grad_norm": 0.38644537329673767, "learning_rate": 5.202719604478718e-07, "loss": 0.3307, "step": 5222 }, { "epoch": 2.6045545212765955, "grad_norm": 0.47119036316871643, "learning_rate": 5.189837978677609e-07, "loss": 0.3518, "step": 5223 }, { "epoch": 2.6050531914893615, "grad_norm": 0.4412660300731659, "learning_rate": 5.176971446732193e-07, "loss": 0.2919, "step": 5224 }, { "epoch": 2.6055518617021276, "grad_norm": 0.4078649878501892, "learning_rate": 5.164120012976415e-07, "loss": 0.3521, "step": 5225 }, { "epoch": 2.6060505319148937, "grad_norm": 0.4241638481616974, "learning_rate": 5.151283681739161e-07, "loss": 0.3538, "step": 5226 }, { "epoch": 2.6065492021276597, "grad_norm": 0.3874402344226837, "learning_rate": 5.138462457344218e-07, "loss": 0.2938, "step": 5227 }, { "epoch": 2.6070478723404253, "grad_norm": 0.43319711089134216, "learning_rate": 5.125656344110281e-07, "loss": 0.3501, "step": 5228 }, { "epoch": 2.6075465425531914, "grad_norm": 0.4113040566444397, "learning_rate": 5.112865346350982e-07, "loss": 0.3093, "step": 5229 }, { "epoch": 2.6080452127659575, "grad_norm": 0.442979097366333, "learning_rate": 5.100089468374813e-07, "loss": 0.425, "step": 5230 }, { "epoch": 2.6085438829787235, "grad_norm": 0.37820178270339966, "learning_rate": 5.087328714485234e-07, "loss": 0.301, "step": 5231 }, { "epoch": 2.6090425531914896, "grad_norm": 0.4227283298969269, "learning_rate": 5.074583088980545e-07, "loss": 0.3676, "step": 5232 }, { "epoch": 2.609541223404255, "grad_norm": 0.40494245290756226, "learning_rate": 5.061852596154005e-07, "loss": 0.3233, "step": 5233 }, { "epoch": 2.6100398936170213, "grad_norm": 0.3789323568344116, "learning_rate": 5.049137240293739e-07, "loss": 0.3585, "step": 5234 }, { "epoch": 2.6105385638297873, "grad_norm": 0.3959592580795288, "learning_rate": 5.0364370256828e-07, "loss": 0.3271, "step": 5235 }, { "epoch": 2.611037234042553, "grad_norm": 0.4214790463447571, "learning_rate": 5.023751956599116e-07, "loss": 0.3418, "step": 5236 }, { "epoch": 2.611535904255319, "grad_norm": 0.4386727809906006, "learning_rate": 5.01108203731554e-07, "loss": 0.3551, "step": 5237 }, { "epoch": 2.612034574468085, "grad_norm": 0.3689984977245331, "learning_rate": 4.998427272099788e-07, "loss": 0.2643, "step": 5238 }, { "epoch": 2.612533244680851, "grad_norm": 0.44174593687057495, "learning_rate": 4.985787665214514e-07, "loss": 0.3616, "step": 5239 }, { "epoch": 2.613031914893617, "grad_norm": 0.4130210876464844, "learning_rate": 4.973163220917227e-07, "loss": 0.2914, "step": 5240 }, { "epoch": 2.613530585106383, "grad_norm": 0.4525352120399475, "learning_rate": 4.960553943460356e-07, "loss": 0.3516, "step": 5241 }, { "epoch": 2.614029255319149, "grad_norm": 0.4197164475917816, "learning_rate": 4.947959837091198e-07, "loss": 0.3133, "step": 5242 }, { "epoch": 2.614527925531915, "grad_norm": 0.45972591638565063, "learning_rate": 4.935380906051968e-07, "loss": 0.2864, "step": 5243 }, { "epoch": 2.615026595744681, "grad_norm": 0.41641953587532043, "learning_rate": 4.922817154579745e-07, "loss": 0.3297, "step": 5244 }, { "epoch": 2.615525265957447, "grad_norm": 0.454158753156662, "learning_rate": 4.910268586906491e-07, "loss": 0.3834, "step": 5245 }, { "epoch": 2.6160239361702127, "grad_norm": 0.44187989830970764, "learning_rate": 4.897735207259091e-07, "loss": 0.3597, "step": 5246 }, { "epoch": 2.6165226063829787, "grad_norm": 0.4432563781738281, "learning_rate": 4.885217019859261e-07, "loss": 0.3678, "step": 5247 }, { "epoch": 2.617021276595745, "grad_norm": 0.38753876090049744, "learning_rate": 4.872714028923653e-07, "loss": 0.3657, "step": 5248 }, { "epoch": 2.6175199468085104, "grad_norm": 0.3721773624420166, "learning_rate": 4.860226238663751e-07, "loss": 0.3661, "step": 5249 }, { "epoch": 2.6180186170212765, "grad_norm": 0.35399654507637024, "learning_rate": 4.847753653285969e-07, "loss": 0.3252, "step": 5250 }, { "epoch": 2.6185172872340425, "grad_norm": 0.3490290641784668, "learning_rate": 4.835296276991547e-07, "loss": 0.3355, "step": 5251 }, { "epoch": 2.6190159574468086, "grad_norm": 0.4350147545337677, "learning_rate": 4.822854113976644e-07, "loss": 0.3614, "step": 5252 }, { "epoch": 2.6195146276595747, "grad_norm": 0.40082648396492004, "learning_rate": 4.810427168432269e-07, "loss": 0.3532, "step": 5253 }, { "epoch": 2.6200132978723403, "grad_norm": 0.4188576340675354, "learning_rate": 4.798015444544329e-07, "loss": 0.3577, "step": 5254 }, { "epoch": 2.6205119680851063, "grad_norm": 0.4314863383769989, "learning_rate": 4.785618946493581e-07, "loss": 0.2952, "step": 5255 }, { "epoch": 2.6210106382978724, "grad_norm": 0.44225940108299255, "learning_rate": 4.773237678455644e-07, "loss": 0.3762, "step": 5256 }, { "epoch": 2.6215093085106385, "grad_norm": 0.37801593542099, "learning_rate": 4.760871644601056e-07, "loss": 0.3278, "step": 5257 }, { "epoch": 2.6220079787234045, "grad_norm": 0.395940899848938, "learning_rate": 4.7485208490951675e-07, "loss": 0.3644, "step": 5258 }, { "epoch": 2.62250664893617, "grad_norm": 0.3673909306526184, "learning_rate": 4.736185296098239e-07, "loss": 0.3361, "step": 5259 }, { "epoch": 2.623005319148936, "grad_norm": 0.47306156158447266, "learning_rate": 4.723864989765359e-07, "loss": 0.3733, "step": 5260 }, { "epoch": 2.6235039893617023, "grad_norm": 0.3917618989944458, "learning_rate": 4.711559934246518e-07, "loss": 0.3188, "step": 5261 }, { "epoch": 2.624002659574468, "grad_norm": 0.40424275398254395, "learning_rate": 4.6992701336865345e-07, "loss": 0.3262, "step": 5262 }, { "epoch": 2.624501329787234, "grad_norm": 0.4321897029876709, "learning_rate": 4.6869955922251185e-07, "loss": 0.3533, "step": 5263 }, { "epoch": 2.625, "grad_norm": 0.43326276540756226, "learning_rate": 4.6747363139968195e-07, "loss": 0.3515, "step": 5264 }, { "epoch": 2.625498670212766, "grad_norm": 0.43404823541641235, "learning_rate": 4.662492303131061e-07, "loss": 0.3386, "step": 5265 }, { "epoch": 2.625997340425532, "grad_norm": 0.3932395279407501, "learning_rate": 4.6502635637520997e-07, "loss": 0.3232, "step": 5266 }, { "epoch": 2.6264960106382977, "grad_norm": 0.4217990040779114, "learning_rate": 4.63805009997908e-07, "loss": 0.3292, "step": 5267 }, { "epoch": 2.626994680851064, "grad_norm": 0.40822848677635193, "learning_rate": 4.6258519159259827e-07, "loss": 0.3412, "step": 5268 }, { "epoch": 2.62749335106383, "grad_norm": 0.4132561981678009, "learning_rate": 4.6136690157016386e-07, "loss": 0.3634, "step": 5269 }, { "epoch": 2.6279920212765955, "grad_norm": 0.3847229778766632, "learning_rate": 4.6015014034097415e-07, "loss": 0.3345, "step": 5270 }, { "epoch": 2.6284906914893615, "grad_norm": 0.4476775825023651, "learning_rate": 4.589349083148825e-07, "loss": 0.4013, "step": 5271 }, { "epoch": 2.6289893617021276, "grad_norm": 0.3918900787830353, "learning_rate": 4.577212059012287e-07, "loss": 0.3329, "step": 5272 }, { "epoch": 2.6294880319148937, "grad_norm": 0.3770771324634552, "learning_rate": 4.565090335088346e-07, "loss": 0.3543, "step": 5273 }, { "epoch": 2.6299867021276597, "grad_norm": 0.3874654769897461, "learning_rate": 4.552983915460102e-07, "loss": 0.3139, "step": 5274 }, { "epoch": 2.6304853723404253, "grad_norm": 0.44989049434661865, "learning_rate": 4.5408928042054603e-07, "loss": 0.3925, "step": 5275 }, { "epoch": 2.6309840425531914, "grad_norm": 0.43580082058906555, "learning_rate": 4.5288170053972123e-07, "loss": 0.3513, "step": 5276 }, { "epoch": 2.6314827127659575, "grad_norm": 0.4523051083087921, "learning_rate": 4.5167565231029663e-07, "loss": 0.3465, "step": 5277 }, { "epoch": 2.6319813829787235, "grad_norm": 0.41967278718948364, "learning_rate": 4.504711361385161e-07, "loss": 0.3233, "step": 5278 }, { "epoch": 2.6324800531914896, "grad_norm": 0.4059939980506897, "learning_rate": 4.492681524301101e-07, "loss": 0.3679, "step": 5279 }, { "epoch": 2.632978723404255, "grad_norm": 0.3926205337047577, "learning_rate": 4.480667015902912e-07, "loss": 0.3011, "step": 5280 }, { "epoch": 2.6334773936170213, "grad_norm": 0.4150402247905731, "learning_rate": 4.468667840237567e-07, "loss": 0.3187, "step": 5281 }, { "epoch": 2.6339760638297873, "grad_norm": 0.4140858054161072, "learning_rate": 4.4566840013468614e-07, "loss": 0.3635, "step": 5282 }, { "epoch": 2.634474734042553, "grad_norm": 0.46251916885375977, "learning_rate": 4.444715503267438e-07, "loss": 0.3038, "step": 5283 }, { "epoch": 2.634973404255319, "grad_norm": 0.4292391836643219, "learning_rate": 4.4327623500307614e-07, "loss": 0.3688, "step": 5284 }, { "epoch": 2.635472074468085, "grad_norm": 0.4121970236301422, "learning_rate": 4.4208245456631327e-07, "loss": 0.3241, "step": 5285 }, { "epoch": 2.635970744680851, "grad_norm": 0.44913187623023987, "learning_rate": 4.40890209418568e-07, "loss": 0.3537, "step": 5286 }, { "epoch": 2.636469414893617, "grad_norm": 0.37717336416244507, "learning_rate": 4.3969949996143733e-07, "loss": 0.3649, "step": 5287 }, { "epoch": 2.636968085106383, "grad_norm": 0.40526077151298523, "learning_rate": 4.385103265959989e-07, "loss": 0.3386, "step": 5288 }, { "epoch": 2.637466755319149, "grad_norm": 0.3531191051006317, "learning_rate": 4.3732268972281336e-07, "loss": 0.3141, "step": 5289 }, { "epoch": 2.637965425531915, "grad_norm": 0.44542622566223145, "learning_rate": 4.3613658974192564e-07, "loss": 0.3698, "step": 5290 }, { "epoch": 2.638464095744681, "grad_norm": 0.38685837388038635, "learning_rate": 4.349520270528601e-07, "loss": 0.3164, "step": 5291 }, { "epoch": 2.638962765957447, "grad_norm": 0.3945256173610687, "learning_rate": 4.3376900205462704e-07, "loss": 0.323, "step": 5292 }, { "epoch": 2.6394614361702127, "grad_norm": 0.39249807596206665, "learning_rate": 4.325875151457137e-07, "loss": 0.3006, "step": 5293 }, { "epoch": 2.6399601063829787, "grad_norm": 0.46106964349746704, "learning_rate": 4.314075667240952e-07, "loss": 0.3262, "step": 5294 }, { "epoch": 2.640458776595745, "grad_norm": 0.4207128584384918, "learning_rate": 4.3022915718722305e-07, "loss": 0.3243, "step": 5295 }, { "epoch": 2.6409574468085104, "grad_norm": 0.38512638211250305, "learning_rate": 4.290522869320346e-07, "loss": 0.3655, "step": 5296 }, { "epoch": 2.6414561170212765, "grad_norm": 0.3808280825614929, "learning_rate": 4.278769563549456e-07, "loss": 0.3446, "step": 5297 }, { "epoch": 2.6419547872340425, "grad_norm": 0.4565633535385132, "learning_rate": 4.267031658518539e-07, "loss": 0.3606, "step": 5298 }, { "epoch": 2.6424534574468086, "grad_norm": 0.42940813302993774, "learning_rate": 4.2553091581814035e-07, "loss": 0.2877, "step": 5299 }, { "epoch": 2.6429521276595747, "grad_norm": 0.42761656641960144, "learning_rate": 4.2436020664866575e-07, "loss": 0.3447, "step": 5300 }, { "epoch": 2.6434507978723403, "grad_norm": 0.4501022696495056, "learning_rate": 4.231910387377708e-07, "loss": 0.3592, "step": 5301 }, { "epoch": 2.6439494680851063, "grad_norm": 0.38908651471138, "learning_rate": 4.2202341247927883e-07, "loss": 0.3606, "step": 5302 }, { "epoch": 2.6444481382978724, "grad_norm": 0.4183308780193329, "learning_rate": 4.208573282664924e-07, "loss": 0.3168, "step": 5303 }, { "epoch": 2.6449468085106385, "grad_norm": 0.4377605617046356, "learning_rate": 4.1969278649219625e-07, "loss": 0.3084, "step": 5304 }, { "epoch": 2.6454454787234045, "grad_norm": 0.4453580677509308, "learning_rate": 4.185297875486538e-07, "loss": 0.3564, "step": 5305 }, { "epoch": 2.64594414893617, "grad_norm": 0.479434072971344, "learning_rate": 4.173683318276095e-07, "loss": 0.3277, "step": 5306 }, { "epoch": 2.646442819148936, "grad_norm": 0.39714768528938293, "learning_rate": 4.1620841972028926e-07, "loss": 0.3127, "step": 5307 }, { "epoch": 2.6469414893617023, "grad_norm": 0.4037000834941864, "learning_rate": 4.1505005161739774e-07, "loss": 0.3107, "step": 5308 }, { "epoch": 2.647440159574468, "grad_norm": 0.4388819634914398, "learning_rate": 4.138932279091179e-07, "loss": 0.3777, "step": 5309 }, { "epoch": 2.647938829787234, "grad_norm": 0.386513888835907, "learning_rate": 4.127379489851169e-07, "loss": 0.2694, "step": 5310 }, { "epoch": 2.6484375, "grad_norm": 0.46991661190986633, "learning_rate": 4.115842152345362e-07, "loss": 0.3989, "step": 5311 }, { "epoch": 2.648936170212766, "grad_norm": 0.4424166679382324, "learning_rate": 4.104320270460016e-07, "loss": 0.3018, "step": 5312 }, { "epoch": 2.649434840425532, "grad_norm": 0.4094339907169342, "learning_rate": 4.092813848076144e-07, "loss": 0.3734, "step": 5313 }, { "epoch": 2.6499335106382977, "grad_norm": 0.45658689737319946, "learning_rate": 4.0813228890695945e-07, "loss": 0.4474, "step": 5314 }, { "epoch": 2.650432180851064, "grad_norm": 0.3442841172218323, "learning_rate": 4.069847397310955e-07, "loss": 0.2737, "step": 5315 }, { "epoch": 2.65093085106383, "grad_norm": 0.44886893033981323, "learning_rate": 4.058387376665651e-07, "loss": 0.3519, "step": 5316 }, { "epoch": 2.6514295212765955, "grad_norm": 0.44353437423706055, "learning_rate": 4.0469428309938553e-07, "loss": 0.374, "step": 5317 }, { "epoch": 2.6519281914893615, "grad_norm": 0.3940211236476898, "learning_rate": 4.0355137641505726e-07, "loss": 0.3658, "step": 5318 }, { "epoch": 2.6524268617021276, "grad_norm": 0.37170469760894775, "learning_rate": 4.0241001799855626e-07, "loss": 0.3349, "step": 5319 }, { "epoch": 2.6529255319148937, "grad_norm": 0.37839171290397644, "learning_rate": 4.0127020823433617e-07, "loss": 0.3633, "step": 5320 }, { "epoch": 2.6534242021276597, "grad_norm": 0.3919380307197571, "learning_rate": 4.00131947506332e-07, "loss": 0.3393, "step": 5321 }, { "epoch": 2.6539228723404253, "grad_norm": 0.4539000988006592, "learning_rate": 3.9899523619795487e-07, "loss": 0.3583, "step": 5322 }, { "epoch": 2.6544215425531914, "grad_norm": 0.36331114172935486, "learning_rate": 3.9786007469209574e-07, "loss": 0.3193, "step": 5323 }, { "epoch": 2.6549202127659575, "grad_norm": 0.43053579330444336, "learning_rate": 3.96726463371121e-07, "loss": 0.3738, "step": 5324 }, { "epoch": 2.6554188829787235, "grad_norm": 0.41358473896980286, "learning_rate": 3.955944026168773e-07, "loss": 0.3509, "step": 5325 }, { "epoch": 2.6559175531914896, "grad_norm": 0.38228893280029297, "learning_rate": 3.9446389281068696e-07, "loss": 0.3304, "step": 5326 }, { "epoch": 2.656416223404255, "grad_norm": 0.4355708658695221, "learning_rate": 3.9333493433335256e-07, "loss": 0.3689, "step": 5327 }, { "epoch": 2.6569148936170213, "grad_norm": 0.44939151406288147, "learning_rate": 3.9220752756515046e-07, "loss": 0.359, "step": 5328 }, { "epoch": 2.6574135638297873, "grad_norm": 0.40452253818511963, "learning_rate": 3.9108167288583855e-07, "loss": 0.367, "step": 5329 }, { "epoch": 2.657912234042553, "grad_norm": 0.3735501766204834, "learning_rate": 3.899573706746485e-07, "loss": 0.3302, "step": 5330 }, { "epoch": 2.658410904255319, "grad_norm": 0.4022245705127716, "learning_rate": 3.888346213102895e-07, "loss": 0.3309, "step": 5331 }, { "epoch": 2.658909574468085, "grad_norm": 0.41459041833877563, "learning_rate": 3.877134251709502e-07, "loss": 0.3773, "step": 5332 }, { "epoch": 2.659408244680851, "grad_norm": 0.4257168769836426, "learning_rate": 3.8659378263429303e-07, "loss": 0.3807, "step": 5333 }, { "epoch": 2.659906914893617, "grad_norm": 0.36674419045448303, "learning_rate": 3.8547569407745955e-07, "loss": 0.3061, "step": 5334 }, { "epoch": 2.660405585106383, "grad_norm": 0.3948860168457031, "learning_rate": 3.843591598770652e-07, "loss": 0.3584, "step": 5335 }, { "epoch": 2.660904255319149, "grad_norm": 0.37124496698379517, "learning_rate": 3.8324418040920464e-07, "loss": 0.3631, "step": 5336 }, { "epoch": 2.661402925531915, "grad_norm": 0.3625965118408203, "learning_rate": 3.8213075604944696e-07, "loss": 0.336, "step": 5337 }, { "epoch": 2.661901595744681, "grad_norm": 0.3605426251888275, "learning_rate": 3.8101888717283765e-07, "loss": 0.3315, "step": 5338 }, { "epoch": 2.662400265957447, "grad_norm": 0.37421926856040955, "learning_rate": 3.7990857415390047e-07, "loss": 0.305, "step": 5339 }, { "epoch": 2.6628989361702127, "grad_norm": 0.4042399525642395, "learning_rate": 3.787998173666313e-07, "loss": 0.3734, "step": 5340 }, { "epoch": 2.6633976063829787, "grad_norm": 0.406089186668396, "learning_rate": 3.776926171845047e-07, "loss": 0.3776, "step": 5341 }, { "epoch": 2.663896276595745, "grad_norm": 0.4600531756877899, "learning_rate": 3.7658697398046953e-07, "loss": 0.3607, "step": 5342 }, { "epoch": 2.6643949468085104, "grad_norm": 0.40909233689308167, "learning_rate": 3.754828881269518e-07, "loss": 0.3217, "step": 5343 }, { "epoch": 2.6648936170212765, "grad_norm": 0.4209268093109131, "learning_rate": 3.743803599958501e-07, "loss": 0.3785, "step": 5344 }, { "epoch": 2.6653922872340425, "grad_norm": 0.3940836489200592, "learning_rate": 3.7327938995854184e-07, "loss": 0.3076, "step": 5345 }, { "epoch": 2.6658909574468086, "grad_norm": 0.4194507598876953, "learning_rate": 3.721799783858754e-07, "loss": 0.3021, "step": 5346 }, { "epoch": 2.6663896276595747, "grad_norm": 0.471437007188797, "learning_rate": 3.7108212564817946e-07, "loss": 0.3746, "step": 5347 }, { "epoch": 2.6668882978723403, "grad_norm": 0.39694303274154663, "learning_rate": 3.699858321152522e-07, "loss": 0.3282, "step": 5348 }, { "epoch": 2.6673869680851063, "grad_norm": 0.44893425703048706, "learning_rate": 3.6889109815637147e-07, "loss": 0.3432, "step": 5349 }, { "epoch": 2.6678856382978724, "grad_norm": 0.41719549894332886, "learning_rate": 3.6779792414028516e-07, "loss": 0.3003, "step": 5350 }, { "epoch": 2.6683843085106385, "grad_norm": 0.40289488434791565, "learning_rate": 3.6670631043521976e-07, "loss": 0.3499, "step": 5351 }, { "epoch": 2.6688829787234045, "grad_norm": 0.37604936957359314, "learning_rate": 3.6561625740887396e-07, "loss": 0.3364, "step": 5352 }, { "epoch": 2.66938164893617, "grad_norm": 0.4263797104358673, "learning_rate": 3.6452776542842073e-07, "loss": 0.3755, "step": 5353 }, { "epoch": 2.669880319148936, "grad_norm": 0.47603538632392883, "learning_rate": 3.6344083486050843e-07, "loss": 0.3773, "step": 5354 }, { "epoch": 2.6703789893617023, "grad_norm": 0.4233412742614746, "learning_rate": 3.623554660712575e-07, "loss": 0.3426, "step": 5355 }, { "epoch": 2.670877659574468, "grad_norm": 0.39961862564086914, "learning_rate": 3.612716594262655e-07, "loss": 0.3027, "step": 5356 }, { "epoch": 2.671376329787234, "grad_norm": 0.3947892189025879, "learning_rate": 3.601894152906005e-07, "loss": 0.3617, "step": 5357 }, { "epoch": 2.671875, "grad_norm": 0.3966520428657532, "learning_rate": 3.591087340288063e-07, "loss": 0.3882, "step": 5358 }, { "epoch": 2.672373670212766, "grad_norm": 0.39615488052368164, "learning_rate": 3.5802961600489905e-07, "loss": 0.3625, "step": 5359 }, { "epoch": 2.672872340425532, "grad_norm": 0.38559749722480774, "learning_rate": 3.569520615823696e-07, "loss": 0.296, "step": 5360 }, { "epoch": 2.6733710106382977, "grad_norm": 0.41400694847106934, "learning_rate": 3.5587607112417987e-07, "loss": 0.3727, "step": 5361 }, { "epoch": 2.673869680851064, "grad_norm": 0.43994858860969543, "learning_rate": 3.548016449927688e-07, "loss": 0.3755, "step": 5362 }, { "epoch": 2.67436835106383, "grad_norm": 0.42362886667251587, "learning_rate": 3.537287835500447e-07, "loss": 0.3611, "step": 5363 }, { "epoch": 2.6748670212765955, "grad_norm": 0.37459471821784973, "learning_rate": 3.5265748715738955e-07, "loss": 0.3089, "step": 5364 }, { "epoch": 2.6753656914893615, "grad_norm": 0.38761579990386963, "learning_rate": 3.5158775617566087e-07, "loss": 0.3313, "step": 5365 }, { "epoch": 2.6758643617021276, "grad_norm": 0.4051671028137207, "learning_rate": 3.505195909651848e-07, "loss": 0.3644, "step": 5366 }, { "epoch": 2.6763630319148937, "grad_norm": 0.42223167419433594, "learning_rate": 3.4945299188576363e-07, "loss": 0.314, "step": 5367 }, { "epoch": 2.6768617021276597, "grad_norm": 0.37236958742141724, "learning_rate": 3.4838795929666934e-07, "loss": 0.3434, "step": 5368 }, { "epoch": 2.6773603723404253, "grad_norm": 0.4053034782409668, "learning_rate": 3.473244935566489e-07, "loss": 0.2758, "step": 5369 }, { "epoch": 2.6778590425531914, "grad_norm": 0.3956943154335022, "learning_rate": 3.4626259502391843e-07, "loss": 0.3532, "step": 5370 }, { "epoch": 2.6783577127659575, "grad_norm": 0.4058333933353424, "learning_rate": 3.452022640561703e-07, "loss": 0.3246, "step": 5371 }, { "epoch": 2.6788563829787235, "grad_norm": 0.426653116941452, "learning_rate": 3.441435010105637e-07, "loss": 0.2699, "step": 5372 }, { "epoch": 2.6793550531914896, "grad_norm": 0.43874165415763855, "learning_rate": 3.430863062437345e-07, "loss": 0.3403, "step": 5373 }, { "epoch": 2.679853723404255, "grad_norm": 0.4610217213630676, "learning_rate": 3.420306801117862e-07, "loss": 0.3809, "step": 5374 }, { "epoch": 2.6803523936170213, "grad_norm": 0.4019734263420105, "learning_rate": 3.409766229702982e-07, "loss": 0.3375, "step": 5375 }, { "epoch": 2.6808510638297873, "grad_norm": 0.40692415833473206, "learning_rate": 3.3992413517431764e-07, "loss": 0.353, "step": 5376 }, { "epoch": 2.681349734042553, "grad_norm": 0.4849611520767212, "learning_rate": 3.388732170783643e-07, "loss": 0.3492, "step": 5377 }, { "epoch": 2.681848404255319, "grad_norm": 0.441057950258255, "learning_rate": 3.3782386903643104e-07, "loss": 0.2959, "step": 5378 }, { "epoch": 2.682347074468085, "grad_norm": 0.38563424348831177, "learning_rate": 3.3677609140197853e-07, "loss": 0.3667, "step": 5379 }, { "epoch": 2.682845744680851, "grad_norm": 0.40629127621650696, "learning_rate": 3.3572988452794167e-07, "loss": 0.3166, "step": 5380 }, { "epoch": 2.683344414893617, "grad_norm": 0.46485939621925354, "learning_rate": 3.346852487667229e-07, "loss": 0.3141, "step": 5381 }, { "epoch": 2.683843085106383, "grad_norm": 0.4546757638454437, "learning_rate": 3.336421844701998e-07, "loss": 0.3119, "step": 5382 }, { "epoch": 2.684341755319149, "grad_norm": 0.41375136375427246, "learning_rate": 3.326006919897168e-07, "loss": 0.368, "step": 5383 }, { "epoch": 2.684840425531915, "grad_norm": 0.38976749777793884, "learning_rate": 3.3156077167608934e-07, "loss": 0.3147, "step": 5384 }, { "epoch": 2.685339095744681, "grad_norm": 0.40771058201789856, "learning_rate": 3.305224238796056e-07, "loss": 0.34, "step": 5385 }, { "epoch": 2.685837765957447, "grad_norm": 0.44064027070999146, "learning_rate": 3.29485648950022e-07, "loss": 0.331, "step": 5386 }, { "epoch": 2.6863364361702127, "grad_norm": 0.37482523918151855, "learning_rate": 3.284504472365668e-07, "loss": 0.2816, "step": 5387 }, { "epoch": 2.6868351063829787, "grad_norm": 0.42515265941619873, "learning_rate": 3.2741681908793563e-07, "loss": 0.3538, "step": 5388 }, { "epoch": 2.687333776595745, "grad_norm": 0.45048919320106506, "learning_rate": 3.2638476485229774e-07, "loss": 0.3163, "step": 5389 }, { "epoch": 2.6878324468085104, "grad_norm": 0.44853416085243225, "learning_rate": 3.253542848772889e-07, "loss": 0.3932, "step": 5390 }, { "epoch": 2.6883311170212765, "grad_norm": 0.3928111791610718, "learning_rate": 3.243253795100171e-07, "loss": 0.3044, "step": 5391 }, { "epoch": 2.6888297872340425, "grad_norm": 0.4021323025226593, "learning_rate": 3.232980490970572e-07, "loss": 0.2992, "step": 5392 }, { "epoch": 2.6893284574468086, "grad_norm": 0.4029253125190735, "learning_rate": 3.2227229398445694e-07, "loss": 0.3564, "step": 5393 }, { "epoch": 2.6898271276595747, "grad_norm": 0.48318955302238464, "learning_rate": 3.21248114517731e-07, "loss": 0.3352, "step": 5394 }, { "epoch": 2.6903257978723403, "grad_norm": 0.4501648247241974, "learning_rate": 3.202255110418634e-07, "loss": 0.3127, "step": 5395 }, { "epoch": 2.6908244680851063, "grad_norm": 0.4655812978744507, "learning_rate": 3.1920448390130866e-07, "loss": 0.34, "step": 5396 }, { "epoch": 2.6913231382978724, "grad_norm": 0.4683706760406494, "learning_rate": 3.181850334399894e-07, "loss": 0.3789, "step": 5397 }, { "epoch": 2.6918218085106385, "grad_norm": 0.36279988288879395, "learning_rate": 3.1716716000129765e-07, "loss": 0.313, "step": 5398 }, { "epoch": 2.6923204787234045, "grad_norm": 0.35572516918182373, "learning_rate": 3.1615086392809245e-07, "loss": 0.3171, "step": 5399 }, { "epoch": 2.69281914893617, "grad_norm": 0.4177791178226471, "learning_rate": 3.1513614556270553e-07, "loss": 0.3441, "step": 5400 }, { "epoch": 2.693317819148936, "grad_norm": 0.4156601130962372, "learning_rate": 3.141230052469324e-07, "loss": 0.3684, "step": 5401 }, { "epoch": 2.6938164893617023, "grad_norm": 0.43917250633239746, "learning_rate": 3.1311144332204057e-07, "loss": 0.3164, "step": 5402 }, { "epoch": 2.694315159574468, "grad_norm": 0.39618608355522156, "learning_rate": 3.121014601287631e-07, "loss": 0.3174, "step": 5403 }, { "epoch": 2.694813829787234, "grad_norm": 0.4454222619533539, "learning_rate": 3.11093056007305e-07, "loss": 0.3462, "step": 5404 }, { "epoch": 2.6953125, "grad_norm": 0.40447667241096497, "learning_rate": 3.100862312973357e-07, "loss": 0.3011, "step": 5405 }, { "epoch": 2.695811170212766, "grad_norm": 0.4152494966983795, "learning_rate": 3.0908098633799345e-07, "loss": 0.3139, "step": 5406 }, { "epoch": 2.696309840425532, "grad_norm": 0.4269630014896393, "learning_rate": 3.0807732146788614e-07, "loss": 0.3563, "step": 5407 }, { "epoch": 2.6968085106382977, "grad_norm": 0.3886622190475464, "learning_rate": 3.0707523702508737e-07, "loss": 0.3414, "step": 5408 }, { "epoch": 2.697307180851064, "grad_norm": 0.3874417245388031, "learning_rate": 3.060747333471409e-07, "loss": 0.3533, "step": 5409 }, { "epoch": 2.69780585106383, "grad_norm": 0.46475473046302795, "learning_rate": 3.0507581077105385e-07, "loss": 0.3103, "step": 5410 }, { "epoch": 2.6983045212765955, "grad_norm": 0.457292765378952, "learning_rate": 3.040784696333055e-07, "loss": 0.3088, "step": 5411 }, { "epoch": 2.6988031914893615, "grad_norm": 0.3962119221687317, "learning_rate": 3.0308271026983973e-07, "loss": 0.3293, "step": 5412 }, { "epoch": 2.6993018617021276, "grad_norm": 0.4314926862716675, "learning_rate": 3.020885330160672e-07, "loss": 0.3568, "step": 5413 }, { "epoch": 2.6998005319148937, "grad_norm": 0.3900313973426819, "learning_rate": 3.010959382068679e-07, "loss": 0.2646, "step": 5414 }, { "epoch": 2.7002992021276597, "grad_norm": 0.4546367824077606, "learning_rate": 3.001049261765865e-07, "loss": 0.3517, "step": 5415 }, { "epoch": 2.7007978723404253, "grad_norm": 0.39359861612319946, "learning_rate": 2.9911549725903656e-07, "loss": 0.3016, "step": 5416 }, { "epoch": 2.7012965425531914, "grad_norm": 0.43419569730758667, "learning_rate": 2.9812765178749594e-07, "loss": 0.3138, "step": 5417 }, { "epoch": 2.7017952127659575, "grad_norm": 0.45188403129577637, "learning_rate": 2.9714139009471177e-07, "loss": 0.4167, "step": 5418 }, { "epoch": 2.7022938829787235, "grad_norm": 0.4741210341453552, "learning_rate": 2.9615671251289503e-07, "loss": 0.3566, "step": 5419 }, { "epoch": 2.7027925531914896, "grad_norm": 0.40261968970298767, "learning_rate": 2.9517361937372646e-07, "loss": 0.3028, "step": 5420 }, { "epoch": 2.703291223404255, "grad_norm": 0.40622517466545105, "learning_rate": 2.9419211100834957e-07, "loss": 0.3449, "step": 5421 }, { "epoch": 2.7037898936170213, "grad_norm": 0.3993648588657379, "learning_rate": 2.9321218774737704e-07, "loss": 0.3796, "step": 5422 }, { "epoch": 2.7042885638297873, "grad_norm": 0.39916160702705383, "learning_rate": 2.922338499208843e-07, "loss": 0.3022, "step": 5423 }, { "epoch": 2.704787234042553, "grad_norm": 0.4753832221031189, "learning_rate": 2.9125709785841673e-07, "loss": 0.3377, "step": 5424 }, { "epoch": 2.705285904255319, "grad_norm": 0.4810241162776947, "learning_rate": 2.9028193188898156e-07, "loss": 0.4018, "step": 5425 }, { "epoch": 2.705784574468085, "grad_norm": 0.3583473265171051, "learning_rate": 2.893083523410556e-07, "loss": 0.2583, "step": 5426 }, { "epoch": 2.706283244680851, "grad_norm": 0.4103499948978424, "learning_rate": 2.883363595425787e-07, "loss": 0.3587, "step": 5427 }, { "epoch": 2.706781914893617, "grad_norm": 0.3877948522567749, "learning_rate": 2.8736595382095555e-07, "loss": 0.2909, "step": 5428 }, { "epoch": 2.707280585106383, "grad_norm": 0.4034360349178314, "learning_rate": 2.8639713550306034e-07, "loss": 0.3404, "step": 5429 }, { "epoch": 2.707779255319149, "grad_norm": 0.4398806691169739, "learning_rate": 2.8542990491522705e-07, "loss": 0.3322, "step": 5430 }, { "epoch": 2.708277925531915, "grad_norm": 0.4162849485874176, "learning_rate": 2.8446426238325996e-07, "loss": 0.2586, "step": 5431 }, { "epoch": 2.708776595744681, "grad_norm": 0.4660519063472748, "learning_rate": 2.8350020823242397e-07, "loss": 0.4079, "step": 5432 }, { "epoch": 2.709275265957447, "grad_norm": 0.4045047461986542, "learning_rate": 2.825377427874532e-07, "loss": 0.3367, "step": 5433 }, { "epoch": 2.7097739361702127, "grad_norm": 0.37894028425216675, "learning_rate": 2.815768663725427e-07, "loss": 0.3507, "step": 5434 }, { "epoch": 2.7102726063829787, "grad_norm": 0.3845669627189636, "learning_rate": 2.806175793113558e-07, "loss": 0.3933, "step": 5435 }, { "epoch": 2.710771276595745, "grad_norm": 0.4266326129436493, "learning_rate": 2.796598819270174e-07, "loss": 0.3585, "step": 5436 }, { "epoch": 2.7112699468085104, "grad_norm": 0.43038883805274963, "learning_rate": 2.787037745421195e-07, "loss": 0.3532, "step": 5437 }, { "epoch": 2.7117686170212765, "grad_norm": 0.42637380957603455, "learning_rate": 2.7774925747871604e-07, "loss": 0.2968, "step": 5438 }, { "epoch": 2.7122672872340425, "grad_norm": 0.4412405788898468, "learning_rate": 2.7679633105832713e-07, "loss": 0.3558, "step": 5439 }, { "epoch": 2.7127659574468086, "grad_norm": 0.37049147486686707, "learning_rate": 2.758449956019371e-07, "loss": 0.3062, "step": 5440 }, { "epoch": 2.7132646276595747, "grad_norm": 0.3644963204860687, "learning_rate": 2.748952514299924e-07, "loss": 0.3636, "step": 5441 }, { "epoch": 2.7137632978723403, "grad_norm": 0.44460931420326233, "learning_rate": 2.7394709886240654e-07, "loss": 0.3733, "step": 5442 }, { "epoch": 2.7142619680851063, "grad_norm": 0.4161943197250366, "learning_rate": 2.730005382185535e-07, "loss": 0.2264, "step": 5443 }, { "epoch": 2.7147606382978724, "grad_norm": 0.40380382537841797, "learning_rate": 2.7205556981727444e-07, "loss": 0.2907, "step": 5444 }, { "epoch": 2.7152593085106385, "grad_norm": 0.4157249927520752, "learning_rate": 2.7111219397687125e-07, "loss": 0.3217, "step": 5445 }, { "epoch": 2.7157579787234045, "grad_norm": 0.4412188231945038, "learning_rate": 2.701704110151121e-07, "loss": 0.3462, "step": 5446 }, { "epoch": 2.71625664893617, "grad_norm": 0.41819897294044495, "learning_rate": 2.6923022124922535e-07, "loss": 0.3412, "step": 5447 }, { "epoch": 2.716755319148936, "grad_norm": 0.40472662448883057, "learning_rate": 2.682916249959061e-07, "loss": 0.3428, "step": 5448 }, { "epoch": 2.7172539893617023, "grad_norm": 0.42342162132263184, "learning_rate": 2.673546225713103e-07, "loss": 0.3573, "step": 5449 }, { "epoch": 2.717752659574468, "grad_norm": 0.37695813179016113, "learning_rate": 2.6641921429105765e-07, "loss": 0.3454, "step": 5450 }, { "epoch": 2.718251329787234, "grad_norm": 0.4307754933834076, "learning_rate": 2.654854004702323e-07, "loss": 0.3505, "step": 5451 }, { "epoch": 2.71875, "grad_norm": 0.42666780948638916, "learning_rate": 2.645531814233787e-07, "loss": 0.3786, "step": 5452 }, { "epoch": 2.719248670212766, "grad_norm": 0.42794349789619446, "learning_rate": 2.636225574645063e-07, "loss": 0.3277, "step": 5453 }, { "epoch": 2.719747340425532, "grad_norm": 0.39480042457580566, "learning_rate": 2.6269352890708644e-07, "loss": 0.3504, "step": 5454 }, { "epoch": 2.7202460106382977, "grad_norm": 0.382810115814209, "learning_rate": 2.6176609606405334e-07, "loss": 0.2888, "step": 5455 }, { "epoch": 2.720744680851064, "grad_norm": 0.39765283465385437, "learning_rate": 2.6084025924780265e-07, "loss": 0.3828, "step": 5456 }, { "epoch": 2.72124335106383, "grad_norm": 0.3760170638561249, "learning_rate": 2.599160187701943e-07, "loss": 0.3223, "step": 5457 }, { "epoch": 2.7217420212765955, "grad_norm": 0.4009666442871094, "learning_rate": 2.5899337494254886e-07, "loss": 0.2984, "step": 5458 }, { "epoch": 2.7222406914893615, "grad_norm": 0.4175800681114197, "learning_rate": 2.580723280756492e-07, "loss": 0.3221, "step": 5459 }, { "epoch": 2.7227393617021276, "grad_norm": 0.4111575186252594, "learning_rate": 2.571528784797417e-07, "loss": 0.362, "step": 5460 }, { "epoch": 2.7232380319148937, "grad_norm": 0.4347602128982544, "learning_rate": 2.56235026464533e-07, "loss": 0.3356, "step": 5461 }, { "epoch": 2.7237367021276597, "grad_norm": 0.41952627897262573, "learning_rate": 2.553187723391937e-07, "loss": 0.3761, "step": 5462 }, { "epoch": 2.7242353723404253, "grad_norm": 0.3786693811416626, "learning_rate": 2.5440411641235285e-07, "loss": 0.3256, "step": 5463 }, { "epoch": 2.7247340425531914, "grad_norm": 0.40594494342803955, "learning_rate": 2.5349105899210523e-07, "loss": 0.3452, "step": 5464 }, { "epoch": 2.7252327127659575, "grad_norm": 0.4253160357475281, "learning_rate": 2.525796003860037e-07, "loss": 0.3712, "step": 5465 }, { "epoch": 2.7257313829787235, "grad_norm": 0.3945719599723816, "learning_rate": 2.516697409010649e-07, "loss": 0.2866, "step": 5466 }, { "epoch": 2.7262300531914896, "grad_norm": 0.4125126600265503, "learning_rate": 2.5076148084376475e-07, "loss": 0.3801, "step": 5467 }, { "epoch": 2.726728723404255, "grad_norm": 0.4276767373085022, "learning_rate": 2.4985482052004296e-07, "loss": 0.3605, "step": 5468 }, { "epoch": 2.7272273936170213, "grad_norm": 0.3617915213108063, "learning_rate": 2.4894976023529807e-07, "loss": 0.3161, "step": 5469 }, { "epoch": 2.7277260638297873, "grad_norm": 0.43971335887908936, "learning_rate": 2.4804630029439056e-07, "loss": 0.3556, "step": 5470 }, { "epoch": 2.728224734042553, "grad_norm": 0.41182419657707214, "learning_rate": 2.471444410016427e-07, "loss": 0.3602, "step": 5471 }, { "epoch": 2.728723404255319, "grad_norm": 0.3914492726325989, "learning_rate": 2.462441826608358e-07, "loss": 0.3085, "step": 5472 }, { "epoch": 2.729222074468085, "grad_norm": 0.36631569266319275, "learning_rate": 2.4534552557521407e-07, "loss": 0.3169, "step": 5473 }, { "epoch": 2.729720744680851, "grad_norm": 0.3886937201023102, "learning_rate": 2.4444847004747975e-07, "loss": 0.3624, "step": 5474 }, { "epoch": 2.730219414893617, "grad_norm": 0.37706172466278076, "learning_rate": 2.435530163797983e-07, "loss": 0.3483, "step": 5475 }, { "epoch": 2.730718085106383, "grad_norm": 0.37352585792541504, "learning_rate": 2.4265916487379306e-07, "loss": 0.2962, "step": 5476 }, { "epoch": 2.731216755319149, "grad_norm": 0.44510379433631897, "learning_rate": 2.417669158305508e-07, "loss": 0.3547, "step": 5477 }, { "epoch": 2.731715425531915, "grad_norm": 0.3695046305656433, "learning_rate": 2.408762695506145e-07, "loss": 0.3314, "step": 5478 }, { "epoch": 2.732214095744681, "grad_norm": 0.3634110391139984, "learning_rate": 2.39987226333992e-07, "loss": 0.3171, "step": 5479 }, { "epoch": 2.732712765957447, "grad_norm": 0.43051207065582275, "learning_rate": 2.3909978648014643e-07, "loss": 0.4032, "step": 5480 }, { "epoch": 2.7332114361702127, "grad_norm": 0.4053167700767517, "learning_rate": 2.3821395028800377e-07, "loss": 0.3341, "step": 5481 }, { "epoch": 2.7337101063829787, "grad_norm": 0.4268244504928589, "learning_rate": 2.3732971805594918e-07, "loss": 0.3286, "step": 5482 }, { "epoch": 2.734208776595745, "grad_norm": 0.43906375765800476, "learning_rate": 2.3644709008182776e-07, "loss": 0.332, "step": 5483 }, { "epoch": 2.7347074468085104, "grad_norm": 0.42156437039375305, "learning_rate": 2.3556606666294334e-07, "loss": 0.3324, "step": 5484 }, { "epoch": 2.7352061170212765, "grad_norm": 0.43227097392082214, "learning_rate": 2.3468664809606024e-07, "loss": 0.312, "step": 5485 }, { "epoch": 2.7357047872340425, "grad_norm": 0.4051015377044678, "learning_rate": 2.33808834677402e-07, "loss": 0.2966, "step": 5486 }, { "epoch": 2.7362034574468086, "grad_norm": 0.38720768690109253, "learning_rate": 2.3293262670265105e-07, "loss": 0.3346, "step": 5487 }, { "epoch": 2.7367021276595747, "grad_norm": 0.409649521112442, "learning_rate": 2.3205802446694847e-07, "loss": 0.3429, "step": 5488 }, { "epoch": 2.7372007978723403, "grad_norm": 0.4180120527744293, "learning_rate": 2.3118502826489697e-07, "loss": 0.3469, "step": 5489 }, { "epoch": 2.7376994680851063, "grad_norm": 0.4601738452911377, "learning_rate": 2.3031363839055577e-07, "loss": 0.3243, "step": 5490 }, { "epoch": 2.7381981382978724, "grad_norm": 0.39982542395591736, "learning_rate": 2.2944385513744394e-07, "loss": 0.3356, "step": 5491 }, { "epoch": 2.7386968085106385, "grad_norm": 0.38339167833328247, "learning_rate": 2.2857567879853882e-07, "loss": 0.3469, "step": 5492 }, { "epoch": 2.7391954787234045, "grad_norm": 0.400400847196579, "learning_rate": 2.2770910966627756e-07, "loss": 0.3022, "step": 5493 }, { "epoch": 2.73969414893617, "grad_norm": 0.46676287055015564, "learning_rate": 2.268441480325545e-07, "loss": 0.3676, "step": 5494 }, { "epoch": 2.740192819148936, "grad_norm": 0.4009566009044647, "learning_rate": 2.2598079418872432e-07, "loss": 0.371, "step": 5495 }, { "epoch": 2.7406914893617023, "grad_norm": 0.3551243245601654, "learning_rate": 2.2511904842559828e-07, "loss": 0.265, "step": 5496 }, { "epoch": 2.741190159574468, "grad_norm": 0.4679122865200043, "learning_rate": 2.242589110334481e-07, "loss": 0.3123, "step": 5497 }, { "epoch": 2.741688829787234, "grad_norm": 0.38378995656967163, "learning_rate": 2.234003823020009e-07, "loss": 0.3684, "step": 5498 }, { "epoch": 2.7421875, "grad_norm": 0.4112095832824707, "learning_rate": 2.2254346252044477e-07, "loss": 0.3497, "step": 5499 }, { "epoch": 2.742686170212766, "grad_norm": 0.41504478454589844, "learning_rate": 2.216881519774239e-07, "loss": 0.3447, "step": 5500 }, { "epoch": 2.743184840425532, "grad_norm": 0.4196903705596924, "learning_rate": 2.2083445096104162e-07, "loss": 0.3213, "step": 5501 }, { "epoch": 2.7436835106382977, "grad_norm": 0.40301501750946045, "learning_rate": 2.1998235975885795e-07, "loss": 0.3243, "step": 5502 }, { "epoch": 2.744182180851064, "grad_norm": 0.423934668302536, "learning_rate": 2.1913187865789109e-07, "loss": 0.3429, "step": 5503 }, { "epoch": 2.74468085106383, "grad_norm": 0.3820303678512573, "learning_rate": 2.1828300794461855e-07, "loss": 0.3209, "step": 5504 }, { "epoch": 2.7451795212765955, "grad_norm": 0.39468252658843994, "learning_rate": 2.1743574790497212e-07, "loss": 0.3926, "step": 5505 }, { "epoch": 2.7456781914893615, "grad_norm": 0.3841380178928375, "learning_rate": 2.165900988243447e-07, "loss": 0.3141, "step": 5506 }, { "epoch": 2.7461768617021276, "grad_norm": 0.3865705132484436, "learning_rate": 2.1574606098758278e-07, "loss": 0.3357, "step": 5507 }, { "epoch": 2.7466755319148937, "grad_norm": 0.38853543996810913, "learning_rate": 2.14903634678994e-07, "loss": 0.2991, "step": 5508 }, { "epoch": 2.7471742021276597, "grad_norm": 0.43209418654441833, "learning_rate": 2.140628201823397e-07, "loss": 0.3534, "step": 5509 }, { "epoch": 2.7476728723404253, "grad_norm": 0.35858801007270813, "learning_rate": 2.132236177808411e-07, "loss": 0.3479, "step": 5510 }, { "epoch": 2.7481715425531914, "grad_norm": 0.3654952645301819, "learning_rate": 2.1238602775717432e-07, "loss": 0.3408, "step": 5511 }, { "epoch": 2.7486702127659575, "grad_norm": 0.3847557306289673, "learning_rate": 2.1155005039347365e-07, "loss": 0.3144, "step": 5512 }, { "epoch": 2.7491688829787235, "grad_norm": 0.3767598867416382, "learning_rate": 2.1071568597132996e-07, "loss": 0.3385, "step": 5513 }, { "epoch": 2.7496675531914896, "grad_norm": 0.40384185314178467, "learning_rate": 2.09882934771789e-07, "loss": 0.3316, "step": 5514 }, { "epoch": 2.750166223404255, "grad_norm": 0.4243508279323578, "learning_rate": 2.0905179707535638e-07, "loss": 0.3936, "step": 5515 }, { "epoch": 2.7506648936170213, "grad_norm": 0.3844226598739624, "learning_rate": 2.0822227316199207e-07, "loss": 0.3591, "step": 5516 }, { "epoch": 2.7511635638297873, "grad_norm": 0.3821580111980438, "learning_rate": 2.0739436331111306e-07, "loss": 0.3561, "step": 5517 }, { "epoch": 2.751662234042553, "grad_norm": 0.36840444803237915, "learning_rate": 2.0656806780159133e-07, "loss": 0.3077, "step": 5518 }, { "epoch": 2.752160904255319, "grad_norm": 0.4326072633266449, "learning_rate": 2.0574338691175867e-07, "loss": 0.3913, "step": 5519 }, { "epoch": 2.752659574468085, "grad_norm": 0.37835949659347534, "learning_rate": 2.0492032091939784e-07, "loss": 0.3372, "step": 5520 }, { "epoch": 2.753158244680851, "grad_norm": 0.4143622815608978, "learning_rate": 2.0409887010175212e-07, "loss": 0.351, "step": 5521 }, { "epoch": 2.753656914893617, "grad_norm": 0.38555553555488586, "learning_rate": 2.0327903473551902e-07, "loss": 0.3477, "step": 5522 }, { "epoch": 2.754155585106383, "grad_norm": 0.3822074830532074, "learning_rate": 2.02460815096851e-07, "loss": 0.3686, "step": 5523 }, { "epoch": 2.754654255319149, "grad_norm": 0.41762593388557434, "learning_rate": 2.0164421146135815e-07, "loss": 0.31, "step": 5524 }, { "epoch": 2.755152925531915, "grad_norm": 0.42522644996643066, "learning_rate": 2.0082922410410432e-07, "loss": 0.3236, "step": 5525 }, { "epoch": 2.755651595744681, "grad_norm": 0.3968725800514221, "learning_rate": 2.0001585329961104e-07, "loss": 0.3706, "step": 5526 }, { "epoch": 2.756150265957447, "grad_norm": 0.4175073802471161, "learning_rate": 1.9920409932185249e-07, "loss": 0.3175, "step": 5527 }, { "epoch": 2.7566489361702127, "grad_norm": 0.3964872360229492, "learning_rate": 1.9839396244426157e-07, "loss": 0.3425, "step": 5528 }, { "epoch": 2.7571476063829787, "grad_norm": 0.4857133626937866, "learning_rate": 1.9758544293972392e-07, "loss": 0.327, "step": 5529 }, { "epoch": 2.757646276595745, "grad_norm": 0.429433137178421, "learning_rate": 1.9677854108058115e-07, "loss": 0.3255, "step": 5530 }, { "epoch": 2.7581449468085104, "grad_norm": 0.37427666783332825, "learning_rate": 1.9597325713863024e-07, "loss": 0.3199, "step": 5531 }, { "epoch": 2.7586436170212765, "grad_norm": 0.4189847707748413, "learning_rate": 1.951695913851237e-07, "loss": 0.3647, "step": 5532 }, { "epoch": 2.7591422872340425, "grad_norm": 0.37527576088905334, "learning_rate": 1.943675440907683e-07, "loss": 0.3012, "step": 5533 }, { "epoch": 2.7596409574468086, "grad_norm": 0.426587849855423, "learning_rate": 1.9356711552572404e-07, "loss": 0.3301, "step": 5534 }, { "epoch": 2.7601396276595747, "grad_norm": 0.4147343635559082, "learning_rate": 1.927683059596086e-07, "loss": 0.3455, "step": 5535 }, { "epoch": 2.7606382978723403, "grad_norm": 0.45991337299346924, "learning_rate": 1.9197111566149285e-07, "loss": 0.357, "step": 5536 }, { "epoch": 2.7611369680851063, "grad_norm": 0.40265771746635437, "learning_rate": 1.9117554489990252e-07, "loss": 0.342, "step": 5537 }, { "epoch": 2.7616356382978724, "grad_norm": 0.3784398138523102, "learning_rate": 1.9038159394281718e-07, "loss": 0.3458, "step": 5538 }, { "epoch": 2.7621343085106385, "grad_norm": 0.38888880610466003, "learning_rate": 1.895892630576718e-07, "loss": 0.3516, "step": 5539 }, { "epoch": 2.7626329787234045, "grad_norm": 0.4030623137950897, "learning_rate": 1.8879855251135505e-07, "loss": 0.3339, "step": 5540 }, { "epoch": 2.76313164893617, "grad_norm": 0.4481988251209259, "learning_rate": 1.8800946257020946e-07, "loss": 0.3488, "step": 5541 }, { "epoch": 2.763630319148936, "grad_norm": 0.43829861283302307, "learning_rate": 1.8722199350003246e-07, "loss": 0.271, "step": 5542 }, { "epoch": 2.7641289893617023, "grad_norm": 0.40774106979370117, "learning_rate": 1.8643614556607515e-07, "loss": 0.3284, "step": 5543 }, { "epoch": 2.764627659574468, "grad_norm": 0.3963404893875122, "learning_rate": 1.8565191903304302e-07, "loss": 0.3483, "step": 5544 }, { "epoch": 2.765126329787234, "grad_norm": 0.4062391519546509, "learning_rate": 1.8486931416509314e-07, "loss": 0.3311, "step": 5545 }, { "epoch": 2.765625, "grad_norm": 0.41731780767440796, "learning_rate": 1.840883312258407e-07, "loss": 0.3703, "step": 5546 }, { "epoch": 2.766123670212766, "grad_norm": 0.371147483587265, "learning_rate": 1.8330897047834972e-07, "loss": 0.3271, "step": 5547 }, { "epoch": 2.766622340425532, "grad_norm": 0.37927865982055664, "learning_rate": 1.825312321851419e-07, "loss": 0.3633, "step": 5548 }, { "epoch": 2.7671210106382977, "grad_norm": 0.3625994920730591, "learning_rate": 1.817551166081888e-07, "loss": 0.2575, "step": 5549 }, { "epoch": 2.767619680851064, "grad_norm": 0.4243813455104828, "learning_rate": 1.8098062400891903e-07, "loss": 0.3611, "step": 5550 }, { "epoch": 2.76811835106383, "grad_norm": 0.38565585017204285, "learning_rate": 1.802077546482117e-07, "loss": 0.3684, "step": 5551 }, { "epoch": 2.7686170212765955, "grad_norm": 0.4059673249721527, "learning_rate": 1.794365087864003e-07, "loss": 0.2786, "step": 5552 }, { "epoch": 2.7691156914893615, "grad_norm": 0.3929993510246277, "learning_rate": 1.786668866832708e-07, "loss": 0.3813, "step": 5553 }, { "epoch": 2.7696143617021276, "grad_norm": 0.3938506543636322, "learning_rate": 1.7789888859806425e-07, "loss": 0.2887, "step": 5554 }, { "epoch": 2.7701130319148937, "grad_norm": 0.4220193326473236, "learning_rate": 1.771325147894709e-07, "loss": 0.3646, "step": 5555 }, { "epoch": 2.7706117021276597, "grad_norm": 0.36979085206985474, "learning_rate": 1.7636776551563872e-07, "loss": 0.3831, "step": 5556 }, { "epoch": 2.7711103723404253, "grad_norm": 0.4447959065437317, "learning_rate": 1.756046410341633e-07, "loss": 0.366, "step": 5557 }, { "epoch": 2.7716090425531914, "grad_norm": 0.4132740795612335, "learning_rate": 1.7484314160209736e-07, "loss": 0.3228, "step": 5558 }, { "epoch": 2.7721077127659575, "grad_norm": 0.38483214378356934, "learning_rate": 1.7408326747594294e-07, "loss": 0.3179, "step": 5559 }, { "epoch": 2.7726063829787235, "grad_norm": 0.38201624155044556, "learning_rate": 1.73325018911657e-07, "loss": 0.3235, "step": 5560 }, { "epoch": 2.7731050531914896, "grad_norm": 0.4203132092952728, "learning_rate": 1.7256839616464848e-07, "loss": 0.3263, "step": 5561 }, { "epoch": 2.773603723404255, "grad_norm": 0.4277400076389313, "learning_rate": 1.7181339948977695e-07, "loss": 0.3124, "step": 5562 }, { "epoch": 2.7741023936170213, "grad_norm": 0.412293404340744, "learning_rate": 1.7106002914135667e-07, "loss": 0.3152, "step": 5563 }, { "epoch": 2.7746010638297873, "grad_norm": 0.3655279576778412, "learning_rate": 1.7030828537315136e-07, "loss": 0.3231, "step": 5564 }, { "epoch": 2.775099734042553, "grad_norm": 0.4365305006504059, "learning_rate": 1.695581684383807e-07, "loss": 0.276, "step": 5565 }, { "epoch": 2.775598404255319, "grad_norm": 0.4485933184623718, "learning_rate": 1.6880967858971197e-07, "loss": 0.3549, "step": 5566 }, { "epoch": 2.776097074468085, "grad_norm": 0.4294889569282532, "learning_rate": 1.680628160792669e-07, "loss": 0.3587, "step": 5567 }, { "epoch": 2.776595744680851, "grad_norm": 0.44273823499679565, "learning_rate": 1.673175811586203e-07, "loss": 0.3632, "step": 5568 }, { "epoch": 2.777094414893617, "grad_norm": 0.3961699903011322, "learning_rate": 1.6657397407879472e-07, "loss": 0.3571, "step": 5569 }, { "epoch": 2.777593085106383, "grad_norm": 0.39416319131851196, "learning_rate": 1.6583199509026815e-07, "loss": 0.2923, "step": 5570 }, { "epoch": 2.778091755319149, "grad_norm": 0.42462414503097534, "learning_rate": 1.6509164444296844e-07, "loss": 0.363, "step": 5571 }, { "epoch": 2.778590425531915, "grad_norm": 0.3883436620235443, "learning_rate": 1.6435292238627609e-07, "loss": 0.3145, "step": 5572 }, { "epoch": 2.779089095744681, "grad_norm": 0.4612589180469513, "learning_rate": 1.6361582916902152e-07, "loss": 0.3621, "step": 5573 }, { "epoch": 2.779587765957447, "grad_norm": 0.42756974697113037, "learning_rate": 1.6288036503948723e-07, "loss": 0.3132, "step": 5574 }, { "epoch": 2.7800864361702127, "grad_norm": 0.37920525670051575, "learning_rate": 1.6214653024540672e-07, "loss": 0.325, "step": 5575 }, { "epoch": 2.7805851063829787, "grad_norm": 0.4142991304397583, "learning_rate": 1.614143250339656e-07, "loss": 0.3364, "step": 5576 }, { "epoch": 2.781083776595745, "grad_norm": 0.4309326410293579, "learning_rate": 1.606837496517999e-07, "loss": 0.3146, "step": 5577 }, { "epoch": 2.7815824468085104, "grad_norm": 0.4075361490249634, "learning_rate": 1.5995480434499555e-07, "loss": 0.4029, "step": 5578 }, { "epoch": 2.7820811170212765, "grad_norm": 0.39477503299713135, "learning_rate": 1.592274893590917e-07, "loss": 0.3673, "step": 5579 }, { "epoch": 2.7825797872340425, "grad_norm": 0.3677620589733124, "learning_rate": 1.585018049390763e-07, "loss": 0.3068, "step": 5580 }, { "epoch": 2.7830784574468086, "grad_norm": 0.38490328192710876, "learning_rate": 1.5777775132938932e-07, "loss": 0.3486, "step": 5581 }, { "epoch": 2.7835771276595747, "grad_norm": 0.37227582931518555, "learning_rate": 1.5705532877392127e-07, "loss": 0.3103, "step": 5582 }, { "epoch": 2.7840757978723403, "grad_norm": 0.45177531242370605, "learning_rate": 1.5633453751601246e-07, "loss": 0.3685, "step": 5583 }, { "epoch": 2.7845744680851063, "grad_norm": 0.412911057472229, "learning_rate": 1.5561537779845426e-07, "loss": 0.3141, "step": 5584 }, { "epoch": 2.7850731382978724, "grad_norm": 0.38289400935173035, "learning_rate": 1.54897849863489e-07, "loss": 0.3588, "step": 5585 }, { "epoch": 2.7855718085106385, "grad_norm": 0.43132802844047546, "learning_rate": 1.5418195395280776e-07, "loss": 0.3609, "step": 5586 }, { "epoch": 2.7860704787234045, "grad_norm": 0.4335956275463104, "learning_rate": 1.5346769030755438e-07, "loss": 0.3037, "step": 5587 }, { "epoch": 2.78656914893617, "grad_norm": 0.4282870888710022, "learning_rate": 1.5275505916832024e-07, "loss": 0.3352, "step": 5588 }, { "epoch": 2.787067819148936, "grad_norm": 0.3923828899860382, "learning_rate": 1.5204406077514832e-07, "loss": 0.3397, "step": 5589 }, { "epoch": 2.7875664893617023, "grad_norm": 0.4018854796886444, "learning_rate": 1.513346953675321e-07, "loss": 0.3099, "step": 5590 }, { "epoch": 2.788065159574468, "grad_norm": 0.4131953716278076, "learning_rate": 1.506269631844126e-07, "loss": 0.351, "step": 5591 }, { "epoch": 2.788563829787234, "grad_norm": 0.36884164810180664, "learning_rate": 1.4992086446418363e-07, "loss": 0.3571, "step": 5592 }, { "epoch": 2.7890625, "grad_norm": 0.4246785044670105, "learning_rate": 1.4921639944468713e-07, "loss": 0.4263, "step": 5593 }, { "epoch": 2.789561170212766, "grad_norm": 0.38491567969322205, "learning_rate": 1.4851356836321495e-07, "loss": 0.3091, "step": 5594 }, { "epoch": 2.790059840425532, "grad_norm": 0.3450469374656677, "learning_rate": 1.478123714565094e-07, "loss": 0.3136, "step": 5595 }, { "epoch": 2.7905585106382977, "grad_norm": 0.3749246597290039, "learning_rate": 1.471128089607604e-07, "loss": 0.393, "step": 5596 }, { "epoch": 2.791057180851064, "grad_norm": 0.44263026118278503, "learning_rate": 1.4641488111161062e-07, "loss": 0.3153, "step": 5597 }, { "epoch": 2.79155585106383, "grad_norm": 0.4007466435432434, "learning_rate": 1.457185881441475e-07, "loss": 0.319, "step": 5598 }, { "epoch": 2.7920545212765955, "grad_norm": 0.3909662365913391, "learning_rate": 1.4502393029291294e-07, "loss": 0.3101, "step": 5599 }, { "epoch": 2.7925531914893615, "grad_norm": 0.3817547857761383, "learning_rate": 1.4433090779189307e-07, "loss": 0.3683, "step": 5600 }, { "epoch": 2.7930518617021276, "grad_norm": 0.37070733308792114, "learning_rate": 1.436395208745278e-07, "loss": 0.3496, "step": 5601 }, { "epoch": 2.7935505319148937, "grad_norm": 0.393147349357605, "learning_rate": 1.42949769773702e-07, "loss": 0.3241, "step": 5602 }, { "epoch": 2.7940492021276597, "grad_norm": 0.3518412709236145, "learning_rate": 1.4226165472175312e-07, "loss": 0.332, "step": 5603 }, { "epoch": 2.7945478723404253, "grad_norm": 0.424282431602478, "learning_rate": 1.4157517595046466e-07, "loss": 0.3556, "step": 5604 }, { "epoch": 2.7950465425531914, "grad_norm": 0.42907387018203735, "learning_rate": 1.4089033369107108e-07, "loss": 0.3523, "step": 5605 }, { "epoch": 2.7955452127659575, "grad_norm": 0.3865559995174408, "learning_rate": 1.402071281742534e-07, "loss": 0.341, "step": 5606 }, { "epoch": 2.7960438829787235, "grad_norm": 0.3541807234287262, "learning_rate": 1.395255596301437e-07, "loss": 0.2983, "step": 5607 }, { "epoch": 2.7965425531914896, "grad_norm": 0.41138580441474915, "learning_rate": 1.388456282883216e-07, "loss": 0.3364, "step": 5608 }, { "epoch": 2.797041223404255, "grad_norm": 0.4510425925254822, "learning_rate": 1.3816733437781393e-07, "loss": 0.352, "step": 5609 }, { "epoch": 2.7975398936170213, "grad_norm": 0.3935955762863159, "learning_rate": 1.3749067812709848e-07, "loss": 0.3207, "step": 5610 }, { "epoch": 2.7980385638297873, "grad_norm": 0.418540358543396, "learning_rate": 1.3681565976409906e-07, "loss": 0.2795, "step": 5611 }, { "epoch": 2.798537234042553, "grad_norm": 0.44218909740448, "learning_rate": 1.361422795161904e-07, "loss": 0.3241, "step": 5612 }, { "epoch": 2.799035904255319, "grad_norm": 0.39401164650917053, "learning_rate": 1.3547053761019224e-07, "loss": 0.3577, "step": 5613 }, { "epoch": 2.799534574468085, "grad_norm": 0.37580716609954834, "learning_rate": 1.3480043427237578e-07, "loss": 0.3412, "step": 5614 }, { "epoch": 2.800033244680851, "grad_norm": 0.4529848098754883, "learning_rate": 1.341319697284571e-07, "loss": 0.3066, "step": 5615 }, { "epoch": 2.800531914893617, "grad_norm": 0.40534698963165283, "learning_rate": 1.334651442036028e-07, "loss": 0.3696, "step": 5616 }, { "epoch": 2.801030585106383, "grad_norm": 0.429269403219223, "learning_rate": 1.327999579224254e-07, "loss": 0.3107, "step": 5617 }, { "epoch": 2.801529255319149, "grad_norm": 0.3839872479438782, "learning_rate": 1.3213641110898788e-07, "loss": 0.2721, "step": 5618 }, { "epoch": 2.802027925531915, "grad_norm": 0.3694447875022888, "learning_rate": 1.3147450398679818e-07, "loss": 0.3488, "step": 5619 }, { "epoch": 2.802526595744681, "grad_norm": 0.38023823499679565, "learning_rate": 1.308142367788129e-07, "loss": 0.3282, "step": 5620 }, { "epoch": 2.803025265957447, "grad_norm": 0.4123170077800751, "learning_rate": 1.301556097074369e-07, "loss": 0.3745, "step": 5621 }, { "epoch": 2.8035239361702127, "grad_norm": 0.4596218466758728, "learning_rate": 1.294986229945222e-07, "loss": 0.3936, "step": 5622 }, { "epoch": 2.8040226063829787, "grad_norm": 0.38687649369239807, "learning_rate": 1.2884327686136843e-07, "loss": 0.2775, "step": 5623 }, { "epoch": 2.804521276595745, "grad_norm": 0.40330955386161804, "learning_rate": 1.281895715287218e-07, "loss": 0.3378, "step": 5624 }, { "epoch": 2.8050199468085104, "grad_norm": 0.4297674000263214, "learning_rate": 1.2753750721677728e-07, "loss": 0.3549, "step": 5625 }, { "epoch": 2.8055186170212765, "grad_norm": 0.40168190002441406, "learning_rate": 1.268870841451747e-07, "loss": 0.3342, "step": 5626 }, { "epoch": 2.8060172872340425, "grad_norm": 0.400017112493515, "learning_rate": 1.262383025330044e-07, "loss": 0.3443, "step": 5627 }, { "epoch": 2.8065159574468086, "grad_norm": 0.4022834599018097, "learning_rate": 1.25591162598801e-07, "loss": 0.3039, "step": 5628 }, { "epoch": 2.8070146276595747, "grad_norm": 0.4131922423839569, "learning_rate": 1.2494566456054736e-07, "loss": 0.3422, "step": 5629 }, { "epoch": 2.8075132978723403, "grad_norm": 0.41592755913734436, "learning_rate": 1.2430180863567233e-07, "loss": 0.3175, "step": 5630 }, { "epoch": 2.8080119680851063, "grad_norm": 0.3999999165534973, "learning_rate": 1.2365959504105351e-07, "loss": 0.3787, "step": 5631 }, { "epoch": 2.8085106382978724, "grad_norm": 0.4215658903121948, "learning_rate": 1.2301902399301347e-07, "loss": 0.3613, "step": 5632 }, { "epoch": 2.8090093085106385, "grad_norm": 0.41183558106422424, "learning_rate": 1.2238009570732179e-07, "loss": 0.2887, "step": 5633 }, { "epoch": 2.8095079787234045, "grad_norm": 0.4231908321380615, "learning_rate": 1.2174281039919634e-07, "loss": 0.3248, "step": 5634 }, { "epoch": 2.81000664893617, "grad_norm": 0.37317556142807007, "learning_rate": 1.2110716828329928e-07, "loss": 0.3564, "step": 5635 }, { "epoch": 2.810505319148936, "grad_norm": 0.3669140040874481, "learning_rate": 1.2047316957374044e-07, "loss": 0.3051, "step": 5636 }, { "epoch": 2.8110039893617023, "grad_norm": 0.42374613881111145, "learning_rate": 1.1984081448407624e-07, "loss": 0.3746, "step": 5637 }, { "epoch": 2.811502659574468, "grad_norm": 0.4285549223423004, "learning_rate": 1.1921010322730907e-07, "loss": 0.2873, "step": 5638 }, { "epoch": 2.812001329787234, "grad_norm": 0.4233493208885193, "learning_rate": 1.1858103601588788e-07, "loss": 0.3719, "step": 5639 }, { "epoch": 2.8125, "grad_norm": 0.3501047194004059, "learning_rate": 1.179536130617076e-07, "loss": 0.3373, "step": 5640 }, { "epoch": 2.812998670212766, "grad_norm": 0.4122050702571869, "learning_rate": 1.1732783457610919e-07, "loss": 0.3765, "step": 5641 }, { "epoch": 2.813497340425532, "grad_norm": 0.3709091544151306, "learning_rate": 1.1670370076987958e-07, "loss": 0.3295, "step": 5642 }, { "epoch": 2.8139960106382977, "grad_norm": 0.4214085638523102, "learning_rate": 1.1608121185325283e-07, "loss": 0.4001, "step": 5643 }, { "epoch": 2.814494680851064, "grad_norm": 0.3894028663635254, "learning_rate": 1.1546036803590732e-07, "loss": 0.3234, "step": 5644 }, { "epoch": 2.81499335106383, "grad_norm": 0.38653528690338135, "learning_rate": 1.1484116952696911e-07, "loss": 0.3379, "step": 5645 }, { "epoch": 2.8154920212765955, "grad_norm": 0.39440426230430603, "learning_rate": 1.1422361653500746e-07, "loss": 0.2818, "step": 5646 }, { "epoch": 2.8159906914893615, "grad_norm": 0.4225783944129944, "learning_rate": 1.1360770926804099e-07, "loss": 0.3883, "step": 5647 }, { "epoch": 2.8164893617021276, "grad_norm": 0.3892861604690552, "learning_rate": 1.1299344793352984e-07, "loss": 0.3001, "step": 5648 }, { "epoch": 2.8169880319148937, "grad_norm": 0.40711960196495056, "learning_rate": 1.1238083273838352e-07, "loss": 0.3689, "step": 5649 }, { "epoch": 2.8174867021276597, "grad_norm": 0.41190576553344727, "learning_rate": 1.1176986388895361e-07, "loss": 0.337, "step": 5650 }, { "epoch": 2.8179853723404253, "grad_norm": 0.37694060802459717, "learning_rate": 1.1116054159104051e-07, "loss": 0.3162, "step": 5651 }, { "epoch": 2.8184840425531914, "grad_norm": 0.40722066164016724, "learning_rate": 1.1055286604988835e-07, "loss": 0.3537, "step": 5652 }, { "epoch": 2.8189827127659575, "grad_norm": 0.41169285774230957, "learning_rate": 1.0994683747018454e-07, "loss": 0.3229, "step": 5653 }, { "epoch": 2.8194813829787235, "grad_norm": 0.3885800242424011, "learning_rate": 1.093424560560663e-07, "loss": 0.3299, "step": 5654 }, { "epoch": 2.8199800531914896, "grad_norm": 0.42760565876960754, "learning_rate": 1.0873972201111138e-07, "loss": 0.3402, "step": 5655 }, { "epoch": 2.820478723404255, "grad_norm": 0.4172482490539551, "learning_rate": 1.0813863553834625e-07, "loss": 0.386, "step": 5656 }, { "epoch": 2.8209773936170213, "grad_norm": 0.38475510478019714, "learning_rate": 1.075391968402395e-07, "loss": 0.2983, "step": 5657 }, { "epoch": 2.8214760638297873, "grad_norm": 0.43619343638420105, "learning_rate": 1.069414061187074e-07, "loss": 0.3458, "step": 5658 }, { "epoch": 2.821974734042553, "grad_norm": 0.3605519235134125, "learning_rate": 1.0634526357510888e-07, "loss": 0.2942, "step": 5659 }, { "epoch": 2.822473404255319, "grad_norm": 0.3903449475765228, "learning_rate": 1.0575076941024942e-07, "loss": 0.3053, "step": 5660 }, { "epoch": 2.822972074468085, "grad_norm": 0.4117943346500397, "learning_rate": 1.0515792382437773e-07, "loss": 0.3632, "step": 5661 }, { "epoch": 2.823470744680851, "grad_norm": 0.44066759943962097, "learning_rate": 1.0456672701718851e-07, "loss": 0.3546, "step": 5662 }, { "epoch": 2.823969414893617, "grad_norm": 0.3649657666683197, "learning_rate": 1.0397717918782024e-07, "loss": 0.3058, "step": 5663 }, { "epoch": 2.824468085106383, "grad_norm": 0.422134131193161, "learning_rate": 1.0338928053485519e-07, "loss": 0.334, "step": 5664 }, { "epoch": 2.824966755319149, "grad_norm": 0.405063271522522, "learning_rate": 1.0280303125632274e-07, "loss": 0.3146, "step": 5665 }, { "epoch": 2.825465425531915, "grad_norm": 0.42732757329940796, "learning_rate": 1.022184315496949e-07, "loss": 0.3702, "step": 5666 }, { "epoch": 2.825964095744681, "grad_norm": 0.37883248925209045, "learning_rate": 1.0163548161188696e-07, "loss": 0.3307, "step": 5667 }, { "epoch": 2.826462765957447, "grad_norm": 0.38422563672065735, "learning_rate": 1.0105418163926128e-07, "loss": 0.3506, "step": 5668 }, { "epoch": 2.8269614361702127, "grad_norm": 0.3640628755092621, "learning_rate": 1.004745318276218e-07, "loss": 0.3158, "step": 5669 }, { "epoch": 2.8274601063829787, "grad_norm": 0.38786581158638, "learning_rate": 9.989653237221897e-08, "loss": 0.344, "step": 5670 }, { "epoch": 2.827958776595745, "grad_norm": 0.37250974774360657, "learning_rate": 9.932018346774486e-08, "loss": 0.2939, "step": 5671 }, { "epoch": 2.8284574468085104, "grad_norm": 0.41143128275871277, "learning_rate": 9.874548530833749e-08, "loss": 0.3259, "step": 5672 }, { "epoch": 2.8289561170212765, "grad_norm": 0.40192726254463196, "learning_rate": 9.817243808757815e-08, "loss": 0.3557, "step": 5673 }, { "epoch": 2.8294547872340425, "grad_norm": 0.4171990752220154, "learning_rate": 9.760104199849241e-08, "loss": 0.3806, "step": 5674 }, { "epoch": 2.8299534574468086, "grad_norm": 0.32714077830314636, "learning_rate": 9.703129723354854e-08, "loss": 0.2752, "step": 5675 }, { "epoch": 2.8304521276595747, "grad_norm": 0.39293766021728516, "learning_rate": 9.646320398466025e-08, "loss": 0.3945, "step": 5676 }, { "epoch": 2.8309507978723403, "grad_norm": 0.38692212104797363, "learning_rate": 9.589676244318336e-08, "loss": 0.3021, "step": 5677 }, { "epoch": 2.8314494680851063, "grad_norm": 0.40292033553123474, "learning_rate": 9.533197279991857e-08, "loss": 0.3678, "step": 5678 }, { "epoch": 2.8319481382978724, "grad_norm": 0.412562757730484, "learning_rate": 9.476883524510928e-08, "loss": 0.3795, "step": 5679 }, { "epoch": 2.8324468085106385, "grad_norm": 0.3843323290348053, "learning_rate": 9.420734996844317e-08, "loss": 0.3199, "step": 5680 }, { "epoch": 2.8329454787234045, "grad_norm": 0.42747387290000916, "learning_rate": 9.364751715905008e-08, "loss": 0.3321, "step": 5681 }, { "epoch": 2.83344414893617, "grad_norm": 0.43183964490890503, "learning_rate": 9.308933700550526e-08, "loss": 0.3144, "step": 5682 }, { "epoch": 2.833942819148936, "grad_norm": 0.3971956670284271, "learning_rate": 9.253280969582557e-08, "loss": 0.3261, "step": 5683 }, { "epoch": 2.8344414893617023, "grad_norm": 0.369940847158432, "learning_rate": 9.197793541747102e-08, "loss": 0.3393, "step": 5684 }, { "epoch": 2.834940159574468, "grad_norm": 0.42436492443084717, "learning_rate": 9.142471435734713e-08, "loss": 0.35, "step": 5685 }, { "epoch": 2.835438829787234, "grad_norm": 0.41212573647499084, "learning_rate": 9.08731467017987e-08, "loss": 0.3151, "step": 5686 }, { "epoch": 2.8359375, "grad_norm": 0.3942498564720154, "learning_rate": 9.032323263661768e-08, "loss": 0.3316, "step": 5687 }, { "epoch": 2.836436170212766, "grad_norm": 0.4272378087043762, "learning_rate": 8.977497234703591e-08, "loss": 0.3952, "step": 5688 }, { "epoch": 2.836934840425532, "grad_norm": 0.4289522171020508, "learning_rate": 8.922836601773066e-08, "loss": 0.2833, "step": 5689 }, { "epoch": 2.8374335106382977, "grad_norm": 0.427200049161911, "learning_rate": 8.868341383281909e-08, "loss": 0.3205, "step": 5690 }, { "epoch": 2.837932180851064, "grad_norm": 0.4024290442466736, "learning_rate": 8.814011597586491e-08, "loss": 0.3455, "step": 5691 }, { "epoch": 2.83843085106383, "grad_norm": 0.4037107527256012, "learning_rate": 8.759847262987065e-08, "loss": 0.3098, "step": 5692 }, { "epoch": 2.8389295212765955, "grad_norm": 0.39882001280784607, "learning_rate": 8.705848397728478e-08, "loss": 0.3672, "step": 5693 }, { "epoch": 2.8394281914893615, "grad_norm": 0.44193679094314575, "learning_rate": 8.652015019999683e-08, "loss": 0.3975, "step": 5694 }, { "epoch": 2.8399268617021276, "grad_norm": 0.3908149302005768, "learning_rate": 8.598347147933839e-08, "loss": 0.3214, "step": 5695 }, { "epoch": 2.8404255319148937, "grad_norm": 0.42177441716194153, "learning_rate": 8.544844799608542e-08, "loss": 0.3229, "step": 5696 }, { "epoch": 2.8409242021276597, "grad_norm": 0.4014332592487335, "learning_rate": 8.491507993045434e-08, "loss": 0.3296, "step": 5697 }, { "epoch": 2.8414228723404253, "grad_norm": 0.36523765325546265, "learning_rate": 8.438336746210584e-08, "loss": 0.3208, "step": 5698 }, { "epoch": 2.8419215425531914, "grad_norm": 0.4658620357513428, "learning_rate": 8.385331077014058e-08, "loss": 0.4422, "step": 5699 }, { "epoch": 2.8424202127659575, "grad_norm": 0.40088051557540894, "learning_rate": 8.332491003310405e-08, "loss": 0.3024, "step": 5700 }, { "epoch": 2.8429188829787235, "grad_norm": 0.38874581456184387, "learning_rate": 8.279816542898222e-08, "loss": 0.3681, "step": 5701 }, { "epoch": 2.8434175531914896, "grad_norm": 0.38658976554870605, "learning_rate": 8.227307713520427e-08, "loss": 0.3925, "step": 5702 }, { "epoch": 2.843916223404255, "grad_norm": 0.3800269067287445, "learning_rate": 8.174964532864038e-08, "loss": 0.2959, "step": 5703 }, { "epoch": 2.8444148936170213, "grad_norm": 0.3771640658378601, "learning_rate": 8.12278701856034e-08, "loss": 0.305, "step": 5704 }, { "epoch": 2.8449135638297873, "grad_norm": 0.390572190284729, "learning_rate": 8.070775188184831e-08, "loss": 0.2982, "step": 5705 }, { "epoch": 2.845412234042553, "grad_norm": 0.4320979416370392, "learning_rate": 8.018929059257108e-08, "loss": 0.3725, "step": 5706 }, { "epoch": 2.845910904255319, "grad_norm": 0.34810540080070496, "learning_rate": 7.967248649241144e-08, "loss": 0.2835, "step": 5707 }, { "epoch": 2.846409574468085, "grad_norm": 0.4329622685909271, "learning_rate": 7.915733975544904e-08, "loss": 0.3664, "step": 5708 }, { "epoch": 2.846908244680851, "grad_norm": 0.4212868809700012, "learning_rate": 7.864385055520562e-08, "loss": 0.331, "step": 5709 }, { "epoch": 2.847406914893617, "grad_norm": 0.39800912141799927, "learning_rate": 7.813201906464562e-08, "loss": 0.3275, "step": 5710 }, { "epoch": 2.847905585106383, "grad_norm": 0.4270269274711609, "learning_rate": 7.762184545617391e-08, "loss": 0.3272, "step": 5711 }, { "epoch": 2.848404255319149, "grad_norm": 0.3824549615383148, "learning_rate": 7.711332990163689e-08, "loss": 0.3833, "step": 5712 }, { "epoch": 2.848902925531915, "grad_norm": 0.37875908613204956, "learning_rate": 7.660647257232368e-08, "loss": 0.3189, "step": 5713 }, { "epoch": 2.849401595744681, "grad_norm": 0.41184836626052856, "learning_rate": 7.61012736389638e-08, "loss": 0.3125, "step": 5714 }, { "epoch": 2.849900265957447, "grad_norm": 0.41316524147987366, "learning_rate": 7.559773327172892e-08, "loss": 0.3309, "step": 5715 }, { "epoch": 2.8503989361702127, "grad_norm": 0.4424660801887512, "learning_rate": 7.509585164023114e-08, "loss": 0.3503, "step": 5716 }, { "epoch": 2.8508976063829787, "grad_norm": 0.38906270265579224, "learning_rate": 7.459562891352412e-08, "loss": 0.337, "step": 5717 }, { "epoch": 2.851396276595745, "grad_norm": 0.4103133976459503, "learning_rate": 7.409706526010362e-08, "loss": 0.3267, "step": 5718 }, { "epoch": 2.8518949468085104, "grad_norm": 0.42484796047210693, "learning_rate": 7.360016084790478e-08, "loss": 0.3589, "step": 5719 }, { "epoch": 2.8523936170212765, "grad_norm": 0.38503560423851013, "learning_rate": 7.31049158443059e-08, "loss": 0.3309, "step": 5720 }, { "epoch": 2.8528922872340425, "grad_norm": 0.42246440052986145, "learning_rate": 7.261133041612468e-08, "loss": 0.3659, "step": 5721 }, { "epoch": 2.8533909574468086, "grad_norm": 0.4016619026660919, "learning_rate": 7.211940472962032e-08, "loss": 0.3466, "step": 5722 }, { "epoch": 2.8538896276595747, "grad_norm": 0.39737173914909363, "learning_rate": 7.162913895049361e-08, "loss": 0.266, "step": 5723 }, { "epoch": 2.8543882978723403, "grad_norm": 0.39004382491111755, "learning_rate": 7.114053324388581e-08, "loss": 0.3905, "step": 5724 }, { "epoch": 2.8548869680851063, "grad_norm": 0.39440053701400757, "learning_rate": 7.065358777437802e-08, "loss": 0.3385, "step": 5725 }, { "epoch": 2.8553856382978724, "grad_norm": 0.41779059171676636, "learning_rate": 7.016830270599407e-08, "loss": 0.2941, "step": 5726 }, { "epoch": 2.8558843085106385, "grad_norm": 0.4468713402748108, "learning_rate": 6.96846782021965e-08, "loss": 0.3622, "step": 5727 }, { "epoch": 2.8563829787234045, "grad_norm": 0.4661322832107544, "learning_rate": 6.920271442588945e-08, "loss": 0.3596, "step": 5728 }, { "epoch": 2.85688164893617, "grad_norm": 0.36580976843833923, "learning_rate": 6.872241153941805e-08, "loss": 0.2776, "step": 5729 }, { "epoch": 2.857380319148936, "grad_norm": 0.3644375205039978, "learning_rate": 6.82437697045668e-08, "loss": 0.3405, "step": 5730 }, { "epoch": 2.8578789893617023, "grad_norm": 0.39044612646102905, "learning_rate": 6.776678908256285e-08, "loss": 0.2841, "step": 5731 }, { "epoch": 2.858377659574468, "grad_norm": 0.45993369817733765, "learning_rate": 6.729146983407042e-08, "loss": 0.3588, "step": 5732 }, { "epoch": 2.858876329787234, "grad_norm": 0.39296388626098633, "learning_rate": 6.681781211919813e-08, "loss": 0.3306, "step": 5733 }, { "epoch": 2.859375, "grad_norm": 0.3865419626235962, "learning_rate": 6.634581609749113e-08, "loss": 0.3361, "step": 5734 }, { "epoch": 2.859873670212766, "grad_norm": 0.408065527677536, "learning_rate": 6.587548192793724e-08, "loss": 0.3437, "step": 5735 }, { "epoch": 2.860372340425532, "grad_norm": 0.37269333004951477, "learning_rate": 6.540680976896419e-08, "loss": 0.3354, "step": 5736 }, { "epoch": 2.8608710106382977, "grad_norm": 0.4165245294570923, "learning_rate": 6.493979977843901e-08, "loss": 0.3476, "step": 5737 }, { "epoch": 2.861369680851064, "grad_norm": 0.4055057764053345, "learning_rate": 6.447445211366921e-08, "loss": 0.386, "step": 5738 }, { "epoch": 2.86186835106383, "grad_norm": 0.45060253143310547, "learning_rate": 6.401076693140329e-08, "loss": 0.3537, "step": 5739 }, { "epoch": 2.8623670212765955, "grad_norm": 0.4480946958065033, "learning_rate": 6.354874438782799e-08, "loss": 0.289, "step": 5740 }, { "epoch": 2.8628656914893615, "grad_norm": 0.4111553430557251, "learning_rate": 6.308838463857159e-08, "loss": 0.3507, "step": 5741 }, { "epoch": 2.8633643617021276, "grad_norm": 0.4163209795951843, "learning_rate": 6.262968783870172e-08, "loss": 0.3453, "step": 5742 }, { "epoch": 2.8638630319148937, "grad_norm": 0.41715380549430847, "learning_rate": 6.217265414272588e-08, "loss": 0.3093, "step": 5743 }, { "epoch": 2.8643617021276597, "grad_norm": 0.4002729654312134, "learning_rate": 6.17172837045904e-08, "loss": 0.3241, "step": 5744 }, { "epoch": 2.8648603723404253, "grad_norm": 0.42753124237060547, "learning_rate": 6.126357667768367e-08, "loss": 0.3538, "step": 5745 }, { "epoch": 2.8653590425531914, "grad_norm": 0.3848763406276703, "learning_rate": 6.081153321483125e-08, "loss": 0.2819, "step": 5746 }, { "epoch": 2.8658577127659575, "grad_norm": 0.40423065423965454, "learning_rate": 6.036115346830018e-08, "loss": 0.3206, "step": 5747 }, { "epoch": 2.8663563829787235, "grad_norm": 0.3927416205406189, "learning_rate": 5.991243758979582e-08, "loss": 0.3743, "step": 5748 }, { "epoch": 2.8668550531914896, "grad_norm": 0.3744502365589142, "learning_rate": 5.9465385730464455e-08, "loss": 0.3786, "step": 5749 }, { "epoch": 2.867353723404255, "grad_norm": 0.37200915813446045, "learning_rate": 5.901999804088954e-08, "loss": 0.3592, "step": 5750 }, { "epoch": 2.8678523936170213, "grad_norm": 0.39301231503486633, "learning_rate": 5.8576274671097166e-08, "loss": 0.3714, "step": 5751 }, { "epoch": 2.8683510638297873, "grad_norm": 0.3974754512310028, "learning_rate": 5.8134215770550003e-08, "loss": 0.3144, "step": 5752 }, { "epoch": 2.868849734042553, "grad_norm": 0.3984546363353729, "learning_rate": 5.7693821488151724e-08, "loss": 0.4135, "step": 5753 }, { "epoch": 2.869348404255319, "grad_norm": 0.3869858682155609, "learning_rate": 5.7255091972244214e-08, "loss": 0.3624, "step": 5754 }, { "epoch": 2.869847074468085, "grad_norm": 0.3918856680393219, "learning_rate": 5.681802737060982e-08, "loss": 0.3669, "step": 5755 }, { "epoch": 2.870345744680851, "grad_norm": 0.39777839183807373, "learning_rate": 5.6382627830469105e-08, "loss": 0.3151, "step": 5756 }, { "epoch": 2.870844414893617, "grad_norm": 0.39498600363731384, "learning_rate": 5.594889349848198e-08, "loss": 0.3549, "step": 5757 }, { "epoch": 2.871343085106383, "grad_norm": 0.4073593318462372, "learning_rate": 5.551682452074714e-08, "loss": 0.3318, "step": 5758 }, { "epoch": 2.871841755319149, "grad_norm": 0.4051569700241089, "learning_rate": 5.508642104280315e-08, "loss": 0.3211, "step": 5759 }, { "epoch": 2.872340425531915, "grad_norm": 0.3641855716705322, "learning_rate": 5.465768320962739e-08, "loss": 0.3424, "step": 5760 }, { "epoch": 2.872839095744681, "grad_norm": 0.3895210921764374, "learning_rate": 5.4230611165635436e-08, "loss": 0.3327, "step": 5761 }, { "epoch": 2.873337765957447, "grad_norm": 0.43462440371513367, "learning_rate": 5.3805205054682786e-08, "loss": 0.3697, "step": 5762 }, { "epoch": 2.8738364361702127, "grad_norm": 0.4268989861011505, "learning_rate": 5.3381465020062584e-08, "loss": 0.3701, "step": 5763 }, { "epoch": 2.8743351063829787, "grad_norm": 0.4476352334022522, "learning_rate": 5.295939120450788e-08, "loss": 0.3533, "step": 5764 }, { "epoch": 2.874833776595745, "grad_norm": 0.4473509192466736, "learning_rate": 5.2538983750189397e-08, "loss": 0.3277, "step": 5765 }, { "epoch": 2.8753324468085104, "grad_norm": 0.40183430910110474, "learning_rate": 5.212024279871775e-08, "loss": 0.2983, "step": 5766 }, { "epoch": 2.8758311170212765, "grad_norm": 0.3940201699733734, "learning_rate": 5.170316849114176e-08, "loss": 0.3864, "step": 5767 }, { "epoch": 2.8763297872340425, "grad_norm": 0.4091455042362213, "learning_rate": 5.128776096794852e-08, "loss": 0.2888, "step": 5768 }, { "epoch": 2.8768284574468086, "grad_norm": 0.4206748902797699, "learning_rate": 5.0874020369064414e-08, "loss": 0.3332, "step": 5769 }, { "epoch": 2.8773271276595747, "grad_norm": 0.3970431387424469, "learning_rate": 5.0461946833852415e-08, "loss": 0.3046, "step": 5770 }, { "epoch": 2.8778257978723403, "grad_norm": 0.39938411116600037, "learning_rate": 5.005154050111705e-08, "loss": 0.3731, "step": 5771 }, { "epoch": 2.8783244680851063, "grad_norm": 0.4046688973903656, "learning_rate": 4.964280150909828e-08, "loss": 0.363, "step": 5772 }, { "epoch": 2.8788231382978724, "grad_norm": 0.432455450296402, "learning_rate": 4.9235729995476545e-08, "loss": 0.2968, "step": 5773 }, { "epoch": 2.8793218085106385, "grad_norm": 0.4166245460510254, "learning_rate": 4.8830326097369375e-08, "loss": 0.3435, "step": 5774 }, { "epoch": 2.8798204787234045, "grad_norm": 0.3986745774745941, "learning_rate": 4.8426589951333645e-08, "loss": 0.328, "step": 5775 }, { "epoch": 2.88031914893617, "grad_norm": 0.3828981816768646, "learning_rate": 4.8024521693363354e-08, "loss": 0.3213, "step": 5776 }, { "epoch": 2.880817819148936, "grad_norm": 0.40259143710136414, "learning_rate": 4.762412145889128e-08, "loss": 0.3658, "step": 5777 }, { "epoch": 2.8813164893617023, "grad_norm": 0.4022573232650757, "learning_rate": 4.722538938278787e-08, "loss": 0.347, "step": 5778 }, { "epoch": 2.881815159574468, "grad_norm": 0.36167749762535095, "learning_rate": 4.68283255993629e-08, "loss": 0.3426, "step": 5779 }, { "epoch": 2.882313829787234, "grad_norm": 0.39603739976882935, "learning_rate": 4.643293024236273e-08, "loss": 0.3638, "step": 5780 }, { "epoch": 2.8828125, "grad_norm": 0.341337114572525, "learning_rate": 4.6039203444971925e-08, "loss": 0.2861, "step": 5781 }, { "epoch": 2.883311170212766, "grad_norm": 0.38121670484542847, "learning_rate": 4.5647145339814956e-08, "loss": 0.358, "step": 5782 }, { "epoch": 2.883809840425532, "grad_norm": 0.43883252143859863, "learning_rate": 4.5256756058950636e-08, "loss": 0.3163, "step": 5783 }, { "epoch": 2.8843085106382977, "grad_norm": 0.38710618019104004, "learning_rate": 4.486803573387932e-08, "loss": 0.3029, "step": 5784 }, { "epoch": 2.884807180851064, "grad_norm": 0.3805430829524994, "learning_rate": 4.448098449553684e-08, "loss": 0.3572, "step": 5785 }, { "epoch": 2.88530585106383, "grad_norm": 0.3931557834148407, "learning_rate": 4.4095602474297785e-08, "loss": 0.3055, "step": 5786 }, { "epoch": 2.8858045212765955, "grad_norm": 0.4181150496006012, "learning_rate": 4.3711889799974426e-08, "loss": 0.3411, "step": 5787 }, { "epoch": 2.8863031914893615, "grad_norm": 0.373139351606369, "learning_rate": 4.332984660181561e-08, "loss": 0.3341, "step": 5788 }, { "epoch": 2.8868018617021276, "grad_norm": 0.41115161776542664, "learning_rate": 4.2949473008510046e-08, "loss": 0.3182, "step": 5789 }, { "epoch": 2.8873005319148937, "grad_norm": 0.43480104207992554, "learning_rate": 4.257076914818192e-08, "loss": 0.3707, "step": 5790 }, { "epoch": 2.8877992021276597, "grad_norm": 0.3884008526802063, "learning_rate": 4.21937351483942e-08, "loss": 0.3605, "step": 5791 }, { "epoch": 2.8882978723404253, "grad_norm": 0.37585577368736267, "learning_rate": 4.1818371136146954e-08, "loss": 0.3694, "step": 5792 }, { "epoch": 2.8887965425531914, "grad_norm": 0.4026452302932739, "learning_rate": 4.1444677237877376e-08, "loss": 0.3261, "step": 5793 }, { "epoch": 2.8892952127659575, "grad_norm": 0.41271600127220154, "learning_rate": 4.107265357946144e-08, "loss": 0.3329, "step": 5794 }, { "epoch": 2.8897938829787235, "grad_norm": 0.3955298364162445, "learning_rate": 4.0702300286210586e-08, "loss": 0.3374, "step": 5795 }, { "epoch": 2.8902925531914896, "grad_norm": 0.4486876428127289, "learning_rate": 4.0333617482875566e-08, "loss": 0.4107, "step": 5796 }, { "epoch": 2.890791223404255, "grad_norm": 0.3736216127872467, "learning_rate": 3.99666052936426e-08, "loss": 0.3322, "step": 5797 }, { "epoch": 2.8912898936170213, "grad_norm": 0.386028915643692, "learning_rate": 3.960126384213669e-08, "loss": 0.3137, "step": 5798 }, { "epoch": 2.8917885638297873, "grad_norm": 0.39285731315612793, "learning_rate": 3.9237593251419406e-08, "loss": 0.325, "step": 5799 }, { "epoch": 2.892287234042553, "grad_norm": 0.4481523633003235, "learning_rate": 3.8875593643989405e-08, "loss": 0.364, "step": 5800 }, { "epoch": 2.892785904255319, "grad_norm": 0.41928407549858093, "learning_rate": 3.851526514178305e-08, "loss": 0.3516, "step": 5801 }, { "epoch": 2.893284574468085, "grad_norm": 0.3574136793613434, "learning_rate": 3.81566078661727e-08, "loss": 0.3157, "step": 5802 }, { "epoch": 2.893783244680851, "grad_norm": 0.4097362756729126, "learning_rate": 3.7799621937968376e-08, "loss": 0.3805, "step": 5803 }, { "epoch": 2.894281914893617, "grad_norm": 0.36105433106422424, "learning_rate": 3.744430747741834e-08, "loss": 0.3046, "step": 5804 }, { "epoch": 2.894780585106383, "grad_norm": 0.40160873532295227, "learning_rate": 3.709066460420574e-08, "loss": 0.3395, "step": 5805 }, { "epoch": 2.895279255319149, "grad_norm": 0.5201047658920288, "learning_rate": 3.673869343745195e-08, "loss": 0.3831, "step": 5806 }, { "epoch": 2.895777925531915, "grad_norm": 0.38784676790237427, "learning_rate": 3.638839409571493e-08, "loss": 0.3142, "step": 5807 }, { "epoch": 2.896276595744681, "grad_norm": 0.4281841516494751, "learning_rate": 3.603976669698972e-08, "loss": 0.2964, "step": 5808 }, { "epoch": 2.896775265957447, "grad_norm": 0.39851731061935425, "learning_rate": 3.569281135870739e-08, "loss": 0.3162, "step": 5809 }, { "epoch": 2.8972739361702127, "grad_norm": 0.417753130197525, "learning_rate": 3.534752819773668e-08, "loss": 0.3548, "step": 5810 }, { "epoch": 2.8977726063829787, "grad_norm": 0.40935245156288147, "learning_rate": 3.5003917330383444e-08, "loss": 0.3635, "step": 5811 }, { "epoch": 2.898271276595745, "grad_norm": 0.3787839114665985, "learning_rate": 3.4661978872388424e-08, "loss": 0.3035, "step": 5812 }, { "epoch": 2.8987699468085104, "grad_norm": 0.4296489655971527, "learning_rate": 3.432171293893116e-08, "loss": 0.3383, "step": 5813 }, { "epoch": 2.8992686170212765, "grad_norm": 0.4015282988548279, "learning_rate": 3.398311964462608e-08, "loss": 0.3264, "step": 5814 }, { "epoch": 2.8997672872340425, "grad_norm": 0.39233842492103577, "learning_rate": 3.3646199103525845e-08, "loss": 0.4015, "step": 5815 }, { "epoch": 2.9002659574468086, "grad_norm": 0.4239775836467743, "learning_rate": 3.3310951429117446e-08, "loss": 0.3193, "step": 5816 }, { "epoch": 2.9007646276595747, "grad_norm": 0.4226128160953522, "learning_rate": 3.297737673432722e-08, "loss": 0.3117, "step": 5817 }, { "epoch": 2.9012632978723403, "grad_norm": 0.4236118495464325, "learning_rate": 3.264547513151528e-08, "loss": 0.3851, "step": 5818 }, { "epoch": 2.9017619680851063, "grad_norm": 0.37081512808799744, "learning_rate": 3.231524673247999e-08, "loss": 0.2552, "step": 5819 }, { "epoch": 2.9022606382978724, "grad_norm": 0.42222362756729126, "learning_rate": 3.198669164845569e-08, "loss": 0.417, "step": 5820 }, { "epoch": 2.9027593085106385, "grad_norm": 0.38850584626197815, "learning_rate": 3.16598099901122e-08, "loss": 0.3163, "step": 5821 }, { "epoch": 2.9032579787234045, "grad_norm": 0.34949350357055664, "learning_rate": 3.1334601867556434e-08, "loss": 0.3121, "step": 5822 }, { "epoch": 2.90375664893617, "grad_norm": 0.3897983133792877, "learning_rate": 3.1011067390331884e-08, "loss": 0.3468, "step": 5823 }, { "epoch": 2.904255319148936, "grad_norm": 0.39258870482444763, "learning_rate": 3.068920666741804e-08, "loss": 0.378, "step": 5824 }, { "epoch": 2.9047539893617023, "grad_norm": 0.363261342048645, "learning_rate": 3.036901980722984e-08, "loss": 0.3175, "step": 5825 }, { "epoch": 2.905252659574468, "grad_norm": 0.3847233057022095, "learning_rate": 3.0050506917619905e-08, "loss": 0.3521, "step": 5826 }, { "epoch": 2.905751329787234, "grad_norm": 0.4237058162689209, "learning_rate": 2.973366810587519e-08, "loss": 0.3817, "step": 5827 }, { "epoch": 2.90625, "grad_norm": 0.36735987663269043, "learning_rate": 2.9418503478720328e-08, "loss": 0.2831, "step": 5828 }, { "epoch": 2.906748670212766, "grad_norm": 0.4349239766597748, "learning_rate": 2.9105013142314842e-08, "loss": 0.3815, "step": 5829 }, { "epoch": 2.907247340425532, "grad_norm": 0.3984867036342621, "learning_rate": 2.879319720225482e-08, "loss": 0.3206, "step": 5830 }, { "epoch": 2.9077460106382977, "grad_norm": 0.39628615975379944, "learning_rate": 2.8483055763573463e-08, "loss": 0.3255, "step": 5831 }, { "epoch": 2.908244680851064, "grad_norm": 0.4123292565345764, "learning_rate": 2.8174588930737213e-08, "loss": 0.3259, "step": 5832 }, { "epoch": 2.90874335106383, "grad_norm": 0.40955328941345215, "learning_rate": 2.7867796807651837e-08, "loss": 0.3464, "step": 5833 }, { "epoch": 2.9092420212765955, "grad_norm": 0.36445459723472595, "learning_rate": 2.7562679497655233e-08, "loss": 0.3214, "step": 5834 }, { "epoch": 2.9097406914893615, "grad_norm": 0.42162761092185974, "learning_rate": 2.7259237103524627e-08, "loss": 0.3794, "step": 5835 }, { "epoch": 2.9102393617021276, "grad_norm": 0.38312485814094543, "learning_rate": 2.6957469727470486e-08, "loss": 0.3999, "step": 5836 }, { "epoch": 2.9107380319148937, "grad_norm": 0.40330252051353455, "learning_rate": 2.66573774711415e-08, "loss": 0.3174, "step": 5837 }, { "epoch": 2.9112367021276597, "grad_norm": 0.4126056432723999, "learning_rate": 2.635896043561903e-08, "loss": 0.3503, "step": 5838 }, { "epoch": 2.9117353723404253, "grad_norm": 0.4097425937652588, "learning_rate": 2.6062218721423227e-08, "loss": 0.3639, "step": 5839 }, { "epoch": 2.9122340425531914, "grad_norm": 0.43302276730537415, "learning_rate": 2.5767152428508023e-08, "loss": 0.3274, "step": 5840 }, { "epoch": 2.9127327127659575, "grad_norm": 0.43061140179634094, "learning_rate": 2.547376165626392e-08, "loss": 0.3654, "step": 5841 }, { "epoch": 2.9132313829787235, "grad_norm": 0.38162246346473694, "learning_rate": 2.5182046503515746e-08, "loss": 0.3071, "step": 5842 }, { "epoch": 2.9137300531914896, "grad_norm": 0.4357090890407562, "learning_rate": 2.4892007068526015e-08, "loss": 0.3294, "step": 5843 }, { "epoch": 2.914228723404255, "grad_norm": 0.4215974807739258, "learning_rate": 2.4603643448991577e-08, "loss": 0.3235, "step": 5844 }, { "epoch": 2.9147273936170213, "grad_norm": 0.3917500972747803, "learning_rate": 2.431695574204307e-08, "loss": 0.3604, "step": 5845 }, { "epoch": 2.9152260638297873, "grad_norm": 0.3653757870197296, "learning_rate": 2.4031944044250465e-08, "loss": 0.347, "step": 5846 }, { "epoch": 2.915724734042553, "grad_norm": 0.40295863151550293, "learning_rate": 2.3748608451616418e-08, "loss": 0.3409, "step": 5847 }, { "epoch": 2.916223404255319, "grad_norm": 0.45972785353660583, "learning_rate": 2.3466949059579025e-08, "loss": 0.3063, "step": 5848 }, { "epoch": 2.916722074468085, "grad_norm": 0.40075939893722534, "learning_rate": 2.318696596301351e-08, "loss": 0.3413, "step": 5849 }, { "epoch": 2.917220744680851, "grad_norm": 0.41148221492767334, "learning_rate": 2.2908659256228318e-08, "loss": 0.341, "step": 5850 }, { "epoch": 2.917719414893617, "grad_norm": 0.36808714270591736, "learning_rate": 2.2632029032969573e-08, "loss": 0.3017, "step": 5851 }, { "epoch": 2.918218085106383, "grad_norm": 0.4392476975917816, "learning_rate": 2.235707538641607e-08, "loss": 0.3898, "step": 5852 }, { "epoch": 2.918716755319149, "grad_norm": 0.36249837279319763, "learning_rate": 2.2083798409183733e-08, "loss": 0.3084, "step": 5853 }, { "epoch": 2.919215425531915, "grad_norm": 0.43364620208740234, "learning_rate": 2.1812198193323363e-08, "loss": 0.393, "step": 5854 }, { "epoch": 2.919714095744681, "grad_norm": 0.40386539697647095, "learning_rate": 2.1542274830321232e-08, "loss": 0.3033, "step": 5855 }, { "epoch": 2.920212765957447, "grad_norm": 0.44753962755203247, "learning_rate": 2.127402841109738e-08, "loss": 0.3251, "step": 5856 }, { "epoch": 2.9207114361702127, "grad_norm": 0.4355054199695587, "learning_rate": 2.100745902600787e-08, "loss": 0.3127, "step": 5857 }, { "epoch": 2.9212101063829787, "grad_norm": 0.4610171616077423, "learning_rate": 2.074256676484476e-08, "loss": 0.3708, "step": 5858 }, { "epoch": 2.921708776595745, "grad_norm": 0.418754905462265, "learning_rate": 2.047935171683446e-08, "loss": 0.2854, "step": 5859 }, { "epoch": 2.9222074468085104, "grad_norm": 0.4089653491973877, "learning_rate": 2.0217813970637156e-08, "loss": 0.3841, "step": 5860 }, { "epoch": 2.9227061170212765, "grad_norm": 0.3769320845603943, "learning_rate": 1.9957953614350155e-08, "loss": 0.323, "step": 5861 }, { "epoch": 2.9232047872340425, "grad_norm": 0.446961909532547, "learning_rate": 1.9699770735504553e-08, "loss": 0.3286, "step": 5862 }, { "epoch": 2.9237034574468086, "grad_norm": 0.387341171503067, "learning_rate": 1.9443265421066893e-08, "loss": 0.2925, "step": 5863 }, { "epoch": 2.9242021276595747, "grad_norm": 0.4063122868537903, "learning_rate": 1.918843775743806e-08, "loss": 0.3282, "step": 5864 }, { "epoch": 2.9247007978723403, "grad_norm": 0.4355519115924835, "learning_rate": 1.8935287830454398e-08, "loss": 0.35, "step": 5865 }, { "epoch": 2.9251994680851063, "grad_norm": 0.43978285789489746, "learning_rate": 1.8683815725387132e-08, "loss": 0.3206, "step": 5866 }, { "epoch": 2.9256981382978724, "grad_norm": 0.3958435654640198, "learning_rate": 1.8434021526941848e-08, "loss": 0.2998, "step": 5867 }, { "epoch": 2.9261968085106385, "grad_norm": 0.4559394121170044, "learning_rate": 1.8185905319259567e-08, "loss": 0.2988, "step": 5868 }, { "epoch": 2.9266954787234045, "grad_norm": 0.42823776602745056, "learning_rate": 1.7939467185915105e-08, "loss": 0.365, "step": 5869 }, { "epoch": 2.92719414893617, "grad_norm": 0.41635861992836, "learning_rate": 1.7694707209919282e-08, "loss": 0.3352, "step": 5870 }, { "epoch": 2.927692819148936, "grad_norm": 0.4135589003562927, "learning_rate": 1.7451625473716704e-08, "loss": 0.3223, "step": 5871 }, { "epoch": 2.9281914893617023, "grad_norm": 0.3909047245979309, "learning_rate": 1.721022205918743e-08, "loss": 0.3415, "step": 5872 }, { "epoch": 2.928690159574468, "grad_norm": 0.38653436303138733, "learning_rate": 1.697049704764475e-08, "loss": 0.3421, "step": 5873 }, { "epoch": 2.929188829787234, "grad_norm": 0.41676896810531616, "learning_rate": 1.6732450519839626e-08, "loss": 0.3305, "step": 5874 }, { "epoch": 2.9296875, "grad_norm": 0.43409261107444763, "learning_rate": 1.6496082555953474e-08, "loss": 0.3648, "step": 5875 }, { "epoch": 2.930186170212766, "grad_norm": 0.3862043619155884, "learning_rate": 1.626139323560594e-08, "loss": 0.3041, "step": 5876 }, { "epoch": 2.930684840425532, "grad_norm": 0.4325001537799835, "learning_rate": 1.602838263784934e-08, "loss": 0.3685, "step": 5877 }, { "epoch": 2.9311835106382977, "grad_norm": 0.40418872237205505, "learning_rate": 1.5797050841170892e-08, "loss": 0.3175, "step": 5878 }, { "epoch": 2.931682180851064, "grad_norm": 0.41348913311958313, "learning_rate": 1.5567397923492157e-08, "loss": 0.2954, "step": 5879 }, { "epoch": 2.93218085106383, "grad_norm": 0.4172312915325165, "learning_rate": 1.533942396217014e-08, "loss": 0.3621, "step": 5880 }, { "epoch": 2.9326795212765955, "grad_norm": 0.4224209487438202, "learning_rate": 1.5113129033995087e-08, "loss": 0.362, "step": 5881 }, { "epoch": 2.9331781914893615, "grad_norm": 0.3892894387245178, "learning_rate": 1.4888513215192135e-08, "loss": 0.3275, "step": 5882 }, { "epoch": 2.9336768617021276, "grad_norm": 0.39508530497550964, "learning_rate": 1.466557658142076e-08, "loss": 0.2811, "step": 5883 }, { "epoch": 2.9341755319148937, "grad_norm": 0.41042426228523254, "learning_rate": 1.4444319207775337e-08, "loss": 0.2744, "step": 5884 }, { "epoch": 2.9346742021276597, "grad_norm": 0.4168742001056671, "learning_rate": 1.4224741168784585e-08, "loss": 0.3461, "step": 5885 }, { "epoch": 2.9351728723404253, "grad_norm": 0.41260647773742676, "learning_rate": 1.4006842538409892e-08, "loss": 0.3649, "step": 5886 }, { "epoch": 2.9356715425531914, "grad_norm": 0.3654003441333771, "learning_rate": 1.3790623390049773e-08, "loss": 0.3083, "step": 5887 }, { "epoch": 2.9361702127659575, "grad_norm": 0.40019696950912476, "learning_rate": 1.3576083796533745e-08, "loss": 0.3974, "step": 5888 }, { "epoch": 2.9366688829787235, "grad_norm": 0.409402459859848, "learning_rate": 1.3363223830129002e-08, "loss": 0.3403, "step": 5889 }, { "epoch": 2.9371675531914896, "grad_norm": 0.3993343710899353, "learning_rate": 1.31520435625343e-08, "loss": 0.4078, "step": 5890 }, { "epoch": 2.937666223404255, "grad_norm": 0.402316689491272, "learning_rate": 1.294254306488385e-08, "loss": 0.3342, "step": 5891 }, { "epoch": 2.9381648936170213, "grad_norm": 0.40308499336242676, "learning_rate": 1.2734722407746203e-08, "loss": 0.3479, "step": 5892 }, { "epoch": 2.9386635638297873, "grad_norm": 0.41103023290634155, "learning_rate": 1.252858166112314e-08, "loss": 0.3301, "step": 5893 }, { "epoch": 2.939162234042553, "grad_norm": 0.4019198417663574, "learning_rate": 1.232412089445134e-08, "loss": 0.3373, "step": 5894 }, { "epoch": 2.939660904255319, "grad_norm": 0.40485212206840515, "learning_rate": 1.2121340176601271e-08, "loss": 0.3361, "step": 5895 }, { "epoch": 2.940159574468085, "grad_norm": 0.3743954598903656, "learning_rate": 1.1920239575877179e-08, "loss": 0.2972, "step": 5896 }, { "epoch": 2.940658244680851, "grad_norm": 0.3931979537010193, "learning_rate": 1.1720819160018771e-08, "loss": 0.3082, "step": 5897 }, { "epoch": 2.941156914893617, "grad_norm": 0.4310149550437927, "learning_rate": 1.1523078996198422e-08, "loss": 0.3758, "step": 5898 }, { "epoch": 2.941655585106383, "grad_norm": 0.3671475648880005, "learning_rate": 1.13270191510223e-08, "loss": 0.3562, "step": 5899 }, { "epoch": 2.942154255319149, "grad_norm": 0.37204086780548096, "learning_rate": 1.1132639690532022e-08, "loss": 0.3588, "step": 5900 }, { "epoch": 2.942652925531915, "grad_norm": 0.41188740730285645, "learning_rate": 1.0939940680202433e-08, "loss": 0.3322, "step": 5901 }, { "epoch": 2.943151595744681, "grad_norm": 0.3744305968284607, "learning_rate": 1.0748922184941057e-08, "loss": 0.3506, "step": 5902 }, { "epoch": 2.943650265957447, "grad_norm": 0.37265780568122864, "learning_rate": 1.0559584269091983e-08, "loss": 0.3815, "step": 5903 }, { "epoch": 2.9441489361702127, "grad_norm": 0.40614449977874756, "learning_rate": 1.0371926996431414e-08, "loss": 0.337, "step": 5904 }, { "epoch": 2.9446476063829787, "grad_norm": 0.401077002286911, "learning_rate": 1.0185950430169345e-08, "loss": 0.3274, "step": 5905 }, { "epoch": 2.945146276595745, "grad_norm": 0.39795398712158203, "learning_rate": 1.000165463295011e-08, "loss": 0.3045, "step": 5906 }, { "epoch": 2.9456449468085104, "grad_norm": 0.41894981265068054, "learning_rate": 9.819039666852937e-09, "loss": 0.3086, "step": 5907 }, { "epoch": 2.9461436170212765, "grad_norm": 0.43029728531837463, "learning_rate": 9.638105593388626e-09, "loss": 0.3852, "step": 5908 }, { "epoch": 2.9466422872340425, "grad_norm": 0.36872148513793945, "learning_rate": 9.458852473503421e-09, "loss": 0.3263, "step": 5909 }, { "epoch": 2.9471409574468086, "grad_norm": 0.37365686893463135, "learning_rate": 9.281280367577361e-09, "loss": 0.3497, "step": 5910 }, { "epoch": 2.9476396276595747, "grad_norm": 0.3785567879676819, "learning_rate": 9.105389335423708e-09, "loss": 0.362, "step": 5911 }, { "epoch": 2.9481382978723403, "grad_norm": 0.4047773480415344, "learning_rate": 8.931179436288961e-09, "loss": 0.3739, "step": 5912 }, { "epoch": 2.9486369680851063, "grad_norm": 0.3790971636772156, "learning_rate": 8.758650728855068e-09, "loss": 0.2789, "step": 5913 }, { "epoch": 2.9491356382978724, "grad_norm": 0.4105313718318939, "learning_rate": 8.58780327123554e-09, "loss": 0.3567, "step": 5914 }, { "epoch": 2.9496343085106385, "grad_norm": 0.42435094714164734, "learning_rate": 8.41863712097879e-09, "loss": 0.3283, "step": 5915 }, { "epoch": 2.9501329787234045, "grad_norm": 0.3979879915714264, "learning_rate": 8.251152335067569e-09, "loss": 0.3144, "step": 5916 }, { "epoch": 2.95063164893617, "grad_norm": 0.450967937707901, "learning_rate": 8.085348969916751e-09, "loss": 0.3343, "step": 5917 }, { "epoch": 2.951130319148936, "grad_norm": 0.39518725872039795, "learning_rate": 7.921227081375549e-09, "loss": 0.2879, "step": 5918 }, { "epoch": 2.9516289893617023, "grad_norm": 0.47718003392219543, "learning_rate": 7.758786724726963e-09, "loss": 0.3711, "step": 5919 }, { "epoch": 2.952127659574468, "grad_norm": 0.42491045594215393, "learning_rate": 7.598027954687781e-09, "loss": 0.3311, "step": 5920 }, { "epoch": 2.952626329787234, "grad_norm": 0.4004131257534027, "learning_rate": 7.43895082540691e-09, "loss": 0.3052, "step": 5921 }, { "epoch": 2.953125, "grad_norm": 0.4105391800403595, "learning_rate": 7.281555390469264e-09, "loss": 0.3161, "step": 5922 }, { "epoch": 2.953623670212766, "grad_norm": 0.41772133111953735, "learning_rate": 7.1258417028907675e-09, "loss": 0.3255, "step": 5923 }, { "epoch": 2.954122340425532, "grad_norm": 0.4364093542098999, "learning_rate": 6.971809815122799e-09, "loss": 0.3397, "step": 5924 }, { "epoch": 2.9546210106382977, "grad_norm": 0.3840574622154236, "learning_rate": 6.819459779048853e-09, "loss": 0.3695, "step": 5925 }, { "epoch": 2.955119680851064, "grad_norm": 0.40923449397087097, "learning_rate": 6.668791645987327e-09, "loss": 0.3347, "step": 5926 }, { "epoch": 2.95561835106383, "grad_norm": 0.37118908762931824, "learning_rate": 6.5198054666881784e-09, "loss": 0.2536, "step": 5927 }, { "epoch": 2.9561170212765955, "grad_norm": 0.40644070506095886, "learning_rate": 6.372501291336819e-09, "loss": 0.3766, "step": 5928 }, { "epoch": 2.9566156914893615, "grad_norm": 0.44686079025268555, "learning_rate": 6.2268791695507815e-09, "loss": 0.3459, "step": 5929 }, { "epoch": 2.9571143617021276, "grad_norm": 0.38261568546295166, "learning_rate": 6.082939150381384e-09, "loss": 0.2823, "step": 5930 }, { "epoch": 2.9576130319148937, "grad_norm": 0.39090797305107117, "learning_rate": 5.9406812823137314e-09, "loss": 0.3596, "step": 5931 }, { "epoch": 2.9581117021276597, "grad_norm": 0.4167637526988983, "learning_rate": 5.800105613265605e-09, "loss": 0.3528, "step": 5932 }, { "epoch": 2.9586103723404253, "grad_norm": 0.3794327974319458, "learning_rate": 5.661212190589127e-09, "loss": 0.3, "step": 5933 }, { "epoch": 2.9591090425531914, "grad_norm": 0.4215754568576813, "learning_rate": 5.52400106106854e-09, "loss": 0.349, "step": 5934 }, { "epoch": 2.9596077127659575, "grad_norm": 0.3634827435016632, "learning_rate": 5.388472270921874e-09, "loss": 0.2647, "step": 5935 }, { "epoch": 2.9601063829787235, "grad_norm": 0.3745673894882202, "learning_rate": 5.254625865800944e-09, "loss": 0.36, "step": 5936 }, { "epoch": 2.9606050531914896, "grad_norm": 0.4051733911037445, "learning_rate": 5.1224618907907975e-09, "loss": 0.3779, "step": 5937 }, { "epoch": 2.961103723404255, "grad_norm": 0.4111439287662506, "learning_rate": 4.9919803904091565e-09, "loss": 0.3207, "step": 5938 }, { "epoch": 2.9616023936170213, "grad_norm": 0.4232427775859833, "learning_rate": 4.863181408608087e-09, "loss": 0.3598, "step": 5939 }, { "epoch": 2.9621010638297873, "grad_norm": 0.39614468812942505, "learning_rate": 4.736064988771216e-09, "loss": 0.311, "step": 5940 }, { "epoch": 2.962599734042553, "grad_norm": 0.4098972678184509, "learning_rate": 4.610631173717628e-09, "loss": 0.3115, "step": 5941 }, { "epoch": 2.963098404255319, "grad_norm": 0.42080503702163696, "learning_rate": 4.486880005697414e-09, "loss": 0.3281, "step": 5942 }, { "epoch": 2.963597074468085, "grad_norm": 0.3869600296020508, "learning_rate": 4.3648115263950074e-09, "loss": 0.3146, "step": 5943 }, { "epoch": 2.964095744680851, "grad_norm": 0.4437698721885681, "learning_rate": 4.244425776928629e-09, "loss": 0.3037, "step": 5944 }, { "epoch": 2.964594414893617, "grad_norm": 0.45583072304725647, "learning_rate": 4.125722797849174e-09, "loss": 0.3808, "step": 5945 }, { "epoch": 2.965093085106383, "grad_norm": 0.4074864387512207, "learning_rate": 4.0087026291391055e-09, "loss": 0.3018, "step": 5946 }, { "epoch": 2.965591755319149, "grad_norm": 0.4016644060611725, "learning_rate": 3.89336531021689e-09, "loss": 0.3329, "step": 5947 }, { "epoch": 2.966090425531915, "grad_norm": 0.4210624694824219, "learning_rate": 3.779710879932563e-09, "loss": 0.3652, "step": 5948 }, { "epoch": 2.966589095744681, "grad_norm": 0.41347214579582214, "learning_rate": 3.667739376568835e-09, "loss": 0.3096, "step": 5949 }, { "epoch": 2.967087765957447, "grad_norm": 0.3872718811035156, "learning_rate": 3.557450837842202e-09, "loss": 0.3286, "step": 5950 }, { "epoch": 2.9675864361702127, "grad_norm": 0.46589359641075134, "learning_rate": 3.448845300903503e-09, "loss": 0.3613, "step": 5951 }, { "epoch": 2.9680851063829787, "grad_norm": 0.44183316826820374, "learning_rate": 3.34192280233403e-09, "loss": 0.3281, "step": 5952 }, { "epoch": 2.968583776595745, "grad_norm": 0.3811807930469513, "learning_rate": 3.236683378149974e-09, "loss": 0.327, "step": 5953 }, { "epoch": 2.9690824468085104, "grad_norm": 0.3800606429576874, "learning_rate": 3.133127063800756e-09, "loss": 0.3314, "step": 5954 }, { "epoch": 2.9695811170212765, "grad_norm": 0.4146391749382019, "learning_rate": 3.0312538941673632e-09, "loss": 0.3747, "step": 5955 }, { "epoch": 2.9700797872340425, "grad_norm": 0.3498390018939972, "learning_rate": 2.9310639035656787e-09, "loss": 0.3186, "step": 5956 }, { "epoch": 2.9705784574468086, "grad_norm": 0.3695351779460907, "learning_rate": 2.8325571257425967e-09, "loss": 0.3225, "step": 5957 }, { "epoch": 2.9710771276595747, "grad_norm": 0.41555550694465637, "learning_rate": 2.735733593880463e-09, "loss": 0.3426, "step": 5958 }, { "epoch": 2.9715757978723403, "grad_norm": 0.4197644889354706, "learning_rate": 2.640593340592079e-09, "loss": 0.3584, "step": 5959 }, { "epoch": 2.9720744680851063, "grad_norm": 0.3700507581233978, "learning_rate": 2.547136397925143e-09, "loss": 0.2937, "step": 5960 }, { "epoch": 2.9725731382978724, "grad_norm": 0.4507772922515869, "learning_rate": 2.455362797360028e-09, "loss": 0.3752, "step": 5961 }, { "epoch": 2.9730718085106385, "grad_norm": 0.41882559657096863, "learning_rate": 2.365272569808674e-09, "loss": 0.3201, "step": 5962 }, { "epoch": 2.9735704787234045, "grad_norm": 0.389226496219635, "learning_rate": 2.276865745618473e-09, "loss": 0.348, "step": 5963 }, { "epoch": 2.97406914893617, "grad_norm": 0.4016020894050598, "learning_rate": 2.1901423545672707e-09, "loss": 0.3133, "step": 5964 }, { "epoch": 2.974567819148936, "grad_norm": 0.34166139364242554, "learning_rate": 2.105102425867256e-09, "loss": 0.3053, "step": 5965 }, { "epoch": 2.9750664893617023, "grad_norm": 0.41892898082733154, "learning_rate": 2.0217459881632927e-09, "loss": 0.3414, "step": 5966 }, { "epoch": 2.975565159574468, "grad_norm": 0.45978495478630066, "learning_rate": 1.940073069533477e-09, "loss": 0.3324, "step": 5967 }, { "epoch": 2.976063829787234, "grad_norm": 0.40890833735466003, "learning_rate": 1.8600836974880243e-09, "loss": 0.2532, "step": 5968 }, { "epoch": 2.9765625, "grad_norm": 0.42907091975212097, "learning_rate": 1.7817778989709379e-09, "loss": 0.3862, "step": 5969 }, { "epoch": 2.977061170212766, "grad_norm": 0.4012555480003357, "learning_rate": 1.7051557003583409e-09, "loss": 0.3755, "step": 5970 }, { "epoch": 2.977559840425532, "grad_norm": 0.43085286021232605, "learning_rate": 1.630217127460143e-09, "loss": 0.3445, "step": 5971 }, { "epoch": 2.9780585106382977, "grad_norm": 0.4151933491230011, "learning_rate": 1.5569622055183752e-09, "loss": 0.2999, "step": 5972 }, { "epoch": 2.978557180851064, "grad_norm": 0.42042234539985657, "learning_rate": 1.4853909592088544e-09, "loss": 0.3625, "step": 5973 }, { "epoch": 2.97905585106383, "grad_norm": 0.4497944414615631, "learning_rate": 1.4155034126384082e-09, "loss": 0.3402, "step": 5974 }, { "epoch": 2.9795545212765955, "grad_norm": 0.39995118975639343, "learning_rate": 1.3472995893487606e-09, "loss": 0.3554, "step": 5975 }, { "epoch": 2.9800531914893615, "grad_norm": 0.36797094345092773, "learning_rate": 1.2807795123137568e-09, "loss": 0.3386, "step": 5976 }, { "epoch": 2.9805518617021276, "grad_norm": 0.41050535440444946, "learning_rate": 1.2159432039393627e-09, "loss": 0.3555, "step": 5977 }, { "epoch": 2.9810505319148937, "grad_norm": 0.39382174611091614, "learning_rate": 1.1527906860658855e-09, "loss": 0.3018, "step": 5978 }, { "epoch": 2.9815492021276597, "grad_norm": 0.3880799114704132, "learning_rate": 1.0913219799651987e-09, "loss": 0.3845, "step": 5979 }, { "epoch": 2.9820478723404253, "grad_norm": 0.38850685954093933, "learning_rate": 1.0315371063424062e-09, "loss": 0.3763, "step": 5980 }, { "epoch": 2.9825465425531914, "grad_norm": 0.4276697635650635, "learning_rate": 9.734360853352887e-10, "loss": 0.3257, "step": 5981 }, { "epoch": 2.9830452127659575, "grad_norm": 0.3782210350036621, "learning_rate": 9.170189365154125e-10, "loss": 0.3465, "step": 5982 }, { "epoch": 2.9835438829787235, "grad_norm": 0.34518033266067505, "learning_rate": 8.622856788848e-10, "loss": 0.3419, "step": 5983 }, { "epoch": 2.9840425531914896, "grad_norm": 0.4192173182964325, "learning_rate": 8.092363308814799e-10, "loss": 0.4036, "step": 5984 }, { "epoch": 2.984541223404255, "grad_norm": 0.4430842995643616, "learning_rate": 7.578709103733817e-10, "loss": 0.3277, "step": 5985 }, { "epoch": 2.9850398936170213, "grad_norm": 0.45330899953842163, "learning_rate": 7.081894346633312e-10, "loss": 0.2996, "step": 5986 }, { "epoch": 2.9855385638297873, "grad_norm": 0.4842875897884369, "learning_rate": 6.601919204851647e-10, "loss": 0.3476, "step": 5987 }, { "epoch": 2.986037234042553, "grad_norm": 0.4060778319835663, "learning_rate": 6.138783840070606e-10, "loss": 0.2847, "step": 5988 }, { "epoch": 2.986535904255319, "grad_norm": 0.4457836449146271, "learning_rate": 5.692488408287622e-10, "loss": 0.3615, "step": 5989 }, { "epoch": 2.987034574468085, "grad_norm": 0.39188581705093384, "learning_rate": 5.263033059837997e-10, "loss": 0.2874, "step": 5990 }, { "epoch": 2.987533244680851, "grad_norm": 0.402339905500412, "learning_rate": 4.850417939372687e-10, "loss": 0.3157, "step": 5991 }, { "epoch": 2.988031914893617, "grad_norm": 0.3923729360103607, "learning_rate": 4.4546431858860653e-10, "loss": 0.3224, "step": 5992 }, { "epoch": 2.988530585106383, "grad_norm": 0.3966045379638672, "learning_rate": 4.075708932682609e-10, "loss": 0.375, "step": 5993 }, { "epoch": 2.989029255319149, "grad_norm": 0.3821544051170349, "learning_rate": 3.713615307410212e-10, "loss": 0.3367, "step": 5994 }, { "epoch": 2.989527925531915, "grad_norm": 0.44633200764656067, "learning_rate": 3.3683624320268725e-10, "loss": 0.3507, "step": 5995 }, { "epoch": 2.990026595744681, "grad_norm": 0.38137248158454895, "learning_rate": 3.0399504228340035e-10, "loss": 0.3424, "step": 5996 }, { "epoch": 2.990525265957447, "grad_norm": 0.37014490365982056, "learning_rate": 2.7283793904542275e-10, "loss": 0.3573, "step": 5997 }, { "epoch": 2.9910239361702127, "grad_norm": 0.35328930616378784, "learning_rate": 2.433649439836927e-10, "loss": 0.3233, "step": 5998 }, { "epoch": 2.9915226063829787, "grad_norm": 0.4061431288719177, "learning_rate": 2.1557606702526934e-10, "loss": 0.4272, "step": 5999 }, { "epoch": 2.992021276595745, "grad_norm": 0.36746183037757874, "learning_rate": 1.8947131753155324e-10, "loss": 0.366, "step": 6000 }, { "epoch": 2.9925199468085104, "grad_norm": 0.47497057914733887, "learning_rate": 1.6505070429495563e-10, "loss": 0.3183, "step": 6001 }, { "epoch": 2.9930186170212765, "grad_norm": 0.38742271065711975, "learning_rate": 1.423142355411189e-10, "loss": 0.3384, "step": 6002 }, { "epoch": 2.9935172872340425, "grad_norm": 0.4465458393096924, "learning_rate": 1.212619189294717e-10, "loss": 0.3532, "step": 6003 }, { "epoch": 2.9940159574468086, "grad_norm": 0.37997621297836304, "learning_rate": 1.0189376155100849e-10, "loss": 0.2906, "step": 6004 }, { "epoch": 2.9945146276595747, "grad_norm": 0.40606212615966797, "learning_rate": 8.420976992939978e-11, "loss": 0.3542, "step": 6005 }, { "epoch": 2.9950132978723403, "grad_norm": 0.41282936930656433, "learning_rate": 6.820995002099206e-11, "loss": 0.2912, "step": 6006 }, { "epoch": 2.9955119680851063, "grad_norm": 0.36680102348327637, "learning_rate": 5.389430721591815e-11, "loss": 0.3243, "step": 6007 }, { "epoch": 2.9960106382978724, "grad_norm": 0.3753984570503235, "learning_rate": 4.1262846335876626e-11, "loss": 0.3403, "step": 6008 }, { "epoch": 2.9965093085106385, "grad_norm": 0.4149220585823059, "learning_rate": 3.031557163579724e-11, "loss": 0.4084, "step": 6009 }, { "epoch": 2.9970079787234045, "grad_norm": 0.3723506033420563, "learning_rate": 2.1052486803285754e-11, "loss": 0.2792, "step": 6010 }, { "epoch": 2.99750664893617, "grad_norm": 0.4082968235015869, "learning_rate": 1.347359495751377e-11, "loss": 0.3166, "step": 6011 }, { "epoch": 2.998005319148936, "grad_norm": 0.4359695613384247, "learning_rate": 7.578898653104461e-12, "loss": 0.3811, "step": 6012 }, { "epoch": 2.9985039893617023, "grad_norm": 0.4311217963695526, "learning_rate": 3.3683998740263733e-12, "loss": 0.3116, "step": 6013 }, { "epoch": 2.999002659574468, "grad_norm": 0.4574633240699768, "learning_rate": 8.421000396996448e-13, "loss": 0.3779, "step": 6014 }, { "epoch": 2.999501329787234, "grad_norm": 0.36362534761428833, "learning_rate": 0.0, "loss": 0.32, "step": 6015 }, { "epoch": 2.999501329787234, "step": 6015, "total_flos": 5081068868993024.0, "train_loss": 0.3983377527615871, "train_runtime": 188442.1438, "train_samples_per_second": 3.065, "train_steps_per_second": 0.032 } ], "logging_steps": 1.0, "max_steps": 6015, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5081068868993024.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }