{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999310276874569, "eval_steps": 400, "global_step": 11415, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005255033336617729, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 9.1222, "step": 2 }, { "epoch": 0.0010510066673235458, "grad_norm": 38.698402404785156, "learning_rate": 9.999123882950763e-05, "loss": 9.1362, "step": 4 }, { "epoch": 0.0015765100009853187, "grad_norm": 23.888906478881836, "learning_rate": 9.997371648852288e-05, "loss": 6.4836, "step": 6 }, { "epoch": 0.0021020133346470915, "grad_norm": 12.829351425170898, "learning_rate": 9.995619414753812e-05, "loss": 5.5976, "step": 8 }, { "epoch": 0.0026275166683088647, "grad_norm": 13.441329956054688, "learning_rate": 9.993867180655336e-05, "loss": 4.922, "step": 10 }, { "epoch": 0.0031530200019706375, "grad_norm": 8.487077713012695, "learning_rate": 9.99211494655686e-05, "loss": 4.3391, "step": 12 }, { "epoch": 0.0036785233356324103, "grad_norm": 7.427611351013184, "learning_rate": 9.990362712458385e-05, "loss": 3.8921, "step": 14 }, { "epoch": 0.004204026669294183, "grad_norm": 6.296542167663574, "learning_rate": 9.988610478359909e-05, "loss": 3.5532, "step": 16 }, { "epoch": 0.004729530002955957, "grad_norm": 5.726244926452637, "learning_rate": 9.986858244261433e-05, "loss": 3.2786, "step": 18 }, { "epoch": 0.0052550333366177295, "grad_norm": 5.131892204284668, "learning_rate": 9.985106010162958e-05, "loss": 3.016, "step": 20 }, { "epoch": 0.005780536670279502, "grad_norm": 4.895572185516357, "learning_rate": 9.983353776064483e-05, "loss": 2.8689, "step": 22 }, { "epoch": 0.006306040003941275, "grad_norm": 4.9487714767456055, "learning_rate": 9.981601541966006e-05, "loss": 2.7813, "step": 24 }, { "epoch": 0.006831543337603048, "grad_norm": 4.282960891723633, "learning_rate": 9.979849307867531e-05, "loss": 2.6648, "step": 26 }, { "epoch": 0.0073570466712648205, "grad_norm": 3.610912561416626, "learning_rate": 9.978097073769056e-05, "loss": 2.6083, "step": 28 }, { "epoch": 0.007882550004926594, "grad_norm": 3.4365551471710205, "learning_rate": 9.97634483967058e-05, "loss": 2.5791, "step": 30 }, { "epoch": 0.008408053338588366, "grad_norm": 3.2651638984680176, "learning_rate": 9.974592605572105e-05, "loss": 2.5128, "step": 32 }, { "epoch": 0.00893355667225014, "grad_norm": 3.259965658187866, "learning_rate": 9.97284037147363e-05, "loss": 2.4589, "step": 34 }, { "epoch": 0.009459060005911913, "grad_norm": 3.229142904281616, "learning_rate": 9.971088137375154e-05, "loss": 2.4857, "step": 36 }, { "epoch": 0.009984563339573685, "grad_norm": 3.1428916454315186, "learning_rate": 9.969335903276678e-05, "loss": 2.4305, "step": 38 }, { "epoch": 0.010510066673235459, "grad_norm": 2.565325975418091, "learning_rate": 9.967583669178203e-05, "loss": 2.4047, "step": 40 }, { "epoch": 0.01103557000689723, "grad_norm": 2.9086647033691406, "learning_rate": 9.965831435079726e-05, "loss": 2.3909, "step": 42 }, { "epoch": 0.011561073340559004, "grad_norm": 2.6053147315979004, "learning_rate": 9.964079200981251e-05, "loss": 2.3981, "step": 44 }, { "epoch": 0.012086576674220776, "grad_norm": 2.3066649436950684, "learning_rate": 9.962326966882776e-05, "loss": 2.3786, "step": 46 }, { "epoch": 0.01261208000788255, "grad_norm": 2.5991337299346924, "learning_rate": 9.960574732784301e-05, "loss": 2.3309, "step": 48 }, { "epoch": 0.013137583341544324, "grad_norm": 2.6027159690856934, "learning_rate": 9.958822498685824e-05, "loss": 2.3522, "step": 50 }, { "epoch": 0.013663086675206096, "grad_norm": 2.2059361934661865, "learning_rate": 9.957070264587349e-05, "loss": 2.3221, "step": 52 }, { "epoch": 0.01418859000886787, "grad_norm": 2.3481605052948, "learning_rate": 9.955318030488874e-05, "loss": 2.3223, "step": 54 }, { "epoch": 0.014714093342529641, "grad_norm": 2.489518880844116, "learning_rate": 9.953565796390398e-05, "loss": 2.299, "step": 56 }, { "epoch": 0.015239596676191415, "grad_norm": 1.992858648300171, "learning_rate": 9.951813562291923e-05, "loss": 2.2695, "step": 58 }, { "epoch": 0.01576510000985319, "grad_norm": 2.3498594760894775, "learning_rate": 9.950061328193448e-05, "loss": 2.305, "step": 60 }, { "epoch": 0.01629060334351496, "grad_norm": 2.3116061687469482, "learning_rate": 9.948309094094971e-05, "loss": 2.28, "step": 62 }, { "epoch": 0.016816106677176732, "grad_norm": 1.8108229637145996, "learning_rate": 9.946556859996496e-05, "loss": 2.2844, "step": 64 }, { "epoch": 0.017341610010838508, "grad_norm": 1.7650134563446045, "learning_rate": 9.944804625898021e-05, "loss": 2.2585, "step": 66 }, { "epoch": 0.01786711334450028, "grad_norm": 2.000683546066284, "learning_rate": 9.943052391799544e-05, "loss": 2.2866, "step": 68 }, { "epoch": 0.01839261667816205, "grad_norm": 1.7432600259780884, "learning_rate": 9.941300157701069e-05, "loss": 2.2551, "step": 70 }, { "epoch": 0.018918120011823827, "grad_norm": 1.8153115510940552, "learning_rate": 9.939547923602594e-05, "loss": 2.2829, "step": 72 }, { "epoch": 0.0194436233454856, "grad_norm": 1.7774672508239746, "learning_rate": 9.937795689504118e-05, "loss": 2.2656, "step": 74 }, { "epoch": 0.01996912667914737, "grad_norm": 1.8277621269226074, "learning_rate": 9.936043455405642e-05, "loss": 2.2602, "step": 76 }, { "epoch": 0.020494630012809142, "grad_norm": 1.8183931112289429, "learning_rate": 9.934291221307167e-05, "loss": 2.2581, "step": 78 }, { "epoch": 0.021020133346470918, "grad_norm": 1.5881011486053467, "learning_rate": 9.932538987208691e-05, "loss": 2.2305, "step": 80 }, { "epoch": 0.02154563668013269, "grad_norm": 1.653342843055725, "learning_rate": 9.930786753110216e-05, "loss": 2.2092, "step": 82 }, { "epoch": 0.02207114001379446, "grad_norm": 1.7351162433624268, "learning_rate": 9.929034519011741e-05, "loss": 2.2031, "step": 84 }, { "epoch": 0.022596643347456237, "grad_norm": 1.679905891418457, "learning_rate": 9.927282284913266e-05, "loss": 2.2373, "step": 86 }, { "epoch": 0.02312214668111801, "grad_norm": 1.742342472076416, "learning_rate": 9.925530050814789e-05, "loss": 2.1937, "step": 88 }, { "epoch": 0.02364765001477978, "grad_norm": 1.5579359531402588, "learning_rate": 9.923777816716314e-05, "loss": 2.2137, "step": 90 }, { "epoch": 0.024173153348441553, "grad_norm": 1.548086404800415, "learning_rate": 9.922025582617839e-05, "loss": 2.2123, "step": 92 }, { "epoch": 0.024698656682103328, "grad_norm": 1.6375842094421387, "learning_rate": 9.920273348519362e-05, "loss": 2.182, "step": 94 }, { "epoch": 0.0252241600157651, "grad_norm": 1.6106681823730469, "learning_rate": 9.918521114420887e-05, "loss": 2.1649, "step": 96 }, { "epoch": 0.025749663349426872, "grad_norm": 1.6409622430801392, "learning_rate": 9.916768880322411e-05, "loss": 2.1702, "step": 98 }, { "epoch": 0.026275166683088647, "grad_norm": 2.1687161922454834, "learning_rate": 9.915016646223936e-05, "loss": 2.1519, "step": 100 }, { "epoch": 0.02680067001675042, "grad_norm": 1.595245122909546, "learning_rate": 9.91326441212546e-05, "loss": 2.1314, "step": 102 }, { "epoch": 0.02732617335041219, "grad_norm": 1.6276413202285767, "learning_rate": 9.911512178026984e-05, "loss": 2.1189, "step": 104 }, { "epoch": 0.027851676684073963, "grad_norm": 1.7238609790802002, "learning_rate": 9.909759943928509e-05, "loss": 2.1617, "step": 106 }, { "epoch": 0.02837718001773574, "grad_norm": 1.5262254476547241, "learning_rate": 9.908007709830034e-05, "loss": 2.1105, "step": 108 }, { "epoch": 0.02890268335139751, "grad_norm": 1.6144453287124634, "learning_rate": 9.906255475731559e-05, "loss": 2.1287, "step": 110 }, { "epoch": 0.029428186685059282, "grad_norm": 1.528970718383789, "learning_rate": 9.904503241633083e-05, "loss": 2.0744, "step": 112 }, { "epoch": 0.029953690018721058, "grad_norm": 1.703235149383545, "learning_rate": 9.902751007534607e-05, "loss": 2.1285, "step": 114 }, { "epoch": 0.03047919335238283, "grad_norm": 1.567522406578064, "learning_rate": 9.900998773436132e-05, "loss": 2.0736, "step": 116 }, { "epoch": 0.0310046966860446, "grad_norm": 1.4350054264068604, "learning_rate": 9.899246539337655e-05, "loss": 2.0745, "step": 118 }, { "epoch": 0.03153020001970638, "grad_norm": 1.438694953918457, "learning_rate": 9.89749430523918e-05, "loss": 2.0607, "step": 120 }, { "epoch": 0.032055703353368145, "grad_norm": 1.3507471084594727, "learning_rate": 9.895742071140705e-05, "loss": 2.0601, "step": 122 }, { "epoch": 0.03258120668702992, "grad_norm": 1.2798346281051636, "learning_rate": 9.893989837042229e-05, "loss": 2.0925, "step": 124 }, { "epoch": 0.033106710020691696, "grad_norm": 1.1719093322753906, "learning_rate": 9.892237602943754e-05, "loss": 2.069, "step": 126 }, { "epoch": 0.033632213354353464, "grad_norm": 1.3536409139633179, "learning_rate": 9.890485368845279e-05, "loss": 2.0716, "step": 128 }, { "epoch": 0.03415771668801524, "grad_norm": 1.3446725606918335, "learning_rate": 9.888733134746802e-05, "loss": 2.0164, "step": 130 }, { "epoch": 0.034683220021677015, "grad_norm": 1.322689414024353, "learning_rate": 9.886980900648327e-05, "loss": 2.0752, "step": 132 }, { "epoch": 0.035208723355338784, "grad_norm": 1.3145771026611328, "learning_rate": 9.885228666549852e-05, "loss": 2.0852, "step": 134 }, { "epoch": 0.03573422668900056, "grad_norm": 1.326093316078186, "learning_rate": 9.883476432451376e-05, "loss": 2.0414, "step": 136 }, { "epoch": 0.036259730022662334, "grad_norm": 1.234911561012268, "learning_rate": 9.881724198352901e-05, "loss": 2.0627, "step": 138 }, { "epoch": 0.0367852333563241, "grad_norm": 1.2666422128677368, "learning_rate": 9.879971964254426e-05, "loss": 2.039, "step": 140 }, { "epoch": 0.03731073668998588, "grad_norm": 1.285933494567871, "learning_rate": 9.87821973015595e-05, "loss": 2.0428, "step": 142 }, { "epoch": 0.03783624002364765, "grad_norm": 1.330095648765564, "learning_rate": 9.876467496057473e-05, "loss": 2.0458, "step": 144 }, { "epoch": 0.03836174335730942, "grad_norm": 1.338354229927063, "learning_rate": 9.874715261958998e-05, "loss": 2.0588, "step": 146 }, { "epoch": 0.0388872466909712, "grad_norm": 1.2980471849441528, "learning_rate": 9.872963027860522e-05, "loss": 2.092, "step": 148 }, { "epoch": 0.039412750024632966, "grad_norm": 1.1613810062408447, "learning_rate": 9.871210793762047e-05, "loss": 2.0558, "step": 150 }, { "epoch": 0.03993825335829474, "grad_norm": 1.2073849439620972, "learning_rate": 9.869458559663572e-05, "loss": 2.0317, "step": 152 }, { "epoch": 0.040463756691956516, "grad_norm": 1.2727243900299072, "learning_rate": 9.867706325565097e-05, "loss": 2.0637, "step": 154 }, { "epoch": 0.040989260025618285, "grad_norm": 1.1971285343170166, "learning_rate": 9.86595409146662e-05, "loss": 2.0245, "step": 156 }, { "epoch": 0.04151476335928006, "grad_norm": 1.2773100137710571, "learning_rate": 9.864201857368145e-05, "loss": 1.9971, "step": 158 }, { "epoch": 0.042040266692941836, "grad_norm": 1.1450934410095215, "learning_rate": 9.86244962326967e-05, "loss": 2.0367, "step": 160 }, { "epoch": 0.042565770026603604, "grad_norm": 1.0777571201324463, "learning_rate": 9.860697389171194e-05, "loss": 2.0667, "step": 162 }, { "epoch": 0.04309127336026538, "grad_norm": 1.1527281999588013, "learning_rate": 9.858945155072719e-05, "loss": 2.0556, "step": 164 }, { "epoch": 0.043616776693927155, "grad_norm": 1.2145709991455078, "learning_rate": 9.857192920974244e-05, "loss": 2.0638, "step": 166 }, { "epoch": 0.04414228002758892, "grad_norm": 1.2327994108200073, "learning_rate": 9.855440686875767e-05, "loss": 2.0521, "step": 168 }, { "epoch": 0.0446677833612507, "grad_norm": 1.1262874603271484, "learning_rate": 9.85368845277729e-05, "loss": 2.0456, "step": 170 }, { "epoch": 0.045193286694912474, "grad_norm": 1.261991262435913, "learning_rate": 9.851936218678815e-05, "loss": 2.0438, "step": 172 }, { "epoch": 0.04571879002857424, "grad_norm": 1.1301138401031494, "learning_rate": 9.85018398458034e-05, "loss": 2.0385, "step": 174 }, { "epoch": 0.04624429336223602, "grad_norm": 1.1710069179534912, "learning_rate": 9.848431750481865e-05, "loss": 1.9769, "step": 176 }, { "epoch": 0.046769796695897786, "grad_norm": 1.417973279953003, "learning_rate": 9.84667951638339e-05, "loss": 2.0309, "step": 178 }, { "epoch": 0.04729530002955956, "grad_norm": 1.1423699855804443, "learning_rate": 9.844927282284914e-05, "loss": 2.0254, "step": 180 }, { "epoch": 0.04782080336322134, "grad_norm": 1.036130428314209, "learning_rate": 9.843175048186438e-05, "loss": 2.0361, "step": 182 }, { "epoch": 0.048346306696883105, "grad_norm": 1.1310175657272339, "learning_rate": 9.841422814087962e-05, "loss": 2.0172, "step": 184 }, { "epoch": 0.04887181003054488, "grad_norm": 1.1112390756607056, "learning_rate": 9.839670579989487e-05, "loss": 2.021, "step": 186 }, { "epoch": 0.049397313364206656, "grad_norm": 1.1196744441986084, "learning_rate": 9.837918345891012e-05, "loss": 2.0398, "step": 188 }, { "epoch": 0.049922816697868425, "grad_norm": 1.1641274690628052, "learning_rate": 9.836166111792537e-05, "loss": 2.0211, "step": 190 }, { "epoch": 0.0504483200315302, "grad_norm": 1.231698989868164, "learning_rate": 9.834413877694061e-05, "loss": 1.9938, "step": 192 }, { "epoch": 0.050973823365191975, "grad_norm": 1.116532325744629, "learning_rate": 9.832661643595585e-05, "loss": 1.9916, "step": 194 }, { "epoch": 0.051499326698853744, "grad_norm": 1.1912415027618408, "learning_rate": 9.830909409497108e-05, "loss": 2.0206, "step": 196 }, { "epoch": 0.05202483003251552, "grad_norm": 1.0664820671081543, "learning_rate": 9.829157175398633e-05, "loss": 2.0244, "step": 198 }, { "epoch": 0.052550333366177295, "grad_norm": 1.1066045761108398, "learning_rate": 9.827404941300158e-05, "loss": 2.0005, "step": 200 }, { "epoch": 0.05307583669983906, "grad_norm": 1.1267125606536865, "learning_rate": 9.825652707201683e-05, "loss": 2.0185, "step": 202 }, { "epoch": 0.05360134003350084, "grad_norm": 1.14983069896698, "learning_rate": 9.823900473103207e-05, "loss": 1.9726, "step": 204 }, { "epoch": 0.054126843367162614, "grad_norm": 1.2234214544296265, "learning_rate": 9.822148239004732e-05, "loss": 2.0497, "step": 206 }, { "epoch": 0.05465234670082438, "grad_norm": 1.1713297367095947, "learning_rate": 9.820396004906255e-05, "loss": 2.0167, "step": 208 }, { "epoch": 0.05517785003448616, "grad_norm": 1.2977114915847778, "learning_rate": 9.81864377080778e-05, "loss": 1.9795, "step": 210 }, { "epoch": 0.055703353368147926, "grad_norm": 1.1145280599594116, "learning_rate": 9.816891536709305e-05, "loss": 2.0099, "step": 212 }, { "epoch": 0.0562288567018097, "grad_norm": 1.126206874847412, "learning_rate": 9.81513930261083e-05, "loss": 2.0053, "step": 214 }, { "epoch": 0.05675436003547148, "grad_norm": 1.1073702573776245, "learning_rate": 9.813387068512355e-05, "loss": 1.985, "step": 216 }, { "epoch": 0.057279863369133245, "grad_norm": 1.2039167881011963, "learning_rate": 9.811634834413879e-05, "loss": 1.9898, "step": 218 }, { "epoch": 0.05780536670279502, "grad_norm": 1.2644699811935425, "learning_rate": 9.809882600315403e-05, "loss": 1.9904, "step": 220 }, { "epoch": 0.058330870036456796, "grad_norm": 0.9593138694763184, "learning_rate": 9.808130366216926e-05, "loss": 2.0065, "step": 222 }, { "epoch": 0.058856373370118564, "grad_norm": 0.9157779216766357, "learning_rate": 9.806378132118451e-05, "loss": 1.9853, "step": 224 }, { "epoch": 0.05938187670378034, "grad_norm": 1.0334917306900024, "learning_rate": 9.804625898019976e-05, "loss": 2.0129, "step": 226 }, { "epoch": 0.059907380037442115, "grad_norm": 1.0476865768432617, "learning_rate": 9.8028736639215e-05, "loss": 1.9816, "step": 228 }, { "epoch": 0.060432883371103883, "grad_norm": 1.1626946926116943, "learning_rate": 9.801121429823025e-05, "loss": 2.0278, "step": 230 }, { "epoch": 0.06095838670476566, "grad_norm": 1.3142833709716797, "learning_rate": 9.79936919572455e-05, "loss": 1.998, "step": 232 }, { "epoch": 0.061483890038427434, "grad_norm": 1.195894718170166, "learning_rate": 9.797616961626073e-05, "loss": 1.9813, "step": 234 }, { "epoch": 0.0620093933720892, "grad_norm": 1.0905667543411255, "learning_rate": 9.795864727527598e-05, "loss": 1.9725, "step": 236 }, { "epoch": 0.06253489670575098, "grad_norm": 1.0762920379638672, "learning_rate": 9.794112493429123e-05, "loss": 2.0036, "step": 238 }, { "epoch": 0.06306040003941275, "grad_norm": 0.9937522411346436, "learning_rate": 9.792360259330648e-05, "loss": 2.0208, "step": 240 }, { "epoch": 0.06358590337307453, "grad_norm": 1.3950233459472656, "learning_rate": 9.790608025232172e-05, "loss": 1.9667, "step": 242 }, { "epoch": 0.06411140670673629, "grad_norm": 1.0673043727874756, "learning_rate": 9.788855791133697e-05, "loss": 1.984, "step": 244 }, { "epoch": 0.06463691004039807, "grad_norm": 1.0824543237686157, "learning_rate": 9.78710355703522e-05, "loss": 2.0271, "step": 246 }, { "epoch": 0.06516241337405984, "grad_norm": 1.2005363702774048, "learning_rate": 9.785351322936744e-05, "loss": 1.9912, "step": 248 }, { "epoch": 0.06568791670772162, "grad_norm": 1.0987967252731323, "learning_rate": 9.783599088838269e-05, "loss": 1.9886, "step": 250 }, { "epoch": 0.06621342004138339, "grad_norm": 1.0441079139709473, "learning_rate": 9.781846854739793e-05, "loss": 1.9888, "step": 252 }, { "epoch": 0.06673892337504517, "grad_norm": 1.3533669710159302, "learning_rate": 9.780094620641318e-05, "loss": 2.0232, "step": 254 }, { "epoch": 0.06726442670870693, "grad_norm": 0.9276557564735413, "learning_rate": 9.778342386542843e-05, "loss": 1.969, "step": 256 }, { "epoch": 0.0677899300423687, "grad_norm": 1.143988847732544, "learning_rate": 9.776590152444368e-05, "loss": 1.9643, "step": 258 }, { "epoch": 0.06831543337603048, "grad_norm": 0.9717089533805847, "learning_rate": 9.774837918345891e-05, "loss": 2.0169, "step": 260 }, { "epoch": 0.06884093670969225, "grad_norm": 1.328565001487732, "learning_rate": 9.773085684247416e-05, "loss": 1.9733, "step": 262 }, { "epoch": 0.06936644004335403, "grad_norm": 1.0031944513320923, "learning_rate": 9.77133345014894e-05, "loss": 2.0026, "step": 264 }, { "epoch": 0.06989194337701579, "grad_norm": 0.9560984969139099, "learning_rate": 9.769581216050465e-05, "loss": 1.9893, "step": 266 }, { "epoch": 0.07041744671067757, "grad_norm": 1.1324944496154785, "learning_rate": 9.76782898195199e-05, "loss": 1.9499, "step": 268 }, { "epoch": 0.07094295004433934, "grad_norm": 1.1018491983413696, "learning_rate": 9.766076747853515e-05, "loss": 1.9915, "step": 270 }, { "epoch": 0.07146845337800112, "grad_norm": 1.1043239831924438, "learning_rate": 9.764324513755038e-05, "loss": 2.0021, "step": 272 }, { "epoch": 0.0719939567116629, "grad_norm": 1.1171072721481323, "learning_rate": 9.762572279656562e-05, "loss": 1.9665, "step": 274 }, { "epoch": 0.07251946004532467, "grad_norm": 1.040802001953125, "learning_rate": 9.760820045558086e-05, "loss": 2.0249, "step": 276 }, { "epoch": 0.07304496337898643, "grad_norm": 1.0344669818878174, "learning_rate": 9.759067811459611e-05, "loss": 2.0102, "step": 278 }, { "epoch": 0.0735704667126482, "grad_norm": 0.9199603199958801, "learning_rate": 9.757315577361136e-05, "loss": 1.9836, "step": 280 }, { "epoch": 0.07409597004630998, "grad_norm": 1.129347562789917, "learning_rate": 9.75556334326266e-05, "loss": 1.9141, "step": 282 }, { "epoch": 0.07462147337997176, "grad_norm": 0.9745813012123108, "learning_rate": 9.753811109164185e-05, "loss": 1.9807, "step": 284 }, { "epoch": 0.07514697671363353, "grad_norm": 1.2225230932235718, "learning_rate": 9.752058875065709e-05, "loss": 1.9786, "step": 286 }, { "epoch": 0.0756724800472953, "grad_norm": 1.7652974128723145, "learning_rate": 9.750306640967234e-05, "loss": 1.9575, "step": 288 }, { "epoch": 0.07619798338095707, "grad_norm": 1.075140118598938, "learning_rate": 9.748554406868758e-05, "loss": 2.0034, "step": 290 }, { "epoch": 0.07672348671461884, "grad_norm": 1.0169627666473389, "learning_rate": 9.746802172770283e-05, "loss": 2.0072, "step": 292 }, { "epoch": 0.07724899004828062, "grad_norm": 1.0737385749816895, "learning_rate": 9.745049938671808e-05, "loss": 1.9446, "step": 294 }, { "epoch": 0.0777744933819424, "grad_norm": 1.2098944187164307, "learning_rate": 9.743297704573331e-05, "loss": 1.9875, "step": 296 }, { "epoch": 0.07829999671560417, "grad_norm": 1.030888557434082, "learning_rate": 9.741545470474856e-05, "loss": 2.0323, "step": 298 }, { "epoch": 0.07882550004926593, "grad_norm": 1.1690922975540161, "learning_rate": 9.73979323637638e-05, "loss": 1.9544, "step": 300 }, { "epoch": 0.0793510033829277, "grad_norm": 0.9702273607254028, "learning_rate": 9.738041002277904e-05, "loss": 1.954, "step": 302 }, { "epoch": 0.07987650671658948, "grad_norm": 1.1210618019104004, "learning_rate": 9.736288768179429e-05, "loss": 1.9786, "step": 304 }, { "epoch": 0.08040201005025126, "grad_norm": 1.0246062278747559, "learning_rate": 9.734536534080954e-05, "loss": 1.9777, "step": 306 }, { "epoch": 0.08092751338391303, "grad_norm": 0.9073227047920227, "learning_rate": 9.732784299982478e-05, "loss": 1.9737, "step": 308 }, { "epoch": 0.08145301671757481, "grad_norm": 0.9827994704246521, "learning_rate": 9.731032065884003e-05, "loss": 1.9794, "step": 310 }, { "epoch": 0.08197852005123657, "grad_norm": 0.9744251370429993, "learning_rate": 9.729279831785527e-05, "loss": 1.9771, "step": 312 }, { "epoch": 0.08250402338489835, "grad_norm": 1.0426684617996216, "learning_rate": 9.727527597687051e-05, "loss": 1.9522, "step": 314 }, { "epoch": 0.08302952671856012, "grad_norm": 1.0296825170516968, "learning_rate": 9.725775363588576e-05, "loss": 1.9727, "step": 316 }, { "epoch": 0.0835550300522219, "grad_norm": 1.2643887996673584, "learning_rate": 9.724023129490101e-05, "loss": 1.9414, "step": 318 }, { "epoch": 0.08408053338588367, "grad_norm": 1.0346665382385254, "learning_rate": 9.722270895391626e-05, "loss": 1.9964, "step": 320 }, { "epoch": 0.08460603671954543, "grad_norm": 0.9016798734664917, "learning_rate": 9.720518661293149e-05, "loss": 1.9762, "step": 322 }, { "epoch": 0.08513154005320721, "grad_norm": 1.0269689559936523, "learning_rate": 9.718766427194674e-05, "loss": 1.9969, "step": 324 }, { "epoch": 0.08565704338686898, "grad_norm": 1.0594022274017334, "learning_rate": 9.717014193096197e-05, "loss": 1.9624, "step": 326 }, { "epoch": 0.08618254672053076, "grad_norm": 1.1567680835723877, "learning_rate": 9.715261958997722e-05, "loss": 1.959, "step": 328 }, { "epoch": 0.08670805005419253, "grad_norm": 0.870560884475708, "learning_rate": 9.713509724899247e-05, "loss": 1.9569, "step": 330 }, { "epoch": 0.08723355338785431, "grad_norm": 1.0716711282730103, "learning_rate": 9.711757490800771e-05, "loss": 1.951, "step": 332 }, { "epoch": 0.08775905672151607, "grad_norm": 1.1096898317337036, "learning_rate": 9.710005256702296e-05, "loss": 1.9796, "step": 334 }, { "epoch": 0.08828456005517785, "grad_norm": 0.8860819339752197, "learning_rate": 9.708253022603821e-05, "loss": 1.9591, "step": 336 }, { "epoch": 0.08881006338883962, "grad_norm": 0.9616803526878357, "learning_rate": 9.706500788505344e-05, "loss": 1.9743, "step": 338 }, { "epoch": 0.0893355667225014, "grad_norm": 0.8491392135620117, "learning_rate": 9.704748554406869e-05, "loss": 1.959, "step": 340 }, { "epoch": 0.08986107005616317, "grad_norm": 1.068489670753479, "learning_rate": 9.702996320308394e-05, "loss": 1.971, "step": 342 }, { "epoch": 0.09038657338982495, "grad_norm": 0.8963595032691956, "learning_rate": 9.701244086209919e-05, "loss": 1.9239, "step": 344 }, { "epoch": 0.09091207672348671, "grad_norm": 1.232599139213562, "learning_rate": 9.699491852111443e-05, "loss": 1.9282, "step": 346 }, { "epoch": 0.09143758005714848, "grad_norm": 1.0067182779312134, "learning_rate": 9.697739618012967e-05, "loss": 1.9539, "step": 348 }, { "epoch": 0.09196308339081026, "grad_norm": 0.927861750125885, "learning_rate": 9.695987383914491e-05, "loss": 1.9518, "step": 350 }, { "epoch": 0.09248858672447204, "grad_norm": 1.1443390846252441, "learning_rate": 9.694235149816015e-05, "loss": 1.9527, "step": 352 }, { "epoch": 0.09301409005813381, "grad_norm": 1.3024427890777588, "learning_rate": 9.69248291571754e-05, "loss": 1.9276, "step": 354 }, { "epoch": 0.09353959339179557, "grad_norm": 1.0598125457763672, "learning_rate": 9.690730681619064e-05, "loss": 1.9728, "step": 356 }, { "epoch": 0.09406509672545735, "grad_norm": 1.0384907722473145, "learning_rate": 9.688978447520589e-05, "loss": 1.9543, "step": 358 }, { "epoch": 0.09459060005911912, "grad_norm": 1.4060821533203125, "learning_rate": 9.687226213422114e-05, "loss": 1.9005, "step": 360 }, { "epoch": 0.0951161033927809, "grad_norm": 0.9630605578422546, "learning_rate": 9.685473979323639e-05, "loss": 1.9306, "step": 362 }, { "epoch": 0.09564160672644267, "grad_norm": 0.9025930166244507, "learning_rate": 9.683721745225162e-05, "loss": 1.9391, "step": 364 }, { "epoch": 0.09616711006010445, "grad_norm": 1.0383387804031372, "learning_rate": 9.681969511126687e-05, "loss": 1.9506, "step": 366 }, { "epoch": 0.09669261339376621, "grad_norm": 0.9397739768028259, "learning_rate": 9.680217277028212e-05, "loss": 1.9132, "step": 368 }, { "epoch": 0.09721811672742799, "grad_norm": 0.9472708106040955, "learning_rate": 9.678465042929736e-05, "loss": 1.9731, "step": 370 }, { "epoch": 0.09774362006108976, "grad_norm": 0.8391740322113037, "learning_rate": 9.676712808831261e-05, "loss": 1.9956, "step": 372 }, { "epoch": 0.09826912339475154, "grad_norm": 1.160340666770935, "learning_rate": 9.674960574732785e-05, "loss": 1.9576, "step": 374 }, { "epoch": 0.09879462672841331, "grad_norm": 1.1132557392120361, "learning_rate": 9.673208340634309e-05, "loss": 1.9407, "step": 376 }, { "epoch": 0.09932013006207509, "grad_norm": 0.9549061059951782, "learning_rate": 9.671456106535834e-05, "loss": 1.96, "step": 378 }, { "epoch": 0.09984563339573685, "grad_norm": 1.953350305557251, "learning_rate": 9.669703872437357e-05, "loss": 1.9146, "step": 380 }, { "epoch": 0.10037113672939862, "grad_norm": 0.8253780603408813, "learning_rate": 9.667951638338882e-05, "loss": 1.9576, "step": 382 }, { "epoch": 0.1008966400630604, "grad_norm": 1.76423180103302, "learning_rate": 9.666199404240407e-05, "loss": 1.9411, "step": 384 }, { "epoch": 0.10142214339672218, "grad_norm": 1.2772194147109985, "learning_rate": 9.664447170141932e-05, "loss": 1.939, "step": 386 }, { "epoch": 0.10194764673038395, "grad_norm": 1.1129158735275269, "learning_rate": 9.662694936043456e-05, "loss": 1.9587, "step": 388 }, { "epoch": 0.10247315006404571, "grad_norm": 1.5891733169555664, "learning_rate": 9.660942701944981e-05, "loss": 1.9677, "step": 390 }, { "epoch": 0.10299865339770749, "grad_norm": 0.8642210364341736, "learning_rate": 9.659190467846505e-05, "loss": 1.9356, "step": 392 }, { "epoch": 0.10352415673136926, "grad_norm": 1.0561408996582031, "learning_rate": 9.65743823374803e-05, "loss": 1.961, "step": 394 }, { "epoch": 0.10404966006503104, "grad_norm": 0.9040923118591309, "learning_rate": 9.655685999649554e-05, "loss": 1.9217, "step": 396 }, { "epoch": 0.10457516339869281, "grad_norm": 0.9783763289451599, "learning_rate": 9.653933765551078e-05, "loss": 1.9042, "step": 398 }, { "epoch": 0.10510066673235459, "grad_norm": 1.0281099081039429, "learning_rate": 9.652181531452602e-05, "loss": 1.8989, "step": 400 }, { "epoch": 0.10510066673235459, "eval_loss": 1.89004647731781, "eval_runtime": 487.2614, "eval_samples_per_second": 249.946, "eval_steps_per_second": 31.244, "step": 400 }, { "epoch": 0.10562617006601635, "grad_norm": 0.9645543694496155, "learning_rate": 9.650429297354127e-05, "loss": 1.9487, "step": 402 }, { "epoch": 0.10615167339967813, "grad_norm": 1.1213475465774536, "learning_rate": 9.648677063255652e-05, "loss": 1.963, "step": 404 }, { "epoch": 0.1066771767333399, "grad_norm": 0.8866876363754272, "learning_rate": 9.646924829157175e-05, "loss": 1.925, "step": 406 }, { "epoch": 0.10720268006700168, "grad_norm": 0.9458855390548706, "learning_rate": 9.6451725950587e-05, "loss": 1.9641, "step": 408 }, { "epoch": 0.10772818340066345, "grad_norm": 0.9262669086456299, "learning_rate": 9.643420360960225e-05, "loss": 1.9257, "step": 410 }, { "epoch": 0.10825368673432523, "grad_norm": 0.9147223830223083, "learning_rate": 9.64166812686175e-05, "loss": 1.9242, "step": 412 }, { "epoch": 0.10877919006798699, "grad_norm": 1.1678857803344727, "learning_rate": 9.639915892763274e-05, "loss": 1.9474, "step": 414 }, { "epoch": 0.10930469340164876, "grad_norm": 1.0557537078857422, "learning_rate": 9.638163658664799e-05, "loss": 1.9527, "step": 416 }, { "epoch": 0.10983019673531054, "grad_norm": 1.047229528427124, "learning_rate": 9.636411424566322e-05, "loss": 1.9116, "step": 418 }, { "epoch": 0.11035570006897231, "grad_norm": 0.9046667814254761, "learning_rate": 9.634659190467847e-05, "loss": 1.914, "step": 420 }, { "epoch": 0.11088120340263409, "grad_norm": 1.019170880317688, "learning_rate": 9.632906956369372e-05, "loss": 1.9667, "step": 422 }, { "epoch": 0.11140670673629585, "grad_norm": 1.061903953552246, "learning_rate": 9.631154722270895e-05, "loss": 1.8915, "step": 424 }, { "epoch": 0.11193221006995763, "grad_norm": 1.1627757549285889, "learning_rate": 9.62940248817242e-05, "loss": 1.9272, "step": 426 }, { "epoch": 0.1124577134036194, "grad_norm": 0.9440149068832397, "learning_rate": 9.627650254073945e-05, "loss": 1.9686, "step": 428 }, { "epoch": 0.11298321673728118, "grad_norm": 1.0250455141067505, "learning_rate": 9.62589801997547e-05, "loss": 1.937, "step": 430 }, { "epoch": 0.11350872007094295, "grad_norm": 0.8273470401763916, "learning_rate": 9.624145785876993e-05, "loss": 1.9411, "step": 432 }, { "epoch": 0.11403422340460473, "grad_norm": 1.4610145092010498, "learning_rate": 9.622393551778518e-05, "loss": 1.954, "step": 434 }, { "epoch": 0.11455972673826649, "grad_norm": 0.9790822267532349, "learning_rate": 9.620641317680042e-05, "loss": 1.9255, "step": 436 }, { "epoch": 0.11508523007192827, "grad_norm": 1.1833688020706177, "learning_rate": 9.618889083581567e-05, "loss": 1.9031, "step": 438 }, { "epoch": 0.11561073340559004, "grad_norm": 0.879008948802948, "learning_rate": 9.617136849483092e-05, "loss": 1.937, "step": 440 }, { "epoch": 0.11613623673925182, "grad_norm": 1.2231090068817139, "learning_rate": 9.615384615384617e-05, "loss": 1.9649, "step": 442 }, { "epoch": 0.11666174007291359, "grad_norm": 1.1259514093399048, "learning_rate": 9.61363238128614e-05, "loss": 1.8921, "step": 444 }, { "epoch": 0.11718724340657537, "grad_norm": 1.294339656829834, "learning_rate": 9.611880147187665e-05, "loss": 1.9084, "step": 446 }, { "epoch": 0.11771274674023713, "grad_norm": 0.9451860785484314, "learning_rate": 9.61012791308919e-05, "loss": 1.9441, "step": 448 }, { "epoch": 0.1182382500738989, "grad_norm": 0.9239658117294312, "learning_rate": 9.608375678990713e-05, "loss": 1.9291, "step": 450 }, { "epoch": 0.11876375340756068, "grad_norm": 1.0765795707702637, "learning_rate": 9.606623444892238e-05, "loss": 1.9249, "step": 452 }, { "epoch": 0.11928925674122245, "grad_norm": 0.919032633304596, "learning_rate": 9.604871210793763e-05, "loss": 1.9399, "step": 454 }, { "epoch": 0.11981476007488423, "grad_norm": 0.8893537521362305, "learning_rate": 9.603118976695287e-05, "loss": 1.8985, "step": 456 }, { "epoch": 0.12034026340854599, "grad_norm": 1.4801400899887085, "learning_rate": 9.601366742596811e-05, "loss": 1.8984, "step": 458 }, { "epoch": 0.12086576674220777, "grad_norm": 0.8458815217018127, "learning_rate": 9.599614508498335e-05, "loss": 1.9365, "step": 460 }, { "epoch": 0.12139127007586954, "grad_norm": 1.32188081741333, "learning_rate": 9.59786227439986e-05, "loss": 1.914, "step": 462 }, { "epoch": 0.12191677340953132, "grad_norm": 0.8433484435081482, "learning_rate": 9.596110040301385e-05, "loss": 1.9746, "step": 464 }, { "epoch": 0.1224422767431931, "grad_norm": 1.1092686653137207, "learning_rate": 9.59435780620291e-05, "loss": 1.9672, "step": 466 }, { "epoch": 0.12296778007685487, "grad_norm": 0.9566219449043274, "learning_rate": 9.592605572104434e-05, "loss": 1.9225, "step": 468 }, { "epoch": 0.12349328341051663, "grad_norm": 1.2604984045028687, "learning_rate": 9.590853338005958e-05, "loss": 1.9309, "step": 470 }, { "epoch": 0.1240187867441784, "grad_norm": 1.2344852685928345, "learning_rate": 9.589101103907483e-05, "loss": 1.9236, "step": 472 }, { "epoch": 0.12454429007784018, "grad_norm": 1.2368342876434326, "learning_rate": 9.587348869809007e-05, "loss": 1.9058, "step": 474 }, { "epoch": 0.12506979341150196, "grad_norm": 1.3377318382263184, "learning_rate": 9.585596635710531e-05, "loss": 1.9345, "step": 476 }, { "epoch": 0.12559529674516373, "grad_norm": 0.981948733329773, "learning_rate": 9.583844401612056e-05, "loss": 1.9253, "step": 478 }, { "epoch": 0.1261208000788255, "grad_norm": 1.3136436939239502, "learning_rate": 9.58209216751358e-05, "loss": 1.933, "step": 480 }, { "epoch": 0.12664630341248728, "grad_norm": 0.9691250324249268, "learning_rate": 9.580339933415105e-05, "loss": 1.9274, "step": 482 }, { "epoch": 0.12717180674614906, "grad_norm": 1.4272805452346802, "learning_rate": 9.578587699316628e-05, "loss": 1.9242, "step": 484 }, { "epoch": 0.1276973100798108, "grad_norm": 1.0407310724258423, "learning_rate": 9.576835465218153e-05, "loss": 1.9399, "step": 486 }, { "epoch": 0.12822281341347258, "grad_norm": 0.8317910432815552, "learning_rate": 9.575083231119678e-05, "loss": 1.8733, "step": 488 }, { "epoch": 0.12874831674713436, "grad_norm": 1.1145474910736084, "learning_rate": 9.573330997021203e-05, "loss": 1.9443, "step": 490 }, { "epoch": 0.12927382008079613, "grad_norm": 1.0935791730880737, "learning_rate": 9.571578762922728e-05, "loss": 1.9324, "step": 492 }, { "epoch": 0.1297993234144579, "grad_norm": 1.340178370475769, "learning_rate": 9.569826528824252e-05, "loss": 1.9046, "step": 494 }, { "epoch": 0.13032482674811968, "grad_norm": 0.8448821902275085, "learning_rate": 9.568074294725776e-05, "loss": 1.9172, "step": 496 }, { "epoch": 0.13085033008178146, "grad_norm": 1.1193829774856567, "learning_rate": 9.5663220606273e-05, "loss": 1.9471, "step": 498 }, { "epoch": 0.13137583341544323, "grad_norm": 1.0313880443572998, "learning_rate": 9.564569826528824e-05, "loss": 1.9027, "step": 500 }, { "epoch": 0.131901336749105, "grad_norm": 1.037778615951538, "learning_rate": 9.562817592430349e-05, "loss": 1.9375, "step": 502 }, { "epoch": 0.13242684008276678, "grad_norm": 1.1615263223648071, "learning_rate": 9.561065358331873e-05, "loss": 1.9226, "step": 504 }, { "epoch": 0.13295234341642856, "grad_norm": 1.5699961185455322, "learning_rate": 9.559313124233398e-05, "loss": 1.9097, "step": 506 }, { "epoch": 0.13347784675009033, "grad_norm": 2.0286099910736084, "learning_rate": 9.557560890134923e-05, "loss": 1.9207, "step": 508 }, { "epoch": 0.13400335008375208, "grad_norm": 1.0447641611099243, "learning_rate": 9.555808656036446e-05, "loss": 1.9415, "step": 510 }, { "epoch": 0.13452885341741386, "grad_norm": 1.357127070426941, "learning_rate": 9.554056421937971e-05, "loss": 1.9165, "step": 512 }, { "epoch": 0.13505435675107563, "grad_norm": 1.3550409078598022, "learning_rate": 9.552304187839496e-05, "loss": 1.9043, "step": 514 }, { "epoch": 0.1355798600847374, "grad_norm": 1.1390573978424072, "learning_rate": 9.55055195374102e-05, "loss": 1.8865, "step": 516 }, { "epoch": 0.13610536341839918, "grad_norm": 1.2006975412368774, "learning_rate": 9.548799719642545e-05, "loss": 1.9478, "step": 518 }, { "epoch": 0.13663086675206096, "grad_norm": 0.9007882475852966, "learning_rate": 9.54704748554407e-05, "loss": 1.8973, "step": 520 }, { "epoch": 0.13715637008572273, "grad_norm": 1.0151760578155518, "learning_rate": 9.545295251445593e-05, "loss": 1.9178, "step": 522 }, { "epoch": 0.1376818734193845, "grad_norm": 1.0484004020690918, "learning_rate": 9.543543017347118e-05, "loss": 1.9464, "step": 524 }, { "epoch": 0.13820737675304628, "grad_norm": 0.9942461848258972, "learning_rate": 9.541790783248642e-05, "loss": 1.8783, "step": 526 }, { "epoch": 0.13873288008670806, "grad_norm": 1.2193222045898438, "learning_rate": 9.540038549150166e-05, "loss": 1.911, "step": 528 }, { "epoch": 0.13925838342036984, "grad_norm": 0.9065030813217163, "learning_rate": 9.538286315051691e-05, "loss": 1.9214, "step": 530 }, { "epoch": 0.13978388675403158, "grad_norm": 1.1993494033813477, "learning_rate": 9.536534080953216e-05, "loss": 1.9159, "step": 532 }, { "epoch": 0.14030939008769336, "grad_norm": 1.1094871759414673, "learning_rate": 9.53478184685474e-05, "loss": 1.9393, "step": 534 }, { "epoch": 0.14083489342135513, "grad_norm": 1.1761337518692017, "learning_rate": 9.533029612756264e-05, "loss": 1.9315, "step": 536 }, { "epoch": 0.1413603967550169, "grad_norm": 0.8991339802742004, "learning_rate": 9.531277378657789e-05, "loss": 1.9138, "step": 538 }, { "epoch": 0.14188590008867868, "grad_norm": 1.0923110246658325, "learning_rate": 9.529525144559314e-05, "loss": 1.9254, "step": 540 }, { "epoch": 0.14241140342234046, "grad_norm": 0.8643242716789246, "learning_rate": 9.527772910460838e-05, "loss": 1.912, "step": 542 }, { "epoch": 0.14293690675600224, "grad_norm": 0.8578360080718994, "learning_rate": 9.526020676362363e-05, "loss": 1.9403, "step": 544 }, { "epoch": 0.143462410089664, "grad_norm": 0.854049563407898, "learning_rate": 9.524268442263888e-05, "loss": 1.9251, "step": 546 }, { "epoch": 0.1439879134233258, "grad_norm": 1.1244609355926514, "learning_rate": 9.522516208165411e-05, "loss": 1.928, "step": 548 }, { "epoch": 0.14451341675698756, "grad_norm": 1.0357218980789185, "learning_rate": 9.520763974066936e-05, "loss": 1.9137, "step": 550 }, { "epoch": 0.14503892009064934, "grad_norm": 1.2136287689208984, "learning_rate": 9.51901173996846e-05, "loss": 1.9009, "step": 552 }, { "epoch": 0.14556442342431108, "grad_norm": 1.5104690790176392, "learning_rate": 9.517259505869984e-05, "loss": 1.8873, "step": 554 }, { "epoch": 0.14608992675797286, "grad_norm": 0.9920956492424011, "learning_rate": 9.515507271771509e-05, "loss": 1.9168, "step": 556 }, { "epoch": 0.14661543009163464, "grad_norm": 1.0754443407058716, "learning_rate": 9.513755037673034e-05, "loss": 1.9258, "step": 558 }, { "epoch": 0.1471409334252964, "grad_norm": 0.9329949617385864, "learning_rate": 9.512002803574558e-05, "loss": 1.8999, "step": 560 }, { "epoch": 0.14766643675895819, "grad_norm": 0.9859107136726379, "learning_rate": 9.510250569476082e-05, "loss": 1.8973, "step": 562 }, { "epoch": 0.14819194009261996, "grad_norm": 0.8728905916213989, "learning_rate": 9.508498335377607e-05, "loss": 1.8981, "step": 564 }, { "epoch": 0.14871744342628174, "grad_norm": 0.9786973595619202, "learning_rate": 9.506746101279131e-05, "loss": 1.9167, "step": 566 }, { "epoch": 0.1492429467599435, "grad_norm": 0.9139885902404785, "learning_rate": 9.504993867180656e-05, "loss": 1.8988, "step": 568 }, { "epoch": 0.1497684500936053, "grad_norm": 0.9002274870872498, "learning_rate": 9.503241633082181e-05, "loss": 1.8964, "step": 570 }, { "epoch": 0.15029395342726706, "grad_norm": 0.9470378756523132, "learning_rate": 9.501489398983706e-05, "loss": 1.8877, "step": 572 }, { "epoch": 0.15081945676092884, "grad_norm": 1.2751479148864746, "learning_rate": 9.499737164885229e-05, "loss": 1.9328, "step": 574 }, { "epoch": 0.1513449600945906, "grad_norm": 1.0107779502868652, "learning_rate": 9.497984930786754e-05, "loss": 1.9271, "step": 576 }, { "epoch": 0.15187046342825236, "grad_norm": 1.3902311325073242, "learning_rate": 9.496232696688277e-05, "loss": 1.8903, "step": 578 }, { "epoch": 0.15239596676191414, "grad_norm": 0.8747096657752991, "learning_rate": 9.494480462589802e-05, "loss": 1.9058, "step": 580 }, { "epoch": 0.1529214700955759, "grad_norm": 0.9474460482597351, "learning_rate": 9.492728228491327e-05, "loss": 1.9357, "step": 582 }, { "epoch": 0.1534469734292377, "grad_norm": 0.9786020517349243, "learning_rate": 9.490975994392851e-05, "loss": 1.9267, "step": 584 }, { "epoch": 0.15397247676289946, "grad_norm": 0.9129196405410767, "learning_rate": 9.489223760294376e-05, "loss": 1.9129, "step": 586 }, { "epoch": 0.15449798009656124, "grad_norm": 1.2065305709838867, "learning_rate": 9.4874715261959e-05, "loss": 1.8809, "step": 588 }, { "epoch": 0.155023483430223, "grad_norm": 1.0768754482269287, "learning_rate": 9.485719292097424e-05, "loss": 1.8827, "step": 590 }, { "epoch": 0.1555489867638848, "grad_norm": 1.3688833713531494, "learning_rate": 9.483967057998949e-05, "loss": 1.9325, "step": 592 }, { "epoch": 0.15607449009754656, "grad_norm": 1.0570260286331177, "learning_rate": 9.482214823900474e-05, "loss": 1.8599, "step": 594 }, { "epoch": 0.15659999343120834, "grad_norm": 1.0146839618682861, "learning_rate": 9.480462589801999e-05, "loss": 1.9205, "step": 596 }, { "epoch": 0.15712549676487012, "grad_norm": 1.6670814752578735, "learning_rate": 9.478710355703523e-05, "loss": 1.8699, "step": 598 }, { "epoch": 0.15765100009853186, "grad_norm": 1.178289532661438, "learning_rate": 9.476958121605047e-05, "loss": 1.9022, "step": 600 }, { "epoch": 0.15817650343219364, "grad_norm": 1.6808775663375854, "learning_rate": 9.47520588750657e-05, "loss": 1.9264, "step": 602 }, { "epoch": 0.1587020067658554, "grad_norm": 0.9852617383003235, "learning_rate": 9.473453653408095e-05, "loss": 1.9114, "step": 604 }, { "epoch": 0.1592275100995172, "grad_norm": 0.8299278616905212, "learning_rate": 9.47170141930962e-05, "loss": 1.9012, "step": 606 }, { "epoch": 0.15975301343317896, "grad_norm": 0.9899255037307739, "learning_rate": 9.469949185211144e-05, "loss": 1.9187, "step": 608 }, { "epoch": 0.16027851676684074, "grad_norm": 0.9521119594573975, "learning_rate": 9.468196951112669e-05, "loss": 1.8972, "step": 610 }, { "epoch": 0.16080402010050251, "grad_norm": 1.2446365356445312, "learning_rate": 9.466444717014194e-05, "loss": 1.8962, "step": 612 }, { "epoch": 0.1613295234341643, "grad_norm": 1.1197859048843384, "learning_rate": 9.464692482915717e-05, "loss": 1.8753, "step": 614 }, { "epoch": 0.16185502676782607, "grad_norm": 0.9934601783752441, "learning_rate": 9.462940248817242e-05, "loss": 1.8974, "step": 616 }, { "epoch": 0.16238053010148784, "grad_norm": 0.9261951446533203, "learning_rate": 9.461188014718767e-05, "loss": 1.9052, "step": 618 }, { "epoch": 0.16290603343514962, "grad_norm": 1.2140916585922241, "learning_rate": 9.459435780620292e-05, "loss": 1.8779, "step": 620 }, { "epoch": 0.16343153676881136, "grad_norm": 1.067958116531372, "learning_rate": 9.457683546521816e-05, "loss": 1.8771, "step": 622 }, { "epoch": 0.16395704010247314, "grad_norm": 0.9392744302749634, "learning_rate": 9.455931312423341e-05, "loss": 1.892, "step": 624 }, { "epoch": 0.16448254343613491, "grad_norm": 1.6888792514801025, "learning_rate": 9.454179078324864e-05, "loss": 1.8829, "step": 626 }, { "epoch": 0.1650080467697967, "grad_norm": 0.9804869890213013, "learning_rate": 9.452426844226389e-05, "loss": 1.9113, "step": 628 }, { "epoch": 0.16553355010345847, "grad_norm": 1.0176154375076294, "learning_rate": 9.450674610127913e-05, "loss": 1.9063, "step": 630 }, { "epoch": 0.16605905343712024, "grad_norm": 0.9681494235992432, "learning_rate": 9.448922376029437e-05, "loss": 1.9095, "step": 632 }, { "epoch": 0.16658455677078202, "grad_norm": 0.9195823669433594, "learning_rate": 9.447170141930962e-05, "loss": 1.8881, "step": 634 }, { "epoch": 0.1671100601044438, "grad_norm": 0.9165635108947754, "learning_rate": 9.445417907832487e-05, "loss": 1.8788, "step": 636 }, { "epoch": 0.16763556343810557, "grad_norm": 1.2198373079299927, "learning_rate": 9.443665673734012e-05, "loss": 1.8717, "step": 638 }, { "epoch": 0.16816106677176734, "grad_norm": 0.8923284411430359, "learning_rate": 9.441913439635536e-05, "loss": 1.918, "step": 640 }, { "epoch": 0.16868657010542912, "grad_norm": 1.1938835382461548, "learning_rate": 9.44016120553706e-05, "loss": 1.9118, "step": 642 }, { "epoch": 0.16921207343909087, "grad_norm": 0.8569881319999695, "learning_rate": 9.438408971438585e-05, "loss": 1.8894, "step": 644 }, { "epoch": 0.16973757677275264, "grad_norm": 0.79718017578125, "learning_rate": 9.43665673734011e-05, "loss": 1.8815, "step": 646 }, { "epoch": 0.17026308010641442, "grad_norm": 0.9728055000305176, "learning_rate": 9.434904503241634e-05, "loss": 1.8942, "step": 648 }, { "epoch": 0.1707885834400762, "grad_norm": 1.2369781732559204, "learning_rate": 9.433152269143159e-05, "loss": 1.8663, "step": 650 }, { "epoch": 0.17131408677373797, "grad_norm": 1.0035600662231445, "learning_rate": 9.431400035044684e-05, "loss": 1.9365, "step": 652 }, { "epoch": 0.17183959010739974, "grad_norm": 0.8765824437141418, "learning_rate": 9.429647800946207e-05, "loss": 1.8818, "step": 654 }, { "epoch": 0.17236509344106152, "grad_norm": 1.0349746942520142, "learning_rate": 9.42789556684773e-05, "loss": 1.8732, "step": 656 }, { "epoch": 0.1728905967747233, "grad_norm": 0.9079018235206604, "learning_rate": 9.426143332749255e-05, "loss": 1.8841, "step": 658 }, { "epoch": 0.17341610010838507, "grad_norm": 0.9596872925758362, "learning_rate": 9.42439109865078e-05, "loss": 1.8824, "step": 660 }, { "epoch": 0.17394160344204684, "grad_norm": 0.8027825355529785, "learning_rate": 9.422638864552305e-05, "loss": 1.9056, "step": 662 }, { "epoch": 0.17446710677570862, "grad_norm": 1.0182610750198364, "learning_rate": 9.42088663045383e-05, "loss": 1.8637, "step": 664 }, { "epoch": 0.1749926101093704, "grad_norm": 1.0941540002822876, "learning_rate": 9.419134396355354e-05, "loss": 1.9068, "step": 666 }, { "epoch": 0.17551811344303214, "grad_norm": 0.8376652598381042, "learning_rate": 9.417382162256878e-05, "loss": 1.88, "step": 668 }, { "epoch": 0.17604361677669392, "grad_norm": 1.1947648525238037, "learning_rate": 9.415629928158402e-05, "loss": 1.8974, "step": 670 }, { "epoch": 0.1765691201103557, "grad_norm": 1.1815409660339355, "learning_rate": 9.413877694059927e-05, "loss": 1.9003, "step": 672 }, { "epoch": 0.17709462344401747, "grad_norm": 1.298322319984436, "learning_rate": 9.412125459961452e-05, "loss": 1.9069, "step": 674 }, { "epoch": 0.17762012677767924, "grad_norm": 0.9089726209640503, "learning_rate": 9.410373225862977e-05, "loss": 1.8631, "step": 676 }, { "epoch": 0.17814563011134102, "grad_norm": 2.489449977874756, "learning_rate": 9.408620991764501e-05, "loss": 1.8517, "step": 678 }, { "epoch": 0.1786711334450028, "grad_norm": 1.456616759300232, "learning_rate": 9.406868757666025e-05, "loss": 1.9026, "step": 680 }, { "epoch": 0.17919663677866457, "grad_norm": 1.3760594129562378, "learning_rate": 9.405116523567548e-05, "loss": 1.9106, "step": 682 }, { "epoch": 0.17972214011232635, "grad_norm": 1.0166149139404297, "learning_rate": 9.403364289469073e-05, "loss": 1.8985, "step": 684 }, { "epoch": 0.18024764344598812, "grad_norm": 0.9207403659820557, "learning_rate": 9.401612055370598e-05, "loss": 1.9231, "step": 686 }, { "epoch": 0.1807731467796499, "grad_norm": 1.1127294301986694, "learning_rate": 9.399859821272122e-05, "loss": 1.9001, "step": 688 }, { "epoch": 0.18129865011331164, "grad_norm": 1.1024377346038818, "learning_rate": 9.398107587173647e-05, "loss": 1.8673, "step": 690 }, { "epoch": 0.18182415344697342, "grad_norm": 1.2524852752685547, "learning_rate": 9.396355353075172e-05, "loss": 1.8481, "step": 692 }, { "epoch": 0.1823496567806352, "grad_norm": 1.0531530380249023, "learning_rate": 9.394603118976695e-05, "loss": 1.8818, "step": 694 }, { "epoch": 0.18287516011429697, "grad_norm": 1.0276319980621338, "learning_rate": 9.39285088487822e-05, "loss": 1.8516, "step": 696 }, { "epoch": 0.18340066344795874, "grad_norm": 0.9953783750534058, "learning_rate": 9.391098650779745e-05, "loss": 1.8906, "step": 698 }, { "epoch": 0.18392616678162052, "grad_norm": 1.245374321937561, "learning_rate": 9.38934641668127e-05, "loss": 1.8797, "step": 700 }, { "epoch": 0.1844516701152823, "grad_norm": 0.7994580864906311, "learning_rate": 9.387594182582794e-05, "loss": 1.8902, "step": 702 }, { "epoch": 0.18497717344894407, "grad_norm": 0.941091775894165, "learning_rate": 9.385841948484318e-05, "loss": 1.8964, "step": 704 }, { "epoch": 0.18550267678260585, "grad_norm": 0.8709573149681091, "learning_rate": 9.384089714385843e-05, "loss": 1.894, "step": 706 }, { "epoch": 0.18602818011626762, "grad_norm": 1.1883244514465332, "learning_rate": 9.382337480287366e-05, "loss": 1.9171, "step": 708 }, { "epoch": 0.1865536834499294, "grad_norm": 2.1829898357391357, "learning_rate": 9.380585246188891e-05, "loss": 1.9089, "step": 710 }, { "epoch": 0.18707918678359114, "grad_norm": 0.9389178156852722, "learning_rate": 9.378833012090415e-05, "loss": 1.9028, "step": 712 }, { "epoch": 0.18760469011725292, "grad_norm": 0.8789083957672119, "learning_rate": 9.37708077799194e-05, "loss": 1.8881, "step": 714 }, { "epoch": 0.1881301934509147, "grad_norm": 1.3515336513519287, "learning_rate": 9.375328543893465e-05, "loss": 1.897, "step": 716 }, { "epoch": 0.18865569678457647, "grad_norm": 1.123223066329956, "learning_rate": 9.37357630979499e-05, "loss": 1.9337, "step": 718 }, { "epoch": 0.18918120011823825, "grad_norm": 1.16353440284729, "learning_rate": 9.371824075696513e-05, "loss": 1.9257, "step": 720 }, { "epoch": 0.18970670345190002, "grad_norm": 0.8678929209709167, "learning_rate": 9.370071841598038e-05, "loss": 1.8686, "step": 722 }, { "epoch": 0.1902322067855618, "grad_norm": 0.8892553448677063, "learning_rate": 9.368319607499563e-05, "loss": 1.8724, "step": 724 }, { "epoch": 0.19075771011922357, "grad_norm": 1.2203401327133179, "learning_rate": 9.366567373401087e-05, "loss": 1.8663, "step": 726 }, { "epoch": 0.19128321345288535, "grad_norm": 0.8035688400268555, "learning_rate": 9.364815139302612e-05, "loss": 1.8748, "step": 728 }, { "epoch": 0.19180871678654712, "grad_norm": 0.8692450523376465, "learning_rate": 9.363062905204136e-05, "loss": 1.9093, "step": 730 }, { "epoch": 0.1923342201202089, "grad_norm": 1.1297273635864258, "learning_rate": 9.36131067110566e-05, "loss": 1.8618, "step": 732 }, { "epoch": 0.19285972345387067, "grad_norm": 0.853670597076416, "learning_rate": 9.359558437007184e-05, "loss": 1.8859, "step": 734 }, { "epoch": 0.19338522678753242, "grad_norm": 1.115268588066101, "learning_rate": 9.357806202908708e-05, "loss": 1.8823, "step": 736 }, { "epoch": 0.1939107301211942, "grad_norm": 1.002034068107605, "learning_rate": 9.356053968810233e-05, "loss": 1.8743, "step": 738 }, { "epoch": 0.19443623345485597, "grad_norm": 1.3825932741165161, "learning_rate": 9.354301734711758e-05, "loss": 1.8897, "step": 740 }, { "epoch": 0.19496173678851775, "grad_norm": 1.4974256753921509, "learning_rate": 9.352549500613283e-05, "loss": 1.899, "step": 742 }, { "epoch": 0.19548724012217952, "grad_norm": 1.2865314483642578, "learning_rate": 9.350797266514808e-05, "loss": 1.9042, "step": 744 }, { "epoch": 0.1960127434558413, "grad_norm": 1.036019206047058, "learning_rate": 9.349045032416331e-05, "loss": 1.9169, "step": 746 }, { "epoch": 0.19653824678950307, "grad_norm": 1.1401413679122925, "learning_rate": 9.347292798317856e-05, "loss": 1.8908, "step": 748 }, { "epoch": 0.19706375012316485, "grad_norm": 1.3356719017028809, "learning_rate": 9.34554056421938e-05, "loss": 1.8642, "step": 750 }, { "epoch": 0.19758925345682662, "grad_norm": 1.025230884552002, "learning_rate": 9.343788330120905e-05, "loss": 1.8733, "step": 752 }, { "epoch": 0.1981147567904884, "grad_norm": 0.981281578540802, "learning_rate": 9.34203609602243e-05, "loss": 1.8894, "step": 754 }, { "epoch": 0.19864026012415018, "grad_norm": 1.0571757555007935, "learning_rate": 9.340283861923953e-05, "loss": 1.8753, "step": 756 }, { "epoch": 0.19916576345781192, "grad_norm": 0.9619776606559753, "learning_rate": 9.338531627825478e-05, "loss": 1.8791, "step": 758 }, { "epoch": 0.1996912667914737, "grad_norm": 0.8430715203285217, "learning_rate": 9.336779393727001e-05, "loss": 1.9133, "step": 760 }, { "epoch": 0.20021677012513547, "grad_norm": 1.8572043180465698, "learning_rate": 9.335027159628526e-05, "loss": 1.9123, "step": 762 }, { "epoch": 0.20074227345879725, "grad_norm": 1.0308524370193481, "learning_rate": 9.333274925530051e-05, "loss": 1.8696, "step": 764 }, { "epoch": 0.20126777679245902, "grad_norm": 1.3068006038665771, "learning_rate": 9.331522691431576e-05, "loss": 1.8932, "step": 766 }, { "epoch": 0.2017932801261208, "grad_norm": 0.9089499711990356, "learning_rate": 9.3297704573331e-05, "loss": 1.8717, "step": 768 }, { "epoch": 0.20231878345978258, "grad_norm": 1.1663358211517334, "learning_rate": 9.328018223234625e-05, "loss": 1.8356, "step": 770 }, { "epoch": 0.20284428679344435, "grad_norm": 1.1666746139526367, "learning_rate": 9.326265989136149e-05, "loss": 1.8785, "step": 772 }, { "epoch": 0.20336979012710613, "grad_norm": 0.8820154666900635, "learning_rate": 9.324513755037673e-05, "loss": 1.8353, "step": 774 }, { "epoch": 0.2038952934607679, "grad_norm": 1.0544767379760742, "learning_rate": 9.322761520939198e-05, "loss": 1.9217, "step": 776 }, { "epoch": 0.20442079679442968, "grad_norm": 1.0300171375274658, "learning_rate": 9.321009286840723e-05, "loss": 1.8982, "step": 778 }, { "epoch": 0.20494630012809142, "grad_norm": 0.884556770324707, "learning_rate": 9.319257052742248e-05, "loss": 1.8783, "step": 780 }, { "epoch": 0.2054718034617532, "grad_norm": 1.150823712348938, "learning_rate": 9.317504818643771e-05, "loss": 1.9116, "step": 782 }, { "epoch": 0.20599730679541498, "grad_norm": 1.1606664657592773, "learning_rate": 9.315752584545296e-05, "loss": 1.8594, "step": 784 }, { "epoch": 0.20652281012907675, "grad_norm": 0.9920266270637512, "learning_rate": 9.314000350446819e-05, "loss": 1.8659, "step": 786 }, { "epoch": 0.20704831346273853, "grad_norm": 1.0931354761123657, "learning_rate": 9.312248116348344e-05, "loss": 1.8876, "step": 788 }, { "epoch": 0.2075738167964003, "grad_norm": 1.097580075263977, "learning_rate": 9.310495882249869e-05, "loss": 1.8961, "step": 790 }, { "epoch": 0.20809932013006208, "grad_norm": 1.160636067390442, "learning_rate": 9.308743648151394e-05, "loss": 1.8925, "step": 792 }, { "epoch": 0.20862482346372385, "grad_norm": 0.9510796666145325, "learning_rate": 9.306991414052918e-05, "loss": 1.8449, "step": 794 }, { "epoch": 0.20915032679738563, "grad_norm": 1.0860143899917603, "learning_rate": 9.305239179954443e-05, "loss": 1.8647, "step": 796 }, { "epoch": 0.2096758301310474, "grad_norm": 1.1009169816970825, "learning_rate": 9.303486945855966e-05, "loss": 1.8992, "step": 798 }, { "epoch": 0.21020133346470918, "grad_norm": 1.0895287990570068, "learning_rate": 9.301734711757491e-05, "loss": 1.8563, "step": 800 }, { "epoch": 0.21020133346470918, "eval_loss": 1.8377995491027832, "eval_runtime": 487.1876, "eval_samples_per_second": 249.984, "eval_steps_per_second": 31.249, "step": 800 }, { "epoch": 0.21072683679837095, "grad_norm": 1.2973535060882568, "learning_rate": 9.299982477659016e-05, "loss": 1.8701, "step": 802 }, { "epoch": 0.2112523401320327, "grad_norm": 0.9440937042236328, "learning_rate": 9.298230243560541e-05, "loss": 1.8858, "step": 804 }, { "epoch": 0.21177784346569448, "grad_norm": 0.8491653203964233, "learning_rate": 9.296478009462064e-05, "loss": 1.872, "step": 806 }, { "epoch": 0.21230334679935625, "grad_norm": 1.0194580554962158, "learning_rate": 9.294725775363589e-05, "loss": 1.8803, "step": 808 }, { "epoch": 0.21282885013301803, "grad_norm": 0.957872748374939, "learning_rate": 9.292973541265114e-05, "loss": 1.8825, "step": 810 }, { "epoch": 0.2133543534666798, "grad_norm": 1.1070436239242554, "learning_rate": 9.291221307166637e-05, "loss": 1.8616, "step": 812 }, { "epoch": 0.21387985680034158, "grad_norm": 0.8715662956237793, "learning_rate": 9.289469073068162e-05, "loss": 1.8634, "step": 814 }, { "epoch": 0.21440536013400335, "grad_norm": 1.1930649280548096, "learning_rate": 9.287716838969687e-05, "loss": 1.842, "step": 816 }, { "epoch": 0.21493086346766513, "grad_norm": 1.0091701745986938, "learning_rate": 9.285964604871211e-05, "loss": 1.9128, "step": 818 }, { "epoch": 0.2154563668013269, "grad_norm": 1.1418068408966064, "learning_rate": 9.284212370772736e-05, "loss": 1.8697, "step": 820 }, { "epoch": 0.21598187013498868, "grad_norm": 1.0154426097869873, "learning_rate": 9.282460136674261e-05, "loss": 1.9324, "step": 822 }, { "epoch": 0.21650737346865045, "grad_norm": 1.2113468647003174, "learning_rate": 9.280707902575784e-05, "loss": 1.8379, "step": 824 }, { "epoch": 0.2170328768023122, "grad_norm": 1.0505681037902832, "learning_rate": 9.278955668477309e-05, "loss": 1.866, "step": 826 }, { "epoch": 0.21755838013597398, "grad_norm": 0.938463568687439, "learning_rate": 9.277203434378834e-05, "loss": 1.8504, "step": 828 }, { "epoch": 0.21808388346963575, "grad_norm": 0.7944304943084717, "learning_rate": 9.275451200280358e-05, "loss": 1.8919, "step": 830 }, { "epoch": 0.21860938680329753, "grad_norm": 0.9631441235542297, "learning_rate": 9.273698966181882e-05, "loss": 1.8451, "step": 832 }, { "epoch": 0.2191348901369593, "grad_norm": 0.9219480156898499, "learning_rate": 9.271946732083407e-05, "loss": 1.8546, "step": 834 }, { "epoch": 0.21966039347062108, "grad_norm": 0.8851411938667297, "learning_rate": 9.270194497984931e-05, "loss": 1.8685, "step": 836 }, { "epoch": 0.22018589680428285, "grad_norm": 0.9707255959510803, "learning_rate": 9.268442263886455e-05, "loss": 1.887, "step": 838 }, { "epoch": 0.22071140013794463, "grad_norm": 0.9294309616088867, "learning_rate": 9.26669002978798e-05, "loss": 1.8967, "step": 840 }, { "epoch": 0.2212369034716064, "grad_norm": 0.9156199097633362, "learning_rate": 9.264937795689504e-05, "loss": 1.8756, "step": 842 }, { "epoch": 0.22176240680526818, "grad_norm": 0.8118696808815002, "learning_rate": 9.263185561591029e-05, "loss": 1.8766, "step": 844 }, { "epoch": 0.22228791013892996, "grad_norm": 0.9590555429458618, "learning_rate": 9.261433327492554e-05, "loss": 1.8693, "step": 846 }, { "epoch": 0.2228134134725917, "grad_norm": 1.385361671447754, "learning_rate": 9.259681093394079e-05, "loss": 1.8824, "step": 848 }, { "epoch": 0.22333891680625348, "grad_norm": 0.9501360654830933, "learning_rate": 9.257928859295602e-05, "loss": 1.8414, "step": 850 }, { "epoch": 0.22386442013991525, "grad_norm": 1.0095267295837402, "learning_rate": 9.256176625197127e-05, "loss": 1.8499, "step": 852 }, { "epoch": 0.22438992347357703, "grad_norm": 0.8820069432258606, "learning_rate": 9.254424391098651e-05, "loss": 1.8513, "step": 854 }, { "epoch": 0.2249154268072388, "grad_norm": 0.7832709550857544, "learning_rate": 9.252672157000176e-05, "loss": 1.8585, "step": 856 }, { "epoch": 0.22544093014090058, "grad_norm": 1.4282846450805664, "learning_rate": 9.2509199229017e-05, "loss": 1.8776, "step": 858 }, { "epoch": 0.22596643347456236, "grad_norm": 0.8925049901008606, "learning_rate": 9.249167688803224e-05, "loss": 1.8626, "step": 860 }, { "epoch": 0.22649193680822413, "grad_norm": 1.4945679903030396, "learning_rate": 9.247415454704749e-05, "loss": 1.874, "step": 862 }, { "epoch": 0.2270174401418859, "grad_norm": 0.9479649066925049, "learning_rate": 9.245663220606273e-05, "loss": 1.8789, "step": 864 }, { "epoch": 0.22754294347554768, "grad_norm": 1.023941159248352, "learning_rate": 9.243910986507797e-05, "loss": 1.8945, "step": 866 }, { "epoch": 0.22806844680920946, "grad_norm": 1.0005570650100708, "learning_rate": 9.242158752409322e-05, "loss": 1.8736, "step": 868 }, { "epoch": 0.2285939501428712, "grad_norm": 0.9343464374542236, "learning_rate": 9.240406518310847e-05, "loss": 1.844, "step": 870 }, { "epoch": 0.22911945347653298, "grad_norm": 0.8942081332206726, "learning_rate": 9.238654284212372e-05, "loss": 1.8737, "step": 872 }, { "epoch": 0.22964495681019476, "grad_norm": 1.0856554508209229, "learning_rate": 9.236902050113896e-05, "loss": 1.8492, "step": 874 }, { "epoch": 0.23017046014385653, "grad_norm": 0.8268112540245056, "learning_rate": 9.23514981601542e-05, "loss": 1.8804, "step": 876 }, { "epoch": 0.2306959634775183, "grad_norm": 0.9946966171264648, "learning_rate": 9.233397581916944e-05, "loss": 1.8592, "step": 878 }, { "epoch": 0.23122146681118008, "grad_norm": 1.0663763284683228, "learning_rate": 9.231645347818469e-05, "loss": 1.8991, "step": 880 }, { "epoch": 0.23174697014484186, "grad_norm": 1.2675755023956299, "learning_rate": 9.229893113719993e-05, "loss": 1.8621, "step": 882 }, { "epoch": 0.23227247347850363, "grad_norm": 1.1921007633209229, "learning_rate": 9.228140879621517e-05, "loss": 1.9014, "step": 884 }, { "epoch": 0.2327979768121654, "grad_norm": 0.8989017009735107, "learning_rate": 9.226388645523042e-05, "loss": 1.8575, "step": 886 }, { "epoch": 0.23332348014582718, "grad_norm": 1.2373161315917969, "learning_rate": 9.224636411424567e-05, "loss": 1.8905, "step": 888 }, { "epoch": 0.23384898347948896, "grad_norm": 0.938845694065094, "learning_rate": 9.222884177326092e-05, "loss": 1.9011, "step": 890 }, { "epoch": 0.23437448681315073, "grad_norm": 0.8705965876579285, "learning_rate": 9.221131943227615e-05, "loss": 1.831, "step": 892 }, { "epoch": 0.23489999014681248, "grad_norm": 1.3164907693862915, "learning_rate": 9.21937970912914e-05, "loss": 1.8455, "step": 894 }, { "epoch": 0.23542549348047426, "grad_norm": 0.9536553025245667, "learning_rate": 9.217627475030665e-05, "loss": 1.8557, "step": 896 }, { "epoch": 0.23595099681413603, "grad_norm": 1.0079597234725952, "learning_rate": 9.21587524093219e-05, "loss": 1.8758, "step": 898 }, { "epoch": 0.2364765001477978, "grad_norm": 2.14665150642395, "learning_rate": 9.214123006833714e-05, "loss": 1.9172, "step": 900 }, { "epoch": 0.23700200348145958, "grad_norm": 1.007122278213501, "learning_rate": 9.212370772735239e-05, "loss": 1.9175, "step": 902 }, { "epoch": 0.23752750681512136, "grad_norm": 0.9156002998352051, "learning_rate": 9.210618538636762e-05, "loss": 1.8589, "step": 904 }, { "epoch": 0.23805301014878313, "grad_norm": 1.2697999477386475, "learning_rate": 9.208866304538287e-05, "loss": 1.8996, "step": 906 }, { "epoch": 0.2385785134824449, "grad_norm": 0.8666015863418579, "learning_rate": 9.20711407043981e-05, "loss": 1.8595, "step": 908 }, { "epoch": 0.23910401681610668, "grad_norm": 1.5307285785675049, "learning_rate": 9.205361836341335e-05, "loss": 1.8551, "step": 910 }, { "epoch": 0.23962952014976846, "grad_norm": 1.4660929441452026, "learning_rate": 9.20360960224286e-05, "loss": 1.8653, "step": 912 }, { "epoch": 0.24015502348343024, "grad_norm": 0.9962916970252991, "learning_rate": 9.201857368144385e-05, "loss": 1.8713, "step": 914 }, { "epoch": 0.24068052681709198, "grad_norm": 1.3509130477905273, "learning_rate": 9.20010513404591e-05, "loss": 1.8946, "step": 916 }, { "epoch": 0.24120603015075376, "grad_norm": 0.8232421875, "learning_rate": 9.198352899947433e-05, "loss": 1.8795, "step": 918 }, { "epoch": 0.24173153348441553, "grad_norm": 1.6278520822525024, "learning_rate": 9.196600665848958e-05, "loss": 1.901, "step": 920 }, { "epoch": 0.2422570368180773, "grad_norm": 0.7896947860717773, "learning_rate": 9.194848431750482e-05, "loss": 1.8681, "step": 922 }, { "epoch": 0.24278254015173908, "grad_norm": 1.1732158660888672, "learning_rate": 9.193096197652007e-05, "loss": 1.8632, "step": 924 }, { "epoch": 0.24330804348540086, "grad_norm": 1.7037454843521118, "learning_rate": 9.191343963553532e-05, "loss": 1.8928, "step": 926 }, { "epoch": 0.24383354681906264, "grad_norm": 0.9831770062446594, "learning_rate": 9.189591729455057e-05, "loss": 1.8491, "step": 928 }, { "epoch": 0.2443590501527244, "grad_norm": 1.2588465213775635, "learning_rate": 9.18783949535658e-05, "loss": 1.8743, "step": 930 }, { "epoch": 0.2448845534863862, "grad_norm": 0.8186757564544678, "learning_rate": 9.186087261258105e-05, "loss": 1.8961, "step": 932 }, { "epoch": 0.24541005682004796, "grad_norm": 1.0041245222091675, "learning_rate": 9.184335027159628e-05, "loss": 1.8264, "step": 934 }, { "epoch": 0.24593556015370974, "grad_norm": 1.397493600845337, "learning_rate": 9.182582793061153e-05, "loss": 1.8483, "step": 936 }, { "epoch": 0.24646106348737148, "grad_norm": 0.9180475473403931, "learning_rate": 9.180830558962678e-05, "loss": 1.8609, "step": 938 }, { "epoch": 0.24698656682103326, "grad_norm": 0.9013431072235107, "learning_rate": 9.179078324864202e-05, "loss": 1.8537, "step": 940 }, { "epoch": 0.24751207015469504, "grad_norm": 0.7891268134117126, "learning_rate": 9.177326090765727e-05, "loss": 1.8224, "step": 942 }, { "epoch": 0.2480375734883568, "grad_norm": 1.082982063293457, "learning_rate": 9.17557385666725e-05, "loss": 1.8799, "step": 944 }, { "epoch": 0.24856307682201859, "grad_norm": 0.9956438541412354, "learning_rate": 9.173821622568775e-05, "loss": 1.8638, "step": 946 }, { "epoch": 0.24908858015568036, "grad_norm": 1.1269092559814453, "learning_rate": 9.1720693884703e-05, "loss": 1.8589, "step": 948 }, { "epoch": 0.24961408348934214, "grad_norm": 0.88730788230896, "learning_rate": 9.170317154371825e-05, "loss": 1.8901, "step": 950 }, { "epoch": 0.2501395868230039, "grad_norm": 0.9314135313034058, "learning_rate": 9.16856492027335e-05, "loss": 1.8457, "step": 952 }, { "epoch": 0.25066509015666566, "grad_norm": 1.0120025873184204, "learning_rate": 9.166812686174874e-05, "loss": 1.8666, "step": 954 }, { "epoch": 0.25119059349032746, "grad_norm": 1.0400328636169434, "learning_rate": 9.165060452076398e-05, "loss": 1.8666, "step": 956 }, { "epoch": 0.2517160968239892, "grad_norm": 1.084693431854248, "learning_rate": 9.163308217977923e-05, "loss": 1.8318, "step": 958 }, { "epoch": 0.252241600157651, "grad_norm": 0.911669909954071, "learning_rate": 9.161555983879446e-05, "loss": 1.859, "step": 960 }, { "epoch": 0.25276710349131276, "grad_norm": 1.0443209409713745, "learning_rate": 9.159803749780971e-05, "loss": 1.8647, "step": 962 }, { "epoch": 0.25329260682497456, "grad_norm": 0.8681670427322388, "learning_rate": 9.158051515682495e-05, "loss": 1.8975, "step": 964 }, { "epoch": 0.2538181101586363, "grad_norm": 1.2208961248397827, "learning_rate": 9.15629928158402e-05, "loss": 1.8722, "step": 966 }, { "epoch": 0.2543436134922981, "grad_norm": 0.8562275767326355, "learning_rate": 9.154547047485545e-05, "loss": 1.8535, "step": 968 }, { "epoch": 0.25486911682595986, "grad_norm": 0.8852279782295227, "learning_rate": 9.152794813387068e-05, "loss": 1.8713, "step": 970 }, { "epoch": 0.2553946201596216, "grad_norm": 0.8528086543083191, "learning_rate": 9.151042579288593e-05, "loss": 1.8736, "step": 972 }, { "epoch": 0.2559201234932834, "grad_norm": 0.886330246925354, "learning_rate": 9.149290345190118e-05, "loss": 1.8915, "step": 974 }, { "epoch": 0.25644562682694516, "grad_norm": 0.8512532711029053, "learning_rate": 9.147538111091643e-05, "loss": 1.8293, "step": 976 }, { "epoch": 0.25697113016060696, "grad_norm": 0.9382111430168152, "learning_rate": 9.145785876993167e-05, "loss": 1.8717, "step": 978 }, { "epoch": 0.2574966334942687, "grad_norm": 0.8720589876174927, "learning_rate": 9.144033642894692e-05, "loss": 1.8839, "step": 980 }, { "epoch": 0.2580221368279305, "grad_norm": 1.6592185497283936, "learning_rate": 9.142281408796216e-05, "loss": 1.8409, "step": 982 }, { "epoch": 0.25854764016159226, "grad_norm": 1.2780932188034058, "learning_rate": 9.140529174697739e-05, "loss": 1.8717, "step": 984 }, { "epoch": 0.25907314349525407, "grad_norm": 0.9220293164253235, "learning_rate": 9.138776940599264e-05, "loss": 1.8539, "step": 986 }, { "epoch": 0.2595986468289158, "grad_norm": 0.8890568017959595, "learning_rate": 9.137024706500788e-05, "loss": 1.8678, "step": 988 }, { "epoch": 0.2601241501625776, "grad_norm": 1.0034205913543701, "learning_rate": 9.135272472402313e-05, "loss": 1.8262, "step": 990 }, { "epoch": 0.26064965349623936, "grad_norm": 1.0338081121444702, "learning_rate": 9.133520238303838e-05, "loss": 1.8566, "step": 992 }, { "epoch": 0.26117515682990117, "grad_norm": 1.4746791124343872, "learning_rate": 9.131768004205363e-05, "loss": 1.8843, "step": 994 }, { "epoch": 0.2617006601635629, "grad_norm": 1.0208336114883423, "learning_rate": 9.130015770106886e-05, "loss": 1.8503, "step": 996 }, { "epoch": 0.26222616349722466, "grad_norm": 0.9133326411247253, "learning_rate": 9.128263536008411e-05, "loss": 1.8805, "step": 998 }, { "epoch": 0.26275166683088647, "grad_norm": 1.1855682134628296, "learning_rate": 9.126511301909936e-05, "loss": 1.8862, "step": 1000 }, { "epoch": 0.2632771701645482, "grad_norm": 0.9511350393295288, "learning_rate": 9.12475906781146e-05, "loss": 1.816, "step": 1002 }, { "epoch": 0.26380267349821, "grad_norm": 1.5805948972702026, "learning_rate": 9.123006833712985e-05, "loss": 1.8539, "step": 1004 }, { "epoch": 0.26432817683187176, "grad_norm": 1.7137740850448608, "learning_rate": 9.12125459961451e-05, "loss": 1.8516, "step": 1006 }, { "epoch": 0.26485368016553357, "grad_norm": 1.1085962057113647, "learning_rate": 9.119502365516033e-05, "loss": 1.8548, "step": 1008 }, { "epoch": 0.2653791834991953, "grad_norm": 0.927699625492096, "learning_rate": 9.117750131417557e-05, "loss": 1.8411, "step": 1010 }, { "epoch": 0.2659046868328571, "grad_norm": 1.0528203248977661, "learning_rate": 9.115997897319081e-05, "loss": 1.8672, "step": 1012 }, { "epoch": 0.26643019016651887, "grad_norm": 0.8325463533401489, "learning_rate": 9.114245663220606e-05, "loss": 1.8081, "step": 1014 }, { "epoch": 0.26695569350018067, "grad_norm": 0.9019527435302734, "learning_rate": 9.112493429122131e-05, "loss": 1.869, "step": 1016 }, { "epoch": 0.2674811968338424, "grad_norm": 1.3394633531570435, "learning_rate": 9.110741195023656e-05, "loss": 1.8063, "step": 1018 }, { "epoch": 0.26800670016750416, "grad_norm": 1.0652636289596558, "learning_rate": 9.10898896092518e-05, "loss": 1.8685, "step": 1020 }, { "epoch": 0.26853220350116597, "grad_norm": 1.0782673358917236, "learning_rate": 9.107236726826704e-05, "loss": 1.8749, "step": 1022 }, { "epoch": 0.2690577068348277, "grad_norm": 1.4112943410873413, "learning_rate": 9.105484492728229e-05, "loss": 1.8942, "step": 1024 }, { "epoch": 0.2695832101684895, "grad_norm": 0.9202266931533813, "learning_rate": 9.103732258629753e-05, "loss": 1.8578, "step": 1026 }, { "epoch": 0.27010871350215127, "grad_norm": 1.4176150560379028, "learning_rate": 9.101980024531278e-05, "loss": 1.8788, "step": 1028 }, { "epoch": 0.27063421683581307, "grad_norm": 1.0629339218139648, "learning_rate": 9.100227790432803e-05, "loss": 1.8597, "step": 1030 }, { "epoch": 0.2711597201694748, "grad_norm": 1.4514985084533691, "learning_rate": 9.098475556334328e-05, "loss": 1.8748, "step": 1032 }, { "epoch": 0.2716852235031366, "grad_norm": 1.7836532592773438, "learning_rate": 9.096723322235851e-05, "loss": 1.8205, "step": 1034 }, { "epoch": 0.27221072683679837, "grad_norm": 0.9807853698730469, "learning_rate": 9.094971088137374e-05, "loss": 1.8556, "step": 1036 }, { "epoch": 0.27273623017046017, "grad_norm": 0.9897574782371521, "learning_rate": 9.093218854038899e-05, "loss": 1.8298, "step": 1038 }, { "epoch": 0.2732617335041219, "grad_norm": 0.8160204887390137, "learning_rate": 9.091466619940424e-05, "loss": 1.8612, "step": 1040 }, { "epoch": 0.27378723683778367, "grad_norm": 1.4802687168121338, "learning_rate": 9.089714385841949e-05, "loss": 1.8657, "step": 1042 }, { "epoch": 0.27431274017144547, "grad_norm": 1.0503878593444824, "learning_rate": 9.087962151743474e-05, "loss": 1.8625, "step": 1044 }, { "epoch": 0.2748382435051072, "grad_norm": 1.03403639793396, "learning_rate": 9.086209917644998e-05, "loss": 1.8339, "step": 1046 }, { "epoch": 0.275363746838769, "grad_norm": 1.1939598321914673, "learning_rate": 9.084457683546522e-05, "loss": 1.8322, "step": 1048 }, { "epoch": 0.27588925017243077, "grad_norm": 0.8081462383270264, "learning_rate": 9.082705449448046e-05, "loss": 1.8527, "step": 1050 }, { "epoch": 0.27641475350609257, "grad_norm": 0.9340723156929016, "learning_rate": 9.080953215349571e-05, "loss": 1.8462, "step": 1052 }, { "epoch": 0.2769402568397543, "grad_norm": 2.116253137588501, "learning_rate": 9.079200981251096e-05, "loss": 1.8157, "step": 1054 }, { "epoch": 0.2774657601734161, "grad_norm": 1.5195270776748657, "learning_rate": 9.077448747152621e-05, "loss": 1.8554, "step": 1056 }, { "epoch": 0.27799126350707787, "grad_norm": 1.1265277862548828, "learning_rate": 9.075696513054145e-05, "loss": 1.8535, "step": 1058 }, { "epoch": 0.27851676684073967, "grad_norm": 0.8470209240913391, "learning_rate": 9.073944278955669e-05, "loss": 1.8552, "step": 1060 }, { "epoch": 0.2790422701744014, "grad_norm": 1.014785885810852, "learning_rate": 9.072192044857192e-05, "loss": 1.8314, "step": 1062 }, { "epoch": 0.27956777350806317, "grad_norm": 0.9315053820610046, "learning_rate": 9.070439810758717e-05, "loss": 1.852, "step": 1064 }, { "epoch": 0.28009327684172497, "grad_norm": 0.8854875564575195, "learning_rate": 9.068687576660242e-05, "loss": 1.853, "step": 1066 }, { "epoch": 0.2806187801753867, "grad_norm": 1.0083775520324707, "learning_rate": 9.066935342561767e-05, "loss": 1.8525, "step": 1068 }, { "epoch": 0.2811442835090485, "grad_norm": 0.8299185633659363, "learning_rate": 9.065183108463291e-05, "loss": 1.858, "step": 1070 }, { "epoch": 0.28166978684271027, "grad_norm": 0.859104573726654, "learning_rate": 9.063430874364816e-05, "loss": 1.8478, "step": 1072 }, { "epoch": 0.28219529017637207, "grad_norm": 0.8011692762374878, "learning_rate": 9.06167864026634e-05, "loss": 1.8531, "step": 1074 }, { "epoch": 0.2827207935100338, "grad_norm": 0.8882426023483276, "learning_rate": 9.059926406167864e-05, "loss": 1.87, "step": 1076 }, { "epoch": 0.2832462968436956, "grad_norm": 0.9461469650268555, "learning_rate": 9.058174172069389e-05, "loss": 1.8531, "step": 1078 }, { "epoch": 0.28377180017735737, "grad_norm": 0.92154461145401, "learning_rate": 9.056421937970914e-05, "loss": 1.8378, "step": 1080 }, { "epoch": 0.2842973035110192, "grad_norm": 0.9303539395332336, "learning_rate": 9.054669703872438e-05, "loss": 1.8897, "step": 1082 }, { "epoch": 0.2848228068446809, "grad_norm": 0.8764487504959106, "learning_rate": 9.052917469773963e-05, "loss": 1.8908, "step": 1084 }, { "epoch": 0.28534831017834267, "grad_norm": 1.0205122232437134, "learning_rate": 9.051165235675487e-05, "loss": 1.8712, "step": 1086 }, { "epoch": 0.28587381351200447, "grad_norm": 1.2372097969055176, "learning_rate": 9.04941300157701e-05, "loss": 1.8288, "step": 1088 }, { "epoch": 0.2863993168456662, "grad_norm": 0.9842033386230469, "learning_rate": 9.047660767478535e-05, "loss": 1.8767, "step": 1090 }, { "epoch": 0.286924820179328, "grad_norm": 1.4316095113754272, "learning_rate": 9.04590853338006e-05, "loss": 1.8624, "step": 1092 }, { "epoch": 0.28745032351298977, "grad_norm": 1.0971202850341797, "learning_rate": 9.044156299281584e-05, "loss": 1.858, "step": 1094 }, { "epoch": 0.2879758268466516, "grad_norm": 1.3766525983810425, "learning_rate": 9.042404065183109e-05, "loss": 1.8366, "step": 1096 }, { "epoch": 0.2885013301803133, "grad_norm": 1.5556044578552246, "learning_rate": 9.040651831084634e-05, "loss": 1.827, "step": 1098 }, { "epoch": 0.2890268335139751, "grad_norm": 0.803501307964325, "learning_rate": 9.038899596986157e-05, "loss": 1.9033, "step": 1100 }, { "epoch": 0.28955233684763687, "grad_norm": 1.090751051902771, "learning_rate": 9.037147362887682e-05, "loss": 1.8605, "step": 1102 }, { "epoch": 0.2900778401812987, "grad_norm": 1.6796822547912598, "learning_rate": 9.035395128789207e-05, "loss": 1.8421, "step": 1104 }, { "epoch": 0.2906033435149604, "grad_norm": 0.8966239094734192, "learning_rate": 9.033642894690731e-05, "loss": 1.8679, "step": 1106 }, { "epoch": 0.29112884684862217, "grad_norm": 1.390019416809082, "learning_rate": 9.031890660592256e-05, "loss": 1.8322, "step": 1108 }, { "epoch": 0.291654350182284, "grad_norm": 0.8526667356491089, "learning_rate": 9.030138426493781e-05, "loss": 1.8364, "step": 1110 }, { "epoch": 0.2921798535159457, "grad_norm": 1.1773560047149658, "learning_rate": 9.028386192395304e-05, "loss": 1.8466, "step": 1112 }, { "epoch": 0.2927053568496075, "grad_norm": 1.0654343366622925, "learning_rate": 9.026633958296828e-05, "loss": 1.8767, "step": 1114 }, { "epoch": 0.29323086018326927, "grad_norm": 1.023926854133606, "learning_rate": 9.024881724198353e-05, "loss": 1.8461, "step": 1116 }, { "epoch": 0.2937563635169311, "grad_norm": 0.94902503490448, "learning_rate": 9.023129490099877e-05, "loss": 1.8412, "step": 1118 }, { "epoch": 0.2942818668505928, "grad_norm": 1.0602984428405762, "learning_rate": 9.021377256001402e-05, "loss": 1.8704, "step": 1120 }, { "epoch": 0.2948073701842546, "grad_norm": 0.8396863341331482, "learning_rate": 9.019625021902927e-05, "loss": 1.8557, "step": 1122 }, { "epoch": 0.29533287351791637, "grad_norm": 1.0940845012664795, "learning_rate": 9.017872787804452e-05, "loss": 1.8773, "step": 1124 }, { "epoch": 0.2958583768515782, "grad_norm": 0.8471454977989197, "learning_rate": 9.016120553705975e-05, "loss": 1.8232, "step": 1126 }, { "epoch": 0.2963838801852399, "grad_norm": 0.7603086829185486, "learning_rate": 9.0143683196075e-05, "loss": 1.855, "step": 1128 }, { "epoch": 0.29690938351890167, "grad_norm": 0.8293117880821228, "learning_rate": 9.012616085509024e-05, "loss": 1.8497, "step": 1130 }, { "epoch": 0.2974348868525635, "grad_norm": 0.8437036275863647, "learning_rate": 9.010863851410549e-05, "loss": 1.8476, "step": 1132 }, { "epoch": 0.2979603901862252, "grad_norm": 0.9667044878005981, "learning_rate": 9.009111617312074e-05, "loss": 1.7826, "step": 1134 }, { "epoch": 0.298485893519887, "grad_norm": 0.7626157402992249, "learning_rate": 9.007359383213599e-05, "loss": 1.8588, "step": 1136 }, { "epoch": 0.29901139685354877, "grad_norm": 0.782361626625061, "learning_rate": 9.005607149115122e-05, "loss": 1.8495, "step": 1138 }, { "epoch": 0.2995369001872106, "grad_norm": 0.9417736530303955, "learning_rate": 9.003854915016647e-05, "loss": 1.8875, "step": 1140 }, { "epoch": 0.3000624035208723, "grad_norm": 0.8616002202033997, "learning_rate": 9.00210268091817e-05, "loss": 1.852, "step": 1142 }, { "epoch": 0.3005879068545341, "grad_norm": 0.8036054372787476, "learning_rate": 9.000350446819695e-05, "loss": 1.8638, "step": 1144 }, { "epoch": 0.3011134101881959, "grad_norm": 1.0912983417510986, "learning_rate": 8.99859821272122e-05, "loss": 1.8191, "step": 1146 }, { "epoch": 0.3016389135218577, "grad_norm": 0.9259098172187805, "learning_rate": 8.996845978622745e-05, "loss": 1.8228, "step": 1148 }, { "epoch": 0.3021644168555194, "grad_norm": 1.3986306190490723, "learning_rate": 8.99509374452427e-05, "loss": 1.8365, "step": 1150 }, { "epoch": 0.3026899201891812, "grad_norm": 0.8834369778633118, "learning_rate": 8.993341510425794e-05, "loss": 1.857, "step": 1152 }, { "epoch": 0.303215423522843, "grad_norm": 0.9686596989631653, "learning_rate": 8.991589276327317e-05, "loss": 1.8559, "step": 1154 }, { "epoch": 0.3037409268565047, "grad_norm": 0.913817822933197, "learning_rate": 8.989837042228842e-05, "loss": 1.8618, "step": 1156 }, { "epoch": 0.3042664301901665, "grad_norm": 1.0107851028442383, "learning_rate": 8.988084808130367e-05, "loss": 1.8706, "step": 1158 }, { "epoch": 0.3047919335238283, "grad_norm": 1.2873750925064087, "learning_rate": 8.986332574031892e-05, "loss": 1.8451, "step": 1160 }, { "epoch": 0.3053174368574901, "grad_norm": 0.9408276677131653, "learning_rate": 8.984580339933417e-05, "loss": 1.8417, "step": 1162 }, { "epoch": 0.3058429401911518, "grad_norm": 1.078941822052002, "learning_rate": 8.98282810583494e-05, "loss": 1.8526, "step": 1164 }, { "epoch": 0.3063684435248136, "grad_norm": 0.9041505455970764, "learning_rate": 8.981075871736465e-05, "loss": 1.8779, "step": 1166 }, { "epoch": 0.3068939468584754, "grad_norm": 0.8624897599220276, "learning_rate": 8.979323637637988e-05, "loss": 1.7867, "step": 1168 }, { "epoch": 0.3074194501921372, "grad_norm": 0.9410212635993958, "learning_rate": 8.977571403539513e-05, "loss": 1.8498, "step": 1170 }, { "epoch": 0.3079449535257989, "grad_norm": 0.9149646162986755, "learning_rate": 8.975819169441038e-05, "loss": 1.8229, "step": 1172 }, { "epoch": 0.30847045685946073, "grad_norm": 0.7817291021347046, "learning_rate": 8.974066935342562e-05, "loss": 1.8886, "step": 1174 }, { "epoch": 0.3089959601931225, "grad_norm": 1.3264005184173584, "learning_rate": 8.972314701244087e-05, "loss": 1.851, "step": 1176 }, { "epoch": 0.3095214635267842, "grad_norm": 1.0288749933242798, "learning_rate": 8.970562467145612e-05, "loss": 1.8514, "step": 1178 }, { "epoch": 0.310046966860446, "grad_norm": 0.9613611698150635, "learning_rate": 8.968810233047135e-05, "loss": 1.8633, "step": 1180 }, { "epoch": 0.3105724701941078, "grad_norm": 1.0935230255126953, "learning_rate": 8.96705799894866e-05, "loss": 1.8175, "step": 1182 }, { "epoch": 0.3110979735277696, "grad_norm": 0.821371853351593, "learning_rate": 8.965305764850185e-05, "loss": 1.8275, "step": 1184 }, { "epoch": 0.3116234768614313, "grad_norm": 1.0035851001739502, "learning_rate": 8.96355353075171e-05, "loss": 1.8379, "step": 1186 }, { "epoch": 0.31214898019509313, "grad_norm": 1.4299391508102417, "learning_rate": 8.961801296653233e-05, "loss": 1.8456, "step": 1188 }, { "epoch": 0.3126744835287549, "grad_norm": 1.1284465789794922, "learning_rate": 8.960049062554758e-05, "loss": 1.8317, "step": 1190 }, { "epoch": 0.3131999868624167, "grad_norm": 0.8965946435928345, "learning_rate": 8.958296828456282e-05, "loss": 1.8265, "step": 1192 }, { "epoch": 0.3137254901960784, "grad_norm": 0.7729229927062988, "learning_rate": 8.956544594357806e-05, "loss": 1.8479, "step": 1194 }, { "epoch": 0.31425099352974023, "grad_norm": 1.0302493572235107, "learning_rate": 8.95479236025933e-05, "loss": 1.8563, "step": 1196 }, { "epoch": 0.314776496863402, "grad_norm": 1.2506144046783447, "learning_rate": 8.953040126160855e-05, "loss": 1.8672, "step": 1198 }, { "epoch": 0.3153020001970637, "grad_norm": 1.190073847770691, "learning_rate": 8.95128789206238e-05, "loss": 1.8476, "step": 1200 }, { "epoch": 0.3153020001970637, "eval_loss": 1.7993388175964355, "eval_runtime": 487.2115, "eval_samples_per_second": 249.972, "eval_steps_per_second": 31.247, "step": 1200 }, { "epoch": 0.31582750353072553, "grad_norm": 1.772186279296875, "learning_rate": 8.949535657963905e-05, "loss": 1.8635, "step": 1202 }, { "epoch": 0.3163530068643873, "grad_norm": 0.8721492290496826, "learning_rate": 8.94778342386543e-05, "loss": 1.8243, "step": 1204 }, { "epoch": 0.3168785101980491, "grad_norm": 0.9871326684951782, "learning_rate": 8.946031189766953e-05, "loss": 1.8491, "step": 1206 }, { "epoch": 0.3174040135317108, "grad_norm": 0.8752848505973816, "learning_rate": 8.944278955668478e-05, "loss": 1.8638, "step": 1208 }, { "epoch": 0.31792951686537263, "grad_norm": 0.9636614918708801, "learning_rate": 8.942526721570003e-05, "loss": 1.8779, "step": 1210 }, { "epoch": 0.3184550201990344, "grad_norm": 1.027335524559021, "learning_rate": 8.940774487471527e-05, "loss": 1.857, "step": 1212 }, { "epoch": 0.3189805235326962, "grad_norm": 0.8861249089241028, "learning_rate": 8.939022253373051e-05, "loss": 1.875, "step": 1214 }, { "epoch": 0.31950602686635793, "grad_norm": 0.8626223802566528, "learning_rate": 8.937270019274575e-05, "loss": 1.8096, "step": 1216 }, { "epoch": 0.32003153020001973, "grad_norm": 0.8333232402801514, "learning_rate": 8.9355177851761e-05, "loss": 1.8275, "step": 1218 }, { "epoch": 0.3205570335336815, "grad_norm": 0.7367919087409973, "learning_rate": 8.933765551077624e-05, "loss": 1.8653, "step": 1220 }, { "epoch": 0.3210825368673432, "grad_norm": 1.1696327924728394, "learning_rate": 8.932013316979148e-05, "loss": 1.8558, "step": 1222 }, { "epoch": 0.32160804020100503, "grad_norm": 0.9526751637458801, "learning_rate": 8.930261082880673e-05, "loss": 1.8429, "step": 1224 }, { "epoch": 0.3221335435346668, "grad_norm": 1.0763540267944336, "learning_rate": 8.928508848782198e-05, "loss": 1.8365, "step": 1226 }, { "epoch": 0.3226590468683286, "grad_norm": 0.8945477604866028, "learning_rate": 8.926756614683723e-05, "loss": 1.8303, "step": 1228 }, { "epoch": 0.32318455020199033, "grad_norm": 0.8092701435089111, "learning_rate": 8.925004380585247e-05, "loss": 1.8454, "step": 1230 }, { "epoch": 0.32371005353565213, "grad_norm": 1.0602396726608276, "learning_rate": 8.923252146486771e-05, "loss": 1.8256, "step": 1232 }, { "epoch": 0.3242355568693139, "grad_norm": 0.8251585960388184, "learning_rate": 8.921499912388296e-05, "loss": 1.8057, "step": 1234 }, { "epoch": 0.3247610602029757, "grad_norm": 1.2272833585739136, "learning_rate": 8.91974767828982e-05, "loss": 1.8549, "step": 1236 }, { "epoch": 0.32528656353663743, "grad_norm": 0.9338359832763672, "learning_rate": 8.917995444191345e-05, "loss": 1.8618, "step": 1238 }, { "epoch": 0.32581206687029923, "grad_norm": 0.8794339299201965, "learning_rate": 8.916243210092868e-05, "loss": 1.846, "step": 1240 }, { "epoch": 0.326337570203961, "grad_norm": 0.9014391899108887, "learning_rate": 8.914490975994393e-05, "loss": 1.8469, "step": 1242 }, { "epoch": 0.32686307353762273, "grad_norm": 0.9050634503364563, "learning_rate": 8.912738741895918e-05, "loss": 1.798, "step": 1244 }, { "epoch": 0.32738857687128453, "grad_norm": 0.9596816897392273, "learning_rate": 8.910986507797441e-05, "loss": 1.8617, "step": 1246 }, { "epoch": 0.3279140802049463, "grad_norm": 0.8555053472518921, "learning_rate": 8.909234273698966e-05, "loss": 1.8262, "step": 1248 }, { "epoch": 0.3284395835386081, "grad_norm": 0.7877684831619263, "learning_rate": 8.907482039600491e-05, "loss": 1.8536, "step": 1250 }, { "epoch": 0.32896508687226983, "grad_norm": 0.9233472347259521, "learning_rate": 8.905729805502016e-05, "loss": 1.8681, "step": 1252 }, { "epoch": 0.32949059020593163, "grad_norm": 0.8724532127380371, "learning_rate": 8.90397757140354e-05, "loss": 1.8169, "step": 1254 }, { "epoch": 0.3300160935395934, "grad_norm": 0.8664790987968445, "learning_rate": 8.902225337305065e-05, "loss": 1.8327, "step": 1256 }, { "epoch": 0.3305415968732552, "grad_norm": 1.0527539253234863, "learning_rate": 8.900473103206589e-05, "loss": 1.849, "step": 1258 }, { "epoch": 0.33106710020691693, "grad_norm": 1.0830262899398804, "learning_rate": 8.898720869108113e-05, "loss": 1.8293, "step": 1260 }, { "epoch": 0.33159260354057873, "grad_norm": 0.9514210820198059, "learning_rate": 8.896968635009638e-05, "loss": 1.8509, "step": 1262 }, { "epoch": 0.3321181068742405, "grad_norm": 1.2806978225708008, "learning_rate": 8.895216400911163e-05, "loss": 1.8622, "step": 1264 }, { "epoch": 0.33264361020790223, "grad_norm": 0.8732459545135498, "learning_rate": 8.893464166812686e-05, "loss": 1.8385, "step": 1266 }, { "epoch": 0.33316911354156403, "grad_norm": 0.9644619822502136, "learning_rate": 8.891711932714211e-05, "loss": 1.8368, "step": 1268 }, { "epoch": 0.3336946168752258, "grad_norm": 0.9549365043640137, "learning_rate": 8.889959698615736e-05, "loss": 1.8363, "step": 1270 }, { "epoch": 0.3342201202088876, "grad_norm": 0.9528286457061768, "learning_rate": 8.888207464517259e-05, "loss": 1.8548, "step": 1272 }, { "epoch": 0.33474562354254933, "grad_norm": 1.0315399169921875, "learning_rate": 8.886455230418784e-05, "loss": 1.8354, "step": 1274 }, { "epoch": 0.33527112687621113, "grad_norm": 0.8084585070610046, "learning_rate": 8.884702996320309e-05, "loss": 1.8423, "step": 1276 }, { "epoch": 0.3357966302098729, "grad_norm": 1.1312843561172485, "learning_rate": 8.882950762221833e-05, "loss": 1.8374, "step": 1278 }, { "epoch": 0.3363221335435347, "grad_norm": 1.1717549562454224, "learning_rate": 8.881198528123358e-05, "loss": 1.8204, "step": 1280 }, { "epoch": 0.33684763687719643, "grad_norm": 0.8000698089599609, "learning_rate": 8.879446294024883e-05, "loss": 1.8577, "step": 1282 }, { "epoch": 0.33737314021085824, "grad_norm": 1.3130030632019043, "learning_rate": 8.877694059926406e-05, "loss": 1.8072, "step": 1284 }, { "epoch": 0.33789864354452, "grad_norm": 0.8949963450431824, "learning_rate": 8.875941825827931e-05, "loss": 1.8586, "step": 1286 }, { "epoch": 0.33842414687818173, "grad_norm": 0.8775150775909424, "learning_rate": 8.874189591729456e-05, "loss": 1.831, "step": 1288 }, { "epoch": 0.33894965021184353, "grad_norm": 0.8946396708488464, "learning_rate": 8.872437357630979e-05, "loss": 1.8243, "step": 1290 }, { "epoch": 0.3394751535455053, "grad_norm": 1.271799921989441, "learning_rate": 8.870685123532504e-05, "loss": 1.8174, "step": 1292 }, { "epoch": 0.3400006568791671, "grad_norm": 0.8557697534561157, "learning_rate": 8.868932889434029e-05, "loss": 1.8429, "step": 1294 }, { "epoch": 0.34052616021282883, "grad_norm": 0.9884776473045349, "learning_rate": 8.867180655335554e-05, "loss": 1.7985, "step": 1296 }, { "epoch": 0.34105166354649064, "grad_norm": 0.9385315775871277, "learning_rate": 8.865428421237077e-05, "loss": 1.7948, "step": 1298 }, { "epoch": 0.3415771668801524, "grad_norm": 1.1224939823150635, "learning_rate": 8.863676187138602e-05, "loss": 1.8564, "step": 1300 }, { "epoch": 0.3421026702138142, "grad_norm": 0.9227058291435242, "learning_rate": 8.861923953040126e-05, "loss": 1.8432, "step": 1302 }, { "epoch": 0.34262817354747593, "grad_norm": 1.0591615438461304, "learning_rate": 8.860171718941651e-05, "loss": 1.8267, "step": 1304 }, { "epoch": 0.34315367688113774, "grad_norm": 0.8901565670967102, "learning_rate": 8.858419484843176e-05, "loss": 1.8452, "step": 1306 }, { "epoch": 0.3436791802147995, "grad_norm": 0.7925954461097717, "learning_rate": 8.856667250744701e-05, "loss": 1.8395, "step": 1308 }, { "epoch": 0.3442046835484613, "grad_norm": 0.8542584776878357, "learning_rate": 8.854915016646224e-05, "loss": 1.821, "step": 1310 }, { "epoch": 0.34473018688212304, "grad_norm": 0.9138728380203247, "learning_rate": 8.853162782547749e-05, "loss": 1.8466, "step": 1312 }, { "epoch": 0.3452556902157848, "grad_norm": 1.0735788345336914, "learning_rate": 8.851410548449274e-05, "loss": 1.8318, "step": 1314 }, { "epoch": 0.3457811935494466, "grad_norm": 1.3310229778289795, "learning_rate": 8.849658314350797e-05, "loss": 1.8334, "step": 1316 }, { "epoch": 0.34630669688310833, "grad_norm": 0.7600061297416687, "learning_rate": 8.847906080252322e-05, "loss": 1.8089, "step": 1318 }, { "epoch": 0.34683220021677014, "grad_norm": 0.818154513835907, "learning_rate": 8.846153846153847e-05, "loss": 1.8577, "step": 1320 }, { "epoch": 0.3473577035504319, "grad_norm": 1.0234004259109497, "learning_rate": 8.844401612055371e-05, "loss": 1.8364, "step": 1322 }, { "epoch": 0.3478832068840937, "grad_norm": 0.880425751209259, "learning_rate": 8.842649377956895e-05, "loss": 1.8671, "step": 1324 }, { "epoch": 0.34840871021775544, "grad_norm": 0.8950909376144409, "learning_rate": 8.84089714385842e-05, "loss": 1.8217, "step": 1326 }, { "epoch": 0.34893421355141724, "grad_norm": 1.029801607131958, "learning_rate": 8.839144909759944e-05, "loss": 1.8273, "step": 1328 }, { "epoch": 0.349459716885079, "grad_norm": 1.1284875869750977, "learning_rate": 8.837392675661469e-05, "loss": 1.8523, "step": 1330 }, { "epoch": 0.3499852202187408, "grad_norm": 1.3472214937210083, "learning_rate": 8.835640441562994e-05, "loss": 1.8878, "step": 1332 }, { "epoch": 0.35051072355240254, "grad_norm": 0.8898762464523315, "learning_rate": 8.833888207464518e-05, "loss": 1.8225, "step": 1334 }, { "epoch": 0.3510362268860643, "grad_norm": 1.2737003564834595, "learning_rate": 8.832135973366042e-05, "loss": 1.7853, "step": 1336 }, { "epoch": 0.3515617302197261, "grad_norm": 0.9682241678237915, "learning_rate": 8.830383739267567e-05, "loss": 1.8502, "step": 1338 }, { "epoch": 0.35208723355338784, "grad_norm": 0.8494237661361694, "learning_rate": 8.828631505169091e-05, "loss": 1.8515, "step": 1340 }, { "epoch": 0.35261273688704964, "grad_norm": 0.923283040523529, "learning_rate": 8.826879271070615e-05, "loss": 1.849, "step": 1342 }, { "epoch": 0.3531382402207114, "grad_norm": 1.0368821620941162, "learning_rate": 8.82512703697214e-05, "loss": 1.8228, "step": 1344 }, { "epoch": 0.3536637435543732, "grad_norm": 0.76881343126297, "learning_rate": 8.823374802873664e-05, "loss": 1.8156, "step": 1346 }, { "epoch": 0.35418924688803494, "grad_norm": 0.7315630316734314, "learning_rate": 8.821622568775189e-05, "loss": 1.8068, "step": 1348 }, { "epoch": 0.35471475022169674, "grad_norm": 1.254550814628601, "learning_rate": 8.819870334676712e-05, "loss": 1.8498, "step": 1350 }, { "epoch": 0.3552402535553585, "grad_norm": 1.1354317665100098, "learning_rate": 8.818118100578237e-05, "loss": 1.8547, "step": 1352 }, { "epoch": 0.3557657568890203, "grad_norm": 1.0078952312469482, "learning_rate": 8.816365866479762e-05, "loss": 1.8635, "step": 1354 }, { "epoch": 0.35629126022268204, "grad_norm": 1.4484366178512573, "learning_rate": 8.814613632381287e-05, "loss": 1.8405, "step": 1356 }, { "epoch": 0.3568167635563438, "grad_norm": 0.8407228589057922, "learning_rate": 8.812861398282811e-05, "loss": 1.8295, "step": 1358 }, { "epoch": 0.3573422668900056, "grad_norm": 0.9024233818054199, "learning_rate": 8.811109164184336e-05, "loss": 1.8052, "step": 1360 }, { "epoch": 0.35786777022366734, "grad_norm": 0.9681188464164734, "learning_rate": 8.80935693008586e-05, "loss": 1.8024, "step": 1362 }, { "epoch": 0.35839327355732914, "grad_norm": 0.9130085706710815, "learning_rate": 8.807604695987384e-05, "loss": 1.8159, "step": 1364 }, { "epoch": 0.3589187768909909, "grad_norm": 0.938353419303894, "learning_rate": 8.805852461888909e-05, "loss": 1.8134, "step": 1366 }, { "epoch": 0.3594442802246527, "grad_norm": 0.8700679540634155, "learning_rate": 8.804100227790433e-05, "loss": 1.8091, "step": 1368 }, { "epoch": 0.35996978355831444, "grad_norm": 0.8863296508789062, "learning_rate": 8.802347993691957e-05, "loss": 1.7961, "step": 1370 }, { "epoch": 0.36049528689197624, "grad_norm": 0.9155923128128052, "learning_rate": 8.800595759593482e-05, "loss": 1.8098, "step": 1372 }, { "epoch": 0.361020790225638, "grad_norm": 1.020551323890686, "learning_rate": 8.798843525495007e-05, "loss": 1.848, "step": 1374 }, { "epoch": 0.3615462935592998, "grad_norm": 0.9836577773094177, "learning_rate": 8.79709129139653e-05, "loss": 1.7839, "step": 1376 }, { "epoch": 0.36207179689296154, "grad_norm": 0.9969834089279175, "learning_rate": 8.795339057298055e-05, "loss": 1.8062, "step": 1378 }, { "epoch": 0.3625973002266233, "grad_norm": 0.8620086312294006, "learning_rate": 8.79358682319958e-05, "loss": 1.7944, "step": 1380 }, { "epoch": 0.3631228035602851, "grad_norm": 1.2116692066192627, "learning_rate": 8.791834589101104e-05, "loss": 1.8718, "step": 1382 }, { "epoch": 0.36364830689394684, "grad_norm": 0.8402097225189209, "learning_rate": 8.790082355002629e-05, "loss": 1.8276, "step": 1384 }, { "epoch": 0.36417381022760864, "grad_norm": 0.9271780848503113, "learning_rate": 8.788330120904154e-05, "loss": 1.8222, "step": 1386 }, { "epoch": 0.3646993135612704, "grad_norm": 0.8769554495811462, "learning_rate": 8.786577886805677e-05, "loss": 1.8416, "step": 1388 }, { "epoch": 0.3652248168949322, "grad_norm": 0.9306502938270569, "learning_rate": 8.784825652707202e-05, "loss": 1.8506, "step": 1390 }, { "epoch": 0.36575032022859394, "grad_norm": 0.8423568606376648, "learning_rate": 8.783073418608726e-05, "loss": 1.8617, "step": 1392 }, { "epoch": 0.36627582356225574, "grad_norm": 0.9485574960708618, "learning_rate": 8.78132118451025e-05, "loss": 1.8448, "step": 1394 }, { "epoch": 0.3668013268959175, "grad_norm": 1.1368005275726318, "learning_rate": 8.779568950411775e-05, "loss": 1.8217, "step": 1396 }, { "epoch": 0.3673268302295793, "grad_norm": 0.9294119477272034, "learning_rate": 8.7778167163133e-05, "loss": 1.8099, "step": 1398 }, { "epoch": 0.36785233356324104, "grad_norm": 0.8389936685562134, "learning_rate": 8.776064482214825e-05, "loss": 1.8601, "step": 1400 }, { "epoch": 0.3683778368969028, "grad_norm": 0.7817425727844238, "learning_rate": 8.77431224811635e-05, "loss": 1.8218, "step": 1402 }, { "epoch": 0.3689033402305646, "grad_norm": 1.1421295404434204, "learning_rate": 8.772560014017873e-05, "loss": 1.8673, "step": 1404 }, { "epoch": 0.36942884356422634, "grad_norm": 0.8209173083305359, "learning_rate": 8.770807779919397e-05, "loss": 1.8298, "step": 1406 }, { "epoch": 0.36995434689788814, "grad_norm": 1.0874011516571045, "learning_rate": 8.769055545820922e-05, "loss": 1.8008, "step": 1408 }, { "epoch": 0.3704798502315499, "grad_norm": 0.839116096496582, "learning_rate": 8.767303311722447e-05, "loss": 1.8511, "step": 1410 }, { "epoch": 0.3710053535652117, "grad_norm": 0.956777036190033, "learning_rate": 8.765551077623972e-05, "loss": 1.8761, "step": 1412 }, { "epoch": 0.37153085689887344, "grad_norm": 0.7702937722206116, "learning_rate": 8.763798843525497e-05, "loss": 1.8231, "step": 1414 }, { "epoch": 0.37205636023253524, "grad_norm": 0.8248230814933777, "learning_rate": 8.76204660942702e-05, "loss": 1.8538, "step": 1416 }, { "epoch": 0.372581863566197, "grad_norm": 1.0771416425704956, "learning_rate": 8.760294375328543e-05, "loss": 1.8515, "step": 1418 }, { "epoch": 0.3731073668998588, "grad_norm": 0.8044272661209106, "learning_rate": 8.758542141230068e-05, "loss": 1.8295, "step": 1420 }, { "epoch": 0.37363287023352054, "grad_norm": 1.0227196216583252, "learning_rate": 8.756789907131593e-05, "loss": 1.8659, "step": 1422 }, { "epoch": 0.3741583735671823, "grad_norm": 0.8310641646385193, "learning_rate": 8.755037673033118e-05, "loss": 1.8286, "step": 1424 }, { "epoch": 0.3746838769008441, "grad_norm": 0.9817164540290833, "learning_rate": 8.753285438934642e-05, "loss": 1.8863, "step": 1426 }, { "epoch": 0.37520938023450584, "grad_norm": 0.872424840927124, "learning_rate": 8.751533204836167e-05, "loss": 1.8489, "step": 1428 }, { "epoch": 0.37573488356816764, "grad_norm": 0.9776557087898254, "learning_rate": 8.74978097073769e-05, "loss": 1.8536, "step": 1430 }, { "epoch": 0.3762603869018294, "grad_norm": 1.3488025665283203, "learning_rate": 8.748028736639215e-05, "loss": 1.8432, "step": 1432 }, { "epoch": 0.3767858902354912, "grad_norm": 0.8517011404037476, "learning_rate": 8.74627650254074e-05, "loss": 1.847, "step": 1434 }, { "epoch": 0.37731139356915294, "grad_norm": 0.8631575703620911, "learning_rate": 8.744524268442265e-05, "loss": 1.8053, "step": 1436 }, { "epoch": 0.37783689690281475, "grad_norm": 0.881100058555603, "learning_rate": 8.74277203434379e-05, "loss": 1.8662, "step": 1438 }, { "epoch": 0.3783624002364765, "grad_norm": 0.8032435178756714, "learning_rate": 8.741019800245314e-05, "loss": 1.8438, "step": 1440 }, { "epoch": 0.3788879035701383, "grad_norm": 0.7921327948570251, "learning_rate": 8.739267566146838e-05, "loss": 1.8685, "step": 1442 }, { "epoch": 0.37941340690380004, "grad_norm": 1.060738444328308, "learning_rate": 8.737515332048361e-05, "loss": 1.8365, "step": 1444 }, { "epoch": 0.37993891023746185, "grad_norm": 1.0198917388916016, "learning_rate": 8.735763097949886e-05, "loss": 1.8278, "step": 1446 }, { "epoch": 0.3804644135711236, "grad_norm": 0.9688281416893005, "learning_rate": 8.73401086385141e-05, "loss": 1.8513, "step": 1448 }, { "epoch": 0.38098991690478534, "grad_norm": 1.2723430395126343, "learning_rate": 8.732258629752935e-05, "loss": 1.7984, "step": 1450 }, { "epoch": 0.38151542023844714, "grad_norm": 0.8690189123153687, "learning_rate": 8.73050639565446e-05, "loss": 1.8288, "step": 1452 }, { "epoch": 0.3820409235721089, "grad_norm": 0.9124467968940735, "learning_rate": 8.728754161555985e-05, "loss": 1.8684, "step": 1454 }, { "epoch": 0.3825664269057707, "grad_norm": 1.1950373649597168, "learning_rate": 8.727001927457508e-05, "loss": 1.8431, "step": 1456 }, { "epoch": 0.38309193023943244, "grad_norm": 0.7846235632896423, "learning_rate": 8.725249693359033e-05, "loss": 1.8409, "step": 1458 }, { "epoch": 0.38361743357309425, "grad_norm": 1.450654149055481, "learning_rate": 8.723497459260558e-05, "loss": 1.844, "step": 1460 }, { "epoch": 0.384142936906756, "grad_norm": 1.0545793771743774, "learning_rate": 8.721745225162083e-05, "loss": 1.8332, "step": 1462 }, { "epoch": 0.3846684402404178, "grad_norm": 1.000705599784851, "learning_rate": 8.719992991063607e-05, "loss": 1.8486, "step": 1464 }, { "epoch": 0.38519394357407954, "grad_norm": 1.2795532941818237, "learning_rate": 8.718240756965132e-05, "loss": 1.8323, "step": 1466 }, { "epoch": 0.38571944690774135, "grad_norm": 0.7551513314247131, "learning_rate": 8.716488522866655e-05, "loss": 1.8573, "step": 1468 }, { "epoch": 0.3862449502414031, "grad_norm": 1.2810308933258057, "learning_rate": 8.714736288768179e-05, "loss": 1.8167, "step": 1470 }, { "epoch": 0.38677045357506484, "grad_norm": 1.0538434982299805, "learning_rate": 8.712984054669704e-05, "loss": 1.8501, "step": 1472 }, { "epoch": 0.38729595690872665, "grad_norm": 1.2018911838531494, "learning_rate": 8.711231820571228e-05, "loss": 1.8291, "step": 1474 }, { "epoch": 0.3878214602423884, "grad_norm": 1.4515736103057861, "learning_rate": 8.709479586472753e-05, "loss": 1.8728, "step": 1476 }, { "epoch": 0.3883469635760502, "grad_norm": 0.855747640132904, "learning_rate": 8.707727352374278e-05, "loss": 1.8048, "step": 1478 }, { "epoch": 0.38887246690971194, "grad_norm": 1.3377580642700195, "learning_rate": 8.705975118275803e-05, "loss": 1.8599, "step": 1480 }, { "epoch": 0.38939797024337375, "grad_norm": 0.9842968583106995, "learning_rate": 8.704222884177326e-05, "loss": 1.8038, "step": 1482 }, { "epoch": 0.3899234735770355, "grad_norm": 1.4240106344223022, "learning_rate": 8.702470650078851e-05, "loss": 1.8305, "step": 1484 }, { "epoch": 0.3904489769106973, "grad_norm": 0.7605730295181274, "learning_rate": 8.700718415980376e-05, "loss": 1.8321, "step": 1486 }, { "epoch": 0.39097448024435905, "grad_norm": 0.9584787487983704, "learning_rate": 8.6989661818819e-05, "loss": 1.799, "step": 1488 }, { "epoch": 0.39149998357802085, "grad_norm": 0.8087942004203796, "learning_rate": 8.697213947783425e-05, "loss": 1.8011, "step": 1490 }, { "epoch": 0.3920254869116826, "grad_norm": 0.7870105504989624, "learning_rate": 8.69546171368495e-05, "loss": 1.7972, "step": 1492 }, { "epoch": 0.39255099024534434, "grad_norm": 1.1304738521575928, "learning_rate": 8.693709479586473e-05, "loss": 1.8088, "step": 1494 }, { "epoch": 0.39307649357900615, "grad_norm": 0.8902273178100586, "learning_rate": 8.691957245487997e-05, "loss": 1.8101, "step": 1496 }, { "epoch": 0.3936019969126679, "grad_norm": 1.1424989700317383, "learning_rate": 8.690205011389521e-05, "loss": 1.8018, "step": 1498 }, { "epoch": 0.3941275002463297, "grad_norm": 0.9772897362709045, "learning_rate": 8.688452777291046e-05, "loss": 1.8191, "step": 1500 }, { "epoch": 0.39465300357999145, "grad_norm": 0.9879363775253296, "learning_rate": 8.686700543192571e-05, "loss": 1.7934, "step": 1502 }, { "epoch": 0.39517850691365325, "grad_norm": 0.8215435147285461, "learning_rate": 8.684948309094096e-05, "loss": 1.8125, "step": 1504 }, { "epoch": 0.395704010247315, "grad_norm": 0.8453714847564697, "learning_rate": 8.68319607499562e-05, "loss": 1.8385, "step": 1506 }, { "epoch": 0.3962295135809768, "grad_norm": 0.9266200661659241, "learning_rate": 8.681443840897144e-05, "loss": 1.8537, "step": 1508 }, { "epoch": 0.39675501691463855, "grad_norm": 1.2535603046417236, "learning_rate": 8.679691606798669e-05, "loss": 1.8138, "step": 1510 }, { "epoch": 0.39728052024830035, "grad_norm": 1.0080575942993164, "learning_rate": 8.677939372700193e-05, "loss": 1.8193, "step": 1512 }, { "epoch": 0.3978060235819621, "grad_norm": 0.8419904112815857, "learning_rate": 8.676187138601718e-05, "loss": 1.845, "step": 1514 }, { "epoch": 0.39833152691562385, "grad_norm": 1.2089139223098755, "learning_rate": 8.674434904503243e-05, "loss": 1.8274, "step": 1516 }, { "epoch": 0.39885703024928565, "grad_norm": 0.9421194791793823, "learning_rate": 8.672682670404768e-05, "loss": 1.8132, "step": 1518 }, { "epoch": 0.3993825335829474, "grad_norm": 1.0286279916763306, "learning_rate": 8.670930436306291e-05, "loss": 1.8045, "step": 1520 }, { "epoch": 0.3999080369166092, "grad_norm": 1.1791476011276245, "learning_rate": 8.669178202207814e-05, "loss": 1.8148, "step": 1522 }, { "epoch": 0.40043354025027095, "grad_norm": 0.9198878407478333, "learning_rate": 8.667425968109339e-05, "loss": 1.8261, "step": 1524 }, { "epoch": 0.40095904358393275, "grad_norm": 1.0204558372497559, "learning_rate": 8.665673734010864e-05, "loss": 1.8324, "step": 1526 }, { "epoch": 0.4014845469175945, "grad_norm": 1.1662263870239258, "learning_rate": 8.663921499912389e-05, "loss": 1.7949, "step": 1528 }, { "epoch": 0.4020100502512563, "grad_norm": 1.2517145872116089, "learning_rate": 8.662169265813913e-05, "loss": 1.7924, "step": 1530 }, { "epoch": 0.40253555358491805, "grad_norm": 0.7785090208053589, "learning_rate": 8.660417031715438e-05, "loss": 1.8023, "step": 1532 }, { "epoch": 0.40306105691857985, "grad_norm": 0.8084584474563599, "learning_rate": 8.658664797616962e-05, "loss": 1.8132, "step": 1534 }, { "epoch": 0.4035865602522416, "grad_norm": 0.8784323930740356, "learning_rate": 8.656912563518486e-05, "loss": 1.8047, "step": 1536 }, { "epoch": 0.40411206358590335, "grad_norm": 0.8626761436462402, "learning_rate": 8.655160329420011e-05, "loss": 1.7985, "step": 1538 }, { "epoch": 0.40463756691956515, "grad_norm": 0.8983022570610046, "learning_rate": 8.653408095321536e-05, "loss": 1.8068, "step": 1540 }, { "epoch": 0.4051630702532269, "grad_norm": 1.2110192775726318, "learning_rate": 8.65165586122306e-05, "loss": 1.815, "step": 1542 }, { "epoch": 0.4056885735868887, "grad_norm": 0.9025999307632446, "learning_rate": 8.649903627124585e-05, "loss": 1.8305, "step": 1544 }, { "epoch": 0.40621407692055045, "grad_norm": 0.8550492525100708, "learning_rate": 8.648151393026109e-05, "loss": 1.7915, "step": 1546 }, { "epoch": 0.40673958025421225, "grad_norm": 0.8646672368049622, "learning_rate": 8.646399158927632e-05, "loss": 1.8338, "step": 1548 }, { "epoch": 0.407265083587874, "grad_norm": 0.8966619372367859, "learning_rate": 8.644646924829157e-05, "loss": 1.8072, "step": 1550 }, { "epoch": 0.4077905869215358, "grad_norm": 0.9003387689590454, "learning_rate": 8.642894690730682e-05, "loss": 1.8041, "step": 1552 }, { "epoch": 0.40831609025519755, "grad_norm": 1.035400152206421, "learning_rate": 8.641142456632206e-05, "loss": 1.8484, "step": 1554 }, { "epoch": 0.40884159358885935, "grad_norm": 0.8281182050704956, "learning_rate": 8.639390222533731e-05, "loss": 1.7943, "step": 1556 }, { "epoch": 0.4093670969225211, "grad_norm": 0.8706338405609131, "learning_rate": 8.637637988435256e-05, "loss": 1.848, "step": 1558 }, { "epoch": 0.40989260025618285, "grad_norm": 0.9510646462440491, "learning_rate": 8.63588575433678e-05, "loss": 1.81, "step": 1560 }, { "epoch": 0.41041810358984465, "grad_norm": 1.2758607864379883, "learning_rate": 8.634133520238304e-05, "loss": 1.831, "step": 1562 }, { "epoch": 0.4109436069235064, "grad_norm": 0.8133296966552734, "learning_rate": 8.632381286139829e-05, "loss": 1.833, "step": 1564 }, { "epoch": 0.4114691102571682, "grad_norm": 0.8663495779037476, "learning_rate": 8.630629052041354e-05, "loss": 1.7999, "step": 1566 }, { "epoch": 0.41199461359082995, "grad_norm": 0.8473132252693176, "learning_rate": 8.628876817942878e-05, "loss": 1.8148, "step": 1568 }, { "epoch": 0.41252011692449175, "grad_norm": 0.7791121006011963, "learning_rate": 8.627124583844402e-05, "loss": 1.8107, "step": 1570 }, { "epoch": 0.4130456202581535, "grad_norm": 0.8510565161705017, "learning_rate": 8.625372349745927e-05, "loss": 1.8434, "step": 1572 }, { "epoch": 0.4135711235918153, "grad_norm": 0.7872109413146973, "learning_rate": 8.62362011564745e-05, "loss": 1.86, "step": 1574 }, { "epoch": 0.41409662692547705, "grad_norm": 0.9160329699516296, "learning_rate": 8.621867881548975e-05, "loss": 1.8408, "step": 1576 }, { "epoch": 0.41462213025913885, "grad_norm": 0.9173769354820251, "learning_rate": 8.6201156474505e-05, "loss": 1.8057, "step": 1578 }, { "epoch": 0.4151476335928006, "grad_norm": 1.086470603942871, "learning_rate": 8.618363413352024e-05, "loss": 1.8199, "step": 1580 }, { "epoch": 0.41567313692646235, "grad_norm": 0.8830829858779907, "learning_rate": 8.616611179253549e-05, "loss": 1.8107, "step": 1582 }, { "epoch": 0.41619864026012415, "grad_norm": 0.8462435007095337, "learning_rate": 8.614858945155074e-05, "loss": 1.8159, "step": 1584 }, { "epoch": 0.4167241435937859, "grad_norm": 0.9439370632171631, "learning_rate": 8.613106711056597e-05, "loss": 1.8311, "step": 1586 }, { "epoch": 0.4172496469274477, "grad_norm": 0.933134138584137, "learning_rate": 8.611354476958122e-05, "loss": 1.8203, "step": 1588 }, { "epoch": 0.41777515026110945, "grad_norm": 0.7939304709434509, "learning_rate": 8.609602242859647e-05, "loss": 1.7991, "step": 1590 }, { "epoch": 0.41830065359477125, "grad_norm": 0.9367759823799133, "learning_rate": 8.607850008761171e-05, "loss": 1.8391, "step": 1592 }, { "epoch": 0.418826156928433, "grad_norm": 0.8476933240890503, "learning_rate": 8.606097774662696e-05, "loss": 1.8135, "step": 1594 }, { "epoch": 0.4193516602620948, "grad_norm": 0.9385167360305786, "learning_rate": 8.60434554056422e-05, "loss": 1.8396, "step": 1596 }, { "epoch": 0.41987716359575655, "grad_norm": 0.9880960583686829, "learning_rate": 8.602593306465744e-05, "loss": 1.8282, "step": 1598 }, { "epoch": 0.42040266692941836, "grad_norm": 1.3682297468185425, "learning_rate": 8.600841072367268e-05, "loss": 1.8063, "step": 1600 }, { "epoch": 0.42040266692941836, "eval_loss": 1.785941243171692, "eval_runtime": 487.1976, "eval_samples_per_second": 249.979, "eval_steps_per_second": 31.248, "step": 1600 }, { "epoch": 0.4209281702630801, "grad_norm": 0.8974788784980774, "learning_rate": 8.599088838268792e-05, "loss": 1.8264, "step": 1602 }, { "epoch": 0.4214536735967419, "grad_norm": 1.3051773309707642, "learning_rate": 8.597336604170317e-05, "loss": 1.8204, "step": 1604 }, { "epoch": 0.42197917693040365, "grad_norm": 0.8313725590705872, "learning_rate": 8.595584370071842e-05, "loss": 1.8276, "step": 1606 }, { "epoch": 0.4225046802640654, "grad_norm": 0.7489058375358582, "learning_rate": 8.593832135973367e-05, "loss": 1.8039, "step": 1608 }, { "epoch": 0.4230301835977272, "grad_norm": 1.303904414176941, "learning_rate": 8.592079901874891e-05, "loss": 1.8117, "step": 1610 }, { "epoch": 0.42355568693138895, "grad_norm": 0.7930120229721069, "learning_rate": 8.590327667776415e-05, "loss": 1.816, "step": 1612 }, { "epoch": 0.42408119026505076, "grad_norm": 1.1683326959609985, "learning_rate": 8.58857543367794e-05, "loss": 1.8485, "step": 1614 }, { "epoch": 0.4246066935987125, "grad_norm": 1.129786491394043, "learning_rate": 8.586823199579464e-05, "loss": 1.8091, "step": 1616 }, { "epoch": 0.4251321969323743, "grad_norm": 1.006664752960205, "learning_rate": 8.585070965480989e-05, "loss": 1.801, "step": 1618 }, { "epoch": 0.42565770026603605, "grad_norm": 1.2593824863433838, "learning_rate": 8.583318731382514e-05, "loss": 1.874, "step": 1620 }, { "epoch": 0.42618320359969786, "grad_norm": 0.7356145977973938, "learning_rate": 8.581566497284037e-05, "loss": 1.8156, "step": 1622 }, { "epoch": 0.4267087069333596, "grad_norm": 1.3224732875823975, "learning_rate": 8.579814263185562e-05, "loss": 1.8485, "step": 1624 }, { "epoch": 0.4272342102670214, "grad_norm": 1.0026780366897583, "learning_rate": 8.578062029087085e-05, "loss": 1.8051, "step": 1626 }, { "epoch": 0.42775971360068316, "grad_norm": 1.235370397567749, "learning_rate": 8.57630979498861e-05, "loss": 1.8267, "step": 1628 }, { "epoch": 0.4282852169343449, "grad_norm": 1.0629823207855225, "learning_rate": 8.574557560890135e-05, "loss": 1.7937, "step": 1630 }, { "epoch": 0.4288107202680067, "grad_norm": 0.9405169486999512, "learning_rate": 8.57280532679166e-05, "loss": 1.8023, "step": 1632 }, { "epoch": 0.42933622360166845, "grad_norm": 1.5205514430999756, "learning_rate": 8.571053092693184e-05, "loss": 1.8112, "step": 1634 }, { "epoch": 0.42986172693533026, "grad_norm": 1.038159728050232, "learning_rate": 8.569300858594709e-05, "loss": 1.7833, "step": 1636 }, { "epoch": 0.430387230268992, "grad_norm": 0.8901309370994568, "learning_rate": 8.567548624496233e-05, "loss": 1.8241, "step": 1638 }, { "epoch": 0.4309127336026538, "grad_norm": 1.166390061378479, "learning_rate": 8.565796390397757e-05, "loss": 1.7899, "step": 1640 }, { "epoch": 0.43143823693631556, "grad_norm": 1.0582796335220337, "learning_rate": 8.564044156299282e-05, "loss": 1.8208, "step": 1642 }, { "epoch": 0.43196374026997736, "grad_norm": 0.8496580123901367, "learning_rate": 8.562291922200807e-05, "loss": 1.8206, "step": 1644 }, { "epoch": 0.4324892436036391, "grad_norm": 0.761249303817749, "learning_rate": 8.560539688102332e-05, "loss": 1.7858, "step": 1646 }, { "epoch": 0.4330147469373009, "grad_norm": 0.8980756402015686, "learning_rate": 8.558787454003855e-05, "loss": 1.8053, "step": 1648 }, { "epoch": 0.43354025027096266, "grad_norm": 0.9203025698661804, "learning_rate": 8.55703521990538e-05, "loss": 1.8379, "step": 1650 }, { "epoch": 0.4340657536046244, "grad_norm": 0.9592378735542297, "learning_rate": 8.555282985806905e-05, "loss": 1.861, "step": 1652 }, { "epoch": 0.4345912569382862, "grad_norm": 1.0187515020370483, "learning_rate": 8.553530751708428e-05, "loss": 1.7764, "step": 1654 }, { "epoch": 0.43511676027194796, "grad_norm": 0.8016685247421265, "learning_rate": 8.551778517609953e-05, "loss": 1.8176, "step": 1656 }, { "epoch": 0.43564226360560976, "grad_norm": 0.7380330562591553, "learning_rate": 8.550026283511477e-05, "loss": 1.8218, "step": 1658 }, { "epoch": 0.4361677669392715, "grad_norm": 0.6815687417984009, "learning_rate": 8.548274049413002e-05, "loss": 1.8127, "step": 1660 }, { "epoch": 0.4366932702729333, "grad_norm": 0.6906920075416565, "learning_rate": 8.546521815314527e-05, "loss": 1.773, "step": 1662 }, { "epoch": 0.43721877360659506, "grad_norm": 0.9510621428489685, "learning_rate": 8.544769581216052e-05, "loss": 1.8305, "step": 1664 }, { "epoch": 0.43774427694025686, "grad_norm": 0.727105438709259, "learning_rate": 8.543017347117575e-05, "loss": 1.8192, "step": 1666 }, { "epoch": 0.4382697802739186, "grad_norm": 0.7399454712867737, "learning_rate": 8.5412651130191e-05, "loss": 1.8133, "step": 1668 }, { "epoch": 0.4387952836075804, "grad_norm": 0.8177588582038879, "learning_rate": 8.539512878920625e-05, "loss": 1.7933, "step": 1670 }, { "epoch": 0.43932078694124216, "grad_norm": 0.7681954503059387, "learning_rate": 8.537760644822148e-05, "loss": 1.837, "step": 1672 }, { "epoch": 0.4398462902749039, "grad_norm": 1.5765389204025269, "learning_rate": 8.536008410723673e-05, "loss": 1.8291, "step": 1674 }, { "epoch": 0.4403717936085657, "grad_norm": 0.7724891304969788, "learning_rate": 8.534256176625198e-05, "loss": 1.8124, "step": 1676 }, { "epoch": 0.44089729694222746, "grad_norm": 0.8893011808395386, "learning_rate": 8.532503942526722e-05, "loss": 1.8405, "step": 1678 }, { "epoch": 0.44142280027588926, "grad_norm": 0.878136932849884, "learning_rate": 8.530751708428246e-05, "loss": 1.8106, "step": 1680 }, { "epoch": 0.441948303609551, "grad_norm": 0.9325633645057678, "learning_rate": 8.52899947432977e-05, "loss": 1.7963, "step": 1682 }, { "epoch": 0.4424738069432128, "grad_norm": 1.0837180614471436, "learning_rate": 8.527247240231295e-05, "loss": 1.8144, "step": 1684 }, { "epoch": 0.44299931027687456, "grad_norm": 0.8428369164466858, "learning_rate": 8.52549500613282e-05, "loss": 1.8001, "step": 1686 }, { "epoch": 0.44352481361053636, "grad_norm": 0.930844783782959, "learning_rate": 8.523742772034345e-05, "loss": 1.8038, "step": 1688 }, { "epoch": 0.4440503169441981, "grad_norm": 0.7409669756889343, "learning_rate": 8.52199053793587e-05, "loss": 1.7926, "step": 1690 }, { "epoch": 0.4445758202778599, "grad_norm": 1.0107098817825317, "learning_rate": 8.520238303837393e-05, "loss": 1.7864, "step": 1692 }, { "epoch": 0.44510132361152166, "grad_norm": 0.760370671749115, "learning_rate": 8.518486069738918e-05, "loss": 1.7881, "step": 1694 }, { "epoch": 0.4456268269451834, "grad_norm": 1.2437944412231445, "learning_rate": 8.516733835640442e-05, "loss": 1.8152, "step": 1696 }, { "epoch": 0.4461523302788452, "grad_norm": 0.8944531679153442, "learning_rate": 8.514981601541966e-05, "loss": 1.8272, "step": 1698 }, { "epoch": 0.44667783361250696, "grad_norm": 0.9550314545631409, "learning_rate": 8.51322936744349e-05, "loss": 1.7936, "step": 1700 }, { "epoch": 0.44720333694616876, "grad_norm": 0.7015916109085083, "learning_rate": 8.511477133345015e-05, "loss": 1.8737, "step": 1702 }, { "epoch": 0.4477288402798305, "grad_norm": 1.0813145637512207, "learning_rate": 8.50972489924654e-05, "loss": 1.8118, "step": 1704 }, { "epoch": 0.4482543436134923, "grad_norm": 0.8479138016700745, "learning_rate": 8.507972665148064e-05, "loss": 1.8074, "step": 1706 }, { "epoch": 0.44877984694715406, "grad_norm": 0.9790395498275757, "learning_rate": 8.506220431049588e-05, "loss": 1.7785, "step": 1708 }, { "epoch": 0.44930535028081586, "grad_norm": 0.8719221353530884, "learning_rate": 8.504468196951113e-05, "loss": 1.8162, "step": 1710 }, { "epoch": 0.4498308536144776, "grad_norm": 1.064282774925232, "learning_rate": 8.502715962852638e-05, "loss": 1.8148, "step": 1712 }, { "epoch": 0.4503563569481394, "grad_norm": 0.8482780456542969, "learning_rate": 8.500963728754163e-05, "loss": 1.7768, "step": 1714 }, { "epoch": 0.45088186028180116, "grad_norm": 0.901155412197113, "learning_rate": 8.499211494655687e-05, "loss": 1.8182, "step": 1716 }, { "epoch": 0.4514073636154629, "grad_norm": 1.0124598741531372, "learning_rate": 8.497459260557211e-05, "loss": 1.8142, "step": 1718 }, { "epoch": 0.4519328669491247, "grad_norm": 0.8708586692810059, "learning_rate": 8.495707026458735e-05, "loss": 1.8247, "step": 1720 }, { "epoch": 0.45245837028278646, "grad_norm": 0.9597557187080383, "learning_rate": 8.49395479236026e-05, "loss": 1.7786, "step": 1722 }, { "epoch": 0.45298387361644826, "grad_norm": 1.1022772789001465, "learning_rate": 8.492202558261784e-05, "loss": 1.8139, "step": 1724 }, { "epoch": 0.45350937695011, "grad_norm": 1.0891538858413696, "learning_rate": 8.490450324163308e-05, "loss": 1.7986, "step": 1726 }, { "epoch": 0.4540348802837718, "grad_norm": 0.8127626776695251, "learning_rate": 8.488698090064833e-05, "loss": 1.8026, "step": 1728 }, { "epoch": 0.45456038361743356, "grad_norm": 0.9313668608665466, "learning_rate": 8.486945855966358e-05, "loss": 1.788, "step": 1730 }, { "epoch": 0.45508588695109536, "grad_norm": 0.8581985235214233, "learning_rate": 8.485193621867881e-05, "loss": 1.8097, "step": 1732 }, { "epoch": 0.4556113902847571, "grad_norm": 0.7745251059532166, "learning_rate": 8.483441387769406e-05, "loss": 1.8095, "step": 1734 }, { "epoch": 0.4561368936184189, "grad_norm": 0.7251246571540833, "learning_rate": 8.481689153670931e-05, "loss": 1.8195, "step": 1736 }, { "epoch": 0.45666239695208066, "grad_norm": 0.8949863314628601, "learning_rate": 8.479936919572456e-05, "loss": 1.8665, "step": 1738 }, { "epoch": 0.4571879002857424, "grad_norm": 0.8396829962730408, "learning_rate": 8.47818468547398e-05, "loss": 1.808, "step": 1740 }, { "epoch": 0.4577134036194042, "grad_norm": 0.8307755589485168, "learning_rate": 8.476432451375505e-05, "loss": 1.8332, "step": 1742 }, { "epoch": 0.45823890695306596, "grad_norm": 0.8172729015350342, "learning_rate": 8.474680217277028e-05, "loss": 1.7829, "step": 1744 }, { "epoch": 0.45876441028672776, "grad_norm": 0.7859178185462952, "learning_rate": 8.472927983178553e-05, "loss": 1.8111, "step": 1746 }, { "epoch": 0.4592899136203895, "grad_norm": 0.8412219882011414, "learning_rate": 8.471175749080078e-05, "loss": 1.8106, "step": 1748 }, { "epoch": 0.4598154169540513, "grad_norm": 1.1843855381011963, "learning_rate": 8.469423514981601e-05, "loss": 1.8064, "step": 1750 }, { "epoch": 0.46034092028771306, "grad_norm": 0.7547696232795715, "learning_rate": 8.467671280883126e-05, "loss": 1.7785, "step": 1752 }, { "epoch": 0.46086642362137487, "grad_norm": 0.8816393613815308, "learning_rate": 8.465919046784651e-05, "loss": 1.7851, "step": 1754 }, { "epoch": 0.4613919269550366, "grad_norm": 0.985379695892334, "learning_rate": 8.464166812686176e-05, "loss": 1.7841, "step": 1756 }, { "epoch": 0.4619174302886984, "grad_norm": 0.7585499286651611, "learning_rate": 8.462414578587699e-05, "loss": 1.7928, "step": 1758 }, { "epoch": 0.46244293362236016, "grad_norm": 0.8072088956832886, "learning_rate": 8.460662344489224e-05, "loss": 1.7867, "step": 1760 }, { "epoch": 0.46296843695602197, "grad_norm": 0.8818538784980774, "learning_rate": 8.458910110390749e-05, "loss": 1.8085, "step": 1762 }, { "epoch": 0.4634939402896837, "grad_norm": 0.7545977830886841, "learning_rate": 8.457157876292273e-05, "loss": 1.8134, "step": 1764 }, { "epoch": 0.46401944362334546, "grad_norm": 0.7525529265403748, "learning_rate": 8.455405642193798e-05, "loss": 1.8235, "step": 1766 }, { "epoch": 0.46454494695700727, "grad_norm": 1.0930671691894531, "learning_rate": 8.453653408095323e-05, "loss": 1.7796, "step": 1768 }, { "epoch": 0.465070450290669, "grad_norm": 0.8903842568397522, "learning_rate": 8.451901173996846e-05, "loss": 1.8413, "step": 1770 }, { "epoch": 0.4655959536243308, "grad_norm": 0.9519714117050171, "learning_rate": 8.450148939898371e-05, "loss": 1.8189, "step": 1772 }, { "epoch": 0.46612145695799256, "grad_norm": 0.816856861114502, "learning_rate": 8.448396705799894e-05, "loss": 1.8338, "step": 1774 }, { "epoch": 0.46664696029165437, "grad_norm": 0.8597956299781799, "learning_rate": 8.446644471701419e-05, "loss": 1.7963, "step": 1776 }, { "epoch": 0.4671724636253161, "grad_norm": 0.8783820271492004, "learning_rate": 8.444892237602944e-05, "loss": 1.7985, "step": 1778 }, { "epoch": 0.4676979669589779, "grad_norm": 0.9130274653434753, "learning_rate": 8.443140003504469e-05, "loss": 1.791, "step": 1780 }, { "epoch": 0.46822347029263967, "grad_norm": 1.001056432723999, "learning_rate": 8.441387769405993e-05, "loss": 1.8115, "step": 1782 }, { "epoch": 0.46874897362630147, "grad_norm": 0.8054397702217102, "learning_rate": 8.439635535307517e-05, "loss": 1.8411, "step": 1784 }, { "epoch": 0.4692744769599632, "grad_norm": 0.8605888485908508, "learning_rate": 8.437883301209042e-05, "loss": 1.7763, "step": 1786 }, { "epoch": 0.46979998029362496, "grad_norm": 1.1046404838562012, "learning_rate": 8.436131067110566e-05, "loss": 1.8331, "step": 1788 }, { "epoch": 0.47032548362728677, "grad_norm": 0.8004000186920166, "learning_rate": 8.434378833012091e-05, "loss": 1.826, "step": 1790 }, { "epoch": 0.4708509869609485, "grad_norm": 1.05780827999115, "learning_rate": 8.432626598913616e-05, "loss": 1.8101, "step": 1792 }, { "epoch": 0.4713764902946103, "grad_norm": 0.9407793879508972, "learning_rate": 8.43087436481514e-05, "loss": 1.7962, "step": 1794 }, { "epoch": 0.47190199362827207, "grad_norm": 0.7705127000808716, "learning_rate": 8.429122130716664e-05, "loss": 1.7978, "step": 1796 }, { "epoch": 0.47242749696193387, "grad_norm": 0.8127232193946838, "learning_rate": 8.427369896618189e-05, "loss": 1.7736, "step": 1798 }, { "epoch": 0.4729530002955956, "grad_norm": 1.0332077741622925, "learning_rate": 8.425617662519712e-05, "loss": 1.7882, "step": 1800 }, { "epoch": 0.4734785036292574, "grad_norm": 0.8787586092948914, "learning_rate": 8.423865428421237e-05, "loss": 1.7942, "step": 1802 }, { "epoch": 0.47400400696291917, "grad_norm": 1.0014612674713135, "learning_rate": 8.422113194322762e-05, "loss": 1.8046, "step": 1804 }, { "epoch": 0.47452951029658097, "grad_norm": 0.7657825946807861, "learning_rate": 8.420360960224286e-05, "loss": 1.7933, "step": 1806 }, { "epoch": 0.4750550136302427, "grad_norm": 0.8316423296928406, "learning_rate": 8.418608726125811e-05, "loss": 1.7988, "step": 1808 }, { "epoch": 0.47558051696390446, "grad_norm": 0.7967455387115479, "learning_rate": 8.416856492027335e-05, "loss": 1.8043, "step": 1810 }, { "epoch": 0.47610602029756627, "grad_norm": 0.7839574217796326, "learning_rate": 8.41510425792886e-05, "loss": 1.8401, "step": 1812 }, { "epoch": 0.476631523631228, "grad_norm": 0.8147667646408081, "learning_rate": 8.413352023830384e-05, "loss": 1.8019, "step": 1814 }, { "epoch": 0.4771570269648898, "grad_norm": 1.0701135396957397, "learning_rate": 8.411599789731909e-05, "loss": 1.8301, "step": 1816 }, { "epoch": 0.47768253029855157, "grad_norm": 0.8631109595298767, "learning_rate": 8.409847555633434e-05, "loss": 1.8335, "step": 1818 }, { "epoch": 0.47820803363221337, "grad_norm": 0.9797492623329163, "learning_rate": 8.408095321534958e-05, "loss": 1.8189, "step": 1820 }, { "epoch": 0.4787335369658751, "grad_norm": 0.9087586998939514, "learning_rate": 8.406343087436482e-05, "loss": 1.8405, "step": 1822 }, { "epoch": 0.4792590402995369, "grad_norm": 0.7700434923171997, "learning_rate": 8.404590853338007e-05, "loss": 1.7943, "step": 1824 }, { "epoch": 0.47978454363319867, "grad_norm": 0.9100522398948669, "learning_rate": 8.40283861923953e-05, "loss": 1.8021, "step": 1826 }, { "epoch": 0.48031004696686047, "grad_norm": 1.0668331384658813, "learning_rate": 8.401086385141055e-05, "loss": 1.7966, "step": 1828 }, { "epoch": 0.4808355503005222, "grad_norm": 0.9680224061012268, "learning_rate": 8.39933415104258e-05, "loss": 1.7945, "step": 1830 }, { "epoch": 0.48136105363418397, "grad_norm": 1.0217275619506836, "learning_rate": 8.397581916944104e-05, "loss": 1.821, "step": 1832 }, { "epoch": 0.48188655696784577, "grad_norm": 0.854264497756958, "learning_rate": 8.395829682845629e-05, "loss": 1.809, "step": 1834 }, { "epoch": 0.4824120603015075, "grad_norm": 0.9226179718971252, "learning_rate": 8.394077448747152e-05, "loss": 1.8021, "step": 1836 }, { "epoch": 0.4829375636351693, "grad_norm": 1.205917239189148, "learning_rate": 8.392325214648677e-05, "loss": 1.7912, "step": 1838 }, { "epoch": 0.48346306696883107, "grad_norm": 1.0180691480636597, "learning_rate": 8.390572980550202e-05, "loss": 1.8032, "step": 1840 }, { "epoch": 0.48398857030249287, "grad_norm": 0.9707418084144592, "learning_rate": 8.388820746451727e-05, "loss": 1.8038, "step": 1842 }, { "epoch": 0.4845140736361546, "grad_norm": 1.21326744556427, "learning_rate": 8.387068512353251e-05, "loss": 1.8264, "step": 1844 }, { "epoch": 0.4850395769698164, "grad_norm": 0.8171116709709167, "learning_rate": 8.385316278254776e-05, "loss": 1.8037, "step": 1846 }, { "epoch": 0.48556508030347817, "grad_norm": 0.9211159348487854, "learning_rate": 8.3835640441563e-05, "loss": 1.8085, "step": 1848 }, { "epoch": 0.48609058363714, "grad_norm": 1.2191238403320312, "learning_rate": 8.381811810057824e-05, "loss": 1.7982, "step": 1850 }, { "epoch": 0.4866160869708017, "grad_norm": 0.8354179263114929, "learning_rate": 8.380059575959348e-05, "loss": 1.7862, "step": 1852 }, { "epoch": 0.48714159030446347, "grad_norm": 0.7723087072372437, "learning_rate": 8.378307341860872e-05, "loss": 1.7629, "step": 1854 }, { "epoch": 0.48766709363812527, "grad_norm": 1.1398297548294067, "learning_rate": 8.376555107762397e-05, "loss": 1.8118, "step": 1856 }, { "epoch": 0.488192596971787, "grad_norm": 0.904444694519043, "learning_rate": 8.374802873663922e-05, "loss": 1.8417, "step": 1858 }, { "epoch": 0.4887181003054488, "grad_norm": 1.8874095678329468, "learning_rate": 8.373050639565447e-05, "loss": 1.7985, "step": 1860 }, { "epoch": 0.48924360363911057, "grad_norm": 1.9317784309387207, "learning_rate": 8.37129840546697e-05, "loss": 1.7974, "step": 1862 }, { "epoch": 0.4897691069727724, "grad_norm": 0.9307408332824707, "learning_rate": 8.369546171368495e-05, "loss": 1.8137, "step": 1864 }, { "epoch": 0.4902946103064341, "grad_norm": 0.7302669882774353, "learning_rate": 8.36779393727002e-05, "loss": 1.7555, "step": 1866 }, { "epoch": 0.4908201136400959, "grad_norm": 0.9116623997688293, "learning_rate": 8.366041703171544e-05, "loss": 1.8122, "step": 1868 }, { "epoch": 0.49134561697375767, "grad_norm": 0.8545958995819092, "learning_rate": 8.364289469073069e-05, "loss": 1.8192, "step": 1870 }, { "epoch": 0.4918711203074195, "grad_norm": 0.8263342380523682, "learning_rate": 8.362537234974594e-05, "loss": 1.8059, "step": 1872 }, { "epoch": 0.4923966236410812, "grad_norm": 1.1116405725479126, "learning_rate": 8.360785000876117e-05, "loss": 1.819, "step": 1874 }, { "epoch": 0.49292212697474297, "grad_norm": 0.8676914572715759, "learning_rate": 8.359032766777641e-05, "loss": 1.801, "step": 1876 }, { "epoch": 0.49344763030840477, "grad_norm": 0.7704113125801086, "learning_rate": 8.357280532679165e-05, "loss": 1.7834, "step": 1878 }, { "epoch": 0.4939731336420665, "grad_norm": 1.1784600019454956, "learning_rate": 8.35552829858069e-05, "loss": 1.8283, "step": 1880 }, { "epoch": 0.4944986369757283, "grad_norm": 1.0245088338851929, "learning_rate": 8.353776064482215e-05, "loss": 1.8172, "step": 1882 }, { "epoch": 0.49502414030939007, "grad_norm": 0.9373153448104858, "learning_rate": 8.35202383038374e-05, "loss": 1.786, "step": 1884 }, { "epoch": 0.4955496436430519, "grad_norm": 1.362306833267212, "learning_rate": 8.350271596285264e-05, "loss": 1.8157, "step": 1886 }, { "epoch": 0.4960751469767136, "grad_norm": 0.9665769934654236, "learning_rate": 8.348519362186788e-05, "loss": 1.7878, "step": 1888 }, { "epoch": 0.4966006503103754, "grad_norm": 0.8806145191192627, "learning_rate": 8.346767128088313e-05, "loss": 1.8323, "step": 1890 }, { "epoch": 0.49712615364403717, "grad_norm": 1.2314025163650513, "learning_rate": 8.345014893989837e-05, "loss": 1.8569, "step": 1892 }, { "epoch": 0.497651656977699, "grad_norm": 1.8855247497558594, "learning_rate": 8.343262659891362e-05, "loss": 1.8152, "step": 1894 }, { "epoch": 0.4981771603113607, "grad_norm": 1.1224102973937988, "learning_rate": 8.341510425792887e-05, "loss": 1.8022, "step": 1896 }, { "epoch": 0.49870266364502247, "grad_norm": 0.7415306568145752, "learning_rate": 8.339758191694412e-05, "loss": 1.7825, "step": 1898 }, { "epoch": 0.4992281669786843, "grad_norm": 1.4165035486221313, "learning_rate": 8.338005957595935e-05, "loss": 1.8148, "step": 1900 }, { "epoch": 0.499753670312346, "grad_norm": 0.7428869605064392, "learning_rate": 8.33625372349746e-05, "loss": 1.8034, "step": 1902 }, { "epoch": 0.5002791736460078, "grad_norm": 0.6850984692573547, "learning_rate": 8.334501489398983e-05, "loss": 1.7891, "step": 1904 }, { "epoch": 0.5008046769796696, "grad_norm": 0.7525566816329956, "learning_rate": 8.332749255300508e-05, "loss": 1.8046, "step": 1906 }, { "epoch": 0.5013301803133313, "grad_norm": 0.8629323840141296, "learning_rate": 8.330997021202033e-05, "loss": 1.82, "step": 1908 }, { "epoch": 0.5018556836469932, "grad_norm": 0.885529637336731, "learning_rate": 8.329244787103557e-05, "loss": 1.7576, "step": 1910 }, { "epoch": 0.5023811869806549, "grad_norm": 0.7622796893119812, "learning_rate": 8.327492553005082e-05, "loss": 1.8287, "step": 1912 }, { "epoch": 0.5029066903143167, "grad_norm": 0.8797925710678101, "learning_rate": 8.325740318906607e-05, "loss": 1.8291, "step": 1914 }, { "epoch": 0.5034321936479784, "grad_norm": 0.8444175124168396, "learning_rate": 8.32398808480813e-05, "loss": 1.8036, "step": 1916 }, { "epoch": 0.5039576969816403, "grad_norm": 0.9204092025756836, "learning_rate": 8.322235850709655e-05, "loss": 1.8346, "step": 1918 }, { "epoch": 0.504483200315302, "grad_norm": 0.7571083307266235, "learning_rate": 8.32048361661118e-05, "loss": 1.7909, "step": 1920 }, { "epoch": 0.5050087036489638, "grad_norm": 0.861875593662262, "learning_rate": 8.318731382512705e-05, "loss": 1.8077, "step": 1922 }, { "epoch": 0.5055342069826255, "grad_norm": 0.8385462164878845, "learning_rate": 8.31697914841423e-05, "loss": 1.7738, "step": 1924 }, { "epoch": 0.5060597103162873, "grad_norm": 0.9641711115837097, "learning_rate": 8.315226914315754e-05, "loss": 1.7992, "step": 1926 }, { "epoch": 0.5065852136499491, "grad_norm": 1.5050584077835083, "learning_rate": 8.313474680217278e-05, "loss": 1.7867, "step": 1928 }, { "epoch": 0.5071107169836109, "grad_norm": 1.0225639343261719, "learning_rate": 8.311722446118801e-05, "loss": 1.7787, "step": 1930 }, { "epoch": 0.5076362203172726, "grad_norm": 0.9370511174201965, "learning_rate": 8.309970212020326e-05, "loss": 1.7945, "step": 1932 }, { "epoch": 0.5081617236509344, "grad_norm": 1.1602392196655273, "learning_rate": 8.30821797792185e-05, "loss": 1.8071, "step": 1934 }, { "epoch": 0.5086872269845962, "grad_norm": 1.095885157585144, "learning_rate": 8.306465743823375e-05, "loss": 1.8048, "step": 1936 }, { "epoch": 0.509212730318258, "grad_norm": 1.124812364578247, "learning_rate": 8.3047135097249e-05, "loss": 1.8092, "step": 1938 }, { "epoch": 0.5097382336519197, "grad_norm": 0.7736151218414307, "learning_rate": 8.302961275626425e-05, "loss": 1.8173, "step": 1940 }, { "epoch": 0.5102637369855815, "grad_norm": 1.199781894683838, "learning_rate": 8.301209041527948e-05, "loss": 1.7725, "step": 1942 }, { "epoch": 0.5107892403192432, "grad_norm": 0.772127091884613, "learning_rate": 8.299456807429473e-05, "loss": 1.8158, "step": 1944 }, { "epoch": 0.5113147436529051, "grad_norm": 0.8498915433883667, "learning_rate": 8.297704573330998e-05, "loss": 1.7641, "step": 1946 }, { "epoch": 0.5118402469865668, "grad_norm": 1.0101765394210815, "learning_rate": 8.295952339232522e-05, "loss": 1.8214, "step": 1948 }, { "epoch": 0.5123657503202286, "grad_norm": 0.8255197405815125, "learning_rate": 8.294200105134047e-05, "loss": 1.7684, "step": 1950 }, { "epoch": 0.5128912536538903, "grad_norm": 0.884064257144928, "learning_rate": 8.292447871035572e-05, "loss": 1.7953, "step": 1952 }, { "epoch": 0.5134167569875522, "grad_norm": 1.1106665134429932, "learning_rate": 8.290695636937095e-05, "loss": 1.8086, "step": 1954 }, { "epoch": 0.5139422603212139, "grad_norm": 0.8464235663414001, "learning_rate": 8.288943402838619e-05, "loss": 1.8182, "step": 1956 }, { "epoch": 0.5144677636548757, "grad_norm": 0.8477456569671631, "learning_rate": 8.287191168740144e-05, "loss": 1.7845, "step": 1958 }, { "epoch": 0.5149932669885374, "grad_norm": 0.8464748859405518, "learning_rate": 8.285438934641668e-05, "loss": 1.7325, "step": 1960 }, { "epoch": 0.5155187703221993, "grad_norm": 0.8044800758361816, "learning_rate": 8.283686700543193e-05, "loss": 1.7715, "step": 1962 }, { "epoch": 0.516044273655861, "grad_norm": 0.8983359336853027, "learning_rate": 8.281934466444718e-05, "loss": 1.8041, "step": 1964 }, { "epoch": 0.5165697769895228, "grad_norm": 1.2751051187515259, "learning_rate": 8.280182232346243e-05, "loss": 1.8038, "step": 1966 }, { "epoch": 0.5170952803231845, "grad_norm": 0.9036842584609985, "learning_rate": 8.278429998247766e-05, "loss": 1.7891, "step": 1968 }, { "epoch": 0.5176207836568463, "grad_norm": 0.8021928668022156, "learning_rate": 8.276677764149291e-05, "loss": 1.8139, "step": 1970 }, { "epoch": 0.5181462869905081, "grad_norm": 0.8150444030761719, "learning_rate": 8.274925530050815e-05, "loss": 1.7707, "step": 1972 }, { "epoch": 0.5186717903241699, "grad_norm": 0.7655881643295288, "learning_rate": 8.27317329595234e-05, "loss": 1.822, "step": 1974 }, { "epoch": 0.5191972936578316, "grad_norm": 1.1329301595687866, "learning_rate": 8.271421061853865e-05, "loss": 1.7675, "step": 1976 }, { "epoch": 0.5197227969914934, "grad_norm": 1.0726947784423828, "learning_rate": 8.269668827755388e-05, "loss": 1.8203, "step": 1978 }, { "epoch": 0.5202483003251552, "grad_norm": 0.7917013168334961, "learning_rate": 8.267916593656913e-05, "loss": 1.8326, "step": 1980 }, { "epoch": 0.520773803658817, "grad_norm": 0.955825686454773, "learning_rate": 8.266164359558437e-05, "loss": 1.7993, "step": 1982 }, { "epoch": 0.5212993069924787, "grad_norm": 1.0151026248931885, "learning_rate": 8.264412125459961e-05, "loss": 1.7764, "step": 1984 }, { "epoch": 0.5218248103261405, "grad_norm": 1.11459219455719, "learning_rate": 8.262659891361486e-05, "loss": 1.8025, "step": 1986 }, { "epoch": 0.5223503136598023, "grad_norm": 0.9148701429367065, "learning_rate": 8.260907657263011e-05, "loss": 1.8218, "step": 1988 }, { "epoch": 0.5228758169934641, "grad_norm": 0.7594835758209229, "learning_rate": 8.259155423164536e-05, "loss": 1.8266, "step": 1990 }, { "epoch": 0.5234013203271258, "grad_norm": 1.0429881811141968, "learning_rate": 8.25740318906606e-05, "loss": 1.8223, "step": 1992 }, { "epoch": 0.5239268236607876, "grad_norm": 0.9307808876037598, "learning_rate": 8.255650954967584e-05, "loss": 1.8322, "step": 1994 }, { "epoch": 0.5244523269944493, "grad_norm": 0.7135612964630127, "learning_rate": 8.253898720869108e-05, "loss": 1.8104, "step": 1996 }, { "epoch": 0.5249778303281112, "grad_norm": 0.8590657711029053, "learning_rate": 8.252146486770633e-05, "loss": 1.7808, "step": 1998 }, { "epoch": 0.5255033336617729, "grad_norm": 0.9131346940994263, "learning_rate": 8.250394252672158e-05, "loss": 1.7846, "step": 2000 }, { "epoch": 0.5255033336617729, "eval_loss": 1.7627384662628174, "eval_runtime": 487.1945, "eval_samples_per_second": 249.98, "eval_steps_per_second": 31.248, "step": 2000 }, { "epoch": 0.5260288369954347, "grad_norm": 0.9146105647087097, "learning_rate": 8.248642018573683e-05, "loss": 1.8062, "step": 2002 }, { "epoch": 0.5265543403290964, "grad_norm": 0.8270906209945679, "learning_rate": 8.246889784475206e-05, "loss": 1.7992, "step": 2004 }, { "epoch": 0.5270798436627583, "grad_norm": 0.7374973297119141, "learning_rate": 8.245137550376731e-05, "loss": 1.8103, "step": 2006 }, { "epoch": 0.52760534699642, "grad_norm": 0.748988926410675, "learning_rate": 8.243385316278254e-05, "loss": 1.8284, "step": 2008 }, { "epoch": 0.5281308503300818, "grad_norm": 0.7624616622924805, "learning_rate": 8.241633082179779e-05, "loss": 1.7947, "step": 2010 }, { "epoch": 0.5286563536637435, "grad_norm": 0.8599483966827393, "learning_rate": 8.239880848081304e-05, "loss": 1.7732, "step": 2012 }, { "epoch": 0.5291818569974053, "grad_norm": 0.8122212886810303, "learning_rate": 8.238128613982829e-05, "loss": 1.8128, "step": 2014 }, { "epoch": 0.5297073603310671, "grad_norm": 0.8008537292480469, "learning_rate": 8.236376379884353e-05, "loss": 1.8028, "step": 2016 }, { "epoch": 0.5302328636647289, "grad_norm": 0.8155772686004639, "learning_rate": 8.234624145785878e-05, "loss": 1.8124, "step": 2018 }, { "epoch": 0.5307583669983906, "grad_norm": 0.9014889001846313, "learning_rate": 8.232871911687401e-05, "loss": 1.8056, "step": 2020 }, { "epoch": 0.5312838703320524, "grad_norm": 0.7495489716529846, "learning_rate": 8.231119677588926e-05, "loss": 1.7891, "step": 2022 }, { "epoch": 0.5318093736657142, "grad_norm": 0.6256895065307617, "learning_rate": 8.229367443490451e-05, "loss": 1.7671, "step": 2024 }, { "epoch": 0.532334876999376, "grad_norm": 0.9852064847946167, "learning_rate": 8.227615209391976e-05, "loss": 1.7945, "step": 2026 }, { "epoch": 0.5328603803330377, "grad_norm": 0.9755546450614929, "learning_rate": 8.2258629752935e-05, "loss": 1.7999, "step": 2028 }, { "epoch": 0.5333858836666995, "grad_norm": 0.7656162977218628, "learning_rate": 8.224110741195024e-05, "loss": 1.7869, "step": 2030 }, { "epoch": 0.5339113870003613, "grad_norm": 0.9118050336837769, "learning_rate": 8.222358507096549e-05, "loss": 1.8278, "step": 2032 }, { "epoch": 0.5344368903340231, "grad_norm": 0.6935544013977051, "learning_rate": 8.220606272998072e-05, "loss": 1.8328, "step": 2034 }, { "epoch": 0.5349623936676848, "grad_norm": 0.9424465298652649, "learning_rate": 8.218854038899597e-05, "loss": 1.7714, "step": 2036 }, { "epoch": 0.5354878970013466, "grad_norm": 0.7186869978904724, "learning_rate": 8.217101804801122e-05, "loss": 1.8039, "step": 2038 }, { "epoch": 0.5360134003350083, "grad_norm": 0.6855552792549133, "learning_rate": 8.215349570702646e-05, "loss": 1.7952, "step": 2040 }, { "epoch": 0.5365389036686702, "grad_norm": 0.6616165637969971, "learning_rate": 8.213597336604171e-05, "loss": 1.7949, "step": 2042 }, { "epoch": 0.5370644070023319, "grad_norm": 0.7983983755111694, "learning_rate": 8.211845102505696e-05, "loss": 1.8027, "step": 2044 }, { "epoch": 0.5375899103359937, "grad_norm": 0.7051743865013123, "learning_rate": 8.210092868407219e-05, "loss": 1.8062, "step": 2046 }, { "epoch": 0.5381154136696554, "grad_norm": 0.7773457169532776, "learning_rate": 8.208340634308744e-05, "loss": 1.7559, "step": 2048 }, { "epoch": 0.5386409170033173, "grad_norm": 1.415022373199463, "learning_rate": 8.206588400210269e-05, "loss": 1.8263, "step": 2050 }, { "epoch": 0.539166420336979, "grad_norm": 0.8128517270088196, "learning_rate": 8.204836166111794e-05, "loss": 1.8131, "step": 2052 }, { "epoch": 0.5396919236706408, "grad_norm": 0.8620209097862244, "learning_rate": 8.203083932013317e-05, "loss": 1.7608, "step": 2054 }, { "epoch": 0.5402174270043025, "grad_norm": 0.7338502407073975, "learning_rate": 8.201331697914842e-05, "loss": 1.8006, "step": 2056 }, { "epoch": 0.5407429303379643, "grad_norm": 0.8432105183601379, "learning_rate": 8.199579463816366e-05, "loss": 1.7716, "step": 2058 }, { "epoch": 0.5412684336716261, "grad_norm": 0.600993812084198, "learning_rate": 8.19782722971789e-05, "loss": 1.7858, "step": 2060 }, { "epoch": 0.5417939370052879, "grad_norm": 0.7413330674171448, "learning_rate": 8.196074995619415e-05, "loss": 1.7714, "step": 2062 }, { "epoch": 0.5423194403389496, "grad_norm": 0.7860500812530518, "learning_rate": 8.19432276152094e-05, "loss": 1.8065, "step": 2064 }, { "epoch": 0.5428449436726114, "grad_norm": 0.8075912594795227, "learning_rate": 8.192570527422464e-05, "loss": 1.7946, "step": 2066 }, { "epoch": 0.5433704470062732, "grad_norm": 0.7681949734687805, "learning_rate": 8.190818293323989e-05, "loss": 1.8246, "step": 2068 }, { "epoch": 0.543895950339935, "grad_norm": 0.7330343127250671, "learning_rate": 8.189066059225514e-05, "loss": 1.7884, "step": 2070 }, { "epoch": 0.5444214536735967, "grad_norm": 0.8136972784996033, "learning_rate": 8.187313825127037e-05, "loss": 1.7783, "step": 2072 }, { "epoch": 0.5449469570072585, "grad_norm": 0.9508219361305237, "learning_rate": 8.185561591028562e-05, "loss": 1.822, "step": 2074 }, { "epoch": 0.5454724603409203, "grad_norm": 1.0187771320343018, "learning_rate": 8.183809356930087e-05, "loss": 1.7858, "step": 2076 }, { "epoch": 0.5459979636745821, "grad_norm": 0.872322678565979, "learning_rate": 8.182057122831611e-05, "loss": 1.7715, "step": 2078 }, { "epoch": 0.5465234670082438, "grad_norm": 0.9391134977340698, "learning_rate": 8.180304888733135e-05, "loss": 1.8014, "step": 2080 }, { "epoch": 0.5470489703419056, "grad_norm": 0.7798128128051758, "learning_rate": 8.17855265463466e-05, "loss": 1.7803, "step": 2082 }, { "epoch": 0.5475744736755673, "grad_norm": 0.9748620390892029, "learning_rate": 8.176800420536184e-05, "loss": 1.8054, "step": 2084 }, { "epoch": 0.5480999770092292, "grad_norm": 0.9456170201301575, "learning_rate": 8.175048186437708e-05, "loss": 1.8084, "step": 2086 }, { "epoch": 0.5486254803428909, "grad_norm": 1.295296549797058, "learning_rate": 8.173295952339232e-05, "loss": 1.8044, "step": 2088 }, { "epoch": 0.5491509836765527, "grad_norm": 0.7323461174964905, "learning_rate": 8.171543718240757e-05, "loss": 1.787, "step": 2090 }, { "epoch": 0.5496764870102144, "grad_norm": 1.0989707708358765, "learning_rate": 8.169791484142282e-05, "loss": 1.7789, "step": 2092 }, { "epoch": 0.5502019903438763, "grad_norm": 0.9566003680229187, "learning_rate": 8.168039250043807e-05, "loss": 1.7832, "step": 2094 }, { "epoch": 0.550727493677538, "grad_norm": 0.8293377757072449, "learning_rate": 8.166287015945331e-05, "loss": 1.8081, "step": 2096 }, { "epoch": 0.5512529970111998, "grad_norm": 0.9931288361549377, "learning_rate": 8.164534781846855e-05, "loss": 1.81, "step": 2098 }, { "epoch": 0.5517785003448615, "grad_norm": 0.7140156626701355, "learning_rate": 8.16278254774838e-05, "loss": 1.7824, "step": 2100 }, { "epoch": 0.5523040036785233, "grad_norm": 0.8644457459449768, "learning_rate": 8.161030313649904e-05, "loss": 1.8058, "step": 2102 }, { "epoch": 0.5528295070121851, "grad_norm": 0.9265533685684204, "learning_rate": 8.159278079551429e-05, "loss": 1.7759, "step": 2104 }, { "epoch": 0.5533550103458469, "grad_norm": 0.7051352262496948, "learning_rate": 8.157525845452952e-05, "loss": 1.7986, "step": 2106 }, { "epoch": 0.5538805136795086, "grad_norm": 0.8235836625099182, "learning_rate": 8.155773611354477e-05, "loss": 1.8038, "step": 2108 }, { "epoch": 0.5544060170131704, "grad_norm": 0.7595791816711426, "learning_rate": 8.154021377256002e-05, "loss": 1.8321, "step": 2110 }, { "epoch": 0.5549315203468322, "grad_norm": 0.7582104802131653, "learning_rate": 8.152269143157525e-05, "loss": 1.8297, "step": 2112 }, { "epoch": 0.555457023680494, "grad_norm": 0.6678735613822937, "learning_rate": 8.15051690905905e-05, "loss": 1.7559, "step": 2114 }, { "epoch": 0.5559825270141557, "grad_norm": 0.8775631189346313, "learning_rate": 8.148764674960575e-05, "loss": 1.8064, "step": 2116 }, { "epoch": 0.5565080303478175, "grad_norm": 0.8533100485801697, "learning_rate": 8.1470124408621e-05, "loss": 1.7946, "step": 2118 }, { "epoch": 0.5570335336814793, "grad_norm": 0.6834318041801453, "learning_rate": 8.145260206763624e-05, "loss": 1.7833, "step": 2120 }, { "epoch": 0.5575590370151411, "grad_norm": 0.9778022766113281, "learning_rate": 8.143507972665149e-05, "loss": 1.8012, "step": 2122 }, { "epoch": 0.5580845403488028, "grad_norm": 0.7117162942886353, "learning_rate": 8.141755738566673e-05, "loss": 1.8246, "step": 2124 }, { "epoch": 0.5586100436824646, "grad_norm": 0.8154255747795105, "learning_rate": 8.140003504468197e-05, "loss": 1.7692, "step": 2126 }, { "epoch": 0.5591355470161263, "grad_norm": 0.7914754152297974, "learning_rate": 8.138251270369722e-05, "loss": 1.7623, "step": 2128 }, { "epoch": 0.5596610503497882, "grad_norm": 0.657900333404541, "learning_rate": 8.136499036271247e-05, "loss": 1.7882, "step": 2130 }, { "epoch": 0.5601865536834499, "grad_norm": 0.6770361065864563, "learning_rate": 8.13474680217277e-05, "loss": 1.7737, "step": 2132 }, { "epoch": 0.5607120570171117, "grad_norm": 0.9176309108734131, "learning_rate": 8.132994568074295e-05, "loss": 1.7706, "step": 2134 }, { "epoch": 0.5612375603507734, "grad_norm": 1.037473201751709, "learning_rate": 8.13124233397582e-05, "loss": 1.8172, "step": 2136 }, { "epoch": 0.5617630636844353, "grad_norm": 1.409900426864624, "learning_rate": 8.129490099877343e-05, "loss": 1.7831, "step": 2138 }, { "epoch": 0.562288567018097, "grad_norm": 0.7014243602752686, "learning_rate": 8.127737865778868e-05, "loss": 1.7844, "step": 2140 }, { "epoch": 0.5628140703517588, "grad_norm": 0.7743593454360962, "learning_rate": 8.125985631680393e-05, "loss": 1.7591, "step": 2142 }, { "epoch": 0.5633395736854205, "grad_norm": 0.8568558096885681, "learning_rate": 8.124233397581917e-05, "loss": 1.8341, "step": 2144 }, { "epoch": 0.5638650770190824, "grad_norm": 0.792972981929779, "learning_rate": 8.122481163483442e-05, "loss": 1.7639, "step": 2146 }, { "epoch": 0.5643905803527441, "grad_norm": 0.983025312423706, "learning_rate": 8.120728929384967e-05, "loss": 1.7952, "step": 2148 }, { "epoch": 0.5649160836864059, "grad_norm": 0.7422555088996887, "learning_rate": 8.11897669528649e-05, "loss": 1.7856, "step": 2150 }, { "epoch": 0.5654415870200676, "grad_norm": 0.6807308793067932, "learning_rate": 8.117224461188015e-05, "loss": 1.841, "step": 2152 }, { "epoch": 0.5659670903537294, "grad_norm": 0.845778226852417, "learning_rate": 8.11547222708954e-05, "loss": 1.7815, "step": 2154 }, { "epoch": 0.5664925936873912, "grad_norm": 0.8135868906974792, "learning_rate": 8.113719992991063e-05, "loss": 1.8014, "step": 2156 }, { "epoch": 0.567018097021053, "grad_norm": 0.7737998366355896, "learning_rate": 8.111967758892588e-05, "loss": 1.7906, "step": 2158 }, { "epoch": 0.5675436003547147, "grad_norm": 0.8078686594963074, "learning_rate": 8.110215524794113e-05, "loss": 1.7612, "step": 2160 }, { "epoch": 0.5680691036883765, "grad_norm": 0.8356254696846008, "learning_rate": 8.108463290695637e-05, "loss": 1.7736, "step": 2162 }, { "epoch": 0.5685946070220383, "grad_norm": 0.7324886322021484, "learning_rate": 8.106711056597162e-05, "loss": 1.8031, "step": 2164 }, { "epoch": 0.5691201103557001, "grad_norm": 0.6843339800834656, "learning_rate": 8.104958822498686e-05, "loss": 1.7733, "step": 2166 }, { "epoch": 0.5696456136893618, "grad_norm": 1.1155132055282593, "learning_rate": 8.10320658840021e-05, "loss": 1.8057, "step": 2168 }, { "epoch": 0.5701711170230236, "grad_norm": 0.944843053817749, "learning_rate": 8.101454354301735e-05, "loss": 1.7848, "step": 2170 }, { "epoch": 0.5706966203566853, "grad_norm": 0.7923617959022522, "learning_rate": 8.09970212020326e-05, "loss": 1.7636, "step": 2172 }, { "epoch": 0.5712221236903472, "grad_norm": 0.7884588837623596, "learning_rate": 8.097949886104785e-05, "loss": 1.7856, "step": 2174 }, { "epoch": 0.5717476270240089, "grad_norm": 0.8496378064155579, "learning_rate": 8.09619765200631e-05, "loss": 1.8124, "step": 2176 }, { "epoch": 0.5722731303576707, "grad_norm": 0.6795907020568848, "learning_rate": 8.094445417907833e-05, "loss": 1.825, "step": 2178 }, { "epoch": 0.5727986336913324, "grad_norm": 1.0210143327713013, "learning_rate": 8.092693183809358e-05, "loss": 1.8193, "step": 2180 }, { "epoch": 0.5733241370249943, "grad_norm": 0.9265308380126953, "learning_rate": 8.090940949710881e-05, "loss": 1.7955, "step": 2182 }, { "epoch": 0.573849640358656, "grad_norm": 0.7709974646568298, "learning_rate": 8.089188715612406e-05, "loss": 1.7748, "step": 2184 }, { "epoch": 0.5743751436923178, "grad_norm": 1.015137791633606, "learning_rate": 8.08743648151393e-05, "loss": 1.7767, "step": 2186 }, { "epoch": 0.5749006470259795, "grad_norm": 0.7084217071533203, "learning_rate": 8.085684247415455e-05, "loss": 1.7737, "step": 2188 }, { "epoch": 0.5754261503596414, "grad_norm": 0.7948693633079529, "learning_rate": 8.08393201331698e-05, "loss": 1.8202, "step": 2190 }, { "epoch": 0.5759516536933031, "grad_norm": 0.921947181224823, "learning_rate": 8.082179779218503e-05, "loss": 1.7906, "step": 2192 }, { "epoch": 0.5764771570269649, "grad_norm": 1.3730195760726929, "learning_rate": 8.080427545120028e-05, "loss": 1.7645, "step": 2194 }, { "epoch": 0.5770026603606266, "grad_norm": 0.7949815392494202, "learning_rate": 8.078675311021553e-05, "loss": 1.7519, "step": 2196 }, { "epoch": 0.5775281636942884, "grad_norm": 0.8247926831245422, "learning_rate": 8.076923076923078e-05, "loss": 1.7875, "step": 2198 }, { "epoch": 0.5780536670279502, "grad_norm": 0.8016488552093506, "learning_rate": 8.075170842824602e-05, "loss": 1.7832, "step": 2200 }, { "epoch": 0.578579170361612, "grad_norm": 0.8356485366821289, "learning_rate": 8.073418608726127e-05, "loss": 1.7964, "step": 2202 }, { "epoch": 0.5791046736952737, "grad_norm": 0.6765563488006592, "learning_rate": 8.07166637462765e-05, "loss": 1.7864, "step": 2204 }, { "epoch": 0.5796301770289355, "grad_norm": 0.6917481422424316, "learning_rate": 8.069914140529175e-05, "loss": 1.7662, "step": 2206 }, { "epoch": 0.5801556803625973, "grad_norm": 0.7653847336769104, "learning_rate": 8.068161906430699e-05, "loss": 1.7586, "step": 2208 }, { "epoch": 0.5806811836962591, "grad_norm": 0.7687065005302429, "learning_rate": 8.066409672332224e-05, "loss": 1.8408, "step": 2210 }, { "epoch": 0.5812066870299208, "grad_norm": 1.3475931882858276, "learning_rate": 8.064657438233748e-05, "loss": 1.7714, "step": 2212 }, { "epoch": 0.5817321903635826, "grad_norm": 0.9233903288841248, "learning_rate": 8.062905204135273e-05, "loss": 1.7366, "step": 2214 }, { "epoch": 0.5822576936972443, "grad_norm": 1.3624159097671509, "learning_rate": 8.061152970036798e-05, "loss": 1.8115, "step": 2216 }, { "epoch": 0.5827831970309062, "grad_norm": 1.2570765018463135, "learning_rate": 8.059400735938321e-05, "loss": 1.8475, "step": 2218 }, { "epoch": 0.583308700364568, "grad_norm": 1.3805052042007446, "learning_rate": 8.057648501839846e-05, "loss": 1.7814, "step": 2220 }, { "epoch": 0.5838342036982297, "grad_norm": 1.1308529376983643, "learning_rate": 8.055896267741371e-05, "loss": 1.7531, "step": 2222 }, { "epoch": 0.5843597070318914, "grad_norm": 0.8926995992660522, "learning_rate": 8.054144033642895e-05, "loss": 1.7973, "step": 2224 }, { "epoch": 0.5848852103655533, "grad_norm": 1.3856247663497925, "learning_rate": 8.05239179954442e-05, "loss": 1.8255, "step": 2226 }, { "epoch": 0.585410713699215, "grad_norm": 1.143256664276123, "learning_rate": 8.050639565445945e-05, "loss": 1.7983, "step": 2228 }, { "epoch": 0.5859362170328768, "grad_norm": 0.7704371809959412, "learning_rate": 8.048887331347468e-05, "loss": 1.787, "step": 2230 }, { "epoch": 0.5864617203665385, "grad_norm": 0.8830547332763672, "learning_rate": 8.047135097248993e-05, "loss": 1.8016, "step": 2232 }, { "epoch": 0.5869872237002004, "grad_norm": 1.3716325759887695, "learning_rate": 8.045382863150517e-05, "loss": 1.8184, "step": 2234 }, { "epoch": 0.5875127270338621, "grad_norm": 0.6707213521003723, "learning_rate": 8.043630629052041e-05, "loss": 1.7814, "step": 2236 }, { "epoch": 0.5880382303675239, "grad_norm": 0.8658749461174011, "learning_rate": 8.041878394953566e-05, "loss": 1.7797, "step": 2238 }, { "epoch": 0.5885637337011856, "grad_norm": 0.8602432608604431, "learning_rate": 8.040126160855091e-05, "loss": 1.7646, "step": 2240 }, { "epoch": 0.5890892370348474, "grad_norm": 0.6430072784423828, "learning_rate": 8.038373926756616e-05, "loss": 1.7872, "step": 2242 }, { "epoch": 0.5896147403685092, "grad_norm": 0.8540019989013672, "learning_rate": 8.036621692658139e-05, "loss": 1.7961, "step": 2244 }, { "epoch": 0.590140243702171, "grad_norm": 0.7394554615020752, "learning_rate": 8.034869458559664e-05, "loss": 1.814, "step": 2246 }, { "epoch": 0.5906657470358327, "grad_norm": 0.6837593913078308, "learning_rate": 8.033117224461188e-05, "loss": 1.7913, "step": 2248 }, { "epoch": 0.5911912503694945, "grad_norm": 0.7128705382347107, "learning_rate": 8.031364990362713e-05, "loss": 1.7655, "step": 2250 }, { "epoch": 0.5917167537031564, "grad_norm": 0.6362258791923523, "learning_rate": 8.029612756264238e-05, "loss": 1.8152, "step": 2252 }, { "epoch": 0.5922422570368181, "grad_norm": 0.8071087002754211, "learning_rate": 8.027860522165763e-05, "loss": 1.774, "step": 2254 }, { "epoch": 0.5927677603704798, "grad_norm": 0.713575005531311, "learning_rate": 8.026108288067286e-05, "loss": 1.7543, "step": 2256 }, { "epoch": 0.5932932637041416, "grad_norm": 1.3430129289627075, "learning_rate": 8.02435605396881e-05, "loss": 1.8361, "step": 2258 }, { "epoch": 0.5938187670378033, "grad_norm": 0.7131674885749817, "learning_rate": 8.022603819870334e-05, "loss": 1.7964, "step": 2260 }, { "epoch": 0.5943442703714652, "grad_norm": 0.8277941942214966, "learning_rate": 8.020851585771859e-05, "loss": 1.7872, "step": 2262 }, { "epoch": 0.594869773705127, "grad_norm": 1.3098149299621582, "learning_rate": 8.019099351673384e-05, "loss": 1.7578, "step": 2264 }, { "epoch": 0.5953952770387887, "grad_norm": 0.7214488983154297, "learning_rate": 8.017347117574909e-05, "loss": 1.8064, "step": 2266 }, { "epoch": 0.5959207803724504, "grad_norm": 0.6886647343635559, "learning_rate": 8.015594883476433e-05, "loss": 1.8442, "step": 2268 }, { "epoch": 0.5964462837061123, "grad_norm": 0.7065162062644958, "learning_rate": 8.013842649377957e-05, "loss": 1.7493, "step": 2270 }, { "epoch": 0.596971787039774, "grad_norm": 0.647866427898407, "learning_rate": 8.012090415279481e-05, "loss": 1.7716, "step": 2272 }, { "epoch": 0.5974972903734358, "grad_norm": 1.0947537422180176, "learning_rate": 8.010338181181006e-05, "loss": 1.7847, "step": 2274 }, { "epoch": 0.5980227937070975, "grad_norm": 0.7569209337234497, "learning_rate": 8.008585947082531e-05, "loss": 1.8247, "step": 2276 }, { "epoch": 0.5985482970407594, "grad_norm": 0.8344804644584656, "learning_rate": 8.006833712984056e-05, "loss": 1.7553, "step": 2278 }, { "epoch": 0.5990738003744212, "grad_norm": 0.7028648257255554, "learning_rate": 8.00508147888558e-05, "loss": 1.7851, "step": 2280 }, { "epoch": 0.5995993037080829, "grad_norm": 0.8421052098274231, "learning_rate": 8.003329244787104e-05, "loss": 1.7872, "step": 2282 }, { "epoch": 0.6001248070417446, "grad_norm": 0.6548582911491394, "learning_rate": 8.001577010688627e-05, "loss": 1.7568, "step": 2284 }, { "epoch": 0.6006503103754064, "grad_norm": 0.7493446469306946, "learning_rate": 7.999824776590152e-05, "loss": 1.766, "step": 2286 }, { "epoch": 0.6011758137090683, "grad_norm": 0.6293224096298218, "learning_rate": 7.998072542491677e-05, "loss": 1.7701, "step": 2288 }, { "epoch": 0.60170131704273, "grad_norm": 0.8307104706764221, "learning_rate": 7.996320308393202e-05, "loss": 1.7783, "step": 2290 }, { "epoch": 0.6022268203763917, "grad_norm": 0.696723997592926, "learning_rate": 7.994568074294726e-05, "loss": 1.8009, "step": 2292 }, { "epoch": 0.6027523237100535, "grad_norm": 0.6556825637817383, "learning_rate": 7.992815840196251e-05, "loss": 1.7693, "step": 2294 }, { "epoch": 0.6032778270437154, "grad_norm": 0.9338749051094055, "learning_rate": 7.991063606097774e-05, "loss": 1.7614, "step": 2296 }, { "epoch": 0.6038033303773771, "grad_norm": 0.8844968676567078, "learning_rate": 7.989311371999299e-05, "loss": 1.7809, "step": 2298 }, { "epoch": 0.6043288337110388, "grad_norm": 0.6624906063079834, "learning_rate": 7.987559137900824e-05, "loss": 1.7607, "step": 2300 }, { "epoch": 0.6048543370447006, "grad_norm": 0.7565969824790955, "learning_rate": 7.985806903802349e-05, "loss": 1.7374, "step": 2302 }, { "epoch": 0.6053798403783625, "grad_norm": 1.3834869861602783, "learning_rate": 7.984054669703873e-05, "loss": 1.8123, "step": 2304 }, { "epoch": 0.6059053437120242, "grad_norm": 0.9589686989784241, "learning_rate": 7.982302435605398e-05, "loss": 1.7766, "step": 2306 }, { "epoch": 0.606430847045686, "grad_norm": 1.1011096239089966, "learning_rate": 7.980550201506922e-05, "loss": 1.7737, "step": 2308 }, { "epoch": 0.6069563503793477, "grad_norm": 1.1299936771392822, "learning_rate": 7.978797967408445e-05, "loss": 1.8042, "step": 2310 }, { "epoch": 0.6074818537130094, "grad_norm": 0.7697176933288574, "learning_rate": 7.97704573330997e-05, "loss": 1.742, "step": 2312 }, { "epoch": 0.6080073570466713, "grad_norm": 0.991256833076477, "learning_rate": 7.975293499211495e-05, "loss": 1.8302, "step": 2314 }, { "epoch": 0.608532860380333, "grad_norm": 0.7879564166069031, "learning_rate": 7.97354126511302e-05, "loss": 1.7624, "step": 2316 }, { "epoch": 0.6090583637139948, "grad_norm": 0.745040774345398, "learning_rate": 7.971789031014544e-05, "loss": 1.807, "step": 2318 }, { "epoch": 0.6095838670476565, "grad_norm": 0.9064005613327026, "learning_rate": 7.970036796916069e-05, "loss": 1.7371, "step": 2320 }, { "epoch": 0.6101093703813184, "grad_norm": 0.9049443602561951, "learning_rate": 7.968284562817592e-05, "loss": 1.8108, "step": 2322 }, { "epoch": 0.6106348737149802, "grad_norm": 0.8754010200500488, "learning_rate": 7.966532328719117e-05, "loss": 1.7804, "step": 2324 }, { "epoch": 0.6111603770486419, "grad_norm": 0.7384723424911499, "learning_rate": 7.964780094620642e-05, "loss": 1.7683, "step": 2326 }, { "epoch": 0.6116858803823036, "grad_norm": 0.739629328250885, "learning_rate": 7.963027860522167e-05, "loss": 1.7826, "step": 2328 }, { "epoch": 0.6122113837159654, "grad_norm": 0.824571967124939, "learning_rate": 7.961275626423691e-05, "loss": 1.7686, "step": 2330 }, { "epoch": 0.6127368870496273, "grad_norm": 0.9385930299758911, "learning_rate": 7.959523392325216e-05, "loss": 1.7893, "step": 2332 }, { "epoch": 0.613262390383289, "grad_norm": 0.7331735491752625, "learning_rate": 7.95777115822674e-05, "loss": 1.7835, "step": 2334 }, { "epoch": 0.6137878937169507, "grad_norm": 0.6689608693122864, "learning_rate": 7.956018924128263e-05, "loss": 1.755, "step": 2336 }, { "epoch": 0.6143133970506125, "grad_norm": 0.7041564583778381, "learning_rate": 7.954266690029788e-05, "loss": 1.7593, "step": 2338 }, { "epoch": 0.6148389003842744, "grad_norm": 0.818277895450592, "learning_rate": 7.952514455931312e-05, "loss": 1.8126, "step": 2340 }, { "epoch": 0.6153644037179361, "grad_norm": 1.1610822677612305, "learning_rate": 7.950762221832837e-05, "loss": 1.8324, "step": 2342 }, { "epoch": 0.6158899070515979, "grad_norm": 0.9594135284423828, "learning_rate": 7.949009987734362e-05, "loss": 1.7777, "step": 2344 }, { "epoch": 0.6164154103852596, "grad_norm": 0.9519843459129333, "learning_rate": 7.947257753635887e-05, "loss": 1.7905, "step": 2346 }, { "epoch": 0.6169409137189215, "grad_norm": 0.7015102505683899, "learning_rate": 7.94550551953741e-05, "loss": 1.7753, "step": 2348 }, { "epoch": 0.6174664170525832, "grad_norm": 0.6667357087135315, "learning_rate": 7.943753285438935e-05, "loss": 1.7925, "step": 2350 }, { "epoch": 0.617991920386245, "grad_norm": 0.6917058229446411, "learning_rate": 7.94200105134046e-05, "loss": 1.7737, "step": 2352 }, { "epoch": 0.6185174237199067, "grad_norm": 0.8922196626663208, "learning_rate": 7.940248817241984e-05, "loss": 1.77, "step": 2354 }, { "epoch": 0.6190429270535684, "grad_norm": 0.8834502696990967, "learning_rate": 7.938496583143509e-05, "loss": 1.7626, "step": 2356 }, { "epoch": 0.6195684303872303, "grad_norm": 0.6824911832809448, "learning_rate": 7.936744349045034e-05, "loss": 1.7694, "step": 2358 }, { "epoch": 0.620093933720892, "grad_norm": 0.8218874931335449, "learning_rate": 7.934992114946557e-05, "loss": 1.753, "step": 2360 }, { "epoch": 0.6206194370545538, "grad_norm": 0.808892011642456, "learning_rate": 7.93323988084808e-05, "loss": 1.7619, "step": 2362 }, { "epoch": 0.6211449403882155, "grad_norm": 0.783028781414032, "learning_rate": 7.931487646749605e-05, "loss": 1.8033, "step": 2364 }, { "epoch": 0.6216704437218774, "grad_norm": 0.8071235418319702, "learning_rate": 7.92973541265113e-05, "loss": 1.7536, "step": 2366 }, { "epoch": 0.6221959470555392, "grad_norm": 0.7900059819221497, "learning_rate": 7.927983178552655e-05, "loss": 1.7715, "step": 2368 }, { "epoch": 0.6227214503892009, "grad_norm": 0.8198074102401733, "learning_rate": 7.92623094445418e-05, "loss": 1.7898, "step": 2370 }, { "epoch": 0.6232469537228627, "grad_norm": 0.6880433559417725, "learning_rate": 7.924478710355704e-05, "loss": 1.7617, "step": 2372 }, { "epoch": 0.6237724570565244, "grad_norm": 0.7786495685577393, "learning_rate": 7.922726476257228e-05, "loss": 1.7574, "step": 2374 }, { "epoch": 0.6242979603901863, "grad_norm": 0.8043944239616394, "learning_rate": 7.920974242158753e-05, "loss": 1.797, "step": 2376 }, { "epoch": 0.624823463723848, "grad_norm": 0.9602116942405701, "learning_rate": 7.919222008060277e-05, "loss": 1.7835, "step": 2378 }, { "epoch": 0.6253489670575098, "grad_norm": 0.6723161339759827, "learning_rate": 7.917469773961802e-05, "loss": 1.751, "step": 2380 }, { "epoch": 0.6258744703911715, "grad_norm": 0.7045361399650574, "learning_rate": 7.915717539863327e-05, "loss": 1.7788, "step": 2382 }, { "epoch": 0.6263999737248334, "grad_norm": 0.7056633234024048, "learning_rate": 7.913965305764852e-05, "loss": 1.7581, "step": 2384 }, { "epoch": 0.6269254770584951, "grad_norm": 0.8192391395568848, "learning_rate": 7.912213071666375e-05, "loss": 1.751, "step": 2386 }, { "epoch": 0.6274509803921569, "grad_norm": 0.8521485924720764, "learning_rate": 7.910460837567898e-05, "loss": 1.811, "step": 2388 }, { "epoch": 0.6279764837258186, "grad_norm": 0.7382224202156067, "learning_rate": 7.908708603469423e-05, "loss": 1.7848, "step": 2390 }, { "epoch": 0.6285019870594805, "grad_norm": 0.6544625163078308, "learning_rate": 7.906956369370948e-05, "loss": 1.7902, "step": 2392 }, { "epoch": 0.6290274903931422, "grad_norm": 0.7634027600288391, "learning_rate": 7.905204135272473e-05, "loss": 1.79, "step": 2394 }, { "epoch": 0.629552993726804, "grad_norm": 1.0319316387176514, "learning_rate": 7.903451901173997e-05, "loss": 1.8045, "step": 2396 }, { "epoch": 0.6300784970604657, "grad_norm": 0.6364408731460571, "learning_rate": 7.901699667075522e-05, "loss": 1.7666, "step": 2398 }, { "epoch": 0.6306040003941275, "grad_norm": 0.919385552406311, "learning_rate": 7.899947432977046e-05, "loss": 1.7625, "step": 2400 }, { "epoch": 0.6306040003941275, "eval_loss": 1.7536036968231201, "eval_runtime": 487.1509, "eval_samples_per_second": 250.003, "eval_steps_per_second": 31.251, "step": 2400 }, { "epoch": 0.6311295037277893, "grad_norm": 0.9996768832206726, "learning_rate": 7.89819519887857e-05, "loss": 1.8133, "step": 2402 }, { "epoch": 0.6316550070614511, "grad_norm": 0.7376594543457031, "learning_rate": 7.896442964780095e-05, "loss": 1.8095, "step": 2404 }, { "epoch": 0.6321805103951128, "grad_norm": 0.970077633857727, "learning_rate": 7.89469073068162e-05, "loss": 1.7818, "step": 2406 }, { "epoch": 0.6327060137287746, "grad_norm": 0.8934677839279175, "learning_rate": 7.892938496583145e-05, "loss": 1.7905, "step": 2408 }, { "epoch": 0.6332315170624364, "grad_norm": 0.8888778686523438, "learning_rate": 7.891186262484669e-05, "loss": 1.7646, "step": 2410 }, { "epoch": 0.6337570203960982, "grad_norm": 0.7317706942558289, "learning_rate": 7.889434028386193e-05, "loss": 1.8039, "step": 2412 }, { "epoch": 0.6342825237297599, "grad_norm": 0.768997848033905, "learning_rate": 7.887681794287717e-05, "loss": 1.8152, "step": 2414 }, { "epoch": 0.6348080270634217, "grad_norm": 0.8444989323616028, "learning_rate": 7.885929560189241e-05, "loss": 1.7889, "step": 2416 }, { "epoch": 0.6353335303970834, "grad_norm": 0.7109376788139343, "learning_rate": 7.884177326090766e-05, "loss": 1.7729, "step": 2418 }, { "epoch": 0.6358590337307453, "grad_norm": 0.630806565284729, "learning_rate": 7.88242509199229e-05, "loss": 1.7954, "step": 2420 }, { "epoch": 0.636384537064407, "grad_norm": 0.6395316123962402, "learning_rate": 7.880672857893815e-05, "loss": 1.7797, "step": 2422 }, { "epoch": 0.6369100403980688, "grad_norm": 1.0122566223144531, "learning_rate": 7.87892062379534e-05, "loss": 1.7841, "step": 2424 }, { "epoch": 0.6374355437317305, "grad_norm": 0.7840449810028076, "learning_rate": 7.877168389696865e-05, "loss": 1.7888, "step": 2426 }, { "epoch": 0.6379610470653924, "grad_norm": 0.7690210342407227, "learning_rate": 7.875416155598388e-05, "loss": 1.8122, "step": 2428 }, { "epoch": 0.6384865503990541, "grad_norm": 0.7418575286865234, "learning_rate": 7.873663921499913e-05, "loss": 1.8056, "step": 2430 }, { "epoch": 0.6390120537327159, "grad_norm": 0.8744335770606995, "learning_rate": 7.871911687401438e-05, "loss": 1.7914, "step": 2432 }, { "epoch": 0.6395375570663776, "grad_norm": 0.9662806391716003, "learning_rate": 7.870159453302962e-05, "loss": 1.7712, "step": 2434 }, { "epoch": 0.6400630604000395, "grad_norm": 1.1848862171173096, "learning_rate": 7.868407219204487e-05, "loss": 1.7627, "step": 2436 }, { "epoch": 0.6405885637337012, "grad_norm": 0.7062596082687378, "learning_rate": 7.86665498510601e-05, "loss": 1.7691, "step": 2438 }, { "epoch": 0.641114067067363, "grad_norm": 0.9317710399627686, "learning_rate": 7.864902751007535e-05, "loss": 1.7697, "step": 2440 }, { "epoch": 0.6416395704010247, "grad_norm": 0.8188003301620483, "learning_rate": 7.863150516909059e-05, "loss": 1.7934, "step": 2442 }, { "epoch": 0.6421650737346865, "grad_norm": 1.0256218910217285, "learning_rate": 7.861398282810583e-05, "loss": 1.8175, "step": 2444 }, { "epoch": 0.6426905770683483, "grad_norm": 0.7887519598007202, "learning_rate": 7.859646048712108e-05, "loss": 1.7842, "step": 2446 }, { "epoch": 0.6432160804020101, "grad_norm": 0.906284749507904, "learning_rate": 7.857893814613633e-05, "loss": 1.7407, "step": 2448 }, { "epoch": 0.6437415837356718, "grad_norm": 1.160643458366394, "learning_rate": 7.856141580515158e-05, "loss": 1.7891, "step": 2450 }, { "epoch": 0.6442670870693336, "grad_norm": 0.7816227078437805, "learning_rate": 7.854389346416682e-05, "loss": 1.802, "step": 2452 }, { "epoch": 0.6447925904029954, "grad_norm": 0.6715728640556335, "learning_rate": 7.852637112318206e-05, "loss": 1.754, "step": 2454 }, { "epoch": 0.6453180937366572, "grad_norm": 0.7650018334388733, "learning_rate": 7.85088487821973e-05, "loss": 1.7936, "step": 2456 }, { "epoch": 0.6458435970703189, "grad_norm": 0.8200324773788452, "learning_rate": 7.849132644121255e-05, "loss": 1.7879, "step": 2458 }, { "epoch": 0.6463691004039807, "grad_norm": 0.7117056846618652, "learning_rate": 7.84738041002278e-05, "loss": 1.7869, "step": 2460 }, { "epoch": 0.6468946037376425, "grad_norm": 0.7513619661331177, "learning_rate": 7.845628175924303e-05, "loss": 1.7929, "step": 2462 }, { "epoch": 0.6474201070713043, "grad_norm": 0.6291913986206055, "learning_rate": 7.843875941825828e-05, "loss": 1.7597, "step": 2464 }, { "epoch": 0.647945610404966, "grad_norm": 0.7736865878105164, "learning_rate": 7.842123707727353e-05, "loss": 1.8084, "step": 2466 }, { "epoch": 0.6484711137386278, "grad_norm": 0.9892921447753906, "learning_rate": 7.840371473628876e-05, "loss": 1.7724, "step": 2468 }, { "epoch": 0.6489966170722895, "grad_norm": 0.5670979619026184, "learning_rate": 7.838619239530401e-05, "loss": 1.7806, "step": 2470 }, { "epoch": 0.6495221204059514, "grad_norm": 0.728164792060852, "learning_rate": 7.836867005431926e-05, "loss": 1.7687, "step": 2472 }, { "epoch": 0.6500476237396131, "grad_norm": 1.0032124519348145, "learning_rate": 7.835114771333451e-05, "loss": 1.7992, "step": 2474 }, { "epoch": 0.6505731270732749, "grad_norm": 0.70088130235672, "learning_rate": 7.833362537234975e-05, "loss": 1.7664, "step": 2476 }, { "epoch": 0.6510986304069366, "grad_norm": 0.8623471856117249, "learning_rate": 7.8316103031365e-05, "loss": 1.8202, "step": 2478 }, { "epoch": 0.6516241337405985, "grad_norm": 0.8462334275245667, "learning_rate": 7.829858069038024e-05, "loss": 1.8086, "step": 2480 }, { "epoch": 0.6521496370742602, "grad_norm": 0.6222977638244629, "learning_rate": 7.828105834939548e-05, "loss": 1.7396, "step": 2482 }, { "epoch": 0.652675140407922, "grad_norm": 0.7277782559394836, "learning_rate": 7.826353600841073e-05, "loss": 1.787, "step": 2484 }, { "epoch": 0.6532006437415837, "grad_norm": 0.6451889276504517, "learning_rate": 7.824601366742598e-05, "loss": 1.7687, "step": 2486 }, { "epoch": 0.6537261470752455, "grad_norm": 0.6663830280303955, "learning_rate": 7.822849132644121e-05, "loss": 1.7627, "step": 2488 }, { "epoch": 0.6542516504089073, "grad_norm": 0.7570757269859314, "learning_rate": 7.821096898545646e-05, "loss": 1.7423, "step": 2490 }, { "epoch": 0.6547771537425691, "grad_norm": 0.5967277884483337, "learning_rate": 7.819344664447171e-05, "loss": 1.7834, "step": 2492 }, { "epoch": 0.6553026570762308, "grad_norm": 0.6351729035377502, "learning_rate": 7.817592430348694e-05, "loss": 1.7541, "step": 2494 }, { "epoch": 0.6558281604098926, "grad_norm": 0.9284831881523132, "learning_rate": 7.815840196250219e-05, "loss": 1.7658, "step": 2496 }, { "epoch": 0.6563536637435544, "grad_norm": 0.754885196685791, "learning_rate": 7.814087962151744e-05, "loss": 1.7602, "step": 2498 }, { "epoch": 0.6568791670772162, "grad_norm": 0.7284504771232605, "learning_rate": 7.812335728053268e-05, "loss": 1.7983, "step": 2500 }, { "epoch": 0.6574046704108779, "grad_norm": 0.6399169564247131, "learning_rate": 7.810583493954793e-05, "loss": 1.7851, "step": 2502 }, { "epoch": 0.6579301737445397, "grad_norm": 1.2791913747787476, "learning_rate": 7.808831259856318e-05, "loss": 1.8401, "step": 2504 }, { "epoch": 0.6584556770782015, "grad_norm": 0.8345859050750732, "learning_rate": 7.807079025757841e-05, "loss": 1.7636, "step": 2506 }, { "epoch": 0.6589811804118633, "grad_norm": 0.6579688191413879, "learning_rate": 7.805326791659366e-05, "loss": 1.815, "step": 2508 }, { "epoch": 0.659506683745525, "grad_norm": 0.744471549987793, "learning_rate": 7.803574557560891e-05, "loss": 1.787, "step": 2510 }, { "epoch": 0.6600321870791868, "grad_norm": 0.6032891869544983, "learning_rate": 7.801822323462416e-05, "loss": 1.7615, "step": 2512 }, { "epoch": 0.6605576904128485, "grad_norm": 0.6453471183776855, "learning_rate": 7.800070089363939e-05, "loss": 1.7519, "step": 2514 }, { "epoch": 0.6610831937465104, "grad_norm": 0.6828714609146118, "learning_rate": 7.798317855265464e-05, "loss": 1.7563, "step": 2516 }, { "epoch": 0.6616086970801721, "grad_norm": 0.7225235104560852, "learning_rate": 7.796565621166989e-05, "loss": 1.7685, "step": 2518 }, { "epoch": 0.6621342004138339, "grad_norm": 0.7070510387420654, "learning_rate": 7.794813387068512e-05, "loss": 1.7816, "step": 2520 }, { "epoch": 0.6626597037474956, "grad_norm": 0.7075088620185852, "learning_rate": 7.793061152970037e-05, "loss": 1.7918, "step": 2522 }, { "epoch": 0.6631852070811575, "grad_norm": 0.8367542028427124, "learning_rate": 7.791308918871561e-05, "loss": 1.7769, "step": 2524 }, { "epoch": 0.6637107104148192, "grad_norm": 0.7209259867668152, "learning_rate": 7.789556684773086e-05, "loss": 1.7997, "step": 2526 }, { "epoch": 0.664236213748481, "grad_norm": 0.9258558750152588, "learning_rate": 7.787804450674611e-05, "loss": 1.7706, "step": 2528 }, { "epoch": 0.6647617170821427, "grad_norm": 0.7765336632728577, "learning_rate": 7.786052216576136e-05, "loss": 1.7774, "step": 2530 }, { "epoch": 0.6652872204158045, "grad_norm": 0.8592368960380554, "learning_rate": 7.784299982477659e-05, "loss": 1.7496, "step": 2532 }, { "epoch": 0.6658127237494663, "grad_norm": 0.7928656339645386, "learning_rate": 7.782547748379184e-05, "loss": 1.7735, "step": 2534 }, { "epoch": 0.6663382270831281, "grad_norm": 0.7750053405761719, "learning_rate": 7.780795514280709e-05, "loss": 1.7732, "step": 2536 }, { "epoch": 0.6668637304167898, "grad_norm": 1.0451760292053223, "learning_rate": 7.779043280182233e-05, "loss": 1.7658, "step": 2538 }, { "epoch": 0.6673892337504516, "grad_norm": 0.6935849189758301, "learning_rate": 7.777291046083757e-05, "loss": 1.7494, "step": 2540 }, { "epoch": 0.6679147370841134, "grad_norm": 0.9037797451019287, "learning_rate": 7.775538811985282e-05, "loss": 1.7664, "step": 2542 }, { "epoch": 0.6684402404177752, "grad_norm": 0.6649421453475952, "learning_rate": 7.773786577886806e-05, "loss": 1.7503, "step": 2544 }, { "epoch": 0.6689657437514369, "grad_norm": 0.6808927059173584, "learning_rate": 7.77203434378833e-05, "loss": 1.7481, "step": 2546 }, { "epoch": 0.6694912470850987, "grad_norm": 0.697090208530426, "learning_rate": 7.770282109689854e-05, "loss": 1.7843, "step": 2548 }, { "epoch": 0.6700167504187605, "grad_norm": 0.9084567427635193, "learning_rate": 7.768529875591379e-05, "loss": 1.7643, "step": 2550 }, { "epoch": 0.6705422537524223, "grad_norm": 0.6704044938087463, "learning_rate": 7.766777641492904e-05, "loss": 1.7687, "step": 2552 }, { "epoch": 0.671067757086084, "grad_norm": 0.7542858719825745, "learning_rate": 7.765025407394429e-05, "loss": 1.771, "step": 2554 }, { "epoch": 0.6715932604197458, "grad_norm": 0.8199552893638611, "learning_rate": 7.763273173295953e-05, "loss": 1.8042, "step": 2556 }, { "epoch": 0.6721187637534075, "grad_norm": 0.8224305510520935, "learning_rate": 7.761520939197477e-05, "loss": 1.7625, "step": 2558 }, { "epoch": 0.6726442670870694, "grad_norm": 0.7174823880195618, "learning_rate": 7.759768705099002e-05, "loss": 1.7857, "step": 2560 }, { "epoch": 0.6731697704207311, "grad_norm": 0.6753933429718018, "learning_rate": 7.758016471000526e-05, "loss": 1.7732, "step": 2562 }, { "epoch": 0.6736952737543929, "grad_norm": 0.985587477684021, "learning_rate": 7.75626423690205e-05, "loss": 1.797, "step": 2564 }, { "epoch": 0.6742207770880546, "grad_norm": 0.892760694026947, "learning_rate": 7.754512002803575e-05, "loss": 1.7881, "step": 2566 }, { "epoch": 0.6747462804217165, "grad_norm": 0.7832928895950317, "learning_rate": 7.752759768705099e-05, "loss": 1.781, "step": 2568 }, { "epoch": 0.6752717837553782, "grad_norm": 0.7372546195983887, "learning_rate": 7.751007534606624e-05, "loss": 1.7774, "step": 2570 }, { "epoch": 0.67579728708904, "grad_norm": 0.7375915050506592, "learning_rate": 7.749255300508147e-05, "loss": 1.8019, "step": 2572 }, { "epoch": 0.6763227904227017, "grad_norm": 0.77280193567276, "learning_rate": 7.747503066409672e-05, "loss": 1.7743, "step": 2574 }, { "epoch": 0.6768482937563635, "grad_norm": 0.7224514484405518, "learning_rate": 7.745750832311197e-05, "loss": 1.7741, "step": 2576 }, { "epoch": 0.6773737970900253, "grad_norm": 0.6910998821258545, "learning_rate": 7.743998598212722e-05, "loss": 1.7893, "step": 2578 }, { "epoch": 0.6778993004236871, "grad_norm": 0.61247718334198, "learning_rate": 7.742246364114247e-05, "loss": 1.7628, "step": 2580 }, { "epoch": 0.6784248037573488, "grad_norm": 0.6819799542427063, "learning_rate": 7.740494130015771e-05, "loss": 1.7355, "step": 2582 }, { "epoch": 0.6789503070910106, "grad_norm": 0.6858199834823608, "learning_rate": 7.738741895917295e-05, "loss": 1.772, "step": 2584 }, { "epoch": 0.6794758104246724, "grad_norm": 0.7759047150611877, "learning_rate": 7.73698966181882e-05, "loss": 1.7586, "step": 2586 }, { "epoch": 0.6800013137583342, "grad_norm": 0.6968771815299988, "learning_rate": 7.735237427720344e-05, "loss": 1.7886, "step": 2588 }, { "epoch": 0.6805268170919959, "grad_norm": 0.7313429117202759, "learning_rate": 7.733485193621868e-05, "loss": 1.7884, "step": 2590 }, { "epoch": 0.6810523204256577, "grad_norm": 0.6731216311454773, "learning_rate": 7.731732959523392e-05, "loss": 1.8149, "step": 2592 }, { "epoch": 0.6815778237593195, "grad_norm": 0.6619842648506165, "learning_rate": 7.729980725424917e-05, "loss": 1.7993, "step": 2594 }, { "epoch": 0.6821033270929813, "grad_norm": 0.6684291362762451, "learning_rate": 7.728228491326442e-05, "loss": 1.7386, "step": 2596 }, { "epoch": 0.682628830426643, "grad_norm": 0.6439480781555176, "learning_rate": 7.726476257227965e-05, "loss": 1.7711, "step": 2598 }, { "epoch": 0.6831543337603048, "grad_norm": 0.9060842990875244, "learning_rate": 7.72472402312949e-05, "loss": 1.7901, "step": 2600 }, { "epoch": 0.6836798370939665, "grad_norm": 0.8999879360198975, "learning_rate": 7.722971789031015e-05, "loss": 1.755, "step": 2602 }, { "epoch": 0.6842053404276284, "grad_norm": 0.931769609451294, "learning_rate": 7.72121955493254e-05, "loss": 1.7682, "step": 2604 }, { "epoch": 0.6847308437612901, "grad_norm": 0.6413145065307617, "learning_rate": 7.719467320834064e-05, "loss": 1.7774, "step": 2606 }, { "epoch": 0.6852563470949519, "grad_norm": 0.7932469248771667, "learning_rate": 7.717715086735589e-05, "loss": 1.7702, "step": 2608 }, { "epoch": 0.6857818504286136, "grad_norm": 0.9040171504020691, "learning_rate": 7.715962852637112e-05, "loss": 1.7799, "step": 2610 }, { "epoch": 0.6863073537622755, "grad_norm": 0.8220160603523254, "learning_rate": 7.714210618538637e-05, "loss": 1.776, "step": 2612 }, { "epoch": 0.6868328570959372, "grad_norm": 0.727171778678894, "learning_rate": 7.712458384440162e-05, "loss": 1.7711, "step": 2614 }, { "epoch": 0.687358360429599, "grad_norm": 0.877223789691925, "learning_rate": 7.710706150341685e-05, "loss": 1.7487, "step": 2616 }, { "epoch": 0.6878838637632607, "grad_norm": 0.7638031244277954, "learning_rate": 7.70895391624321e-05, "loss": 1.7925, "step": 2618 }, { "epoch": 0.6884093670969226, "grad_norm": 0.8082540035247803, "learning_rate": 7.707201682144735e-05, "loss": 1.756, "step": 2620 }, { "epoch": 0.6889348704305843, "grad_norm": 0.8931254148483276, "learning_rate": 7.70544944804626e-05, "loss": 1.7783, "step": 2622 }, { "epoch": 0.6894603737642461, "grad_norm": 0.760176420211792, "learning_rate": 7.703697213947783e-05, "loss": 1.7603, "step": 2624 }, { "epoch": 0.6899858770979078, "grad_norm": 0.7421066761016846, "learning_rate": 7.701944979849308e-05, "loss": 1.7872, "step": 2626 }, { "epoch": 0.6905113804315696, "grad_norm": 0.7226428389549255, "learning_rate": 7.700192745750833e-05, "loss": 1.7947, "step": 2628 }, { "epoch": 0.6910368837652314, "grad_norm": 0.7266933917999268, "learning_rate": 7.698440511652357e-05, "loss": 1.7512, "step": 2630 }, { "epoch": 0.6915623870988932, "grad_norm": 0.6181286573410034, "learning_rate": 7.696688277553882e-05, "loss": 1.7382, "step": 2632 }, { "epoch": 0.6920878904325549, "grad_norm": 0.7923069000244141, "learning_rate": 7.694936043455407e-05, "loss": 1.772, "step": 2634 }, { "epoch": 0.6926133937662167, "grad_norm": 0.7019143104553223, "learning_rate": 7.69318380935693e-05, "loss": 1.7768, "step": 2636 }, { "epoch": 0.6931388970998785, "grad_norm": 0.6440560817718506, "learning_rate": 7.691431575258455e-05, "loss": 1.784, "step": 2638 }, { "epoch": 0.6936644004335403, "grad_norm": 0.6912879943847656, "learning_rate": 7.689679341159978e-05, "loss": 1.7714, "step": 2640 }, { "epoch": 0.694189903767202, "grad_norm": 0.7078375816345215, "learning_rate": 7.687927107061503e-05, "loss": 1.7774, "step": 2642 }, { "epoch": 0.6947154071008638, "grad_norm": 0.744601309299469, "learning_rate": 7.686174872963028e-05, "loss": 1.781, "step": 2644 }, { "epoch": 0.6952409104345255, "grad_norm": 0.6456273198127747, "learning_rate": 7.684422638864553e-05, "loss": 1.7704, "step": 2646 }, { "epoch": 0.6957664137681874, "grad_norm": 1.024562954902649, "learning_rate": 7.682670404766077e-05, "loss": 1.7578, "step": 2648 }, { "epoch": 0.6962919171018491, "grad_norm": 0.7917899489402771, "learning_rate": 7.680918170667601e-05, "loss": 1.7853, "step": 2650 }, { "epoch": 0.6968174204355109, "grad_norm": 0.8094285130500793, "learning_rate": 7.679165936569126e-05, "loss": 1.7531, "step": 2652 }, { "epoch": 0.6973429237691726, "grad_norm": 0.5724372863769531, "learning_rate": 7.67741370247065e-05, "loss": 1.7458, "step": 2654 }, { "epoch": 0.6978684271028345, "grad_norm": 0.8631569743156433, "learning_rate": 7.675661468372175e-05, "loss": 1.7559, "step": 2656 }, { "epoch": 0.6983939304364962, "grad_norm": 0.8773946762084961, "learning_rate": 7.6739092342737e-05, "loss": 1.7658, "step": 2658 }, { "epoch": 0.698919433770158, "grad_norm": 0.7718110084533691, "learning_rate": 7.672157000175225e-05, "loss": 1.8134, "step": 2660 }, { "epoch": 0.6994449371038197, "grad_norm": 0.7242169380187988, "learning_rate": 7.670404766076748e-05, "loss": 1.7982, "step": 2662 }, { "epoch": 0.6999704404374816, "grad_norm": 0.8516372442245483, "learning_rate": 7.668652531978273e-05, "loss": 1.7878, "step": 2664 }, { "epoch": 0.7004959437711433, "grad_norm": 0.7234377861022949, "learning_rate": 7.666900297879796e-05, "loss": 1.808, "step": 2666 }, { "epoch": 0.7010214471048051, "grad_norm": 0.6989150643348694, "learning_rate": 7.665148063781321e-05, "loss": 1.7998, "step": 2668 }, { "epoch": 0.7015469504384668, "grad_norm": 0.8885288834571838, "learning_rate": 7.663395829682846e-05, "loss": 1.7768, "step": 2670 }, { "epoch": 0.7020724537721286, "grad_norm": 0.7263723015785217, "learning_rate": 7.66164359558437e-05, "loss": 1.7789, "step": 2672 }, { "epoch": 0.7025979571057904, "grad_norm": 0.7461345195770264, "learning_rate": 7.659891361485895e-05, "loss": 1.7583, "step": 2674 }, { "epoch": 0.7031234604394522, "grad_norm": 1.028860330581665, "learning_rate": 7.65813912738742e-05, "loss": 1.7666, "step": 2676 }, { "epoch": 0.7036489637731139, "grad_norm": 0.7666818499565125, "learning_rate": 7.656386893288943e-05, "loss": 1.7621, "step": 2678 }, { "epoch": 0.7041744671067757, "grad_norm": 0.8842204809188843, "learning_rate": 7.654634659190468e-05, "loss": 1.7664, "step": 2680 }, { "epoch": 0.7046999704404375, "grad_norm": 1.2368742227554321, "learning_rate": 7.652882425091993e-05, "loss": 1.7572, "step": 2682 }, { "epoch": 0.7052254737740993, "grad_norm": 0.9390943646430969, "learning_rate": 7.651130190993518e-05, "loss": 1.7482, "step": 2684 }, { "epoch": 0.705750977107761, "grad_norm": 0.7266187071800232, "learning_rate": 7.649377956895042e-05, "loss": 1.7961, "step": 2686 }, { "epoch": 0.7062764804414228, "grad_norm": 0.6799050569534302, "learning_rate": 7.647625722796567e-05, "loss": 1.7712, "step": 2688 }, { "epoch": 0.7068019837750845, "grad_norm": 0.8757466077804565, "learning_rate": 7.64587348869809e-05, "loss": 1.7721, "step": 2690 }, { "epoch": 0.7073274871087464, "grad_norm": 0.8347486257553101, "learning_rate": 7.644121254599614e-05, "loss": 1.7481, "step": 2692 }, { "epoch": 0.7078529904424081, "grad_norm": 0.7270652055740356, "learning_rate": 7.642369020501139e-05, "loss": 1.7818, "step": 2694 }, { "epoch": 0.7083784937760699, "grad_norm": 0.563240110874176, "learning_rate": 7.640616786402663e-05, "loss": 1.769, "step": 2696 }, { "epoch": 0.7089039971097316, "grad_norm": 0.8982949256896973, "learning_rate": 7.638864552304188e-05, "loss": 1.7806, "step": 2698 }, { "epoch": 0.7094295004433935, "grad_norm": 0.723839282989502, "learning_rate": 7.637112318205713e-05, "loss": 1.7947, "step": 2700 }, { "epoch": 0.7099550037770552, "grad_norm": 0.7281327247619629, "learning_rate": 7.635360084107238e-05, "loss": 1.7763, "step": 2702 }, { "epoch": 0.710480507110717, "grad_norm": 0.8034355044364929, "learning_rate": 7.633607850008761e-05, "loss": 1.7894, "step": 2704 }, { "epoch": 0.7110060104443787, "grad_norm": 0.7230488061904907, "learning_rate": 7.631855615910286e-05, "loss": 1.7715, "step": 2706 }, { "epoch": 0.7115315137780406, "grad_norm": 0.9625870585441589, "learning_rate": 7.63010338181181e-05, "loss": 1.7966, "step": 2708 }, { "epoch": 0.7120570171117023, "grad_norm": 0.6187042593955994, "learning_rate": 7.628351147713335e-05, "loss": 1.791, "step": 2710 }, { "epoch": 0.7125825204453641, "grad_norm": 0.7607492208480835, "learning_rate": 7.62659891361486e-05, "loss": 1.8201, "step": 2712 }, { "epoch": 0.7131080237790258, "grad_norm": 0.8220197558403015, "learning_rate": 7.624846679516385e-05, "loss": 1.7512, "step": 2714 }, { "epoch": 0.7136335271126876, "grad_norm": 0.7219741344451904, "learning_rate": 7.623094445417908e-05, "loss": 1.775, "step": 2716 }, { "epoch": 0.7141590304463494, "grad_norm": 0.9267223477363586, "learning_rate": 7.621342211319432e-05, "loss": 1.7833, "step": 2718 }, { "epoch": 0.7146845337800112, "grad_norm": 0.6037401556968689, "learning_rate": 7.619589977220956e-05, "loss": 1.7956, "step": 2720 }, { "epoch": 0.7152100371136729, "grad_norm": 0.7281090021133423, "learning_rate": 7.617837743122481e-05, "loss": 1.7341, "step": 2722 }, { "epoch": 0.7157355404473347, "grad_norm": 0.8907596468925476, "learning_rate": 7.616085509024006e-05, "loss": 1.754, "step": 2724 }, { "epoch": 0.7162610437809965, "grad_norm": 0.7699323296546936, "learning_rate": 7.61433327492553e-05, "loss": 1.7591, "step": 2726 }, { "epoch": 0.7167865471146583, "grad_norm": 0.7458289265632629, "learning_rate": 7.612581040827055e-05, "loss": 1.8033, "step": 2728 }, { "epoch": 0.71731205044832, "grad_norm": 1.1340159177780151, "learning_rate": 7.610828806728579e-05, "loss": 1.7821, "step": 2730 }, { "epoch": 0.7178375537819818, "grad_norm": 0.6701779365539551, "learning_rate": 7.609076572630104e-05, "loss": 1.7681, "step": 2732 }, { "epoch": 0.7183630571156436, "grad_norm": 0.7182425260543823, "learning_rate": 7.607324338531628e-05, "loss": 1.7735, "step": 2734 }, { "epoch": 0.7188885604493054, "grad_norm": 0.8409538865089417, "learning_rate": 7.605572104433153e-05, "loss": 1.7535, "step": 2736 }, { "epoch": 0.7194140637829671, "grad_norm": 0.7824596166610718, "learning_rate": 7.603819870334678e-05, "loss": 1.7853, "step": 2738 }, { "epoch": 0.7199395671166289, "grad_norm": 1.0866069793701172, "learning_rate": 7.602067636236203e-05, "loss": 1.7729, "step": 2740 }, { "epoch": 0.7204650704502906, "grad_norm": 0.7940483093261719, "learning_rate": 7.600315402137726e-05, "loss": 1.7643, "step": 2742 }, { "epoch": 0.7209905737839525, "grad_norm": 1.0263162851333618, "learning_rate": 7.59856316803925e-05, "loss": 1.7533, "step": 2744 }, { "epoch": 0.7215160771176142, "grad_norm": 0.7818773984909058, "learning_rate": 7.596810933940774e-05, "loss": 1.7526, "step": 2746 }, { "epoch": 0.722041580451276, "grad_norm": 0.7003962397575378, "learning_rate": 7.595058699842299e-05, "loss": 1.7635, "step": 2748 }, { "epoch": 0.7225670837849377, "grad_norm": 0.7112312316894531, "learning_rate": 7.593306465743824e-05, "loss": 1.7608, "step": 2750 }, { "epoch": 0.7230925871185996, "grad_norm": 0.8331362009048462, "learning_rate": 7.591554231645348e-05, "loss": 1.7899, "step": 2752 }, { "epoch": 0.7236180904522613, "grad_norm": 0.7973011136054993, "learning_rate": 7.589801997546873e-05, "loss": 1.757, "step": 2754 }, { "epoch": 0.7241435937859231, "grad_norm": 0.6794710159301758, "learning_rate": 7.588049763448397e-05, "loss": 1.7516, "step": 2756 }, { "epoch": 0.7246690971195848, "grad_norm": 0.9882004857063293, "learning_rate": 7.586297529349921e-05, "loss": 1.7499, "step": 2758 }, { "epoch": 0.7251946004532466, "grad_norm": 0.6301134824752808, "learning_rate": 7.584545295251446e-05, "loss": 1.7785, "step": 2760 }, { "epoch": 0.7257201037869084, "grad_norm": 0.8858461976051331, "learning_rate": 7.582793061152971e-05, "loss": 1.7956, "step": 2762 }, { "epoch": 0.7262456071205702, "grad_norm": 0.7689917087554932, "learning_rate": 7.581040827054496e-05, "loss": 1.7588, "step": 2764 }, { "epoch": 0.7267711104542319, "grad_norm": 0.7277519702911377, "learning_rate": 7.57928859295602e-05, "loss": 1.7878, "step": 2766 }, { "epoch": 0.7272966137878937, "grad_norm": 0.7690980434417725, "learning_rate": 7.577536358857544e-05, "loss": 1.7738, "step": 2768 }, { "epoch": 0.7278221171215555, "grad_norm": 0.9652357697486877, "learning_rate": 7.575784124759067e-05, "loss": 1.8106, "step": 2770 }, { "epoch": 0.7283476204552173, "grad_norm": 0.7968404293060303, "learning_rate": 7.574031890660592e-05, "loss": 1.7581, "step": 2772 }, { "epoch": 0.728873123788879, "grad_norm": 0.7790765762329102, "learning_rate": 7.572279656562117e-05, "loss": 1.7257, "step": 2774 }, { "epoch": 0.7293986271225408, "grad_norm": 0.93352872133255, "learning_rate": 7.570527422463641e-05, "loss": 1.7343, "step": 2776 }, { "epoch": 0.7299241304562026, "grad_norm": 0.8516173958778381, "learning_rate": 7.568775188365166e-05, "loss": 1.7442, "step": 2778 }, { "epoch": 0.7304496337898644, "grad_norm": 0.6152732968330383, "learning_rate": 7.567022954266691e-05, "loss": 1.7601, "step": 2780 }, { "epoch": 0.7309751371235261, "grad_norm": 0.6822034120559692, "learning_rate": 7.565270720168214e-05, "loss": 1.7915, "step": 2782 }, { "epoch": 0.7315006404571879, "grad_norm": 0.6148113012313843, "learning_rate": 7.563518486069739e-05, "loss": 1.7642, "step": 2784 }, { "epoch": 0.7320261437908496, "grad_norm": 0.9175143241882324, "learning_rate": 7.561766251971264e-05, "loss": 1.7588, "step": 2786 }, { "epoch": 0.7325516471245115, "grad_norm": 0.6481335759162903, "learning_rate": 7.560014017872789e-05, "loss": 1.7838, "step": 2788 }, { "epoch": 0.7330771504581732, "grad_norm": 0.712563693523407, "learning_rate": 7.558261783774313e-05, "loss": 1.7449, "step": 2790 }, { "epoch": 0.733602653791835, "grad_norm": 0.7027430534362793, "learning_rate": 7.556509549675838e-05, "loss": 1.7544, "step": 2792 }, { "epoch": 0.7341281571254967, "grad_norm": 0.7059524059295654, "learning_rate": 7.554757315577362e-05, "loss": 1.7766, "step": 2794 }, { "epoch": 0.7346536604591586, "grad_norm": 0.7603355050086975, "learning_rate": 7.553005081478885e-05, "loss": 1.7778, "step": 2796 }, { "epoch": 0.7351791637928203, "grad_norm": 0.8169555068016052, "learning_rate": 7.55125284738041e-05, "loss": 1.768, "step": 2798 }, { "epoch": 0.7357046671264821, "grad_norm": 0.7924453616142273, "learning_rate": 7.549500613281934e-05, "loss": 1.7617, "step": 2800 }, { "epoch": 0.7357046671264821, "eval_loss": 1.736789345741272, "eval_runtime": 487.2587, "eval_samples_per_second": 249.947, "eval_steps_per_second": 31.244, "step": 2800 }, { "epoch": 0.7362301704601438, "grad_norm": 0.6122123003005981, "learning_rate": 7.547748379183459e-05, "loss": 1.7861, "step": 2802 }, { "epoch": 0.7367556737938056, "grad_norm": 0.7799514532089233, "learning_rate": 7.545996145084984e-05, "loss": 1.7612, "step": 2804 }, { "epoch": 0.7372811771274674, "grad_norm": 0.6509242057800293, "learning_rate": 7.544243910986509e-05, "loss": 1.8034, "step": 2806 }, { "epoch": 0.7378066804611292, "grad_norm": 0.6152432560920715, "learning_rate": 7.542491676888032e-05, "loss": 1.7565, "step": 2808 }, { "epoch": 0.7383321837947909, "grad_norm": 0.6861807703971863, "learning_rate": 7.540739442789557e-05, "loss": 1.7622, "step": 2810 }, { "epoch": 0.7388576871284527, "grad_norm": 0.6434677839279175, "learning_rate": 7.538987208691082e-05, "loss": 1.7778, "step": 2812 }, { "epoch": 0.7393831904621145, "grad_norm": 0.8894173502922058, "learning_rate": 7.537234974592606e-05, "loss": 1.7921, "step": 2814 }, { "epoch": 0.7399086937957763, "grad_norm": 0.6787139177322388, "learning_rate": 7.535482740494131e-05, "loss": 1.76, "step": 2816 }, { "epoch": 0.740434197129438, "grad_norm": 0.6434260010719299, "learning_rate": 7.533730506395656e-05, "loss": 1.7702, "step": 2818 }, { "epoch": 0.7409597004630998, "grad_norm": 0.6635193228721619, "learning_rate": 7.531978272297179e-05, "loss": 1.7631, "step": 2820 }, { "epoch": 0.7414852037967616, "grad_norm": 0.6992824077606201, "learning_rate": 7.530226038198703e-05, "loss": 1.8017, "step": 2822 }, { "epoch": 0.7420107071304234, "grad_norm": 0.6800288558006287, "learning_rate": 7.528473804100227e-05, "loss": 1.764, "step": 2824 }, { "epoch": 0.7425362104640851, "grad_norm": 1.0732684135437012, "learning_rate": 7.526721570001752e-05, "loss": 1.7818, "step": 2826 }, { "epoch": 0.7430617137977469, "grad_norm": 0.7014878392219543, "learning_rate": 7.524969335903277e-05, "loss": 1.7774, "step": 2828 }, { "epoch": 0.7435872171314086, "grad_norm": 0.5895276069641113, "learning_rate": 7.523217101804802e-05, "loss": 1.7622, "step": 2830 }, { "epoch": 0.7441127204650705, "grad_norm": 0.8901596069335938, "learning_rate": 7.521464867706326e-05, "loss": 1.7579, "step": 2832 }, { "epoch": 0.7446382237987322, "grad_norm": 0.9512175917625427, "learning_rate": 7.51971263360785e-05, "loss": 1.7481, "step": 2834 }, { "epoch": 0.745163727132394, "grad_norm": 0.8138533234596252, "learning_rate": 7.517960399509375e-05, "loss": 1.7612, "step": 2836 }, { "epoch": 0.7456892304660557, "grad_norm": 0.7234623432159424, "learning_rate": 7.5162081654109e-05, "loss": 1.7587, "step": 2838 }, { "epoch": 0.7462147337997176, "grad_norm": 1.0871793031692505, "learning_rate": 7.514455931312424e-05, "loss": 1.7368, "step": 2840 }, { "epoch": 0.7467402371333793, "grad_norm": 0.7515408992767334, "learning_rate": 7.512703697213949e-05, "loss": 1.8096, "step": 2842 }, { "epoch": 0.7472657404670411, "grad_norm": 0.6623795032501221, "learning_rate": 7.510951463115472e-05, "loss": 1.7683, "step": 2844 }, { "epoch": 0.7477912438007028, "grad_norm": 0.648363471031189, "learning_rate": 7.509199229016997e-05, "loss": 1.8195, "step": 2846 }, { "epoch": 0.7483167471343646, "grad_norm": 0.7021984457969666, "learning_rate": 7.50744699491852e-05, "loss": 1.7918, "step": 2848 }, { "epoch": 0.7488422504680264, "grad_norm": 0.7864859104156494, "learning_rate": 7.505694760820045e-05, "loss": 1.8118, "step": 2850 }, { "epoch": 0.7493677538016882, "grad_norm": 0.6326330900192261, "learning_rate": 7.50394252672157e-05, "loss": 1.7781, "step": 2852 }, { "epoch": 0.7498932571353499, "grad_norm": 0.6461377143859863, "learning_rate": 7.502190292623095e-05, "loss": 1.77, "step": 2854 }, { "epoch": 0.7504187604690117, "grad_norm": 0.6641056537628174, "learning_rate": 7.50043805852462e-05, "loss": 1.7442, "step": 2856 }, { "epoch": 0.7509442638026735, "grad_norm": 0.6882733702659607, "learning_rate": 7.498685824426144e-05, "loss": 1.7583, "step": 2858 }, { "epoch": 0.7514697671363353, "grad_norm": 0.9753492474555969, "learning_rate": 7.496933590327668e-05, "loss": 1.7557, "step": 2860 }, { "epoch": 0.751995270469997, "grad_norm": 0.6425254940986633, "learning_rate": 7.495181356229192e-05, "loss": 1.8, "step": 2862 }, { "epoch": 0.7525207738036588, "grad_norm": 0.7045446038246155, "learning_rate": 7.493429122130717e-05, "loss": 1.7532, "step": 2864 }, { "epoch": 0.7530462771373206, "grad_norm": 0.6855698227882385, "learning_rate": 7.491676888032242e-05, "loss": 1.7652, "step": 2866 }, { "epoch": 0.7535717804709824, "grad_norm": 0.7373823523521423, "learning_rate": 7.489924653933767e-05, "loss": 1.7896, "step": 2868 }, { "epoch": 0.7540972838046441, "grad_norm": 0.772221565246582, "learning_rate": 7.48817241983529e-05, "loss": 1.7712, "step": 2870 }, { "epoch": 0.7546227871383059, "grad_norm": 0.7220898270606995, "learning_rate": 7.486420185736815e-05, "loss": 1.7298, "step": 2872 }, { "epoch": 0.7551482904719676, "grad_norm": 0.6807803511619568, "learning_rate": 7.484667951638338e-05, "loss": 1.7767, "step": 2874 }, { "epoch": 0.7556737938056295, "grad_norm": 0.7383838891983032, "learning_rate": 7.482915717539863e-05, "loss": 1.7722, "step": 2876 }, { "epoch": 0.7561992971392912, "grad_norm": 0.7259317636489868, "learning_rate": 7.481163483441388e-05, "loss": 1.7623, "step": 2878 }, { "epoch": 0.756724800472953, "grad_norm": 0.7161348462104797, "learning_rate": 7.479411249342913e-05, "loss": 1.7379, "step": 2880 }, { "epoch": 0.7572503038066147, "grad_norm": 0.5866290330886841, "learning_rate": 7.477659015244437e-05, "loss": 1.7687, "step": 2882 }, { "epoch": 0.7577758071402766, "grad_norm": 0.6606796979904175, "learning_rate": 7.475906781145962e-05, "loss": 1.7833, "step": 2884 }, { "epoch": 0.7583013104739383, "grad_norm": 0.6400638818740845, "learning_rate": 7.474154547047485e-05, "loss": 1.75, "step": 2886 }, { "epoch": 0.7588268138076001, "grad_norm": 0.6338980793952942, "learning_rate": 7.47240231294901e-05, "loss": 1.7836, "step": 2888 }, { "epoch": 0.7593523171412618, "grad_norm": 0.6896232962608337, "learning_rate": 7.470650078850535e-05, "loss": 1.7473, "step": 2890 }, { "epoch": 0.7598778204749237, "grad_norm": 0.7504851222038269, "learning_rate": 7.46889784475206e-05, "loss": 1.7587, "step": 2892 }, { "epoch": 0.7604033238085854, "grad_norm": 0.6796631217002869, "learning_rate": 7.467145610653584e-05, "loss": 1.7469, "step": 2894 }, { "epoch": 0.7609288271422472, "grad_norm": 0.6032044887542725, "learning_rate": 7.465393376555108e-05, "loss": 1.7651, "step": 2896 }, { "epoch": 0.7614543304759089, "grad_norm": 0.6399370431900024, "learning_rate": 7.463641142456633e-05, "loss": 1.8041, "step": 2898 }, { "epoch": 0.7619798338095707, "grad_norm": 0.6167407631874084, "learning_rate": 7.461888908358156e-05, "loss": 1.7743, "step": 2900 }, { "epoch": 0.7625053371432325, "grad_norm": 0.5767862200737, "learning_rate": 7.460136674259681e-05, "loss": 1.7598, "step": 2902 }, { "epoch": 0.7630308404768943, "grad_norm": 0.6222682595252991, "learning_rate": 7.458384440161206e-05, "loss": 1.7659, "step": 2904 }, { "epoch": 0.763556343810556, "grad_norm": 0.6252115368843079, "learning_rate": 7.45663220606273e-05, "loss": 1.7671, "step": 2906 }, { "epoch": 0.7640818471442178, "grad_norm": 0.6266006231307983, "learning_rate": 7.454879971964255e-05, "loss": 1.7786, "step": 2908 }, { "epoch": 0.7646073504778796, "grad_norm": 0.7761850357055664, "learning_rate": 7.45312773786578e-05, "loss": 1.7185, "step": 2910 }, { "epoch": 0.7651328538115414, "grad_norm": 0.6215353608131409, "learning_rate": 7.451375503767303e-05, "loss": 1.7478, "step": 2912 }, { "epoch": 0.7656583571452031, "grad_norm": 0.7758198380470276, "learning_rate": 7.449623269668828e-05, "loss": 1.7437, "step": 2914 }, { "epoch": 0.7661838604788649, "grad_norm": 0.5999752879142761, "learning_rate": 7.447871035570353e-05, "loss": 1.7534, "step": 2916 }, { "epoch": 0.7667093638125266, "grad_norm": 0.6463642120361328, "learning_rate": 7.446118801471877e-05, "loss": 1.7589, "step": 2918 }, { "epoch": 0.7672348671461885, "grad_norm": 0.7435876727104187, "learning_rate": 7.444366567373402e-05, "loss": 1.8053, "step": 2920 }, { "epoch": 0.7677603704798502, "grad_norm": 0.7085327506065369, "learning_rate": 7.442614333274926e-05, "loss": 1.748, "step": 2922 }, { "epoch": 0.768285873813512, "grad_norm": 0.6690971255302429, "learning_rate": 7.44086209917645e-05, "loss": 1.808, "step": 2924 }, { "epoch": 0.7688113771471737, "grad_norm": 0.652035653591156, "learning_rate": 7.439109865077975e-05, "loss": 1.7491, "step": 2926 }, { "epoch": 0.7693368804808356, "grad_norm": 0.6042243242263794, "learning_rate": 7.437357630979499e-05, "loss": 1.751, "step": 2928 }, { "epoch": 0.7698623838144973, "grad_norm": 0.5847947597503662, "learning_rate": 7.435605396881023e-05, "loss": 1.7539, "step": 2930 }, { "epoch": 0.7703878871481591, "grad_norm": 0.690543532371521, "learning_rate": 7.433853162782548e-05, "loss": 1.7448, "step": 2932 }, { "epoch": 0.7709133904818208, "grad_norm": 0.7835954427719116, "learning_rate": 7.432100928684073e-05, "loss": 1.7352, "step": 2934 }, { "epoch": 0.7714388938154827, "grad_norm": 0.8784381747245789, "learning_rate": 7.430348694585598e-05, "loss": 1.7802, "step": 2936 }, { "epoch": 0.7719643971491444, "grad_norm": 0.8104349970817566, "learning_rate": 7.428596460487122e-05, "loss": 1.755, "step": 2938 }, { "epoch": 0.7724899004828062, "grad_norm": 0.6043236255645752, "learning_rate": 7.426844226388646e-05, "loss": 1.8118, "step": 2940 }, { "epoch": 0.7730154038164679, "grad_norm": 1.0382120609283447, "learning_rate": 7.42509199229017e-05, "loss": 1.7832, "step": 2942 }, { "epoch": 0.7735409071501297, "grad_norm": 0.6753326654434204, "learning_rate": 7.423339758191695e-05, "loss": 1.7886, "step": 2944 }, { "epoch": 0.7740664104837915, "grad_norm": 1.226515531539917, "learning_rate": 7.421587524093219e-05, "loss": 1.7406, "step": 2946 }, { "epoch": 0.7745919138174533, "grad_norm": 0.7555555105209351, "learning_rate": 7.419835289994743e-05, "loss": 1.726, "step": 2948 }, { "epoch": 0.775117417151115, "grad_norm": 1.073789119720459, "learning_rate": 7.418083055896268e-05, "loss": 1.7905, "step": 2950 }, { "epoch": 0.7756429204847768, "grad_norm": 0.883283793926239, "learning_rate": 7.416330821797793e-05, "loss": 1.7516, "step": 2952 }, { "epoch": 0.7761684238184386, "grad_norm": 0.8123281598091125, "learning_rate": 7.414578587699316e-05, "loss": 1.7689, "step": 2954 }, { "epoch": 0.7766939271521004, "grad_norm": 0.6473522782325745, "learning_rate": 7.412826353600841e-05, "loss": 1.7966, "step": 2956 }, { "epoch": 0.7772194304857621, "grad_norm": 0.6950981616973877, "learning_rate": 7.411074119502366e-05, "loss": 1.7494, "step": 2958 }, { "epoch": 0.7777449338194239, "grad_norm": 0.7919045686721802, "learning_rate": 7.40932188540389e-05, "loss": 1.7992, "step": 2960 }, { "epoch": 0.7782704371530856, "grad_norm": 0.8082287311553955, "learning_rate": 7.407569651305415e-05, "loss": 1.758, "step": 2962 }, { "epoch": 0.7787959404867475, "grad_norm": 0.634069561958313, "learning_rate": 7.40581741720694e-05, "loss": 1.7855, "step": 2964 }, { "epoch": 0.7793214438204092, "grad_norm": 1.0830199718475342, "learning_rate": 7.404065183108463e-05, "loss": 1.7509, "step": 2966 }, { "epoch": 0.779846947154071, "grad_norm": 0.6961039304733276, "learning_rate": 7.402312949009988e-05, "loss": 1.7491, "step": 2968 }, { "epoch": 0.7803724504877327, "grad_norm": 0.7842292189598083, "learning_rate": 7.400560714911513e-05, "loss": 1.736, "step": 2970 }, { "epoch": 0.7808979538213946, "grad_norm": 0.6680390238761902, "learning_rate": 7.398808480813036e-05, "loss": 1.7594, "step": 2972 }, { "epoch": 0.7814234571550563, "grad_norm": 0.8992615342140198, "learning_rate": 7.397056246714561e-05, "loss": 1.7811, "step": 2974 }, { "epoch": 0.7819489604887181, "grad_norm": 0.6779314279556274, "learning_rate": 7.395304012616086e-05, "loss": 1.7675, "step": 2976 }, { "epoch": 0.7824744638223798, "grad_norm": 0.5508474111557007, "learning_rate": 7.39355177851761e-05, "loss": 1.8152, "step": 2978 }, { "epoch": 0.7829999671560417, "grad_norm": 0.6748946905136108, "learning_rate": 7.391799544419134e-05, "loss": 1.7596, "step": 2980 }, { "epoch": 0.7835254704897034, "grad_norm": 0.5707883834838867, "learning_rate": 7.390047310320659e-05, "loss": 1.765, "step": 2982 }, { "epoch": 0.7840509738233652, "grad_norm": 0.6725517511367798, "learning_rate": 7.388295076222184e-05, "loss": 1.7528, "step": 2984 }, { "epoch": 0.7845764771570269, "grad_norm": 0.6516979336738586, "learning_rate": 7.386542842123708e-05, "loss": 1.7613, "step": 2986 }, { "epoch": 0.7851019804906887, "grad_norm": 0.6247593760490417, "learning_rate": 7.384790608025233e-05, "loss": 1.7519, "step": 2988 }, { "epoch": 0.7856274838243505, "grad_norm": 0.6806461811065674, "learning_rate": 7.383038373926758e-05, "loss": 1.7501, "step": 2990 }, { "epoch": 0.7861529871580123, "grad_norm": 0.8175075054168701, "learning_rate": 7.381286139828281e-05, "loss": 1.7543, "step": 2992 }, { "epoch": 0.786678490491674, "grad_norm": 0.8868039846420288, "learning_rate": 7.379533905729806e-05, "loss": 1.7285, "step": 2994 }, { "epoch": 0.7872039938253358, "grad_norm": 0.9624283909797668, "learning_rate": 7.377781671631331e-05, "loss": 1.7679, "step": 2996 }, { "epoch": 0.7877294971589976, "grad_norm": 0.8716928958892822, "learning_rate": 7.376029437532854e-05, "loss": 1.7642, "step": 2998 }, { "epoch": 0.7882550004926594, "grad_norm": 0.6427202224731445, "learning_rate": 7.374277203434379e-05, "loss": 1.777, "step": 3000 }, { "epoch": 0.7887805038263211, "grad_norm": 0.6569937467575073, "learning_rate": 7.372524969335904e-05, "loss": 1.7414, "step": 3002 }, { "epoch": 0.7893060071599829, "grad_norm": 1.117759346961975, "learning_rate": 7.370772735237428e-05, "loss": 1.7709, "step": 3004 }, { "epoch": 0.7898315104936446, "grad_norm": 0.6267141103744507, "learning_rate": 7.369020501138952e-05, "loss": 1.7871, "step": 3006 }, { "epoch": 0.7903570138273065, "grad_norm": 0.8071964979171753, "learning_rate": 7.367268267040477e-05, "loss": 1.7471, "step": 3008 }, { "epoch": 0.7908825171609682, "grad_norm": 0.702384352684021, "learning_rate": 7.365516032942001e-05, "loss": 1.7434, "step": 3010 }, { "epoch": 0.79140802049463, "grad_norm": 0.6770474314689636, "learning_rate": 7.363763798843526e-05, "loss": 1.782, "step": 3012 }, { "epoch": 0.7919335238282917, "grad_norm": 0.6293635964393616, "learning_rate": 7.362011564745051e-05, "loss": 1.7682, "step": 3014 }, { "epoch": 0.7924590271619536, "grad_norm": 0.8292271494865417, "learning_rate": 7.360259330646576e-05, "loss": 1.7729, "step": 3016 }, { "epoch": 0.7929845304956153, "grad_norm": 0.6828389167785645, "learning_rate": 7.358507096548099e-05, "loss": 1.7521, "step": 3018 }, { "epoch": 0.7935100338292771, "grad_norm": 0.6849939823150635, "learning_rate": 7.356754862449624e-05, "loss": 1.7619, "step": 3020 }, { "epoch": 0.7940355371629388, "grad_norm": 0.720439612865448, "learning_rate": 7.355002628351149e-05, "loss": 1.7725, "step": 3022 }, { "epoch": 0.7945610404966007, "grad_norm": 0.6729586124420166, "learning_rate": 7.353250394252672e-05, "loss": 1.7711, "step": 3024 }, { "epoch": 0.7950865438302624, "grad_norm": 1.0918195247650146, "learning_rate": 7.351498160154197e-05, "loss": 1.7764, "step": 3026 }, { "epoch": 0.7956120471639242, "grad_norm": 0.658743143081665, "learning_rate": 7.349745926055721e-05, "loss": 1.7641, "step": 3028 }, { "epoch": 0.7961375504975859, "grad_norm": 0.5973094701766968, "learning_rate": 7.347993691957246e-05, "loss": 1.7554, "step": 3030 }, { "epoch": 0.7966630538312477, "grad_norm": 0.6641756296157837, "learning_rate": 7.34624145785877e-05, "loss": 1.7656, "step": 3032 }, { "epoch": 0.7971885571649096, "grad_norm": 0.7535148859024048, "learning_rate": 7.344489223760294e-05, "loss": 1.7521, "step": 3034 }, { "epoch": 0.7977140604985713, "grad_norm": 0.9680157899856567, "learning_rate": 7.342736989661819e-05, "loss": 1.7794, "step": 3036 }, { "epoch": 0.798239563832233, "grad_norm": 0.7855157852172852, "learning_rate": 7.340984755563344e-05, "loss": 1.7853, "step": 3038 }, { "epoch": 0.7987650671658948, "grad_norm": 0.6831044554710388, "learning_rate": 7.339232521464869e-05, "loss": 1.7443, "step": 3040 }, { "epoch": 0.7992905704995567, "grad_norm": 0.7435096502304077, "learning_rate": 7.337480287366393e-05, "loss": 1.7585, "step": 3042 }, { "epoch": 0.7998160738332184, "grad_norm": 0.6614308953285217, "learning_rate": 7.335728053267917e-05, "loss": 1.7872, "step": 3044 }, { "epoch": 0.8003415771668801, "grad_norm": 0.5782405138015747, "learning_rate": 7.333975819169442e-05, "loss": 1.7504, "step": 3046 }, { "epoch": 0.8008670805005419, "grad_norm": 0.6519070267677307, "learning_rate": 7.332223585070965e-05, "loss": 1.7396, "step": 3048 }, { "epoch": 0.8013925838342038, "grad_norm": 0.5866365432739258, "learning_rate": 7.33047135097249e-05, "loss": 1.7591, "step": 3050 }, { "epoch": 0.8019180871678655, "grad_norm": 0.6096078753471375, "learning_rate": 7.328719116874014e-05, "loss": 1.7871, "step": 3052 }, { "epoch": 0.8024435905015272, "grad_norm": 0.6846382021903992, "learning_rate": 7.326966882775539e-05, "loss": 1.7773, "step": 3054 }, { "epoch": 0.802969093835189, "grad_norm": 0.6193353533744812, "learning_rate": 7.325214648677064e-05, "loss": 1.753, "step": 3056 }, { "epoch": 0.8034945971688507, "grad_norm": 0.6320629119873047, "learning_rate": 7.323462414578587e-05, "loss": 1.7679, "step": 3058 }, { "epoch": 0.8040201005025126, "grad_norm": 0.5982667803764343, "learning_rate": 7.321710180480112e-05, "loss": 1.7307, "step": 3060 }, { "epoch": 0.8045456038361744, "grad_norm": 0.7248689532279968, "learning_rate": 7.319957946381637e-05, "loss": 1.7933, "step": 3062 }, { "epoch": 0.8050711071698361, "grad_norm": 0.7433560490608215, "learning_rate": 7.318205712283162e-05, "loss": 1.7393, "step": 3064 }, { "epoch": 0.8055966105034978, "grad_norm": 0.6755779981613159, "learning_rate": 7.316453478184686e-05, "loss": 1.7589, "step": 3066 }, { "epoch": 0.8061221138371597, "grad_norm": 0.6949239373207092, "learning_rate": 7.314701244086211e-05, "loss": 1.7677, "step": 3068 }, { "epoch": 0.8066476171708215, "grad_norm": 0.6781786680221558, "learning_rate": 7.312949009987735e-05, "loss": 1.7027, "step": 3070 }, { "epoch": 0.8071731205044832, "grad_norm": 0.683310866355896, "learning_rate": 7.311196775889259e-05, "loss": 1.7205, "step": 3072 }, { "epoch": 0.807698623838145, "grad_norm": 0.5861634016036987, "learning_rate": 7.309444541790783e-05, "loss": 1.7412, "step": 3074 }, { "epoch": 0.8082241271718067, "grad_norm": 0.7344016432762146, "learning_rate": 7.307692307692307e-05, "loss": 1.7988, "step": 3076 }, { "epoch": 0.8087496305054686, "grad_norm": 0.5996577143669128, "learning_rate": 7.305940073593832e-05, "loss": 1.765, "step": 3078 }, { "epoch": 0.8092751338391303, "grad_norm": 0.5766566395759583, "learning_rate": 7.304187839495357e-05, "loss": 1.8002, "step": 3080 }, { "epoch": 0.809800637172792, "grad_norm": 0.6364811658859253, "learning_rate": 7.302435605396882e-05, "loss": 1.7687, "step": 3082 }, { "epoch": 0.8103261405064538, "grad_norm": 0.839227557182312, "learning_rate": 7.300683371298405e-05, "loss": 1.7501, "step": 3084 }, { "epoch": 0.8108516438401157, "grad_norm": 0.6285102367401123, "learning_rate": 7.29893113719993e-05, "loss": 1.7503, "step": 3086 }, { "epoch": 0.8113771471737774, "grad_norm": 0.6087677478790283, "learning_rate": 7.297178903101455e-05, "loss": 1.7873, "step": 3088 }, { "epoch": 0.8119026505074391, "grad_norm": 0.9094337821006775, "learning_rate": 7.29542666900298e-05, "loss": 1.7501, "step": 3090 }, { "epoch": 0.8124281538411009, "grad_norm": 0.6166443228721619, "learning_rate": 7.293674434904504e-05, "loss": 1.7809, "step": 3092 }, { "epoch": 0.8129536571747628, "grad_norm": 0.6993762850761414, "learning_rate": 7.291922200806029e-05, "loss": 1.776, "step": 3094 }, { "epoch": 0.8134791605084245, "grad_norm": 0.6359695792198181, "learning_rate": 7.290169966707552e-05, "loss": 1.7177, "step": 3096 }, { "epoch": 0.8140046638420863, "grad_norm": 0.8014838695526123, "learning_rate": 7.288417732609077e-05, "loss": 1.7488, "step": 3098 }, { "epoch": 0.814530167175748, "grad_norm": 0.6601728200912476, "learning_rate": 7.2866654985106e-05, "loss": 1.8006, "step": 3100 }, { "epoch": 0.8150556705094097, "grad_norm": 0.9497177004814148, "learning_rate": 7.284913264412125e-05, "loss": 1.8234, "step": 3102 }, { "epoch": 0.8155811738430716, "grad_norm": 0.7122120261192322, "learning_rate": 7.28316103031365e-05, "loss": 1.7306, "step": 3104 }, { "epoch": 0.8161066771767334, "grad_norm": 0.7118192911148071, "learning_rate": 7.281408796215175e-05, "loss": 1.7178, "step": 3106 }, { "epoch": 0.8166321805103951, "grad_norm": 0.727682888507843, "learning_rate": 7.2796565621167e-05, "loss": 1.7857, "step": 3108 }, { "epoch": 0.8171576838440568, "grad_norm": 0.6266892552375793, "learning_rate": 7.277904328018223e-05, "loss": 1.7531, "step": 3110 }, { "epoch": 0.8176831871777187, "grad_norm": 1.1099108457565308, "learning_rate": 7.276152093919748e-05, "loss": 1.7579, "step": 3112 }, { "epoch": 0.8182086905113805, "grad_norm": 0.7087392807006836, "learning_rate": 7.274399859821272e-05, "loss": 1.7393, "step": 3114 }, { "epoch": 0.8187341938450422, "grad_norm": 0.9023381471633911, "learning_rate": 7.272647625722797e-05, "loss": 1.7488, "step": 3116 }, { "epoch": 0.819259697178704, "grad_norm": 0.6250995993614197, "learning_rate": 7.270895391624322e-05, "loss": 1.7567, "step": 3118 }, { "epoch": 0.8197852005123657, "grad_norm": 0.7343935370445251, "learning_rate": 7.269143157525847e-05, "loss": 1.7801, "step": 3120 }, { "epoch": 0.8203107038460276, "grad_norm": 0.7513467073440552, "learning_rate": 7.26739092342737e-05, "loss": 1.7638, "step": 3122 }, { "epoch": 0.8208362071796893, "grad_norm": 0.7709615230560303, "learning_rate": 7.265638689328895e-05, "loss": 1.7477, "step": 3124 }, { "epoch": 0.821361710513351, "grad_norm": 0.6068372130393982, "learning_rate": 7.263886455230418e-05, "loss": 1.7597, "step": 3126 }, { "epoch": 0.8218872138470128, "grad_norm": 0.6252472400665283, "learning_rate": 7.262134221131943e-05, "loss": 1.7413, "step": 3128 }, { "epoch": 0.8224127171806747, "grad_norm": 0.6601640582084656, "learning_rate": 7.260381987033468e-05, "loss": 1.7298, "step": 3130 }, { "epoch": 0.8229382205143364, "grad_norm": 0.6942901015281677, "learning_rate": 7.258629752934993e-05, "loss": 1.7788, "step": 3132 }, { "epoch": 0.8234637238479982, "grad_norm": 0.7625568509101868, "learning_rate": 7.256877518836517e-05, "loss": 1.752, "step": 3134 }, { "epoch": 0.8239892271816599, "grad_norm": 0.7857789993286133, "learning_rate": 7.25512528473804e-05, "loss": 1.7903, "step": 3136 }, { "epoch": 0.8245147305153218, "grad_norm": 0.76043301820755, "learning_rate": 7.253373050639565e-05, "loss": 1.7619, "step": 3138 }, { "epoch": 0.8250402338489835, "grad_norm": 0.7142301201820374, "learning_rate": 7.25162081654109e-05, "loss": 1.7328, "step": 3140 }, { "epoch": 0.8255657371826453, "grad_norm": 0.8170753717422485, "learning_rate": 7.249868582442615e-05, "loss": 1.7779, "step": 3142 }, { "epoch": 0.826091240516307, "grad_norm": 0.6277784109115601, "learning_rate": 7.24811634834414e-05, "loss": 1.7421, "step": 3144 }, { "epoch": 0.8266167438499687, "grad_norm": 0.8075504899024963, "learning_rate": 7.246364114245664e-05, "loss": 1.7504, "step": 3146 }, { "epoch": 0.8271422471836306, "grad_norm": 0.5615305304527283, "learning_rate": 7.244611880147188e-05, "loss": 1.7537, "step": 3148 }, { "epoch": 0.8276677505172924, "grad_norm": 0.7479259967803955, "learning_rate": 7.242859646048711e-05, "loss": 1.7801, "step": 3150 }, { "epoch": 0.8281932538509541, "grad_norm": 0.7294136881828308, "learning_rate": 7.241107411950236e-05, "loss": 1.7752, "step": 3152 }, { "epoch": 0.8287187571846159, "grad_norm": 0.6307454705238342, "learning_rate": 7.239355177851761e-05, "loss": 1.8088, "step": 3154 }, { "epoch": 0.8292442605182777, "grad_norm": 0.641791582107544, "learning_rate": 7.237602943753286e-05, "loss": 1.758, "step": 3156 }, { "epoch": 0.8297697638519395, "grad_norm": 0.7693712711334229, "learning_rate": 7.23585070965481e-05, "loss": 1.7816, "step": 3158 }, { "epoch": 0.8302952671856012, "grad_norm": 0.6319524049758911, "learning_rate": 7.234098475556335e-05, "loss": 1.7644, "step": 3160 }, { "epoch": 0.830820770519263, "grad_norm": 0.6433089375495911, "learning_rate": 7.232346241457858e-05, "loss": 1.7876, "step": 3162 }, { "epoch": 0.8313462738529247, "grad_norm": 0.5543965697288513, "learning_rate": 7.230594007359383e-05, "loss": 1.7678, "step": 3164 }, { "epoch": 0.8318717771865866, "grad_norm": 0.7125136852264404, "learning_rate": 7.228841773260908e-05, "loss": 1.7816, "step": 3166 }, { "epoch": 0.8323972805202483, "grad_norm": 0.6893459558486938, "learning_rate": 7.227089539162433e-05, "loss": 1.7556, "step": 3168 }, { "epoch": 0.83292278385391, "grad_norm": 0.7028675675392151, "learning_rate": 7.225337305063957e-05, "loss": 1.752, "step": 3170 }, { "epoch": 0.8334482871875718, "grad_norm": 0.6112826466560364, "learning_rate": 7.223585070965482e-05, "loss": 1.7367, "step": 3172 }, { "epoch": 0.8339737905212337, "grad_norm": 0.6377979516983032, "learning_rate": 7.221832836867006e-05, "loss": 1.7452, "step": 3174 }, { "epoch": 0.8344992938548954, "grad_norm": 0.6925122141838074, "learning_rate": 7.22008060276853e-05, "loss": 1.7776, "step": 3176 }, { "epoch": 0.8350247971885572, "grad_norm": 0.6226949095726013, "learning_rate": 7.218328368670054e-05, "loss": 1.748, "step": 3178 }, { "epoch": 0.8355503005222189, "grad_norm": 0.7138300538063049, "learning_rate": 7.216576134571579e-05, "loss": 1.7662, "step": 3180 }, { "epoch": 0.8360758038558808, "grad_norm": 0.6248802542686462, "learning_rate": 7.214823900473103e-05, "loss": 1.7652, "step": 3182 }, { "epoch": 0.8366013071895425, "grad_norm": 0.6834786534309387, "learning_rate": 7.213071666374628e-05, "loss": 1.7537, "step": 3184 }, { "epoch": 0.8371268105232043, "grad_norm": 0.6411855220794678, "learning_rate": 7.211319432276153e-05, "loss": 1.7582, "step": 3186 }, { "epoch": 0.837652313856866, "grad_norm": 0.6991042494773865, "learning_rate": 7.209567198177678e-05, "loss": 1.7643, "step": 3188 }, { "epoch": 0.8381778171905278, "grad_norm": 0.6581319570541382, "learning_rate": 7.207814964079201e-05, "loss": 1.7283, "step": 3190 }, { "epoch": 0.8387033205241896, "grad_norm": 0.6132378578186035, "learning_rate": 7.206062729980726e-05, "loss": 1.7724, "step": 3192 }, { "epoch": 0.8392288238578514, "grad_norm": 0.6994782090187073, "learning_rate": 7.20431049588225e-05, "loss": 1.7765, "step": 3194 }, { "epoch": 0.8397543271915131, "grad_norm": 0.7247324585914612, "learning_rate": 7.202558261783775e-05, "loss": 1.7665, "step": 3196 }, { "epoch": 0.8402798305251749, "grad_norm": 0.8014911413192749, "learning_rate": 7.2008060276853e-05, "loss": 1.7471, "step": 3198 }, { "epoch": 0.8408053338588367, "grad_norm": 0.7040480971336365, "learning_rate": 7.199053793586825e-05, "loss": 1.7527, "step": 3200 }, { "epoch": 0.8408053338588367, "eval_loss": 1.7256534099578857, "eval_runtime": 487.1811, "eval_samples_per_second": 249.987, "eval_steps_per_second": 31.249, "step": 3200 }, { "epoch": 0.8413308371924985, "grad_norm": 0.8999149799346924, "learning_rate": 7.197301559488348e-05, "loss": 1.7836, "step": 3202 }, { "epoch": 0.8418563405261602, "grad_norm": 0.6944252252578735, "learning_rate": 7.195549325389872e-05, "loss": 1.7336, "step": 3204 }, { "epoch": 0.842381843859822, "grad_norm": 0.5566868185997009, "learning_rate": 7.193797091291396e-05, "loss": 1.7681, "step": 3206 }, { "epoch": 0.8429073471934838, "grad_norm": 0.769062340259552, "learning_rate": 7.192044857192921e-05, "loss": 1.7477, "step": 3208 }, { "epoch": 0.8434328505271456, "grad_norm": 0.6222507953643799, "learning_rate": 7.190292623094446e-05, "loss": 1.7576, "step": 3210 }, { "epoch": 0.8439583538608073, "grad_norm": 0.6584329009056091, "learning_rate": 7.18854038899597e-05, "loss": 1.7658, "step": 3212 }, { "epoch": 0.8444838571944691, "grad_norm": 0.6822264194488525, "learning_rate": 7.186788154897495e-05, "loss": 1.7709, "step": 3214 }, { "epoch": 0.8450093605281308, "grad_norm": 0.6141505241394043, "learning_rate": 7.185035920799019e-05, "loss": 1.776, "step": 3216 }, { "epoch": 0.8455348638617927, "grad_norm": 0.5974141359329224, "learning_rate": 7.183283686700543e-05, "loss": 1.7246, "step": 3218 }, { "epoch": 0.8460603671954544, "grad_norm": 0.6883708238601685, "learning_rate": 7.181531452602068e-05, "loss": 1.7609, "step": 3220 }, { "epoch": 0.8465858705291162, "grad_norm": 0.6883281469345093, "learning_rate": 7.179779218503593e-05, "loss": 1.7315, "step": 3222 }, { "epoch": 0.8471113738627779, "grad_norm": 0.6654126048088074, "learning_rate": 7.178026984405118e-05, "loss": 1.7469, "step": 3224 }, { "epoch": 0.8476368771964398, "grad_norm": 0.8119237422943115, "learning_rate": 7.176274750306641e-05, "loss": 1.7882, "step": 3226 }, { "epoch": 0.8481623805301015, "grad_norm": 0.7793521285057068, "learning_rate": 7.174522516208166e-05, "loss": 1.7826, "step": 3228 }, { "epoch": 0.8486878838637633, "grad_norm": 0.8359899520874023, "learning_rate": 7.172770282109689e-05, "loss": 1.7612, "step": 3230 }, { "epoch": 0.849213387197425, "grad_norm": 0.5970791578292847, "learning_rate": 7.171018048011214e-05, "loss": 1.7498, "step": 3232 }, { "epoch": 0.8497388905310868, "grad_norm": 0.6869466304779053, "learning_rate": 7.169265813912739e-05, "loss": 1.7787, "step": 3234 }, { "epoch": 0.8502643938647486, "grad_norm": 0.6755763292312622, "learning_rate": 7.167513579814264e-05, "loss": 1.7883, "step": 3236 }, { "epoch": 0.8507898971984104, "grad_norm": 0.8182030320167542, "learning_rate": 7.165761345715788e-05, "loss": 1.7821, "step": 3238 }, { "epoch": 0.8513154005320721, "grad_norm": 0.6454432010650635, "learning_rate": 7.164009111617313e-05, "loss": 1.7527, "step": 3240 }, { "epoch": 0.8518409038657339, "grad_norm": 0.7202356457710266, "learning_rate": 7.162256877518836e-05, "loss": 1.7821, "step": 3242 }, { "epoch": 0.8523664071993957, "grad_norm": 0.7236865758895874, "learning_rate": 7.160504643420361e-05, "loss": 1.7579, "step": 3244 }, { "epoch": 0.8528919105330575, "grad_norm": 0.6406378746032715, "learning_rate": 7.158752409321886e-05, "loss": 1.7459, "step": 3246 }, { "epoch": 0.8534174138667192, "grad_norm": 0.755293607711792, "learning_rate": 7.157000175223411e-05, "loss": 1.7638, "step": 3248 }, { "epoch": 0.853942917200381, "grad_norm": 0.7986418604850769, "learning_rate": 7.155247941124936e-05, "loss": 1.7686, "step": 3250 }, { "epoch": 0.8544684205340428, "grad_norm": 0.7703737616539001, "learning_rate": 7.153495707026459e-05, "loss": 1.7372, "step": 3252 }, { "epoch": 0.8549939238677046, "grad_norm": 0.6074422597885132, "learning_rate": 7.151743472927984e-05, "loss": 1.7765, "step": 3254 }, { "epoch": 0.8555194272013663, "grad_norm": 0.7662899494171143, "learning_rate": 7.149991238829507e-05, "loss": 1.7563, "step": 3256 }, { "epoch": 0.8560449305350281, "grad_norm": 0.7859123945236206, "learning_rate": 7.148239004731032e-05, "loss": 1.7462, "step": 3258 }, { "epoch": 0.8565704338686898, "grad_norm": 0.5845335721969604, "learning_rate": 7.146486770632557e-05, "loss": 1.7524, "step": 3260 }, { "epoch": 0.8570959372023517, "grad_norm": 0.6083472967147827, "learning_rate": 7.144734536534081e-05, "loss": 1.7705, "step": 3262 }, { "epoch": 0.8576214405360134, "grad_norm": 0.779712438583374, "learning_rate": 7.142982302435606e-05, "loss": 1.7768, "step": 3264 }, { "epoch": 0.8581469438696752, "grad_norm": 0.7343006134033203, "learning_rate": 7.141230068337131e-05, "loss": 1.7489, "step": 3266 }, { "epoch": 0.8586724472033369, "grad_norm": 0.578289270401001, "learning_rate": 7.139477834238654e-05, "loss": 1.7436, "step": 3268 }, { "epoch": 0.8591979505369988, "grad_norm": 0.6657706499099731, "learning_rate": 7.137725600140179e-05, "loss": 1.7674, "step": 3270 }, { "epoch": 0.8597234538706605, "grad_norm": 0.7936644554138184, "learning_rate": 7.135973366041704e-05, "loss": 1.7482, "step": 3272 }, { "epoch": 0.8602489572043223, "grad_norm": 0.5892787575721741, "learning_rate": 7.134221131943229e-05, "loss": 1.7448, "step": 3274 }, { "epoch": 0.860774460537984, "grad_norm": 0.6341478228569031, "learning_rate": 7.132468897844753e-05, "loss": 1.7613, "step": 3276 }, { "epoch": 0.8612999638716458, "grad_norm": 0.684622585773468, "learning_rate": 7.130716663746277e-05, "loss": 1.7685, "step": 3278 }, { "epoch": 0.8618254672053076, "grad_norm": 0.5765745639801025, "learning_rate": 7.128964429647801e-05, "loss": 1.7889, "step": 3280 }, { "epoch": 0.8623509705389694, "grad_norm": 0.5164791941642761, "learning_rate": 7.127212195549325e-05, "loss": 1.7573, "step": 3282 }, { "epoch": 0.8628764738726311, "grad_norm": 0.5809277892112732, "learning_rate": 7.12545996145085e-05, "loss": 1.8003, "step": 3284 }, { "epoch": 0.8634019772062929, "grad_norm": 0.6859455108642578, "learning_rate": 7.123707727352374e-05, "loss": 1.726, "step": 3286 }, { "epoch": 0.8639274805399547, "grad_norm": 0.665773332118988, "learning_rate": 7.121955493253899e-05, "loss": 1.7629, "step": 3288 }, { "epoch": 0.8644529838736165, "grad_norm": 0.7473315596580505, "learning_rate": 7.120203259155424e-05, "loss": 1.7538, "step": 3290 }, { "epoch": 0.8649784872072782, "grad_norm": 0.7632318735122681, "learning_rate": 7.118451025056949e-05, "loss": 1.7755, "step": 3292 }, { "epoch": 0.86550399054094, "grad_norm": 0.7813208699226379, "learning_rate": 7.116698790958472e-05, "loss": 1.7835, "step": 3294 }, { "epoch": 0.8660294938746018, "grad_norm": 0.9293962121009827, "learning_rate": 7.114946556859997e-05, "loss": 1.7332, "step": 3296 }, { "epoch": 0.8665549972082636, "grad_norm": 0.9213319420814514, "learning_rate": 7.113194322761522e-05, "loss": 1.709, "step": 3298 }, { "epoch": 0.8670805005419253, "grad_norm": 0.5981359481811523, "learning_rate": 7.111442088663046e-05, "loss": 1.7406, "step": 3300 }, { "epoch": 0.8676060038755871, "grad_norm": 0.645785927772522, "learning_rate": 7.109689854564571e-05, "loss": 1.7659, "step": 3302 }, { "epoch": 0.8681315072092488, "grad_norm": 0.9009891748428345, "learning_rate": 7.107937620466094e-05, "loss": 1.7335, "step": 3304 }, { "epoch": 0.8686570105429107, "grad_norm": 0.7231364250183105, "learning_rate": 7.106185386367619e-05, "loss": 1.7378, "step": 3306 }, { "epoch": 0.8691825138765724, "grad_norm": 0.8442168235778809, "learning_rate": 7.104433152269143e-05, "loss": 1.7722, "step": 3308 }, { "epoch": 0.8697080172102342, "grad_norm": 0.7314670085906982, "learning_rate": 7.102680918170667e-05, "loss": 1.7594, "step": 3310 }, { "epoch": 0.8702335205438959, "grad_norm": 0.6394951343536377, "learning_rate": 7.100928684072192e-05, "loss": 1.7693, "step": 3312 }, { "epoch": 0.8707590238775578, "grad_norm": 0.612392008304596, "learning_rate": 7.099176449973717e-05, "loss": 1.7777, "step": 3314 }, { "epoch": 0.8712845272112195, "grad_norm": 0.6011560559272766, "learning_rate": 7.097424215875242e-05, "loss": 1.7429, "step": 3316 }, { "epoch": 0.8718100305448813, "grad_norm": 0.9169111847877502, "learning_rate": 7.095671981776766e-05, "loss": 1.7723, "step": 3318 }, { "epoch": 0.872335533878543, "grad_norm": 0.5939310193061829, "learning_rate": 7.09391974767829e-05, "loss": 1.718, "step": 3320 }, { "epoch": 0.8728610372122048, "grad_norm": 0.8067646026611328, "learning_rate": 7.092167513579815e-05, "loss": 1.7679, "step": 3322 }, { "epoch": 0.8733865405458666, "grad_norm": 0.6771594882011414, "learning_rate": 7.090415279481339e-05, "loss": 1.7405, "step": 3324 }, { "epoch": 0.8739120438795284, "grad_norm": 0.7373068332672119, "learning_rate": 7.088663045382864e-05, "loss": 1.7363, "step": 3326 }, { "epoch": 0.8744375472131901, "grad_norm": 0.5647407174110413, "learning_rate": 7.086910811284387e-05, "loss": 1.8027, "step": 3328 }, { "epoch": 0.8749630505468519, "grad_norm": 0.5459885001182556, "learning_rate": 7.085158577185912e-05, "loss": 1.7588, "step": 3330 }, { "epoch": 0.8754885538805137, "grad_norm": 0.6479038596153259, "learning_rate": 7.083406343087437e-05, "loss": 1.8066, "step": 3332 }, { "epoch": 0.8760140572141755, "grad_norm": 0.6865916848182678, "learning_rate": 7.08165410898896e-05, "loss": 1.7605, "step": 3334 }, { "epoch": 0.8765395605478372, "grad_norm": 0.6146122813224792, "learning_rate": 7.079901874890485e-05, "loss": 1.7711, "step": 3336 }, { "epoch": 0.877065063881499, "grad_norm": 0.6822938323020935, "learning_rate": 7.07814964079201e-05, "loss": 1.7355, "step": 3338 }, { "epoch": 0.8775905672151608, "grad_norm": 0.8722227215766907, "learning_rate": 7.076397406693535e-05, "loss": 1.7405, "step": 3340 }, { "epoch": 0.8781160705488226, "grad_norm": 0.6625831127166748, "learning_rate": 7.07464517259506e-05, "loss": 1.748, "step": 3342 }, { "epoch": 0.8786415738824843, "grad_norm": 0.5351794362068176, "learning_rate": 7.072892938496584e-05, "loss": 1.747, "step": 3344 }, { "epoch": 0.8791670772161461, "grad_norm": 0.7087706923484802, "learning_rate": 7.071140704398108e-05, "loss": 1.7475, "step": 3346 }, { "epoch": 0.8796925805498078, "grad_norm": 0.6210707426071167, "learning_rate": 7.069388470299632e-05, "loss": 1.7666, "step": 3348 }, { "epoch": 0.8802180838834697, "grad_norm": 0.5860678553581238, "learning_rate": 7.067636236201157e-05, "loss": 1.728, "step": 3350 }, { "epoch": 0.8807435872171314, "grad_norm": 0.5733693242073059, "learning_rate": 7.065884002102682e-05, "loss": 1.8006, "step": 3352 }, { "epoch": 0.8812690905507932, "grad_norm": 0.6897549629211426, "learning_rate": 7.064131768004205e-05, "loss": 1.7615, "step": 3354 }, { "epoch": 0.8817945938844549, "grad_norm": 0.8119019865989685, "learning_rate": 7.06237953390573e-05, "loss": 1.7428, "step": 3356 }, { "epoch": 0.8823200972181168, "grad_norm": 0.6543797254562378, "learning_rate": 7.060627299807255e-05, "loss": 1.7592, "step": 3358 }, { "epoch": 0.8828456005517785, "grad_norm": 0.6914211511611938, "learning_rate": 7.058875065708778e-05, "loss": 1.7549, "step": 3360 }, { "epoch": 0.8833711038854403, "grad_norm": 0.7868301868438721, "learning_rate": 7.057122831610303e-05, "loss": 1.7475, "step": 3362 }, { "epoch": 0.883896607219102, "grad_norm": 0.7719436287879944, "learning_rate": 7.055370597511828e-05, "loss": 1.7549, "step": 3364 }, { "epoch": 0.8844221105527639, "grad_norm": 0.6737284064292908, "learning_rate": 7.053618363413352e-05, "loss": 1.7392, "step": 3366 }, { "epoch": 0.8849476138864256, "grad_norm": 1.0450035333633423, "learning_rate": 7.051866129314877e-05, "loss": 1.736, "step": 3368 }, { "epoch": 0.8854731172200874, "grad_norm": 0.6924079060554504, "learning_rate": 7.050113895216402e-05, "loss": 1.7431, "step": 3370 }, { "epoch": 0.8859986205537491, "grad_norm": 0.689011812210083, "learning_rate": 7.048361661117925e-05, "loss": 1.7567, "step": 3372 }, { "epoch": 0.8865241238874109, "grad_norm": 0.6025006771087646, "learning_rate": 7.04660942701945e-05, "loss": 1.7339, "step": 3374 }, { "epoch": 0.8870496272210727, "grad_norm": 0.6246035695075989, "learning_rate": 7.044857192920975e-05, "loss": 1.7728, "step": 3376 }, { "epoch": 0.8875751305547345, "grad_norm": 0.92098069190979, "learning_rate": 7.0431049588225e-05, "loss": 1.7452, "step": 3378 }, { "epoch": 0.8881006338883962, "grad_norm": 0.6286795139312744, "learning_rate": 7.041352724724023e-05, "loss": 1.752, "step": 3380 }, { "epoch": 0.888626137222058, "grad_norm": 0.5665922164916992, "learning_rate": 7.039600490625548e-05, "loss": 1.7561, "step": 3382 }, { "epoch": 0.8891516405557198, "grad_norm": 0.7606804966926575, "learning_rate": 7.037848256527073e-05, "loss": 1.7903, "step": 3384 }, { "epoch": 0.8896771438893816, "grad_norm": 0.8399646282196045, "learning_rate": 7.036096022428596e-05, "loss": 1.8045, "step": 3386 }, { "epoch": 0.8902026472230433, "grad_norm": 0.6141343116760254, "learning_rate": 7.03434378833012e-05, "loss": 1.7665, "step": 3388 }, { "epoch": 0.8907281505567051, "grad_norm": 0.7554699778556824, "learning_rate": 7.032591554231645e-05, "loss": 1.7219, "step": 3390 }, { "epoch": 0.8912536538903668, "grad_norm": 0.7068594694137573, "learning_rate": 7.03083932013317e-05, "loss": 1.7689, "step": 3392 }, { "epoch": 0.8917791572240287, "grad_norm": 0.8859004378318787, "learning_rate": 7.029087086034695e-05, "loss": 1.7261, "step": 3394 }, { "epoch": 0.8923046605576904, "grad_norm": 0.7646594047546387, "learning_rate": 7.02733485193622e-05, "loss": 1.7813, "step": 3396 }, { "epoch": 0.8928301638913522, "grad_norm": 0.7105104923248291, "learning_rate": 7.025582617837743e-05, "loss": 1.7528, "step": 3398 }, { "epoch": 0.8933556672250139, "grad_norm": 0.5750377178192139, "learning_rate": 7.023830383739268e-05, "loss": 1.7324, "step": 3400 }, { "epoch": 0.8938811705586758, "grad_norm": 0.6942424178123474, "learning_rate": 7.022078149640793e-05, "loss": 1.7352, "step": 3402 }, { "epoch": 0.8944066738923375, "grad_norm": 0.7094271779060364, "learning_rate": 7.020325915542317e-05, "loss": 1.7449, "step": 3404 }, { "epoch": 0.8949321772259993, "grad_norm": 0.6620864868164062, "learning_rate": 7.018573681443841e-05, "loss": 1.7718, "step": 3406 }, { "epoch": 0.895457680559661, "grad_norm": 0.6577492952346802, "learning_rate": 7.016821447345366e-05, "loss": 1.745, "step": 3408 }, { "epoch": 0.8959831838933229, "grad_norm": 0.6020835041999817, "learning_rate": 7.01506921324689e-05, "loss": 1.7486, "step": 3410 }, { "epoch": 0.8965086872269846, "grad_norm": 0.6429753303527832, "learning_rate": 7.013316979148414e-05, "loss": 1.7656, "step": 3412 }, { "epoch": 0.8970341905606464, "grad_norm": 0.6767374277114868, "learning_rate": 7.011564745049938e-05, "loss": 1.7193, "step": 3414 }, { "epoch": 0.8975596938943081, "grad_norm": 0.7432959079742432, "learning_rate": 7.009812510951463e-05, "loss": 1.7336, "step": 3416 }, { "epoch": 0.8980851972279699, "grad_norm": 0.6830999851226807, "learning_rate": 7.008060276852988e-05, "loss": 1.7438, "step": 3418 }, { "epoch": 0.8986107005616317, "grad_norm": 0.5865710377693176, "learning_rate": 7.006308042754513e-05, "loss": 1.8067, "step": 3420 }, { "epoch": 0.8991362038952935, "grad_norm": 1.0097007751464844, "learning_rate": 7.004555808656037e-05, "loss": 1.7458, "step": 3422 }, { "epoch": 0.8996617072289552, "grad_norm": 0.7260006666183472, "learning_rate": 7.002803574557561e-05, "loss": 1.7707, "step": 3424 }, { "epoch": 0.900187210562617, "grad_norm": 0.7570676803588867, "learning_rate": 7.001051340459086e-05, "loss": 1.7631, "step": 3426 }, { "epoch": 0.9007127138962788, "grad_norm": 1.079424500465393, "learning_rate": 6.99929910636061e-05, "loss": 1.7318, "step": 3428 }, { "epoch": 0.9012382172299406, "grad_norm": 0.637510359287262, "learning_rate": 6.997546872262134e-05, "loss": 1.7427, "step": 3430 }, { "epoch": 0.9017637205636023, "grad_norm": 0.9443916082382202, "learning_rate": 6.995794638163659e-05, "loss": 1.7182, "step": 3432 }, { "epoch": 0.9022892238972641, "grad_norm": 0.7777066826820374, "learning_rate": 6.994042404065183e-05, "loss": 1.7543, "step": 3434 }, { "epoch": 0.9028147272309258, "grad_norm": 0.6093029975891113, "learning_rate": 6.992290169966708e-05, "loss": 1.7501, "step": 3436 }, { "epoch": 0.9033402305645877, "grad_norm": 0.6203290820121765, "learning_rate": 6.990537935868233e-05, "loss": 1.8313, "step": 3438 }, { "epoch": 0.9038657338982494, "grad_norm": 0.6452112793922424, "learning_rate": 6.988785701769756e-05, "loss": 1.7691, "step": 3440 }, { "epoch": 0.9043912372319112, "grad_norm": 0.6220587491989136, "learning_rate": 6.987033467671281e-05, "loss": 1.7458, "step": 3442 }, { "epoch": 0.9049167405655729, "grad_norm": 0.6492230296134949, "learning_rate": 6.985281233572806e-05, "loss": 1.7624, "step": 3444 }, { "epoch": 0.9054422438992348, "grad_norm": 0.8058958053588867, "learning_rate": 6.98352899947433e-05, "loss": 1.7341, "step": 3446 }, { "epoch": 0.9059677472328965, "grad_norm": 0.765034019947052, "learning_rate": 6.981776765375855e-05, "loss": 1.7312, "step": 3448 }, { "epoch": 0.9064932505665583, "grad_norm": 0.9301319122314453, "learning_rate": 6.98002453127738e-05, "loss": 1.7785, "step": 3450 }, { "epoch": 0.90701875390022, "grad_norm": 0.723552942276001, "learning_rate": 6.978272297178903e-05, "loss": 1.7547, "step": 3452 }, { "epoch": 0.9075442572338819, "grad_norm": 0.8970544338226318, "learning_rate": 6.976520063080428e-05, "loss": 1.7424, "step": 3454 }, { "epoch": 0.9080697605675436, "grad_norm": 0.6986632943153381, "learning_rate": 6.974767828981952e-05, "loss": 1.76, "step": 3456 }, { "epoch": 0.9085952639012054, "grad_norm": 0.6767635941505432, "learning_rate": 6.973015594883476e-05, "loss": 1.7517, "step": 3458 }, { "epoch": 0.9091207672348671, "grad_norm": 0.7813493013381958, "learning_rate": 6.971263360785001e-05, "loss": 1.7303, "step": 3460 }, { "epoch": 0.9096462705685289, "grad_norm": 0.7122093439102173, "learning_rate": 6.969511126686526e-05, "loss": 1.7735, "step": 3462 }, { "epoch": 0.9101717739021907, "grad_norm": 0.9538240432739258, "learning_rate": 6.96775889258805e-05, "loss": 1.7279, "step": 3464 }, { "epoch": 0.9106972772358525, "grad_norm": 0.9474038481712341, "learning_rate": 6.966006658489574e-05, "loss": 1.7636, "step": 3466 }, { "epoch": 0.9112227805695142, "grad_norm": 0.6011683344841003, "learning_rate": 6.964254424391099e-05, "loss": 1.7414, "step": 3468 }, { "epoch": 0.911748283903176, "grad_norm": 0.7591129541397095, "learning_rate": 6.962502190292623e-05, "loss": 1.7654, "step": 3470 }, { "epoch": 0.9122737872368378, "grad_norm": 0.8848958015441895, "learning_rate": 6.960749956194148e-05, "loss": 1.7581, "step": 3472 }, { "epoch": 0.9127992905704996, "grad_norm": 0.7266464233398438, "learning_rate": 6.958997722095673e-05, "loss": 1.769, "step": 3474 }, { "epoch": 0.9133247939041613, "grad_norm": 0.8937695026397705, "learning_rate": 6.957245487997198e-05, "loss": 1.7574, "step": 3476 }, { "epoch": 0.9138502972378231, "grad_norm": 0.6543369293212891, "learning_rate": 6.955493253898721e-05, "loss": 1.7587, "step": 3478 }, { "epoch": 0.9143758005714848, "grad_norm": 1.0419197082519531, "learning_rate": 6.953741019800246e-05, "loss": 1.7311, "step": 3480 }, { "epoch": 0.9149013039051467, "grad_norm": 0.6583675742149353, "learning_rate": 6.951988785701769e-05, "loss": 1.7415, "step": 3482 }, { "epoch": 0.9154268072388084, "grad_norm": 0.7123555541038513, "learning_rate": 6.950236551603294e-05, "loss": 1.7227, "step": 3484 }, { "epoch": 0.9159523105724702, "grad_norm": 0.606636106967926, "learning_rate": 6.948484317504819e-05, "loss": 1.7555, "step": 3486 }, { "epoch": 0.9164778139061319, "grad_norm": 1.0369200706481934, "learning_rate": 6.946732083406344e-05, "loss": 1.7494, "step": 3488 }, { "epoch": 0.9170033172397938, "grad_norm": 0.6828787922859192, "learning_rate": 6.944979849307868e-05, "loss": 1.7302, "step": 3490 }, { "epoch": 0.9175288205734555, "grad_norm": 0.7840218544006348, "learning_rate": 6.943227615209392e-05, "loss": 1.734, "step": 3492 }, { "epoch": 0.9180543239071173, "grad_norm": 0.6639379858970642, "learning_rate": 6.941475381110916e-05, "loss": 1.7197, "step": 3494 }, { "epoch": 0.918579827240779, "grad_norm": 0.6590544581413269, "learning_rate": 6.939723147012441e-05, "loss": 1.758, "step": 3496 }, { "epoch": 0.9191053305744409, "grad_norm": 0.5985316038131714, "learning_rate": 6.937970912913966e-05, "loss": 1.7545, "step": 3498 }, { "epoch": 0.9196308339081026, "grad_norm": 0.6269810795783997, "learning_rate": 6.936218678815491e-05, "loss": 1.7476, "step": 3500 }, { "epoch": 0.9201563372417644, "grad_norm": 0.7149941921234131, "learning_rate": 6.934466444717016e-05, "loss": 1.7629, "step": 3502 }, { "epoch": 0.9206818405754261, "grad_norm": 0.641220211982727, "learning_rate": 6.932714210618539e-05, "loss": 1.7613, "step": 3504 }, { "epoch": 0.9212073439090879, "grad_norm": 0.7803055644035339, "learning_rate": 6.930961976520064e-05, "loss": 1.7401, "step": 3506 }, { "epoch": 0.9217328472427497, "grad_norm": 0.6284143924713135, "learning_rate": 6.929209742421587e-05, "loss": 1.7267, "step": 3508 }, { "epoch": 0.9222583505764115, "grad_norm": 0.594203531742096, "learning_rate": 6.927457508323112e-05, "loss": 1.7553, "step": 3510 }, { "epoch": 0.9227838539100732, "grad_norm": 0.7015509009361267, "learning_rate": 6.925705274224637e-05, "loss": 1.7122, "step": 3512 }, { "epoch": 0.923309357243735, "grad_norm": 0.6864806413650513, "learning_rate": 6.923953040126161e-05, "loss": 1.7099, "step": 3514 }, { "epoch": 0.9238348605773968, "grad_norm": 0.7701146602630615, "learning_rate": 6.922200806027686e-05, "loss": 1.7508, "step": 3516 }, { "epoch": 0.9243603639110586, "grad_norm": 0.6888076663017273, "learning_rate": 6.92044857192921e-05, "loss": 1.7455, "step": 3518 }, { "epoch": 0.9248858672447203, "grad_norm": 0.6871370077133179, "learning_rate": 6.918696337830734e-05, "loss": 1.7597, "step": 3520 }, { "epoch": 0.9254113705783821, "grad_norm": 1.1456379890441895, "learning_rate": 6.916944103732259e-05, "loss": 1.7585, "step": 3522 }, { "epoch": 0.9259368739120439, "grad_norm": 0.6293717622756958, "learning_rate": 6.915191869633784e-05, "loss": 1.7279, "step": 3524 }, { "epoch": 0.9264623772457057, "grad_norm": 0.6523435711860657, "learning_rate": 6.913439635535309e-05, "loss": 1.7486, "step": 3526 }, { "epoch": 0.9269878805793674, "grad_norm": 0.6591430306434631, "learning_rate": 6.911687401436833e-05, "loss": 1.7544, "step": 3528 }, { "epoch": 0.9275133839130292, "grad_norm": 0.7993916273117065, "learning_rate": 6.909935167338357e-05, "loss": 1.7468, "step": 3530 }, { "epoch": 0.9280388872466909, "grad_norm": 0.5962069630622864, "learning_rate": 6.90818293323988e-05, "loss": 1.73, "step": 3532 }, { "epoch": 0.9285643905803528, "grad_norm": 0.7057903409004211, "learning_rate": 6.906430699141405e-05, "loss": 1.7613, "step": 3534 }, { "epoch": 0.9290898939140145, "grad_norm": 0.8017992377281189, "learning_rate": 6.90467846504293e-05, "loss": 1.7576, "step": 3536 }, { "epoch": 0.9296153972476763, "grad_norm": 0.717413067817688, "learning_rate": 6.902926230944454e-05, "loss": 1.6982, "step": 3538 }, { "epoch": 0.930140900581338, "grad_norm": 0.7504727840423584, "learning_rate": 6.901173996845979e-05, "loss": 1.7657, "step": 3540 }, { "epoch": 0.9306664039149999, "grad_norm": 1.548189401626587, "learning_rate": 6.899421762747504e-05, "loss": 1.7614, "step": 3542 }, { "epoch": 0.9311919072486616, "grad_norm": 0.7025728821754456, "learning_rate": 6.897669528649027e-05, "loss": 1.748, "step": 3544 }, { "epoch": 0.9317174105823234, "grad_norm": 0.8109622001647949, "learning_rate": 6.895917294550552e-05, "loss": 1.7346, "step": 3546 }, { "epoch": 0.9322429139159851, "grad_norm": 0.6339906454086304, "learning_rate": 6.894165060452077e-05, "loss": 1.7594, "step": 3548 }, { "epoch": 0.9327684172496469, "grad_norm": 0.7020452618598938, "learning_rate": 6.892412826353602e-05, "loss": 1.7461, "step": 3550 }, { "epoch": 0.9332939205833087, "grad_norm": 0.755463182926178, "learning_rate": 6.890660592255126e-05, "loss": 1.7522, "step": 3552 }, { "epoch": 0.9338194239169705, "grad_norm": 0.8311240673065186, "learning_rate": 6.888908358156651e-05, "loss": 1.7438, "step": 3554 }, { "epoch": 0.9343449272506322, "grad_norm": 0.8520714044570923, "learning_rate": 6.887156124058174e-05, "loss": 1.7354, "step": 3556 }, { "epoch": 0.934870430584294, "grad_norm": 0.5846522450447083, "learning_rate": 6.885403889959698e-05, "loss": 1.7784, "step": 3558 }, { "epoch": 0.9353959339179558, "grad_norm": 0.6541776657104492, "learning_rate": 6.883651655861223e-05, "loss": 1.7437, "step": 3560 }, { "epoch": 0.9359214372516176, "grad_norm": 0.8012898564338684, "learning_rate": 6.881899421762747e-05, "loss": 1.7559, "step": 3562 }, { "epoch": 0.9364469405852793, "grad_norm": 0.6521446704864502, "learning_rate": 6.880147187664272e-05, "loss": 1.7359, "step": 3564 }, { "epoch": 0.9369724439189411, "grad_norm": 0.6322072744369507, "learning_rate": 6.878394953565797e-05, "loss": 1.7682, "step": 3566 }, { "epoch": 0.9374979472526029, "grad_norm": 0.8116897940635681, "learning_rate": 6.876642719467322e-05, "loss": 1.7303, "step": 3568 }, { "epoch": 0.9380234505862647, "grad_norm": 0.6734224557876587, "learning_rate": 6.874890485368845e-05, "loss": 1.7101, "step": 3570 }, { "epoch": 0.9385489539199264, "grad_norm": 0.7724919319152832, "learning_rate": 6.87313825127037e-05, "loss": 1.754, "step": 3572 }, { "epoch": 0.9390744572535882, "grad_norm": 0.6816089749336243, "learning_rate": 6.871386017171895e-05, "loss": 1.7592, "step": 3574 }, { "epoch": 0.9395999605872499, "grad_norm": 0.6413043737411499, "learning_rate": 6.869633783073419e-05, "loss": 1.7446, "step": 3576 }, { "epoch": 0.9401254639209118, "grad_norm": 0.8377385139465332, "learning_rate": 6.867881548974944e-05, "loss": 1.7528, "step": 3578 }, { "epoch": 0.9406509672545735, "grad_norm": 0.7805231809616089, "learning_rate": 6.866129314876469e-05, "loss": 1.8016, "step": 3580 }, { "epoch": 0.9411764705882353, "grad_norm": 0.5867004990577698, "learning_rate": 6.864377080777992e-05, "loss": 1.7655, "step": 3582 }, { "epoch": 0.941701973921897, "grad_norm": 0.6897924542427063, "learning_rate": 6.862624846679516e-05, "loss": 1.7547, "step": 3584 }, { "epoch": 0.9422274772555589, "grad_norm": 0.8123984336853027, "learning_rate": 6.86087261258104e-05, "loss": 1.7478, "step": 3586 }, { "epoch": 0.9427529805892206, "grad_norm": 0.6391046643257141, "learning_rate": 6.859120378482565e-05, "loss": 1.7409, "step": 3588 }, { "epoch": 0.9432784839228824, "grad_norm": 0.7633985280990601, "learning_rate": 6.85736814438409e-05, "loss": 1.742, "step": 3590 }, { "epoch": 0.9438039872565441, "grad_norm": 0.6767701506614685, "learning_rate": 6.855615910285615e-05, "loss": 1.723, "step": 3592 }, { "epoch": 0.9443294905902059, "grad_norm": 0.7079693675041199, "learning_rate": 6.85386367618714e-05, "loss": 1.7801, "step": 3594 }, { "epoch": 0.9448549939238677, "grad_norm": 0.6894028782844543, "learning_rate": 6.852111442088663e-05, "loss": 1.7508, "step": 3596 }, { "epoch": 0.9453804972575295, "grad_norm": 0.6596964597702026, "learning_rate": 6.850359207990188e-05, "loss": 1.7529, "step": 3598 }, { "epoch": 0.9459060005911912, "grad_norm": 0.6702237725257874, "learning_rate": 6.848606973891712e-05, "loss": 1.7714, "step": 3600 }, { "epoch": 0.9459060005911912, "eval_loss": 1.7171640396118164, "eval_runtime": 487.1544, "eval_samples_per_second": 250.001, "eval_steps_per_second": 31.251, "step": 3600 }, { "epoch": 0.946431503924853, "grad_norm": 0.5935298800468445, "learning_rate": 6.846854739793237e-05, "loss": 1.7409, "step": 3602 }, { "epoch": 0.9469570072585148, "grad_norm": 0.844529926776886, "learning_rate": 6.845102505694762e-05, "loss": 1.7598, "step": 3604 }, { "epoch": 0.9474825105921766, "grad_norm": 0.6452075839042664, "learning_rate": 6.843350271596287e-05, "loss": 1.7641, "step": 3606 }, { "epoch": 0.9480080139258383, "grad_norm": 0.7184598445892334, "learning_rate": 6.84159803749781e-05, "loss": 1.7575, "step": 3608 }, { "epoch": 0.9485335172595001, "grad_norm": 0.6628120541572571, "learning_rate": 6.839845803399333e-05, "loss": 1.7427, "step": 3610 }, { "epoch": 0.9490590205931619, "grad_norm": 0.6025474667549133, "learning_rate": 6.838093569300858e-05, "loss": 1.7132, "step": 3612 }, { "epoch": 0.9495845239268237, "grad_norm": 0.6190858483314514, "learning_rate": 6.836341335202383e-05, "loss": 1.7773, "step": 3614 }, { "epoch": 0.9501100272604854, "grad_norm": 0.6773670315742493, "learning_rate": 6.834589101103908e-05, "loss": 1.7139, "step": 3616 }, { "epoch": 0.9506355305941472, "grad_norm": 0.6356403827667236, "learning_rate": 6.832836867005432e-05, "loss": 1.7366, "step": 3618 }, { "epoch": 0.9511610339278089, "grad_norm": 0.7546253800392151, "learning_rate": 6.831084632906957e-05, "loss": 1.7313, "step": 3620 }, { "epoch": 0.9516865372614708, "grad_norm": 0.6066844463348389, "learning_rate": 6.82933239880848e-05, "loss": 1.7141, "step": 3622 }, { "epoch": 0.9522120405951325, "grad_norm": 0.5842781066894531, "learning_rate": 6.827580164710005e-05, "loss": 1.7677, "step": 3624 }, { "epoch": 0.9527375439287943, "grad_norm": 0.6750701069831848, "learning_rate": 6.82582793061153e-05, "loss": 1.7065, "step": 3626 }, { "epoch": 0.953263047262456, "grad_norm": 0.5796899199485779, "learning_rate": 6.824075696513055e-05, "loss": 1.7272, "step": 3628 }, { "epoch": 0.9537885505961179, "grad_norm": 0.7888158559799194, "learning_rate": 6.82232346241458e-05, "loss": 1.7441, "step": 3630 }, { "epoch": 0.9543140539297796, "grad_norm": 0.5940207839012146, "learning_rate": 6.820571228316104e-05, "loss": 1.7261, "step": 3632 }, { "epoch": 0.9548395572634414, "grad_norm": 0.7521408200263977, "learning_rate": 6.818818994217628e-05, "loss": 1.7826, "step": 3634 }, { "epoch": 0.9553650605971031, "grad_norm": 0.6173054575920105, "learning_rate": 6.817066760119151e-05, "loss": 1.7113, "step": 3636 }, { "epoch": 0.9558905639307649, "grad_norm": 0.6263679265975952, "learning_rate": 6.815314526020676e-05, "loss": 1.6943, "step": 3638 }, { "epoch": 0.9564160672644267, "grad_norm": 0.6220889687538147, "learning_rate": 6.8135622919222e-05, "loss": 1.7146, "step": 3640 }, { "epoch": 0.9569415705980885, "grad_norm": 0.6978549957275391, "learning_rate": 6.811810057823725e-05, "loss": 1.791, "step": 3642 }, { "epoch": 0.9574670739317502, "grad_norm": 0.6536712050437927, "learning_rate": 6.81005782372525e-05, "loss": 1.7397, "step": 3644 }, { "epoch": 0.957992577265412, "grad_norm": 0.6998410820960999, "learning_rate": 6.808305589626775e-05, "loss": 1.736, "step": 3646 }, { "epoch": 0.9585180805990738, "grad_norm": 0.6182027459144592, "learning_rate": 6.806553355528298e-05, "loss": 1.7462, "step": 3648 }, { "epoch": 0.9590435839327356, "grad_norm": 0.7864842414855957, "learning_rate": 6.804801121429823e-05, "loss": 1.7495, "step": 3650 }, { "epoch": 0.9595690872663973, "grad_norm": 0.6455209255218506, "learning_rate": 6.803048887331348e-05, "loss": 1.7324, "step": 3652 }, { "epoch": 0.9600945906000591, "grad_norm": 0.6225829720497131, "learning_rate": 6.801296653232873e-05, "loss": 1.748, "step": 3654 }, { "epoch": 0.9606200939337209, "grad_norm": 0.6527931094169617, "learning_rate": 6.799544419134397e-05, "loss": 1.7133, "step": 3656 }, { "epoch": 0.9611455972673827, "grad_norm": 0.619773805141449, "learning_rate": 6.797792185035922e-05, "loss": 1.7876, "step": 3658 }, { "epoch": 0.9616711006010444, "grad_norm": 0.681759774684906, "learning_rate": 6.796039950937446e-05, "loss": 1.7069, "step": 3660 }, { "epoch": 0.9621966039347062, "grad_norm": 0.746255099773407, "learning_rate": 6.794287716838969e-05, "loss": 1.7784, "step": 3662 }, { "epoch": 0.9627221072683679, "grad_norm": 0.5940551161766052, "learning_rate": 6.792535482740494e-05, "loss": 1.7495, "step": 3664 }, { "epoch": 0.9632476106020298, "grad_norm": 0.6246922016143799, "learning_rate": 6.790783248642018e-05, "loss": 1.7758, "step": 3666 }, { "epoch": 0.9637731139356915, "grad_norm": 0.6583105325698853, "learning_rate": 6.789031014543543e-05, "loss": 1.7431, "step": 3668 }, { "epoch": 0.9642986172693533, "grad_norm": 0.6988399624824524, "learning_rate": 6.787278780445068e-05, "loss": 1.7269, "step": 3670 }, { "epoch": 0.964824120603015, "grad_norm": 0.60069739818573, "learning_rate": 6.785526546346593e-05, "loss": 1.7466, "step": 3672 }, { "epoch": 0.9653496239366769, "grad_norm": 0.615967869758606, "learning_rate": 6.783774312248116e-05, "loss": 1.7208, "step": 3674 }, { "epoch": 0.9658751272703386, "grad_norm": 0.7129417061805725, "learning_rate": 6.782022078149641e-05, "loss": 1.7441, "step": 3676 }, { "epoch": 0.9664006306040004, "grad_norm": 0.6287668347358704, "learning_rate": 6.780269844051166e-05, "loss": 1.7413, "step": 3678 }, { "epoch": 0.9669261339376621, "grad_norm": 0.7385637760162354, "learning_rate": 6.77851760995269e-05, "loss": 1.7569, "step": 3680 }, { "epoch": 0.967451637271324, "grad_norm": 0.6797763109207153, "learning_rate": 6.776765375854215e-05, "loss": 1.7417, "step": 3682 }, { "epoch": 0.9679771406049857, "grad_norm": 0.667385458946228, "learning_rate": 6.77501314175574e-05, "loss": 1.7554, "step": 3684 }, { "epoch": 0.9685026439386475, "grad_norm": 0.7547754645347595, "learning_rate": 6.773260907657263e-05, "loss": 1.6905, "step": 3686 }, { "epoch": 0.9690281472723092, "grad_norm": 0.6124489903450012, "learning_rate": 6.771508673558788e-05, "loss": 1.7276, "step": 3688 }, { "epoch": 0.969553650605971, "grad_norm": 0.6314408779144287, "learning_rate": 6.769756439460311e-05, "loss": 1.7375, "step": 3690 }, { "epoch": 0.9700791539396328, "grad_norm": 1.1786826848983765, "learning_rate": 6.768004205361836e-05, "loss": 1.7797, "step": 3692 }, { "epoch": 0.9706046572732946, "grad_norm": 0.6495144963264465, "learning_rate": 6.766251971263361e-05, "loss": 1.7767, "step": 3694 }, { "epoch": 0.9711301606069563, "grad_norm": 0.8259857892990112, "learning_rate": 6.764499737164886e-05, "loss": 1.7568, "step": 3696 }, { "epoch": 0.9716556639406181, "grad_norm": 0.9718241095542908, "learning_rate": 6.76274750306641e-05, "loss": 1.7382, "step": 3698 }, { "epoch": 0.97218116727428, "grad_norm": 0.7467637062072754, "learning_rate": 6.760995268967935e-05, "loss": 1.7168, "step": 3700 }, { "epoch": 0.9727066706079417, "grad_norm": 0.6963891386985779, "learning_rate": 6.759243034869459e-05, "loss": 1.75, "step": 3702 }, { "epoch": 0.9732321739416034, "grad_norm": 0.5882383584976196, "learning_rate": 6.757490800770983e-05, "loss": 1.7664, "step": 3704 }, { "epoch": 0.9737576772752652, "grad_norm": 0.5909221768379211, "learning_rate": 6.755738566672508e-05, "loss": 1.7569, "step": 3706 }, { "epoch": 0.9742831806089269, "grad_norm": 0.6651691794395447, "learning_rate": 6.753986332574033e-05, "loss": 1.7332, "step": 3708 }, { "epoch": 0.9748086839425888, "grad_norm": 0.6473085880279541, "learning_rate": 6.752234098475558e-05, "loss": 1.7668, "step": 3710 }, { "epoch": 0.9753341872762505, "grad_norm": 0.6437013745307922, "learning_rate": 6.750481864377081e-05, "loss": 1.747, "step": 3712 }, { "epoch": 0.9758596906099123, "grad_norm": 0.6409528255462646, "learning_rate": 6.748729630278606e-05, "loss": 1.7105, "step": 3714 }, { "epoch": 0.976385193943574, "grad_norm": 0.7331600785255432, "learning_rate": 6.746977396180129e-05, "loss": 1.7328, "step": 3716 }, { "epoch": 0.9769106972772359, "grad_norm": 0.7643489837646484, "learning_rate": 6.745225162081654e-05, "loss": 1.7464, "step": 3718 }, { "epoch": 0.9774362006108976, "grad_norm": 0.8822628259658813, "learning_rate": 6.743472927983179e-05, "loss": 1.7681, "step": 3720 }, { "epoch": 0.9779617039445594, "grad_norm": 0.9610887765884399, "learning_rate": 6.741720693884703e-05, "loss": 1.7329, "step": 3722 }, { "epoch": 0.9784872072782211, "grad_norm": 0.7626636028289795, "learning_rate": 6.739968459786228e-05, "loss": 1.7161, "step": 3724 }, { "epoch": 0.979012710611883, "grad_norm": 0.6538355946540833, "learning_rate": 6.738216225687753e-05, "loss": 1.7716, "step": 3726 }, { "epoch": 0.9795382139455447, "grad_norm": 0.7273457050323486, "learning_rate": 6.736463991589276e-05, "loss": 1.7367, "step": 3728 }, { "epoch": 0.9800637172792065, "grad_norm": 0.7992196679115295, "learning_rate": 6.734711757490801e-05, "loss": 1.7294, "step": 3730 }, { "epoch": 0.9805892206128682, "grad_norm": 0.6318385601043701, "learning_rate": 6.732959523392326e-05, "loss": 1.7498, "step": 3732 }, { "epoch": 0.98111472394653, "grad_norm": 0.8519952893257141, "learning_rate": 6.73120728929385e-05, "loss": 1.7366, "step": 3734 }, { "epoch": 0.9816402272801918, "grad_norm": 0.7661817073822021, "learning_rate": 6.729455055195374e-05, "loss": 1.7565, "step": 3736 }, { "epoch": 0.9821657306138536, "grad_norm": 0.5940839052200317, "learning_rate": 6.727702821096899e-05, "loss": 1.7384, "step": 3738 }, { "epoch": 0.9826912339475153, "grad_norm": 0.6861841082572937, "learning_rate": 6.725950586998424e-05, "loss": 1.7129, "step": 3740 }, { "epoch": 0.9832167372811771, "grad_norm": 0.9989137649536133, "learning_rate": 6.724198352899947e-05, "loss": 1.7758, "step": 3742 }, { "epoch": 0.983742240614839, "grad_norm": 0.6522616147994995, "learning_rate": 6.722446118801472e-05, "loss": 1.7114, "step": 3744 }, { "epoch": 0.9842677439485007, "grad_norm": 0.6522443294525146, "learning_rate": 6.720693884702996e-05, "loss": 1.7375, "step": 3746 }, { "epoch": 0.9847932472821624, "grad_norm": 0.5641542077064514, "learning_rate": 6.718941650604521e-05, "loss": 1.7371, "step": 3748 }, { "epoch": 0.9853187506158242, "grad_norm": 0.5788604021072388, "learning_rate": 6.717189416506046e-05, "loss": 1.7184, "step": 3750 }, { "epoch": 0.9858442539494859, "grad_norm": 0.5921624302864075, "learning_rate": 6.715437182407571e-05, "loss": 1.7304, "step": 3752 }, { "epoch": 0.9863697572831478, "grad_norm": 0.6481342911720276, "learning_rate": 6.713684948309094e-05, "loss": 1.7838, "step": 3754 }, { "epoch": 0.9868952606168095, "grad_norm": 0.6901116967201233, "learning_rate": 6.711932714210619e-05, "loss": 1.7189, "step": 3756 }, { "epoch": 0.9874207639504713, "grad_norm": 0.7430614233016968, "learning_rate": 6.710180480112144e-05, "loss": 1.7532, "step": 3758 }, { "epoch": 0.987946267284133, "grad_norm": 0.8305982947349548, "learning_rate": 6.708428246013668e-05, "loss": 1.7417, "step": 3760 }, { "epoch": 0.9884717706177949, "grad_norm": 0.801034152507782, "learning_rate": 6.706676011915192e-05, "loss": 1.7058, "step": 3762 }, { "epoch": 0.9889972739514566, "grad_norm": 0.6561713218688965, "learning_rate": 6.704923777816717e-05, "loss": 1.7347, "step": 3764 }, { "epoch": 0.9895227772851184, "grad_norm": 0.6591217517852783, "learning_rate": 6.703171543718241e-05, "loss": 1.7693, "step": 3766 }, { "epoch": 0.9900482806187801, "grad_norm": 0.6448448300361633, "learning_rate": 6.701419309619765e-05, "loss": 1.7327, "step": 3768 }, { "epoch": 0.990573783952442, "grad_norm": 0.7278120517730713, "learning_rate": 6.69966707552129e-05, "loss": 1.7049, "step": 3770 }, { "epoch": 0.9910992872861037, "grad_norm": 0.5780648589134216, "learning_rate": 6.697914841422814e-05, "loss": 1.7643, "step": 3772 }, { "epoch": 0.9916247906197655, "grad_norm": 0.599757969379425, "learning_rate": 6.696162607324339e-05, "loss": 1.7419, "step": 3774 }, { "epoch": 0.9921502939534272, "grad_norm": 0.6809077858924866, "learning_rate": 6.694410373225864e-05, "loss": 1.7638, "step": 3776 }, { "epoch": 0.992675797287089, "grad_norm": 0.7125533223152161, "learning_rate": 6.692658139127389e-05, "loss": 1.7598, "step": 3778 }, { "epoch": 0.9932013006207508, "grad_norm": 0.7388641834259033, "learning_rate": 6.690905905028912e-05, "loss": 1.7484, "step": 3780 }, { "epoch": 0.9937268039544126, "grad_norm": 0.622369647026062, "learning_rate": 6.689153670930437e-05, "loss": 1.7587, "step": 3782 }, { "epoch": 0.9942523072880743, "grad_norm": 0.7785460948944092, "learning_rate": 6.687401436831961e-05, "loss": 1.7575, "step": 3784 }, { "epoch": 0.9947778106217361, "grad_norm": 0.6789509057998657, "learning_rate": 6.685649202733486e-05, "loss": 1.7346, "step": 3786 }, { "epoch": 0.995303313955398, "grad_norm": 0.6360666155815125, "learning_rate": 6.68389696863501e-05, "loss": 1.7838, "step": 3788 }, { "epoch": 0.9958288172890597, "grad_norm": 0.6261754035949707, "learning_rate": 6.682144734536534e-05, "loss": 1.7563, "step": 3790 }, { "epoch": 0.9963543206227214, "grad_norm": 0.6586030125617981, "learning_rate": 6.680392500438059e-05, "loss": 1.7034, "step": 3792 }, { "epoch": 0.9968798239563832, "grad_norm": 0.7084933519363403, "learning_rate": 6.678640266339583e-05, "loss": 1.7995, "step": 3794 }, { "epoch": 0.9974053272900449, "grad_norm": 0.8571730256080627, "learning_rate": 6.676888032241107e-05, "loss": 1.7874, "step": 3796 }, { "epoch": 0.9979308306237068, "grad_norm": 0.6537512540817261, "learning_rate": 6.675135798142632e-05, "loss": 1.7672, "step": 3798 }, { "epoch": 0.9984563339573685, "grad_norm": 0.6276223659515381, "learning_rate": 6.673383564044157e-05, "loss": 1.7106, "step": 3800 }, { "epoch": 0.9989818372910303, "grad_norm": 0.6775636076927185, "learning_rate": 6.671631329945682e-05, "loss": 1.7228, "step": 3802 }, { "epoch": 0.999507340624692, "grad_norm": 0.776142954826355, "learning_rate": 6.669879095847206e-05, "loss": 1.7674, "step": 3804 }, { "epoch": 1.000032843958354, "grad_norm": 0.6469699740409851, "learning_rate": 6.66812686174873e-05, "loss": 1.7201, "step": 3806 }, { "epoch": 1.0005583472920156, "grad_norm": 0.6889809966087341, "learning_rate": 6.666374627650254e-05, "loss": 1.6785, "step": 3808 }, { "epoch": 1.0010838506256774, "grad_norm": 0.5873193740844727, "learning_rate": 6.664622393551779e-05, "loss": 1.6643, "step": 3810 }, { "epoch": 1.0016093539593391, "grad_norm": 0.6216626167297363, "learning_rate": 6.662870159453303e-05, "loss": 1.6835, "step": 3812 }, { "epoch": 1.002134857293001, "grad_norm": 0.621909499168396, "learning_rate": 6.661117925354827e-05, "loss": 1.6972, "step": 3814 }, { "epoch": 1.0026603606266626, "grad_norm": 0.6056811213493347, "learning_rate": 6.659365691256352e-05, "loss": 1.7327, "step": 3816 }, { "epoch": 1.0031858639603246, "grad_norm": 0.6430692076683044, "learning_rate": 6.657613457157877e-05, "loss": 1.7151, "step": 3818 }, { "epoch": 1.0037113672939864, "grad_norm": 0.6576054692268372, "learning_rate": 6.6558612230594e-05, "loss": 1.7058, "step": 3820 }, { "epoch": 1.004236870627648, "grad_norm": 0.666572093963623, "learning_rate": 6.654108988960925e-05, "loss": 1.6745, "step": 3822 }, { "epoch": 1.0047623739613099, "grad_norm": 0.873776376247406, "learning_rate": 6.65235675486245e-05, "loss": 1.7123, "step": 3824 }, { "epoch": 1.0052878772949716, "grad_norm": 0.6568595767021179, "learning_rate": 6.650604520763975e-05, "loss": 1.6813, "step": 3826 }, { "epoch": 1.0058133806286333, "grad_norm": 0.7071711421012878, "learning_rate": 6.648852286665499e-05, "loss": 1.6926, "step": 3828 }, { "epoch": 1.006338883962295, "grad_norm": 0.756188154220581, "learning_rate": 6.647100052567024e-05, "loss": 1.6939, "step": 3830 }, { "epoch": 1.0068643872959568, "grad_norm": 0.6261985301971436, "learning_rate": 6.645347818468547e-05, "loss": 1.6729, "step": 3832 }, { "epoch": 1.0073898906296186, "grad_norm": 0.6476467847824097, "learning_rate": 6.643595584370072e-05, "loss": 1.6905, "step": 3834 }, { "epoch": 1.0079153939632806, "grad_norm": 0.7380629777908325, "learning_rate": 6.641843350271597e-05, "loss": 1.6695, "step": 3836 }, { "epoch": 1.0084408972969423, "grad_norm": 0.7090455293655396, "learning_rate": 6.64009111617312e-05, "loss": 1.7271, "step": 3838 }, { "epoch": 1.008966400630604, "grad_norm": 0.5697006583213806, "learning_rate": 6.638338882074645e-05, "loss": 1.6869, "step": 3840 }, { "epoch": 1.0094919039642658, "grad_norm": 0.5765069127082825, "learning_rate": 6.63658664797617e-05, "loss": 1.7133, "step": 3842 }, { "epoch": 1.0100174072979275, "grad_norm": 0.7223833799362183, "learning_rate": 6.634834413877695e-05, "loss": 1.6566, "step": 3844 }, { "epoch": 1.0105429106315893, "grad_norm": 0.6132098436355591, "learning_rate": 6.633082179779218e-05, "loss": 1.7448, "step": 3846 }, { "epoch": 1.011068413965251, "grad_norm": 0.8053067922592163, "learning_rate": 6.631329945680743e-05, "loss": 1.6833, "step": 3848 }, { "epoch": 1.0115939172989128, "grad_norm": 0.6919856667518616, "learning_rate": 6.629577711582268e-05, "loss": 1.7138, "step": 3850 }, { "epoch": 1.0121194206325745, "grad_norm": 0.590743899345398, "learning_rate": 6.627825477483792e-05, "loss": 1.7058, "step": 3852 }, { "epoch": 1.0126449239662365, "grad_norm": 0.6320709586143494, "learning_rate": 6.626073243385317e-05, "loss": 1.6988, "step": 3854 }, { "epoch": 1.0131704272998983, "grad_norm": 0.5564618706703186, "learning_rate": 6.624321009286842e-05, "loss": 1.7098, "step": 3856 }, { "epoch": 1.01369593063356, "grad_norm": 0.5974157452583313, "learning_rate": 6.622568775188365e-05, "loss": 1.7168, "step": 3858 }, { "epoch": 1.0142214339672218, "grad_norm": 0.6708089709281921, "learning_rate": 6.62081654108989e-05, "loss": 1.6849, "step": 3860 }, { "epoch": 1.0147469373008835, "grad_norm": 0.6684040427207947, "learning_rate": 6.619064306991415e-05, "loss": 1.7061, "step": 3862 }, { "epoch": 1.0152724406345452, "grad_norm": 0.6342937350273132, "learning_rate": 6.617312072892938e-05, "loss": 1.689, "step": 3864 }, { "epoch": 1.015797943968207, "grad_norm": 0.5644361972808838, "learning_rate": 6.615559838794463e-05, "loss": 1.6784, "step": 3866 }, { "epoch": 1.0163234473018687, "grad_norm": 0.7777919173240662, "learning_rate": 6.613807604695988e-05, "loss": 1.6912, "step": 3868 }, { "epoch": 1.0168489506355305, "grad_norm": 0.7786663770675659, "learning_rate": 6.612055370597512e-05, "loss": 1.6968, "step": 3870 }, { "epoch": 1.0173744539691925, "grad_norm": 0.7163161039352417, "learning_rate": 6.610303136499036e-05, "loss": 1.7118, "step": 3872 }, { "epoch": 1.0178999573028542, "grad_norm": 0.731606662273407, "learning_rate": 6.60855090240056e-05, "loss": 1.6944, "step": 3874 }, { "epoch": 1.018425460636516, "grad_norm": 0.6335828900337219, "learning_rate": 6.606798668302085e-05, "loss": 1.7006, "step": 3876 }, { "epoch": 1.0189509639701777, "grad_norm": 0.7113467454910278, "learning_rate": 6.60504643420361e-05, "loss": 1.694, "step": 3878 }, { "epoch": 1.0194764673038395, "grad_norm": 0.84892338514328, "learning_rate": 6.603294200105135e-05, "loss": 1.7098, "step": 3880 }, { "epoch": 1.0200019706375012, "grad_norm": 0.6938359141349792, "learning_rate": 6.60154196600666e-05, "loss": 1.6912, "step": 3882 }, { "epoch": 1.020527473971163, "grad_norm": 0.6478989124298096, "learning_rate": 6.599789731908183e-05, "loss": 1.6992, "step": 3884 }, { "epoch": 1.0210529773048247, "grad_norm": 0.7370628118515015, "learning_rate": 6.598037497809708e-05, "loss": 1.6763, "step": 3886 }, { "epoch": 1.0215784806384864, "grad_norm": 0.6913176774978638, "learning_rate": 6.596285263711233e-05, "loss": 1.6808, "step": 3888 }, { "epoch": 1.0221039839721484, "grad_norm": 0.6637833118438721, "learning_rate": 6.594533029612756e-05, "loss": 1.679, "step": 3890 }, { "epoch": 1.0226294873058102, "grad_norm": 0.7522826194763184, "learning_rate": 6.59278079551428e-05, "loss": 1.6729, "step": 3892 }, { "epoch": 1.023154990639472, "grad_norm": 0.5955492258071899, "learning_rate": 6.591028561415805e-05, "loss": 1.6915, "step": 3894 }, { "epoch": 1.0236804939731337, "grad_norm": 0.6156378388404846, "learning_rate": 6.58927632731733e-05, "loss": 1.6887, "step": 3896 }, { "epoch": 1.0242059973067954, "grad_norm": 0.5954993963241577, "learning_rate": 6.587524093218854e-05, "loss": 1.7028, "step": 3898 }, { "epoch": 1.0247315006404571, "grad_norm": 0.6089223623275757, "learning_rate": 6.585771859120378e-05, "loss": 1.7172, "step": 3900 }, { "epoch": 1.025257003974119, "grad_norm": 0.6109156012535095, "learning_rate": 6.584019625021903e-05, "loss": 1.6883, "step": 3902 }, { "epoch": 1.0257825073077806, "grad_norm": 0.769751787185669, "learning_rate": 6.582267390923428e-05, "loss": 1.7029, "step": 3904 }, { "epoch": 1.0263080106414426, "grad_norm": 0.579433023929596, "learning_rate": 6.580515156824953e-05, "loss": 1.6728, "step": 3906 }, { "epoch": 1.0268335139751044, "grad_norm": 0.6194645166397095, "learning_rate": 6.578762922726477e-05, "loss": 1.6949, "step": 3908 }, { "epoch": 1.027359017308766, "grad_norm": 0.5495603680610657, "learning_rate": 6.577010688628001e-05, "loss": 1.7059, "step": 3910 }, { "epoch": 1.0278845206424279, "grad_norm": 0.7045862674713135, "learning_rate": 6.575258454529526e-05, "loss": 1.7098, "step": 3912 }, { "epoch": 1.0284100239760896, "grad_norm": 0.6708394885063171, "learning_rate": 6.573506220431049e-05, "loss": 1.6671, "step": 3914 }, { "epoch": 1.0289355273097514, "grad_norm": 0.6526671051979065, "learning_rate": 6.571753986332574e-05, "loss": 1.6645, "step": 3916 }, { "epoch": 1.029461030643413, "grad_norm": 0.5542386174201965, "learning_rate": 6.570001752234098e-05, "loss": 1.6694, "step": 3918 }, { "epoch": 1.0299865339770748, "grad_norm": 0.6871373057365417, "learning_rate": 6.568249518135623e-05, "loss": 1.7054, "step": 3920 }, { "epoch": 1.0305120373107366, "grad_norm": 0.6650441288948059, "learning_rate": 6.566497284037148e-05, "loss": 1.7034, "step": 3922 }, { "epoch": 1.0310375406443986, "grad_norm": 1.027212142944336, "learning_rate": 6.564745049938671e-05, "loss": 1.6684, "step": 3924 }, { "epoch": 1.0315630439780603, "grad_norm": 0.7262475490570068, "learning_rate": 6.562992815840196e-05, "loss": 1.6994, "step": 3926 }, { "epoch": 1.032088547311722, "grad_norm": 0.697229266166687, "learning_rate": 6.561240581741721e-05, "loss": 1.6627, "step": 3928 }, { "epoch": 1.0326140506453838, "grad_norm": 0.6965095400810242, "learning_rate": 6.559488347643246e-05, "loss": 1.7078, "step": 3930 }, { "epoch": 1.0331395539790456, "grad_norm": 0.6743383407592773, "learning_rate": 6.55773611354477e-05, "loss": 1.6954, "step": 3932 }, { "epoch": 1.0336650573127073, "grad_norm": 0.7088636159896851, "learning_rate": 6.555983879446295e-05, "loss": 1.7079, "step": 3934 }, { "epoch": 1.034190560646369, "grad_norm": 0.9612395763397217, "learning_rate": 6.554231645347819e-05, "loss": 1.7034, "step": 3936 }, { "epoch": 1.0347160639800308, "grad_norm": 0.644659161567688, "learning_rate": 6.552479411249343e-05, "loss": 1.6736, "step": 3938 }, { "epoch": 1.0352415673136925, "grad_norm": 0.6379082202911377, "learning_rate": 6.550727177150867e-05, "loss": 1.6999, "step": 3940 }, { "epoch": 1.0357670706473545, "grad_norm": 0.8248457908630371, "learning_rate": 6.548974943052391e-05, "loss": 1.7085, "step": 3942 }, { "epoch": 1.0362925739810163, "grad_norm": 0.9295015335083008, "learning_rate": 6.547222708953916e-05, "loss": 1.6797, "step": 3944 }, { "epoch": 1.036818077314678, "grad_norm": 0.614661693572998, "learning_rate": 6.545470474855441e-05, "loss": 1.6928, "step": 3946 }, { "epoch": 1.0373435806483398, "grad_norm": 0.721056342124939, "learning_rate": 6.543718240756966e-05, "loss": 1.6783, "step": 3948 }, { "epoch": 1.0378690839820015, "grad_norm": 1.0733940601348877, "learning_rate": 6.54196600665849e-05, "loss": 1.7053, "step": 3950 }, { "epoch": 1.0383945873156633, "grad_norm": 0.6472097039222717, "learning_rate": 6.540213772560014e-05, "loss": 1.7327, "step": 3952 }, { "epoch": 1.038920090649325, "grad_norm": 0.7506822943687439, "learning_rate": 6.538461538461539e-05, "loss": 1.6703, "step": 3954 }, { "epoch": 1.0394455939829867, "grad_norm": 0.8442516326904297, "learning_rate": 6.536709304363063e-05, "loss": 1.695, "step": 3956 }, { "epoch": 1.0399710973166485, "grad_norm": 0.7090259790420532, "learning_rate": 6.534957070264588e-05, "loss": 1.6954, "step": 3958 }, { "epoch": 1.0404966006503105, "grad_norm": 0.8459334969520569, "learning_rate": 6.533204836166113e-05, "loss": 1.6729, "step": 3960 }, { "epoch": 1.0410221039839722, "grad_norm": 0.8890243172645569, "learning_rate": 6.531452602067638e-05, "loss": 1.6778, "step": 3962 }, { "epoch": 1.041547607317634, "grad_norm": 0.9002764821052551, "learning_rate": 6.529700367969161e-05, "loss": 1.6651, "step": 3964 }, { "epoch": 1.0420731106512957, "grad_norm": 0.7871319055557251, "learning_rate": 6.527948133870684e-05, "loss": 1.7182, "step": 3966 }, { "epoch": 1.0425986139849575, "grad_norm": 1.2501089572906494, "learning_rate": 6.526195899772209e-05, "loss": 1.7056, "step": 3968 }, { "epoch": 1.0431241173186192, "grad_norm": 0.8261802792549133, "learning_rate": 6.524443665673734e-05, "loss": 1.7063, "step": 3970 }, { "epoch": 1.043649620652281, "grad_norm": 0.8968937993049622, "learning_rate": 6.522691431575259e-05, "loss": 1.6993, "step": 3972 }, { "epoch": 1.0441751239859427, "grad_norm": 0.8691303730010986, "learning_rate": 6.520939197476783e-05, "loss": 1.6731, "step": 3974 }, { "epoch": 1.0447006273196044, "grad_norm": 0.9870227575302124, "learning_rate": 6.519186963378308e-05, "loss": 1.6689, "step": 3976 }, { "epoch": 1.0452261306532664, "grad_norm": 0.6180405616760254, "learning_rate": 6.517434729279832e-05, "loss": 1.69, "step": 3978 }, { "epoch": 1.0457516339869282, "grad_norm": 0.6060590147972107, "learning_rate": 6.515682495181356e-05, "loss": 1.7048, "step": 3980 }, { "epoch": 1.04627713732059, "grad_norm": 0.6996809244155884, "learning_rate": 6.513930261082881e-05, "loss": 1.693, "step": 3982 }, { "epoch": 1.0468026406542517, "grad_norm": 0.8016669154167175, "learning_rate": 6.512178026984406e-05, "loss": 1.7072, "step": 3984 }, { "epoch": 1.0473281439879134, "grad_norm": 0.6650173664093018, "learning_rate": 6.51042579288593e-05, "loss": 1.6923, "step": 3986 }, { "epoch": 1.0478536473215752, "grad_norm": 0.921610951423645, "learning_rate": 6.508673558787455e-05, "loss": 1.6857, "step": 3988 }, { "epoch": 1.048379150655237, "grad_norm": 0.5630477666854858, "learning_rate": 6.506921324688979e-05, "loss": 1.6831, "step": 3990 }, { "epoch": 1.0489046539888986, "grad_norm": 0.7276068329811096, "learning_rate": 6.505169090590502e-05, "loss": 1.6785, "step": 3992 }, { "epoch": 1.0494301573225606, "grad_norm": 0.7698312997817993, "learning_rate": 6.503416856492027e-05, "loss": 1.6842, "step": 3994 }, { "epoch": 1.0499556606562224, "grad_norm": 0.8240602016448975, "learning_rate": 6.501664622393552e-05, "loss": 1.6815, "step": 3996 }, { "epoch": 1.0504811639898841, "grad_norm": 0.6555003523826599, "learning_rate": 6.499912388295076e-05, "loss": 1.6668, "step": 3998 }, { "epoch": 1.0510066673235459, "grad_norm": 0.7370006442070007, "learning_rate": 6.498160154196601e-05, "loss": 1.6993, "step": 4000 }, { "epoch": 1.0510066673235459, "eval_loss": 1.7162247896194458, "eval_runtime": 487.2234, "eval_samples_per_second": 249.965, "eval_steps_per_second": 31.246, "step": 4000 }, { "epoch": 1.0515321706572076, "grad_norm": 0.7542751431465149, "learning_rate": 6.496407920098126e-05, "loss": 1.6854, "step": 4002 }, { "epoch": 1.0520576739908694, "grad_norm": 0.7216155529022217, "learning_rate": 6.49465568599965e-05, "loss": 1.7008, "step": 4004 }, { "epoch": 1.052583177324531, "grad_norm": 0.6681018471717834, "learning_rate": 6.492903451901174e-05, "loss": 1.703, "step": 4006 }, { "epoch": 1.0531086806581929, "grad_norm": 0.8611218929290771, "learning_rate": 6.491151217802699e-05, "loss": 1.6686, "step": 4008 }, { "epoch": 1.0536341839918546, "grad_norm": 0.6838074922561646, "learning_rate": 6.489398983704224e-05, "loss": 1.7053, "step": 4010 }, { "epoch": 1.0541596873255166, "grad_norm": 0.6425184607505798, "learning_rate": 6.487646749605748e-05, "loss": 1.6621, "step": 4012 }, { "epoch": 1.0546851906591783, "grad_norm": 0.8689895272254944, "learning_rate": 6.485894515507273e-05, "loss": 1.6693, "step": 4014 }, { "epoch": 1.05521069399284, "grad_norm": 0.7122433185577393, "learning_rate": 6.484142281408797e-05, "loss": 1.6936, "step": 4016 }, { "epoch": 1.0557361973265018, "grad_norm": 0.7124624252319336, "learning_rate": 6.48239004731032e-05, "loss": 1.6915, "step": 4018 }, { "epoch": 1.0562617006601636, "grad_norm": 0.6151629686355591, "learning_rate": 6.480637813211845e-05, "loss": 1.6601, "step": 4020 }, { "epoch": 1.0567872039938253, "grad_norm": 0.5656692385673523, "learning_rate": 6.47888557911337e-05, "loss": 1.6651, "step": 4022 }, { "epoch": 1.057312707327487, "grad_norm": 0.6214647889137268, "learning_rate": 6.477133345014894e-05, "loss": 1.7278, "step": 4024 }, { "epoch": 1.0578382106611488, "grad_norm": 0.7187774777412415, "learning_rate": 6.475381110916419e-05, "loss": 1.682, "step": 4026 }, { "epoch": 1.0583637139948106, "grad_norm": 0.602172315120697, "learning_rate": 6.473628876817944e-05, "loss": 1.6712, "step": 4028 }, { "epoch": 1.0588892173284725, "grad_norm": 0.7032018303871155, "learning_rate": 6.471876642719467e-05, "loss": 1.7056, "step": 4030 }, { "epoch": 1.0594147206621343, "grad_norm": 0.720413088798523, "learning_rate": 6.470124408620992e-05, "loss": 1.6658, "step": 4032 }, { "epoch": 1.059940223995796, "grad_norm": 0.7536730170249939, "learning_rate": 6.468372174522517e-05, "loss": 1.6624, "step": 4034 }, { "epoch": 1.0604657273294578, "grad_norm": 0.6960994005203247, "learning_rate": 6.466619940424041e-05, "loss": 1.6805, "step": 4036 }, { "epoch": 1.0609912306631195, "grad_norm": 0.6354637145996094, "learning_rate": 6.464867706325566e-05, "loss": 1.6969, "step": 4038 }, { "epoch": 1.0615167339967813, "grad_norm": 0.659994900226593, "learning_rate": 6.463115472227091e-05, "loss": 1.7076, "step": 4040 }, { "epoch": 1.062042237330443, "grad_norm": 0.6840848326683044, "learning_rate": 6.461363238128614e-05, "loss": 1.7255, "step": 4042 }, { "epoch": 1.0625677406641048, "grad_norm": 0.8560720086097717, "learning_rate": 6.459611004030138e-05, "loss": 1.6797, "step": 4044 }, { "epoch": 1.0630932439977667, "grad_norm": 0.5914828181266785, "learning_rate": 6.457858769931663e-05, "loss": 1.6948, "step": 4046 }, { "epoch": 1.0636187473314285, "grad_norm": 0.9306698441505432, "learning_rate": 6.456106535833187e-05, "loss": 1.6749, "step": 4048 }, { "epoch": 1.0641442506650902, "grad_norm": 0.6383719444274902, "learning_rate": 6.454354301734712e-05, "loss": 1.7245, "step": 4050 }, { "epoch": 1.064669753998752, "grad_norm": 0.6287466287612915, "learning_rate": 6.452602067636237e-05, "loss": 1.6935, "step": 4052 }, { "epoch": 1.0651952573324137, "grad_norm": 0.7017025947570801, "learning_rate": 6.450849833537762e-05, "loss": 1.6792, "step": 4054 }, { "epoch": 1.0657207606660755, "grad_norm": 0.6088765859603882, "learning_rate": 6.449097599439285e-05, "loss": 1.6926, "step": 4056 }, { "epoch": 1.0662462639997372, "grad_norm": 0.6329763531684875, "learning_rate": 6.44734536534081e-05, "loss": 1.7088, "step": 4058 }, { "epoch": 1.066771767333399, "grad_norm": 0.6406139731407166, "learning_rate": 6.445593131242334e-05, "loss": 1.6847, "step": 4060 }, { "epoch": 1.0672972706670607, "grad_norm": 0.5512668490409851, "learning_rate": 6.443840897143859e-05, "loss": 1.6572, "step": 4062 }, { "epoch": 1.0678227740007227, "grad_norm": 0.6363273859024048, "learning_rate": 6.442088663045384e-05, "loss": 1.7079, "step": 4064 }, { "epoch": 1.0683482773343844, "grad_norm": 0.5485667586326599, "learning_rate": 6.440336428946909e-05, "loss": 1.7357, "step": 4066 }, { "epoch": 1.0688737806680462, "grad_norm": 0.6467545628547668, "learning_rate": 6.438584194848432e-05, "loss": 1.6527, "step": 4068 }, { "epoch": 1.069399284001708, "grad_norm": 0.6813017129898071, "learning_rate": 6.436831960749956e-05, "loss": 1.6907, "step": 4070 }, { "epoch": 1.0699247873353697, "grad_norm": 0.7942006587982178, "learning_rate": 6.43507972665148e-05, "loss": 1.6908, "step": 4072 }, { "epoch": 1.0704502906690314, "grad_norm": 0.7022045254707336, "learning_rate": 6.433327492553005e-05, "loss": 1.6829, "step": 4074 }, { "epoch": 1.0709757940026932, "grad_norm": 0.581889271736145, "learning_rate": 6.43157525845453e-05, "loss": 1.6625, "step": 4076 }, { "epoch": 1.071501297336355, "grad_norm": 0.979885458946228, "learning_rate": 6.429823024356055e-05, "loss": 1.6818, "step": 4078 }, { "epoch": 1.0720268006700167, "grad_norm": 0.6776348352432251, "learning_rate": 6.428070790257579e-05, "loss": 1.6806, "step": 4080 }, { "epoch": 1.0725523040036786, "grad_norm": 0.8267763257026672, "learning_rate": 6.426318556159103e-05, "loss": 1.6728, "step": 4082 }, { "epoch": 1.0730778073373404, "grad_norm": 0.6556562781333923, "learning_rate": 6.424566322060627e-05, "loss": 1.6894, "step": 4084 }, { "epoch": 1.0736033106710021, "grad_norm": 0.809785783290863, "learning_rate": 6.422814087962152e-05, "loss": 1.6918, "step": 4086 }, { "epoch": 1.0741288140046639, "grad_norm": 0.5993219017982483, "learning_rate": 6.421061853863677e-05, "loss": 1.673, "step": 4088 }, { "epoch": 1.0746543173383256, "grad_norm": 0.6594780087471008, "learning_rate": 6.419309619765202e-05, "loss": 1.7099, "step": 4090 }, { "epoch": 1.0751798206719874, "grad_norm": 0.8069416284561157, "learning_rate": 6.417557385666726e-05, "loss": 1.7032, "step": 4092 }, { "epoch": 1.075705324005649, "grad_norm": 0.6383103728294373, "learning_rate": 6.41580515156825e-05, "loss": 1.6763, "step": 4094 }, { "epoch": 1.0762308273393109, "grad_norm": 0.622296929359436, "learning_rate": 6.414052917469773e-05, "loss": 1.6717, "step": 4096 }, { "epoch": 1.0767563306729726, "grad_norm": 0.5699704885482788, "learning_rate": 6.412300683371298e-05, "loss": 1.6684, "step": 4098 }, { "epoch": 1.0772818340066346, "grad_norm": 0.653029203414917, "learning_rate": 6.410548449272823e-05, "loss": 1.6898, "step": 4100 }, { "epoch": 1.0778073373402963, "grad_norm": 0.5569880604743958, "learning_rate": 6.408796215174348e-05, "loss": 1.6897, "step": 4102 }, { "epoch": 1.078332840673958, "grad_norm": 0.8268890380859375, "learning_rate": 6.407043981075872e-05, "loss": 1.691, "step": 4104 }, { "epoch": 1.0788583440076198, "grad_norm": 0.6143434643745422, "learning_rate": 6.405291746977397e-05, "loss": 1.7033, "step": 4106 }, { "epoch": 1.0793838473412816, "grad_norm": 0.6650940775871277, "learning_rate": 6.40353951287892e-05, "loss": 1.7149, "step": 4108 }, { "epoch": 1.0799093506749433, "grad_norm": 0.9318827986717224, "learning_rate": 6.401787278780445e-05, "loss": 1.7007, "step": 4110 }, { "epoch": 1.080434854008605, "grad_norm": 0.870758056640625, "learning_rate": 6.40003504468197e-05, "loss": 1.6928, "step": 4112 }, { "epoch": 1.0809603573422668, "grad_norm": 0.5847840309143066, "learning_rate": 6.398282810583495e-05, "loss": 1.7171, "step": 4114 }, { "epoch": 1.0814858606759286, "grad_norm": 0.6585195064544678, "learning_rate": 6.39653057648502e-05, "loss": 1.6983, "step": 4116 }, { "epoch": 1.0820113640095905, "grad_norm": 0.6032067537307739, "learning_rate": 6.394778342386543e-05, "loss": 1.7266, "step": 4118 }, { "epoch": 1.0825368673432523, "grad_norm": 0.7615604996681213, "learning_rate": 6.393026108288068e-05, "loss": 1.6821, "step": 4120 }, { "epoch": 1.083062370676914, "grad_norm": 0.6143482327461243, "learning_rate": 6.391273874189591e-05, "loss": 1.6857, "step": 4122 }, { "epoch": 1.0835878740105758, "grad_norm": 0.7933993935585022, "learning_rate": 6.389521640091116e-05, "loss": 1.6717, "step": 4124 }, { "epoch": 1.0841133773442375, "grad_norm": 0.6543474793434143, "learning_rate": 6.38776940599264e-05, "loss": 1.6959, "step": 4126 }, { "epoch": 1.0846388806778993, "grad_norm": 0.6127591133117676, "learning_rate": 6.386017171894165e-05, "loss": 1.6885, "step": 4128 }, { "epoch": 1.085164384011561, "grad_norm": 0.8532068133354187, "learning_rate": 6.38426493779569e-05, "loss": 1.6763, "step": 4130 }, { "epoch": 1.0856898873452228, "grad_norm": 0.5886075496673584, "learning_rate": 6.382512703697215e-05, "loss": 1.6857, "step": 4132 }, { "epoch": 1.0862153906788845, "grad_norm": 0.6927480697631836, "learning_rate": 6.380760469598738e-05, "loss": 1.6978, "step": 4134 }, { "epoch": 1.0867408940125465, "grad_norm": 0.5445473790168762, "learning_rate": 6.379008235500263e-05, "loss": 1.6985, "step": 4136 }, { "epoch": 1.0872663973462082, "grad_norm": 0.6567670702934265, "learning_rate": 6.377256001401788e-05, "loss": 1.6897, "step": 4138 }, { "epoch": 1.08779190067987, "grad_norm": 0.6687731742858887, "learning_rate": 6.375503767303312e-05, "loss": 1.688, "step": 4140 }, { "epoch": 1.0883174040135317, "grad_norm": 0.575955331325531, "learning_rate": 6.373751533204837e-05, "loss": 1.7204, "step": 4142 }, { "epoch": 1.0888429073471935, "grad_norm": 0.6957133412361145, "learning_rate": 6.37199929910636e-05, "loss": 1.6945, "step": 4144 }, { "epoch": 1.0893684106808552, "grad_norm": 0.7448277473449707, "learning_rate": 6.370247065007885e-05, "loss": 1.6953, "step": 4146 }, { "epoch": 1.089893914014517, "grad_norm": 0.6629153490066528, "learning_rate": 6.368494830909409e-05, "loss": 1.703, "step": 4148 }, { "epoch": 1.0904194173481787, "grad_norm": 0.7909244298934937, "learning_rate": 6.366742596810934e-05, "loss": 1.6915, "step": 4150 }, { "epoch": 1.0909449206818405, "grad_norm": 0.5901594161987305, "learning_rate": 6.364990362712458e-05, "loss": 1.6747, "step": 4152 }, { "epoch": 1.0914704240155024, "grad_norm": 0.6351743340492249, "learning_rate": 6.363238128613983e-05, "loss": 1.6776, "step": 4154 }, { "epoch": 1.0919959273491642, "grad_norm": 0.7577309608459473, "learning_rate": 6.361485894515508e-05, "loss": 1.6941, "step": 4156 }, { "epoch": 1.092521430682826, "grad_norm": 0.8337988257408142, "learning_rate": 6.359733660417033e-05, "loss": 1.6985, "step": 4158 }, { "epoch": 1.0930469340164877, "grad_norm": 0.6406223177909851, "learning_rate": 6.357981426318556e-05, "loss": 1.6687, "step": 4160 }, { "epoch": 1.0935724373501494, "grad_norm": 0.5720377564430237, "learning_rate": 6.356229192220081e-05, "loss": 1.6844, "step": 4162 }, { "epoch": 1.0940979406838112, "grad_norm": 0.5843801498413086, "learning_rate": 6.354476958121606e-05, "loss": 1.7099, "step": 4164 }, { "epoch": 1.094623444017473, "grad_norm": 0.7225820422172546, "learning_rate": 6.35272472402313e-05, "loss": 1.6941, "step": 4166 }, { "epoch": 1.0951489473511347, "grad_norm": 0.6221319437026978, "learning_rate": 6.350972489924655e-05, "loss": 1.7102, "step": 4168 }, { "epoch": 1.0956744506847966, "grad_norm": 0.6108186841011047, "learning_rate": 6.349220255826178e-05, "loss": 1.688, "step": 4170 }, { "epoch": 1.0961999540184584, "grad_norm": 0.6956847906112671, "learning_rate": 6.347468021727703e-05, "loss": 1.67, "step": 4172 }, { "epoch": 1.0967254573521201, "grad_norm": 0.616248369216919, "learning_rate": 6.345715787629227e-05, "loss": 1.7036, "step": 4174 }, { "epoch": 1.0972509606857819, "grad_norm": 0.6128188371658325, "learning_rate": 6.343963553530751e-05, "loss": 1.6961, "step": 4176 }, { "epoch": 1.0977764640194436, "grad_norm": 0.6509534120559692, "learning_rate": 6.342211319432276e-05, "loss": 1.7298, "step": 4178 }, { "epoch": 1.0983019673531054, "grad_norm": 0.8290244340896606, "learning_rate": 6.340459085333801e-05, "loss": 1.7022, "step": 4180 }, { "epoch": 1.0988274706867671, "grad_norm": 0.8365241289138794, "learning_rate": 6.338706851235326e-05, "loss": 1.7145, "step": 4182 }, { "epoch": 1.0993529740204289, "grad_norm": 0.6763657927513123, "learning_rate": 6.33695461713685e-05, "loss": 1.7356, "step": 4184 }, { "epoch": 1.0998784773540906, "grad_norm": 0.6510267853736877, "learning_rate": 6.335202383038374e-05, "loss": 1.7102, "step": 4186 }, { "epoch": 1.1004039806877526, "grad_norm": 0.8549639582633972, "learning_rate": 6.333450148939899e-05, "loss": 1.7083, "step": 4188 }, { "epoch": 1.1009294840214143, "grad_norm": 0.5909300446510315, "learning_rate": 6.331697914841423e-05, "loss": 1.7224, "step": 4190 }, { "epoch": 1.101454987355076, "grad_norm": 0.5720754265785217, "learning_rate": 6.329945680742948e-05, "loss": 1.6936, "step": 4192 }, { "epoch": 1.1019804906887378, "grad_norm": 0.8893141150474548, "learning_rate": 6.328193446644473e-05, "loss": 1.6726, "step": 4194 }, { "epoch": 1.1025059940223996, "grad_norm": 0.5539205074310303, "learning_rate": 6.326441212545996e-05, "loss": 1.6461, "step": 4196 }, { "epoch": 1.1030314973560613, "grad_norm": 0.8009784817695618, "learning_rate": 6.324688978447521e-05, "loss": 1.7247, "step": 4198 }, { "epoch": 1.103557000689723, "grad_norm": 0.6527197957038879, "learning_rate": 6.322936744349046e-05, "loss": 1.7007, "step": 4200 }, { "epoch": 1.1040825040233848, "grad_norm": 0.6934798955917358, "learning_rate": 6.321184510250569e-05, "loss": 1.6944, "step": 4202 }, { "epoch": 1.1046080073570468, "grad_norm": 0.6967670917510986, "learning_rate": 6.319432276152094e-05, "loss": 1.6937, "step": 4204 }, { "epoch": 1.1051335106907085, "grad_norm": 0.7582138776779175, "learning_rate": 6.317680042053619e-05, "loss": 1.703, "step": 4206 }, { "epoch": 1.1056590140243703, "grad_norm": 0.6267343759536743, "learning_rate": 6.315927807955143e-05, "loss": 1.6888, "step": 4208 }, { "epoch": 1.106184517358032, "grad_norm": 0.6019650101661682, "learning_rate": 6.314175573856668e-05, "loss": 1.7076, "step": 4210 }, { "epoch": 1.1067100206916938, "grad_norm": 0.5411569476127625, "learning_rate": 6.312423339758193e-05, "loss": 1.6816, "step": 4212 }, { "epoch": 1.1072355240253555, "grad_norm": 1.0304468870162964, "learning_rate": 6.310671105659716e-05, "loss": 1.7147, "step": 4214 }, { "epoch": 1.1077610273590173, "grad_norm": 0.6413879990577698, "learning_rate": 6.308918871561241e-05, "loss": 1.6949, "step": 4216 }, { "epoch": 1.108286530692679, "grad_norm": 0.6234250664710999, "learning_rate": 6.307166637462766e-05, "loss": 1.6998, "step": 4218 }, { "epoch": 1.1088120340263408, "grad_norm": 0.5468757152557373, "learning_rate": 6.305414403364289e-05, "loss": 1.7209, "step": 4220 }, { "epoch": 1.1093375373600027, "grad_norm": 0.7098813056945801, "learning_rate": 6.303662169265814e-05, "loss": 1.6913, "step": 4222 }, { "epoch": 1.1098630406936645, "grad_norm": 0.7469329237937927, "learning_rate": 6.301909935167339e-05, "loss": 1.6945, "step": 4224 }, { "epoch": 1.1103885440273262, "grad_norm": 0.634135901927948, "learning_rate": 6.300157701068863e-05, "loss": 1.6777, "step": 4226 }, { "epoch": 1.110914047360988, "grad_norm": 0.670148491859436, "learning_rate": 6.298405466970387e-05, "loss": 1.6921, "step": 4228 }, { "epoch": 1.1114395506946497, "grad_norm": 0.6443579792976379, "learning_rate": 6.296653232871912e-05, "loss": 1.7338, "step": 4230 }, { "epoch": 1.1119650540283115, "grad_norm": 0.6890257000923157, "learning_rate": 6.294900998773436e-05, "loss": 1.7463, "step": 4232 }, { "epoch": 1.1124905573619732, "grad_norm": 0.5529821515083313, "learning_rate": 6.293148764674961e-05, "loss": 1.6557, "step": 4234 }, { "epoch": 1.113016060695635, "grad_norm": 0.7622890472412109, "learning_rate": 6.291396530576486e-05, "loss": 1.7009, "step": 4236 }, { "epoch": 1.1135415640292967, "grad_norm": 0.6255015730857849, "learning_rate": 6.28964429647801e-05, "loss": 1.6922, "step": 4238 }, { "epoch": 1.1140670673629587, "grad_norm": 0.5990403890609741, "learning_rate": 6.287892062379534e-05, "loss": 1.7081, "step": 4240 }, { "epoch": 1.1145925706966204, "grad_norm": 0.6255038976669312, "learning_rate": 6.286139828281059e-05, "loss": 1.6878, "step": 4242 }, { "epoch": 1.1151180740302822, "grad_norm": 0.6530934572219849, "learning_rate": 6.284387594182584e-05, "loss": 1.6948, "step": 4244 }, { "epoch": 1.115643577363944, "grad_norm": 0.6111620664596558, "learning_rate": 6.282635360084107e-05, "loss": 1.6752, "step": 4246 }, { "epoch": 1.1161690806976057, "grad_norm": 0.7106771469116211, "learning_rate": 6.280883125985632e-05, "loss": 1.6941, "step": 4248 }, { "epoch": 1.1166945840312674, "grad_norm": 0.6513094902038574, "learning_rate": 6.279130891887156e-05, "loss": 1.6937, "step": 4250 }, { "epoch": 1.1172200873649292, "grad_norm": 0.6989749670028687, "learning_rate": 6.277378657788681e-05, "loss": 1.7048, "step": 4252 }, { "epoch": 1.117745590698591, "grad_norm": 0.6800375580787659, "learning_rate": 6.275626423690205e-05, "loss": 1.6896, "step": 4254 }, { "epoch": 1.1182710940322527, "grad_norm": 0.5913258790969849, "learning_rate": 6.27387418959173e-05, "loss": 1.7218, "step": 4256 }, { "epoch": 1.1187965973659146, "grad_norm": 0.6066367030143738, "learning_rate": 6.272121955493254e-05, "loss": 1.7018, "step": 4258 }, { "epoch": 1.1193221006995764, "grad_norm": 0.6343132853507996, "learning_rate": 6.270369721394779e-05, "loss": 1.6942, "step": 4260 }, { "epoch": 1.1198476040332381, "grad_norm": 0.72137451171875, "learning_rate": 6.268617487296304e-05, "loss": 1.6996, "step": 4262 }, { "epoch": 1.1203731073668999, "grad_norm": 0.6029708981513977, "learning_rate": 6.266865253197828e-05, "loss": 1.7288, "step": 4264 }, { "epoch": 1.1208986107005616, "grad_norm": 0.5431277751922607, "learning_rate": 6.265113019099352e-05, "loss": 1.677, "step": 4266 }, { "epoch": 1.1214241140342234, "grad_norm": 0.5661784410476685, "learning_rate": 6.263360785000877e-05, "loss": 1.7043, "step": 4268 }, { "epoch": 1.1219496173678851, "grad_norm": 0.611405074596405, "learning_rate": 6.261608550902401e-05, "loss": 1.6822, "step": 4270 }, { "epoch": 1.1224751207015469, "grad_norm": 0.6432753205299377, "learning_rate": 6.259856316803925e-05, "loss": 1.7392, "step": 4272 }, { "epoch": 1.1230006240352086, "grad_norm": 0.6208961606025696, "learning_rate": 6.25810408270545e-05, "loss": 1.649, "step": 4274 }, { "epoch": 1.1235261273688706, "grad_norm": 0.628790020942688, "learning_rate": 6.256351848606974e-05, "loss": 1.6912, "step": 4276 }, { "epoch": 1.1240516307025323, "grad_norm": 0.6716508269309998, "learning_rate": 6.254599614508499e-05, "loss": 1.7001, "step": 4278 }, { "epoch": 1.124577134036194, "grad_norm": 0.7140381932258606, "learning_rate": 6.252847380410022e-05, "loss": 1.6913, "step": 4280 }, { "epoch": 1.1251026373698558, "grad_norm": 0.6927066445350647, "learning_rate": 6.251095146311547e-05, "loss": 1.7051, "step": 4282 }, { "epoch": 1.1256281407035176, "grad_norm": 0.6016809940338135, "learning_rate": 6.249342912213072e-05, "loss": 1.6734, "step": 4284 }, { "epoch": 1.1261536440371793, "grad_norm": 0.6632773876190186, "learning_rate": 6.247590678114597e-05, "loss": 1.6911, "step": 4286 }, { "epoch": 1.126679147370841, "grad_norm": 0.6406089663505554, "learning_rate": 6.245838444016121e-05, "loss": 1.6867, "step": 4288 }, { "epoch": 1.1272046507045028, "grad_norm": 0.6394585371017456, "learning_rate": 6.244086209917646e-05, "loss": 1.7197, "step": 4290 }, { "epoch": 1.1277301540381646, "grad_norm": 0.9273669719696045, "learning_rate": 6.24233397581917e-05, "loss": 1.6678, "step": 4292 }, { "epoch": 1.1282556573718265, "grad_norm": 0.703338086605072, "learning_rate": 6.240581741720694e-05, "loss": 1.6839, "step": 4294 }, { "epoch": 1.1287811607054883, "grad_norm": 0.788475751876831, "learning_rate": 6.238829507622218e-05, "loss": 1.6906, "step": 4296 }, { "epoch": 1.12930666403915, "grad_norm": 0.9603837728500366, "learning_rate": 6.237077273523742e-05, "loss": 1.6735, "step": 4298 }, { "epoch": 1.1298321673728118, "grad_norm": 0.7163801789283752, "learning_rate": 6.235325039425267e-05, "loss": 1.6773, "step": 4300 }, { "epoch": 1.1303576707064735, "grad_norm": 0.8321335911750793, "learning_rate": 6.233572805326792e-05, "loss": 1.6836, "step": 4302 }, { "epoch": 1.1308831740401353, "grad_norm": 0.6714113354682922, "learning_rate": 6.231820571228317e-05, "loss": 1.6949, "step": 4304 }, { "epoch": 1.131408677373797, "grad_norm": 0.9358461499214172, "learning_rate": 6.23006833712984e-05, "loss": 1.6965, "step": 4306 }, { "epoch": 1.1319341807074588, "grad_norm": 0.8378779292106628, "learning_rate": 6.228316103031365e-05, "loss": 1.697, "step": 4308 }, { "epoch": 1.1324596840411205, "grad_norm": 0.5909398198127747, "learning_rate": 6.22656386893289e-05, "loss": 1.7349, "step": 4310 }, { "epoch": 1.1329851873747825, "grad_norm": 0.5922428965568542, "learning_rate": 6.224811634834414e-05, "loss": 1.6598, "step": 4312 }, { "epoch": 1.1335106907084442, "grad_norm": 0.7783740162849426, "learning_rate": 6.223059400735939e-05, "loss": 1.6822, "step": 4314 }, { "epoch": 1.134036194042106, "grad_norm": 0.5940402746200562, "learning_rate": 6.221307166637464e-05, "loss": 1.7093, "step": 4316 }, { "epoch": 1.1345616973757677, "grad_norm": 0.7679793834686279, "learning_rate": 6.219554932538987e-05, "loss": 1.6886, "step": 4318 }, { "epoch": 1.1350872007094295, "grad_norm": 0.624814510345459, "learning_rate": 6.217802698440512e-05, "loss": 1.684, "step": 4320 }, { "epoch": 1.1356127040430912, "grad_norm": 0.87510085105896, "learning_rate": 6.216050464342036e-05, "loss": 1.7228, "step": 4322 }, { "epoch": 1.136138207376753, "grad_norm": 0.6720306277275085, "learning_rate": 6.21429823024356e-05, "loss": 1.6675, "step": 4324 }, { "epoch": 1.136663710710415, "grad_norm": 0.6201871037483215, "learning_rate": 6.212545996145085e-05, "loss": 1.708, "step": 4326 }, { "epoch": 1.1371892140440765, "grad_norm": 0.6067994832992554, "learning_rate": 6.21079376204661e-05, "loss": 1.6825, "step": 4328 }, { "epoch": 1.1377147173777384, "grad_norm": 0.5989435315132141, "learning_rate": 6.209041527948135e-05, "loss": 1.6796, "step": 4330 }, { "epoch": 1.1382402207114002, "grad_norm": 0.6668772101402283, "learning_rate": 6.207289293849658e-05, "loss": 1.6989, "step": 4332 }, { "epoch": 1.138765724045062, "grad_norm": 0.6205788850784302, "learning_rate": 6.205537059751183e-05, "loss": 1.6787, "step": 4334 }, { "epoch": 1.1392912273787237, "grad_norm": 0.7359316349029541, "learning_rate": 6.203784825652707e-05, "loss": 1.6806, "step": 4336 }, { "epoch": 1.1398167307123854, "grad_norm": 0.5981895923614502, "learning_rate": 6.202032591554232e-05, "loss": 1.6804, "step": 4338 }, { "epoch": 1.1403422340460472, "grad_norm": 0.6678347587585449, "learning_rate": 6.200280357455757e-05, "loss": 1.6966, "step": 4340 }, { "epoch": 1.140867737379709, "grad_norm": 0.691186249256134, "learning_rate": 6.198528123357282e-05, "loss": 1.6746, "step": 4342 }, { "epoch": 1.141393240713371, "grad_norm": 0.6158877611160278, "learning_rate": 6.196775889258805e-05, "loss": 1.6968, "step": 4344 }, { "epoch": 1.1419187440470326, "grad_norm": 0.6359262466430664, "learning_rate": 6.19502365516033e-05, "loss": 1.6768, "step": 4346 }, { "epoch": 1.1424442473806944, "grad_norm": 0.5825957655906677, "learning_rate": 6.193271421061853e-05, "loss": 1.6953, "step": 4348 }, { "epoch": 1.1429697507143561, "grad_norm": 0.6011431217193604, "learning_rate": 6.191519186963378e-05, "loss": 1.6895, "step": 4350 }, { "epoch": 1.1434952540480179, "grad_norm": 0.5862739086151123, "learning_rate": 6.189766952864903e-05, "loss": 1.6716, "step": 4352 }, { "epoch": 1.1440207573816796, "grad_norm": 0.6541095972061157, "learning_rate": 6.188014718766428e-05, "loss": 1.6748, "step": 4354 }, { "epoch": 1.1445462607153414, "grad_norm": 0.6046082973480225, "learning_rate": 6.186262484667952e-05, "loss": 1.6677, "step": 4356 }, { "epoch": 1.1450717640490031, "grad_norm": 0.5532662868499756, "learning_rate": 6.184510250569476e-05, "loss": 1.6567, "step": 4358 }, { "epoch": 1.1455972673826649, "grad_norm": 0.6711378693580627, "learning_rate": 6.182758016471e-05, "loss": 1.6731, "step": 4360 }, { "epoch": 1.1461227707163268, "grad_norm": 0.6071760654449463, "learning_rate": 6.181005782372525e-05, "loss": 1.6749, "step": 4362 }, { "epoch": 1.1466482740499886, "grad_norm": 0.6723074913024902, "learning_rate": 6.17925354827405e-05, "loss": 1.6757, "step": 4364 }, { "epoch": 1.1471737773836503, "grad_norm": 0.649804949760437, "learning_rate": 6.177501314175575e-05, "loss": 1.6809, "step": 4366 }, { "epoch": 1.147699280717312, "grad_norm": 0.6812226176261902, "learning_rate": 6.1757490800771e-05, "loss": 1.694, "step": 4368 }, { "epoch": 1.1482247840509738, "grad_norm": 0.6040016412734985, "learning_rate": 6.173996845978623e-05, "loss": 1.7262, "step": 4370 }, { "epoch": 1.1487502873846356, "grad_norm": 0.652336597442627, "learning_rate": 6.172244611880148e-05, "loss": 1.6702, "step": 4372 }, { "epoch": 1.1492757907182973, "grad_norm": 0.5667666792869568, "learning_rate": 6.170492377781671e-05, "loss": 1.6873, "step": 4374 }, { "epoch": 1.149801294051959, "grad_norm": 0.6907036304473877, "learning_rate": 6.168740143683196e-05, "loss": 1.7076, "step": 4376 }, { "epoch": 1.1503267973856208, "grad_norm": 0.5834628939628601, "learning_rate": 6.16698790958472e-05, "loss": 1.6956, "step": 4378 }, { "epoch": 1.1508523007192828, "grad_norm": 0.6255402565002441, "learning_rate": 6.165235675486245e-05, "loss": 1.7165, "step": 4380 }, { "epoch": 1.1513778040529445, "grad_norm": 0.6630857586860657, "learning_rate": 6.16348344138777e-05, "loss": 1.6898, "step": 4382 }, { "epoch": 1.1519033073866063, "grad_norm": 0.6802171468734741, "learning_rate": 6.161731207289293e-05, "loss": 1.6946, "step": 4384 }, { "epoch": 1.152428810720268, "grad_norm": 0.5508185029029846, "learning_rate": 6.159978973190818e-05, "loss": 1.676, "step": 4386 }, { "epoch": 1.1529543140539298, "grad_norm": 0.5373386740684509, "learning_rate": 6.158226739092343e-05, "loss": 1.6595, "step": 4388 }, { "epoch": 1.1534798173875915, "grad_norm": 0.5505905747413635, "learning_rate": 6.156474504993868e-05, "loss": 1.679, "step": 4390 }, { "epoch": 1.1540053207212533, "grad_norm": 0.5813027620315552, "learning_rate": 6.154722270895392e-05, "loss": 1.6824, "step": 4392 }, { "epoch": 1.154530824054915, "grad_norm": 0.6369917392730713, "learning_rate": 6.152970036796917e-05, "loss": 1.6704, "step": 4394 }, { "epoch": 1.1550563273885768, "grad_norm": 0.5745425224304199, "learning_rate": 6.15121780269844e-05, "loss": 1.6583, "step": 4396 }, { "epoch": 1.1555818307222387, "grad_norm": 0.6166395545005798, "learning_rate": 6.149465568599964e-05, "loss": 1.6866, "step": 4398 }, { "epoch": 1.1561073340559005, "grad_norm": 0.5606328248977661, "learning_rate": 6.147713334501489e-05, "loss": 1.6844, "step": 4400 }, { "epoch": 1.1561073340559005, "eval_loss": 1.7070621252059937, "eval_runtime": 487.2485, "eval_samples_per_second": 249.953, "eval_steps_per_second": 31.245, "step": 4400 }, { "epoch": 1.1566328373895622, "grad_norm": 0.6324471235275269, "learning_rate": 6.145961100403014e-05, "loss": 1.6756, "step": 4402 }, { "epoch": 1.157158340723224, "grad_norm": 0.66776043176651, "learning_rate": 6.144208866304538e-05, "loss": 1.7205, "step": 4404 }, { "epoch": 1.1576838440568857, "grad_norm": 0.5929023623466492, "learning_rate": 6.142456632206063e-05, "loss": 1.6974, "step": 4406 }, { "epoch": 1.1582093473905475, "grad_norm": 0.6727093458175659, "learning_rate": 6.140704398107588e-05, "loss": 1.6482, "step": 4408 }, { "epoch": 1.1587348507242092, "grad_norm": 0.5299540758132935, "learning_rate": 6.138952164009111e-05, "loss": 1.712, "step": 4410 }, { "epoch": 1.159260354057871, "grad_norm": 0.6730796098709106, "learning_rate": 6.137199929910636e-05, "loss": 1.661, "step": 4412 }, { "epoch": 1.1597858573915327, "grad_norm": 0.6617345213890076, "learning_rate": 6.135447695812161e-05, "loss": 1.706, "step": 4414 }, { "epoch": 1.1603113607251947, "grad_norm": 0.5500732660293579, "learning_rate": 6.133695461713686e-05, "loss": 1.7226, "step": 4416 }, { "epoch": 1.1608368640588564, "grad_norm": 0.6082700490951538, "learning_rate": 6.13194322761521e-05, "loss": 1.6724, "step": 4418 }, { "epoch": 1.1613623673925182, "grad_norm": 0.6681337356567383, "learning_rate": 6.130190993516735e-05, "loss": 1.6395, "step": 4420 }, { "epoch": 1.16188787072618, "grad_norm": 0.6328011155128479, "learning_rate": 6.128438759418258e-05, "loss": 1.6802, "step": 4422 }, { "epoch": 1.1624133740598417, "grad_norm": 0.6065677404403687, "learning_rate": 6.126686525319782e-05, "loss": 1.6797, "step": 4424 }, { "epoch": 1.1629388773935034, "grad_norm": 0.5718259811401367, "learning_rate": 6.124934291221307e-05, "loss": 1.6686, "step": 4426 }, { "epoch": 1.1634643807271652, "grad_norm": 0.5725675821304321, "learning_rate": 6.123182057122831e-05, "loss": 1.7017, "step": 4428 }, { "epoch": 1.163989884060827, "grad_norm": 0.5983812808990479, "learning_rate": 6.121429823024356e-05, "loss": 1.6779, "step": 4430 }, { "epoch": 1.1645153873944887, "grad_norm": 0.5576106309890747, "learning_rate": 6.119677588925881e-05, "loss": 1.6843, "step": 4432 }, { "epoch": 1.1650408907281506, "grad_norm": 0.9009653329849243, "learning_rate": 6.117925354827406e-05, "loss": 1.6473, "step": 4434 }, { "epoch": 1.1655663940618124, "grad_norm": 0.7119090557098389, "learning_rate": 6.116173120728929e-05, "loss": 1.6738, "step": 4436 }, { "epoch": 1.1660918973954741, "grad_norm": 0.8985000848770142, "learning_rate": 6.114420886630454e-05, "loss": 1.6753, "step": 4438 }, { "epoch": 1.166617400729136, "grad_norm": 0.6928833723068237, "learning_rate": 6.112668652531979e-05, "loss": 1.6922, "step": 4440 }, { "epoch": 1.1671429040627976, "grad_norm": 0.5745196342468262, "learning_rate": 6.110916418433503e-05, "loss": 1.6845, "step": 4442 }, { "epoch": 1.1676684073964594, "grad_norm": 0.6172153949737549, "learning_rate": 6.109164184335028e-05, "loss": 1.7065, "step": 4444 }, { "epoch": 1.1681939107301211, "grad_norm": 0.606926441192627, "learning_rate": 6.107411950236553e-05, "loss": 1.6673, "step": 4446 }, { "epoch": 1.1687194140637829, "grad_norm": 0.6731804609298706, "learning_rate": 6.105659716138076e-05, "loss": 1.7006, "step": 4448 }, { "epoch": 1.1692449173974446, "grad_norm": 0.8230140805244446, "learning_rate": 6.103907482039601e-05, "loss": 1.694, "step": 4450 }, { "epoch": 1.1697704207311066, "grad_norm": 0.809855580329895, "learning_rate": 6.102155247941125e-05, "loss": 1.699, "step": 4452 }, { "epoch": 1.1702959240647683, "grad_norm": 0.6014560461044312, "learning_rate": 6.100403013842649e-05, "loss": 1.7296, "step": 4454 }, { "epoch": 1.17082142739843, "grad_norm": 0.6129987835884094, "learning_rate": 6.098650779744174e-05, "loss": 1.6596, "step": 4456 }, { "epoch": 1.1713469307320918, "grad_norm": 0.8771330118179321, "learning_rate": 6.0968985456456986e-05, "loss": 1.6907, "step": 4458 }, { "epoch": 1.1718724340657536, "grad_norm": 0.5508044362068176, "learning_rate": 6.095146311547223e-05, "loss": 1.7086, "step": 4460 }, { "epoch": 1.1723979373994153, "grad_norm": 0.7334650158882141, "learning_rate": 6.0933940774487474e-05, "loss": 1.6956, "step": 4462 }, { "epoch": 1.172923440733077, "grad_norm": 0.7739654183387756, "learning_rate": 6.091641843350272e-05, "loss": 1.6785, "step": 4464 }, { "epoch": 1.1734489440667388, "grad_norm": 0.6686828136444092, "learning_rate": 6.089889609251796e-05, "loss": 1.6571, "step": 4466 }, { "epoch": 1.1739744474004006, "grad_norm": 0.6591079831123352, "learning_rate": 6.088137375153321e-05, "loss": 1.6661, "step": 4468 }, { "epoch": 1.1744999507340625, "grad_norm": 0.520408570766449, "learning_rate": 6.086385141054846e-05, "loss": 1.7076, "step": 4470 }, { "epoch": 1.1750254540677243, "grad_norm": 0.7765664458274841, "learning_rate": 6.08463290695637e-05, "loss": 1.6801, "step": 4472 }, { "epoch": 1.175550957401386, "grad_norm": 0.6031415462493896, "learning_rate": 6.0828806728578946e-05, "loss": 1.6999, "step": 4474 }, { "epoch": 1.1760764607350478, "grad_norm": 0.694837749004364, "learning_rate": 6.081128438759418e-05, "loss": 1.7186, "step": 4476 }, { "epoch": 1.1766019640687095, "grad_norm": 0.5900973081588745, "learning_rate": 6.079376204660943e-05, "loss": 1.6832, "step": 4478 }, { "epoch": 1.1771274674023713, "grad_norm": 0.5364487171173096, "learning_rate": 6.077623970562467e-05, "loss": 1.6756, "step": 4480 }, { "epoch": 1.177652970736033, "grad_norm": 0.5482999086380005, "learning_rate": 6.0758717364639916e-05, "loss": 1.697, "step": 4482 }, { "epoch": 1.178178474069695, "grad_norm": 0.7080986499786377, "learning_rate": 6.0741195023655164e-05, "loss": 1.7187, "step": 4484 }, { "epoch": 1.1787039774033565, "grad_norm": 0.6446167230606079, "learning_rate": 6.0723672682670405e-05, "loss": 1.6753, "step": 4486 }, { "epoch": 1.1792294807370185, "grad_norm": 0.6965610384941101, "learning_rate": 6.070615034168565e-05, "loss": 1.689, "step": 4488 }, { "epoch": 1.1797549840706802, "grad_norm": 0.5724831819534302, "learning_rate": 6.06886280007009e-05, "loss": 1.6872, "step": 4490 }, { "epoch": 1.180280487404342, "grad_norm": 0.7344499230384827, "learning_rate": 6.067110565971614e-05, "loss": 1.6421, "step": 4492 }, { "epoch": 1.1808059907380037, "grad_norm": 0.663684070110321, "learning_rate": 6.065358331873139e-05, "loss": 1.6909, "step": 4494 }, { "epoch": 1.1813314940716655, "grad_norm": 0.5467018485069275, "learning_rate": 6.0636060977746636e-05, "loss": 1.6707, "step": 4496 }, { "epoch": 1.1818569974053272, "grad_norm": 0.5809999704360962, "learning_rate": 6.0618538636761876e-05, "loss": 1.692, "step": 4498 }, { "epoch": 1.182382500738989, "grad_norm": 0.6746166944503784, "learning_rate": 6.060101629577711e-05, "loss": 1.6944, "step": 4500 }, { "epoch": 1.182908004072651, "grad_norm": 0.5990844368934631, "learning_rate": 6.058349395479236e-05, "loss": 1.703, "step": 4502 }, { "epoch": 1.1834335074063127, "grad_norm": 0.7285276651382446, "learning_rate": 6.0565971613807606e-05, "loss": 1.7037, "step": 4504 }, { "epoch": 1.1839590107399744, "grad_norm": 0.5541601181030273, "learning_rate": 6.0548449272822846e-05, "loss": 1.712, "step": 4506 }, { "epoch": 1.1844845140736362, "grad_norm": 0.5341848134994507, "learning_rate": 6.0530926931838094e-05, "loss": 1.6848, "step": 4508 }, { "epoch": 1.185010017407298, "grad_norm": 0.5783855319023132, "learning_rate": 6.051340459085334e-05, "loss": 1.6941, "step": 4510 }, { "epoch": 1.1855355207409597, "grad_norm": 0.7894533276557922, "learning_rate": 6.049588224986858e-05, "loss": 1.699, "step": 4512 }, { "epoch": 1.1860610240746214, "grad_norm": 0.6631985306739807, "learning_rate": 6.047835990888383e-05, "loss": 1.7063, "step": 4514 }, { "epoch": 1.1865865274082832, "grad_norm": 0.5982935428619385, "learning_rate": 6.046083756789908e-05, "loss": 1.6921, "step": 4516 }, { "epoch": 1.187112030741945, "grad_norm": 0.5341594219207764, "learning_rate": 6.044331522691432e-05, "loss": 1.7252, "step": 4518 }, { "epoch": 1.187637534075607, "grad_norm": 0.5512247681617737, "learning_rate": 6.0425792885929566e-05, "loss": 1.7033, "step": 4520 }, { "epoch": 1.1881630374092687, "grad_norm": 0.5994440913200378, "learning_rate": 6.040827054494481e-05, "loss": 1.7003, "step": 4522 }, { "epoch": 1.1886885407429304, "grad_norm": 0.6756665706634521, "learning_rate": 6.0390748203960054e-05, "loss": 1.6817, "step": 4524 }, { "epoch": 1.1892140440765921, "grad_norm": 0.6200160980224609, "learning_rate": 6.037322586297529e-05, "loss": 1.7068, "step": 4526 }, { "epoch": 1.189739547410254, "grad_norm": 0.5417330861091614, "learning_rate": 6.0355703521990536e-05, "loss": 1.7199, "step": 4528 }, { "epoch": 1.1902650507439156, "grad_norm": 0.6741480231285095, "learning_rate": 6.033818118100578e-05, "loss": 1.6577, "step": 4530 }, { "epoch": 1.1907905540775774, "grad_norm": 0.667620062828064, "learning_rate": 6.0320658840021024e-05, "loss": 1.6835, "step": 4532 }, { "epoch": 1.1913160574112391, "grad_norm": 0.783929705619812, "learning_rate": 6.030313649903627e-05, "loss": 1.7028, "step": 4534 }, { "epoch": 1.1918415607449009, "grad_norm": 0.6510748267173767, "learning_rate": 6.028561415805152e-05, "loss": 1.6668, "step": 4536 }, { "epoch": 1.1923670640785629, "grad_norm": 0.5531290769577026, "learning_rate": 6.026809181706676e-05, "loss": 1.6691, "step": 4538 }, { "epoch": 1.1928925674122246, "grad_norm": 0.5906832218170166, "learning_rate": 6.025056947608201e-05, "loss": 1.6689, "step": 4540 }, { "epoch": 1.1934180707458864, "grad_norm": 0.580287516117096, "learning_rate": 6.0233047135097255e-05, "loss": 1.7105, "step": 4542 }, { "epoch": 1.193943574079548, "grad_norm": 0.6183664798736572, "learning_rate": 6.0215524794112496e-05, "loss": 1.7059, "step": 4544 }, { "epoch": 1.1944690774132098, "grad_norm": 0.5283176898956299, "learning_rate": 6.019800245312774e-05, "loss": 1.701, "step": 4546 }, { "epoch": 1.1949945807468716, "grad_norm": 0.5322774052619934, "learning_rate": 6.018048011214299e-05, "loss": 1.6725, "step": 4548 }, { "epoch": 1.1955200840805333, "grad_norm": 0.6731706857681274, "learning_rate": 6.016295777115823e-05, "loss": 1.6892, "step": 4550 }, { "epoch": 1.196045587414195, "grad_norm": 0.5544896125793457, "learning_rate": 6.0145435430173466e-05, "loss": 1.6925, "step": 4552 }, { "epoch": 1.1965710907478568, "grad_norm": 0.6916193962097168, "learning_rate": 6.012791308918871e-05, "loss": 1.7058, "step": 4554 }, { "epoch": 1.1970965940815188, "grad_norm": 0.6361691355705261, "learning_rate": 6.011039074820396e-05, "loss": 1.6505, "step": 4556 }, { "epoch": 1.1976220974151806, "grad_norm": 0.5423670411109924, "learning_rate": 6.00928684072192e-05, "loss": 1.6935, "step": 4558 }, { "epoch": 1.1981476007488423, "grad_norm": 0.737524688243866, "learning_rate": 6.007534606623445e-05, "loss": 1.7217, "step": 4560 }, { "epoch": 1.198673104082504, "grad_norm": 0.5811054110527039, "learning_rate": 6.00578237252497e-05, "loss": 1.6936, "step": 4562 }, { "epoch": 1.1991986074161658, "grad_norm": 0.554607093334198, "learning_rate": 6.004030138426494e-05, "loss": 1.6963, "step": 4564 }, { "epoch": 1.1997241107498275, "grad_norm": 0.6123262643814087, "learning_rate": 6.0022779043280185e-05, "loss": 1.6998, "step": 4566 }, { "epoch": 1.2002496140834893, "grad_norm": 0.7441146969795227, "learning_rate": 6.000525670229543e-05, "loss": 1.6764, "step": 4568 }, { "epoch": 1.200775117417151, "grad_norm": 0.6230175495147705, "learning_rate": 5.9987734361310673e-05, "loss": 1.6832, "step": 4570 }, { "epoch": 1.2013006207508128, "grad_norm": 0.5986462831497192, "learning_rate": 5.997021202032592e-05, "loss": 1.6863, "step": 4572 }, { "epoch": 1.2018261240844748, "grad_norm": 0.7910165786743164, "learning_rate": 5.995268967934117e-05, "loss": 1.6787, "step": 4574 }, { "epoch": 1.2023516274181365, "grad_norm": 0.5927668213844299, "learning_rate": 5.993516733835641e-05, "loss": 1.6826, "step": 4576 }, { "epoch": 1.2028771307517983, "grad_norm": 0.6229565143585205, "learning_rate": 5.991764499737165e-05, "loss": 1.6938, "step": 4578 }, { "epoch": 1.20340263408546, "grad_norm": 0.5543463826179504, "learning_rate": 5.990012265638689e-05, "loss": 1.6557, "step": 4580 }, { "epoch": 1.2039281374191217, "grad_norm": 0.6359471678733826, "learning_rate": 5.988260031540214e-05, "loss": 1.6984, "step": 4582 }, { "epoch": 1.2044536407527835, "grad_norm": 0.6143317222595215, "learning_rate": 5.9865077974417386e-05, "loss": 1.6618, "step": 4584 }, { "epoch": 1.2049791440864452, "grad_norm": 0.5732699036598206, "learning_rate": 5.984755563343263e-05, "loss": 1.7093, "step": 4586 }, { "epoch": 1.205504647420107, "grad_norm": 0.6090834140777588, "learning_rate": 5.9830033292447874e-05, "loss": 1.6769, "step": 4588 }, { "epoch": 1.2060301507537687, "grad_norm": 0.5685290098190308, "learning_rate": 5.981251095146312e-05, "loss": 1.689, "step": 4590 }, { "epoch": 1.2065556540874307, "grad_norm": 0.641834557056427, "learning_rate": 5.979498861047836e-05, "loss": 1.6734, "step": 4592 }, { "epoch": 1.2070811574210925, "grad_norm": 0.6050872206687927, "learning_rate": 5.977746626949361e-05, "loss": 1.6887, "step": 4594 }, { "epoch": 1.2076066607547542, "grad_norm": 0.8347049951553345, "learning_rate": 5.975994392850886e-05, "loss": 1.7161, "step": 4596 }, { "epoch": 1.208132164088416, "grad_norm": 0.5846490859985352, "learning_rate": 5.97424215875241e-05, "loss": 1.7121, "step": 4598 }, { "epoch": 1.2086576674220777, "grad_norm": 0.8007673621177673, "learning_rate": 5.9724899246539346e-05, "loss": 1.6825, "step": 4600 }, { "epoch": 1.2091831707557394, "grad_norm": 0.7300782203674316, "learning_rate": 5.970737690555458e-05, "loss": 1.6842, "step": 4602 }, { "epoch": 1.2097086740894012, "grad_norm": 0.5872364044189453, "learning_rate": 5.968985456456983e-05, "loss": 1.6854, "step": 4604 }, { "epoch": 1.210234177423063, "grad_norm": 0.6040078997612, "learning_rate": 5.967233222358507e-05, "loss": 1.7254, "step": 4606 }, { "epoch": 1.2107596807567247, "grad_norm": 0.6352037787437439, "learning_rate": 5.9654809882600316e-05, "loss": 1.6559, "step": 4608 }, { "epoch": 1.2112851840903867, "grad_norm": 0.8045578002929688, "learning_rate": 5.9637287541615564e-05, "loss": 1.6984, "step": 4610 }, { "epoch": 1.2118106874240484, "grad_norm": 0.5676342248916626, "learning_rate": 5.9619765200630805e-05, "loss": 1.6881, "step": 4612 }, { "epoch": 1.2123361907577102, "grad_norm": 0.5795140862464905, "learning_rate": 5.960224285964605e-05, "loss": 1.6856, "step": 4614 }, { "epoch": 1.212861694091372, "grad_norm": 1.0082522630691528, "learning_rate": 5.95847205186613e-05, "loss": 1.7146, "step": 4616 }, { "epoch": 1.2133871974250336, "grad_norm": 0.5609422922134399, "learning_rate": 5.956719817767654e-05, "loss": 1.708, "step": 4618 }, { "epoch": 1.2139127007586954, "grad_norm": 0.6195114850997925, "learning_rate": 5.954967583669179e-05, "loss": 1.6505, "step": 4620 }, { "epoch": 1.2144382040923571, "grad_norm": 0.7452388405799866, "learning_rate": 5.9532153495707036e-05, "loss": 1.6737, "step": 4622 }, { "epoch": 1.214963707426019, "grad_norm": 0.624179482460022, "learning_rate": 5.9514631154722276e-05, "loss": 1.6642, "step": 4624 }, { "epoch": 1.2154892107596806, "grad_norm": 0.5954496264457703, "learning_rate": 5.9497108813737524e-05, "loss": 1.6683, "step": 4626 }, { "epoch": 1.2160147140933426, "grad_norm": 0.6502910852432251, "learning_rate": 5.947958647275276e-05, "loss": 1.6515, "step": 4628 }, { "epoch": 1.2165402174270044, "grad_norm": 0.6815327405929565, "learning_rate": 5.9462064131768005e-05, "loss": 1.7016, "step": 4630 }, { "epoch": 1.217065720760666, "grad_norm": 0.6848923563957214, "learning_rate": 5.9444541790783246e-05, "loss": 1.6792, "step": 4632 }, { "epoch": 1.2175912240943279, "grad_norm": 0.5819631218910217, "learning_rate": 5.9427019449798494e-05, "loss": 1.6822, "step": 4634 }, { "epoch": 1.2181167274279896, "grad_norm": 0.5571796894073486, "learning_rate": 5.940949710881374e-05, "loss": 1.6894, "step": 4636 }, { "epoch": 1.2186422307616513, "grad_norm": 0.7528451681137085, "learning_rate": 5.939197476782898e-05, "loss": 1.6863, "step": 4638 }, { "epoch": 1.219167734095313, "grad_norm": 0.5486443042755127, "learning_rate": 5.937445242684423e-05, "loss": 1.7017, "step": 4640 }, { "epoch": 1.219693237428975, "grad_norm": 0.7798599004745483, "learning_rate": 5.935693008585948e-05, "loss": 1.7108, "step": 4642 }, { "epoch": 1.2202187407626366, "grad_norm": 0.6270305514335632, "learning_rate": 5.933940774487472e-05, "loss": 1.6979, "step": 4644 }, { "epoch": 1.2207442440962986, "grad_norm": 0.5500916838645935, "learning_rate": 5.9321885403889966e-05, "loss": 1.6832, "step": 4646 }, { "epoch": 1.2212697474299603, "grad_norm": 0.6205405592918396, "learning_rate": 5.930436306290521e-05, "loss": 1.7544, "step": 4648 }, { "epoch": 1.221795250763622, "grad_norm": 0.6169667840003967, "learning_rate": 5.9286840721920454e-05, "loss": 1.7024, "step": 4650 }, { "epoch": 1.2223207540972838, "grad_norm": 0.7890437841415405, "learning_rate": 5.92693183809357e-05, "loss": 1.6707, "step": 4652 }, { "epoch": 1.2228462574309455, "grad_norm": 0.6405115723609924, "learning_rate": 5.9251796039950936e-05, "loss": 1.6503, "step": 4654 }, { "epoch": 1.2233717607646073, "grad_norm": 0.6368988156318665, "learning_rate": 5.923427369896618e-05, "loss": 1.6963, "step": 4656 }, { "epoch": 1.223897264098269, "grad_norm": 0.7032740712165833, "learning_rate": 5.9216751357981424e-05, "loss": 1.6675, "step": 4658 }, { "epoch": 1.224422767431931, "grad_norm": 0.6669613718986511, "learning_rate": 5.919922901699667e-05, "loss": 1.7002, "step": 4660 }, { "epoch": 1.2249482707655928, "grad_norm": 0.6690141558647156, "learning_rate": 5.918170667601192e-05, "loss": 1.6499, "step": 4662 }, { "epoch": 1.2254737740992545, "grad_norm": 0.5619009733200073, "learning_rate": 5.916418433502716e-05, "loss": 1.7039, "step": 4664 }, { "epoch": 1.2259992774329163, "grad_norm": 0.6794602274894714, "learning_rate": 5.914666199404241e-05, "loss": 1.6732, "step": 4666 }, { "epoch": 1.226524780766578, "grad_norm": 0.8924415707588196, "learning_rate": 5.9129139653057655e-05, "loss": 1.6833, "step": 4668 }, { "epoch": 1.2270502841002398, "grad_norm": 0.6746704578399658, "learning_rate": 5.9111617312072896e-05, "loss": 1.6816, "step": 4670 }, { "epoch": 1.2275757874339015, "grad_norm": 0.5335586071014404, "learning_rate": 5.909409497108814e-05, "loss": 1.7214, "step": 4672 }, { "epoch": 1.2281012907675632, "grad_norm": 0.6099651455879211, "learning_rate": 5.907657263010339e-05, "loss": 1.6652, "step": 4674 }, { "epoch": 1.228626794101225, "grad_norm": 0.6068762540817261, "learning_rate": 5.905905028911863e-05, "loss": 1.6563, "step": 4676 }, { "epoch": 1.229152297434887, "grad_norm": 0.561000645160675, "learning_rate": 5.904152794813388e-05, "loss": 1.6927, "step": 4678 }, { "epoch": 1.2296778007685487, "grad_norm": 0.5757449269294739, "learning_rate": 5.902400560714911e-05, "loss": 1.7042, "step": 4680 }, { "epoch": 1.2302033041022105, "grad_norm": 0.5956955552101135, "learning_rate": 5.900648326616436e-05, "loss": 1.6785, "step": 4682 }, { "epoch": 1.2307288074358722, "grad_norm": 0.5795134902000427, "learning_rate": 5.89889609251796e-05, "loss": 1.6774, "step": 4684 }, { "epoch": 1.231254310769534, "grad_norm": 0.8627724051475525, "learning_rate": 5.897143858419485e-05, "loss": 1.7131, "step": 4686 }, { "epoch": 1.2317798141031957, "grad_norm": 0.5390210747718811, "learning_rate": 5.89539162432101e-05, "loss": 1.7153, "step": 4688 }, { "epoch": 1.2323053174368575, "grad_norm": 0.8949792981147766, "learning_rate": 5.893639390222534e-05, "loss": 1.689, "step": 4690 }, { "epoch": 1.2328308207705192, "grad_norm": 0.6706136465072632, "learning_rate": 5.8918871561240585e-05, "loss": 1.71, "step": 4692 }, { "epoch": 1.233356324104181, "grad_norm": 0.5577556490898132, "learning_rate": 5.890134922025583e-05, "loss": 1.6462, "step": 4694 }, { "epoch": 1.233881827437843, "grad_norm": 0.6497912406921387, "learning_rate": 5.8883826879271073e-05, "loss": 1.6806, "step": 4696 }, { "epoch": 1.2344073307715047, "grad_norm": 0.5242089629173279, "learning_rate": 5.886630453828632e-05, "loss": 1.7029, "step": 4698 }, { "epoch": 1.2349328341051664, "grad_norm": 0.9607106447219849, "learning_rate": 5.884878219730157e-05, "loss": 1.6864, "step": 4700 }, { "epoch": 1.2354583374388282, "grad_norm": 0.9370645880699158, "learning_rate": 5.883125985631681e-05, "loss": 1.6973, "step": 4702 }, { "epoch": 1.23598384077249, "grad_norm": 0.7093036770820618, "learning_rate": 5.881373751533204e-05, "loss": 1.7237, "step": 4704 }, { "epoch": 1.2365093441061517, "grad_norm": 0.6128284931182861, "learning_rate": 5.879621517434729e-05, "loss": 1.6673, "step": 4706 }, { "epoch": 1.2370348474398134, "grad_norm": 1.1151705980300903, "learning_rate": 5.877869283336254e-05, "loss": 1.6668, "step": 4708 }, { "epoch": 1.2375603507734751, "grad_norm": 0.7388045191764832, "learning_rate": 5.876117049237778e-05, "loss": 1.6923, "step": 4710 }, { "epoch": 1.238085854107137, "grad_norm": 0.539486289024353, "learning_rate": 5.874364815139303e-05, "loss": 1.6779, "step": 4712 }, { "epoch": 1.2386113574407989, "grad_norm": 0.7663931250572205, "learning_rate": 5.8726125810408274e-05, "loss": 1.686, "step": 4714 }, { "epoch": 1.2391368607744606, "grad_norm": 0.6578073501586914, "learning_rate": 5.8708603469423515e-05, "loss": 1.7054, "step": 4716 }, { "epoch": 1.2396623641081224, "grad_norm": 0.5744708776473999, "learning_rate": 5.869108112843876e-05, "loss": 1.6785, "step": 4718 }, { "epoch": 1.240187867441784, "grad_norm": 0.7532958388328552, "learning_rate": 5.867355878745401e-05, "loss": 1.6743, "step": 4720 }, { "epoch": 1.2407133707754459, "grad_norm": 0.5337387323379517, "learning_rate": 5.865603644646925e-05, "loss": 1.7303, "step": 4722 }, { "epoch": 1.2412388741091076, "grad_norm": 0.5469831228256226, "learning_rate": 5.86385141054845e-05, "loss": 1.6566, "step": 4724 }, { "epoch": 1.2417643774427694, "grad_norm": 0.5634649395942688, "learning_rate": 5.8620991764499746e-05, "loss": 1.6868, "step": 4726 }, { "epoch": 1.242289880776431, "grad_norm": 0.7761363983154297, "learning_rate": 5.860346942351499e-05, "loss": 1.6792, "step": 4728 }, { "epoch": 1.2428153841100928, "grad_norm": 0.6965597867965698, "learning_rate": 5.858594708253022e-05, "loss": 1.715, "step": 4730 }, { "epoch": 1.2433408874437548, "grad_norm": 0.6957322359085083, "learning_rate": 5.856842474154547e-05, "loss": 1.6739, "step": 4732 }, { "epoch": 1.2438663907774166, "grad_norm": 0.5814897418022156, "learning_rate": 5.8550902400560716e-05, "loss": 1.6988, "step": 4734 }, { "epoch": 1.2443918941110783, "grad_norm": 0.9491192698478699, "learning_rate": 5.853338005957596e-05, "loss": 1.6994, "step": 4736 }, { "epoch": 1.24491739744474, "grad_norm": 0.7140454649925232, "learning_rate": 5.8515857718591205e-05, "loss": 1.6665, "step": 4738 }, { "epoch": 1.2454429007784018, "grad_norm": 0.5758277773857117, "learning_rate": 5.849833537760645e-05, "loss": 1.6842, "step": 4740 }, { "epoch": 1.2459684041120636, "grad_norm": 0.5660161972045898, "learning_rate": 5.848081303662169e-05, "loss": 1.6844, "step": 4742 }, { "epoch": 1.2464939074457253, "grad_norm": 0.7871302962303162, "learning_rate": 5.846329069563694e-05, "loss": 1.6983, "step": 4744 }, { "epoch": 1.247019410779387, "grad_norm": 0.5468810200691223, "learning_rate": 5.844576835465219e-05, "loss": 1.6561, "step": 4746 }, { "epoch": 1.2475449141130488, "grad_norm": 0.545019268989563, "learning_rate": 5.842824601366743e-05, "loss": 1.6704, "step": 4748 }, { "epoch": 1.2480704174467108, "grad_norm": 0.5376728177070618, "learning_rate": 5.8410723672682676e-05, "loss": 1.6373, "step": 4750 }, { "epoch": 1.2485959207803725, "grad_norm": 0.6872471570968628, "learning_rate": 5.8393201331697924e-05, "loss": 1.6629, "step": 4752 }, { "epoch": 1.2491214241140343, "grad_norm": 0.6468409299850464, "learning_rate": 5.8375678990713165e-05, "loss": 1.6785, "step": 4754 }, { "epoch": 1.249646927447696, "grad_norm": 0.7774249315261841, "learning_rate": 5.83581566497284e-05, "loss": 1.642, "step": 4756 }, { "epoch": 1.2501724307813578, "grad_norm": 0.6062743067741394, "learning_rate": 5.8340634308743646e-05, "loss": 1.7039, "step": 4758 }, { "epoch": 1.2506979341150195, "grad_norm": 0.6170024871826172, "learning_rate": 5.8323111967758894e-05, "loss": 1.7296, "step": 4760 }, { "epoch": 1.2512234374486813, "grad_norm": 0.5989388823509216, "learning_rate": 5.8305589626774135e-05, "loss": 1.6776, "step": 4762 }, { "epoch": 1.2517489407823432, "grad_norm": 0.6566706895828247, "learning_rate": 5.828806728578938e-05, "loss": 1.7016, "step": 4764 }, { "epoch": 1.2522744441160047, "grad_norm": 0.6715628504753113, "learning_rate": 5.827054494480463e-05, "loss": 1.6946, "step": 4766 }, { "epoch": 1.2527999474496667, "grad_norm": 0.7730180621147156, "learning_rate": 5.825302260381987e-05, "loss": 1.7043, "step": 4768 }, { "epoch": 1.2533254507833285, "grad_norm": 0.5837029814720154, "learning_rate": 5.823550026283512e-05, "loss": 1.7019, "step": 4770 }, { "epoch": 1.2538509541169902, "grad_norm": 0.6363146901130676, "learning_rate": 5.8217977921850366e-05, "loss": 1.6965, "step": 4772 }, { "epoch": 1.254376457450652, "grad_norm": 0.6856462359428406, "learning_rate": 5.8200455580865606e-05, "loss": 1.6899, "step": 4774 }, { "epoch": 1.2549019607843137, "grad_norm": 0.7007546424865723, "learning_rate": 5.8182933239880854e-05, "loss": 1.6673, "step": 4776 }, { "epoch": 1.2554274641179755, "grad_norm": 0.6396967172622681, "learning_rate": 5.81654108988961e-05, "loss": 1.6826, "step": 4778 }, { "epoch": 1.2559529674516372, "grad_norm": 0.6191834211349487, "learning_rate": 5.814788855791134e-05, "loss": 1.6848, "step": 4780 }, { "epoch": 1.2564784707852992, "grad_norm": 0.7049083709716797, "learning_rate": 5.8130366216926576e-05, "loss": 1.6785, "step": 4782 }, { "epoch": 1.2570039741189607, "grad_norm": 0.5608694553375244, "learning_rate": 5.8112843875941824e-05, "loss": 1.7138, "step": 4784 }, { "epoch": 1.2575294774526227, "grad_norm": 0.5731740593910217, "learning_rate": 5.809532153495707e-05, "loss": 1.6912, "step": 4786 }, { "epoch": 1.2580549807862844, "grad_norm": 0.5545026063919067, "learning_rate": 5.807779919397231e-05, "loss": 1.6621, "step": 4788 }, { "epoch": 1.2585804841199462, "grad_norm": 0.5389024019241333, "learning_rate": 5.806027685298756e-05, "loss": 1.6956, "step": 4790 }, { "epoch": 1.259105987453608, "grad_norm": 0.5764096975326538, "learning_rate": 5.804275451200281e-05, "loss": 1.6556, "step": 4792 }, { "epoch": 1.2596314907872697, "grad_norm": 0.5584897398948669, "learning_rate": 5.802523217101805e-05, "loss": 1.6837, "step": 4794 }, { "epoch": 1.2601569941209314, "grad_norm": 0.6685154438018799, "learning_rate": 5.8007709830033296e-05, "loss": 1.6343, "step": 4796 }, { "epoch": 1.2606824974545932, "grad_norm": 0.6008186936378479, "learning_rate": 5.799018748904854e-05, "loss": 1.6674, "step": 4798 }, { "epoch": 1.2612080007882551, "grad_norm": 0.6519007086753845, "learning_rate": 5.7972665148063784e-05, "loss": 1.6898, "step": 4800 }, { "epoch": 1.2612080007882551, "eval_loss": 1.700656533241272, "eval_runtime": 487.1424, "eval_samples_per_second": 250.007, "eval_steps_per_second": 31.252, "step": 4800 }, { "epoch": 1.2617335041219166, "grad_norm": 0.5863731503486633, "learning_rate": 5.795514280707903e-05, "loss": 1.6239, "step": 4802 }, { "epoch": 1.2622590074555786, "grad_norm": 0.6118308901786804, "learning_rate": 5.793762046609428e-05, "loss": 1.6997, "step": 4804 }, { "epoch": 1.2627845107892404, "grad_norm": 0.7632036209106445, "learning_rate": 5.792009812510951e-05, "loss": 1.6626, "step": 4806 }, { "epoch": 1.2633100141229021, "grad_norm": 0.593360960483551, "learning_rate": 5.7902575784124754e-05, "loss": 1.6919, "step": 4808 }, { "epoch": 1.2638355174565639, "grad_norm": 0.8928617238998413, "learning_rate": 5.788505344314e-05, "loss": 1.7171, "step": 4810 }, { "epoch": 1.2643610207902256, "grad_norm": 0.6598185896873474, "learning_rate": 5.786753110215525e-05, "loss": 1.6581, "step": 4812 }, { "epoch": 1.2648865241238874, "grad_norm": 0.6565474271774292, "learning_rate": 5.785000876117049e-05, "loss": 1.6763, "step": 4814 }, { "epoch": 1.265412027457549, "grad_norm": 0.796642541885376, "learning_rate": 5.783248642018574e-05, "loss": 1.6925, "step": 4816 }, { "epoch": 1.265937530791211, "grad_norm": 0.517112135887146, "learning_rate": 5.7814964079200985e-05, "loss": 1.7095, "step": 4818 }, { "epoch": 1.2664630341248726, "grad_norm": 0.6834176182746887, "learning_rate": 5.7797441738216226e-05, "loss": 1.6919, "step": 4820 }, { "epoch": 1.2669885374585346, "grad_norm": 0.5728265643119812, "learning_rate": 5.7779919397231473e-05, "loss": 1.6796, "step": 4822 }, { "epoch": 1.2675140407921963, "grad_norm": 0.8350217342376709, "learning_rate": 5.776239705624672e-05, "loss": 1.6946, "step": 4824 }, { "epoch": 1.268039544125858, "grad_norm": 0.7911032438278198, "learning_rate": 5.774487471526196e-05, "loss": 1.705, "step": 4826 }, { "epoch": 1.2685650474595198, "grad_norm": 0.6383318305015564, "learning_rate": 5.772735237427721e-05, "loss": 1.6945, "step": 4828 }, { "epoch": 1.2690905507931816, "grad_norm": 0.7389723062515259, "learning_rate": 5.770983003329246e-05, "loss": 1.6889, "step": 4830 }, { "epoch": 1.2696160541268433, "grad_norm": 0.5114458203315735, "learning_rate": 5.769230769230769e-05, "loss": 1.69, "step": 4832 }, { "epoch": 1.270141557460505, "grad_norm": 0.7528431415557861, "learning_rate": 5.767478535132294e-05, "loss": 1.6829, "step": 4834 }, { "epoch": 1.270667060794167, "grad_norm": 0.6436894536018372, "learning_rate": 5.765726301033818e-05, "loss": 1.6968, "step": 4836 }, { "epoch": 1.2711925641278286, "grad_norm": 0.601234495639801, "learning_rate": 5.763974066935343e-05, "loss": 1.6826, "step": 4838 }, { "epoch": 1.2717180674614905, "grad_norm": 0.6918678283691406, "learning_rate": 5.7622218328368674e-05, "loss": 1.698, "step": 4840 }, { "epoch": 1.2722435707951523, "grad_norm": 0.5951560139656067, "learning_rate": 5.7604695987383915e-05, "loss": 1.7462, "step": 4842 }, { "epoch": 1.272769074128814, "grad_norm": 0.8118217587471008, "learning_rate": 5.758717364639916e-05, "loss": 1.6818, "step": 4844 }, { "epoch": 1.2732945774624758, "grad_norm": 0.5995089411735535, "learning_rate": 5.756965130541441e-05, "loss": 1.7061, "step": 4846 }, { "epoch": 1.2738200807961375, "grad_norm": 0.5712124705314636, "learning_rate": 5.755212896442965e-05, "loss": 1.6635, "step": 4848 }, { "epoch": 1.2743455841297993, "grad_norm": 0.975643515586853, "learning_rate": 5.75346066234449e-05, "loss": 1.6871, "step": 4850 }, { "epoch": 1.274871087463461, "grad_norm": 0.6375470161437988, "learning_rate": 5.7517084282460146e-05, "loss": 1.6946, "step": 4852 }, { "epoch": 1.275396590797123, "grad_norm": 0.6874341368675232, "learning_rate": 5.749956194147539e-05, "loss": 1.6987, "step": 4854 }, { "epoch": 1.2759220941307847, "grad_norm": 0.5625795125961304, "learning_rate": 5.7482039600490635e-05, "loss": 1.6946, "step": 4856 }, { "epoch": 1.2764475974644465, "grad_norm": 0.7773122191429138, "learning_rate": 5.746451725950587e-05, "loss": 1.6922, "step": 4858 }, { "epoch": 1.2769731007981082, "grad_norm": 0.6410396099090576, "learning_rate": 5.7446994918521116e-05, "loss": 1.676, "step": 4860 }, { "epoch": 1.27749860413177, "grad_norm": 0.5476670861244202, "learning_rate": 5.742947257753636e-05, "loss": 1.7063, "step": 4862 }, { "epoch": 1.2780241074654317, "grad_norm": 0.5646496415138245, "learning_rate": 5.7411950236551604e-05, "loss": 1.6657, "step": 4864 }, { "epoch": 1.2785496107990935, "grad_norm": 0.7577764987945557, "learning_rate": 5.739442789556685e-05, "loss": 1.6869, "step": 4866 }, { "epoch": 1.2790751141327552, "grad_norm": 0.5627464652061462, "learning_rate": 5.737690555458209e-05, "loss": 1.6608, "step": 4868 }, { "epoch": 1.279600617466417, "grad_norm": 0.5386861562728882, "learning_rate": 5.735938321359734e-05, "loss": 1.6826, "step": 4870 }, { "epoch": 1.280126120800079, "grad_norm": 0.8496121168136597, "learning_rate": 5.734186087261259e-05, "loss": 1.6771, "step": 4872 }, { "epoch": 1.2806516241337407, "grad_norm": 0.6117552518844604, "learning_rate": 5.732433853162783e-05, "loss": 1.6957, "step": 4874 }, { "epoch": 1.2811771274674024, "grad_norm": 0.7759193181991577, "learning_rate": 5.7306816190643076e-05, "loss": 1.6396, "step": 4876 }, { "epoch": 1.2817026308010642, "grad_norm": 0.6560829281806946, "learning_rate": 5.7289293849658324e-05, "loss": 1.6983, "step": 4878 }, { "epoch": 1.282228134134726, "grad_norm": 0.6807093024253845, "learning_rate": 5.7271771508673565e-05, "loss": 1.6545, "step": 4880 }, { "epoch": 1.2827536374683877, "grad_norm": 0.6546826958656311, "learning_rate": 5.72542491676888e-05, "loss": 1.7138, "step": 4882 }, { "epoch": 1.2832791408020494, "grad_norm": 0.5959972739219666, "learning_rate": 5.7236726826704046e-05, "loss": 1.6567, "step": 4884 }, { "epoch": 1.2838046441357112, "grad_norm": 0.6360346078872681, "learning_rate": 5.7219204485719294e-05, "loss": 1.6764, "step": 4886 }, { "epoch": 1.284330147469373, "grad_norm": 0.7171315550804138, "learning_rate": 5.7201682144734535e-05, "loss": 1.6489, "step": 4888 }, { "epoch": 1.2848556508030349, "grad_norm": 0.6501976251602173, "learning_rate": 5.718415980374978e-05, "loss": 1.7136, "step": 4890 }, { "epoch": 1.2853811541366966, "grad_norm": 0.7134879231452942, "learning_rate": 5.716663746276503e-05, "loss": 1.6789, "step": 4892 }, { "epoch": 1.2859066574703584, "grad_norm": 0.6386318206787109, "learning_rate": 5.714911512178027e-05, "loss": 1.6877, "step": 4894 }, { "epoch": 1.2864321608040201, "grad_norm": 0.5816686749458313, "learning_rate": 5.713159278079552e-05, "loss": 1.7148, "step": 4896 }, { "epoch": 1.2869576641376819, "grad_norm": 0.49736329913139343, "learning_rate": 5.7114070439810766e-05, "loss": 1.6591, "step": 4898 }, { "epoch": 1.2874831674713436, "grad_norm": 0.5454695820808411, "learning_rate": 5.7096548098826006e-05, "loss": 1.7062, "step": 4900 }, { "epoch": 1.2880086708050054, "grad_norm": 0.5578910708427429, "learning_rate": 5.7079025757841254e-05, "loss": 1.6773, "step": 4902 }, { "epoch": 1.288534174138667, "grad_norm": 0.7455135583877563, "learning_rate": 5.70615034168565e-05, "loss": 1.6605, "step": 4904 }, { "epoch": 1.2890596774723289, "grad_norm": 0.6031019687652588, "learning_rate": 5.704398107587174e-05, "loss": 1.706, "step": 4906 }, { "epoch": 1.2895851808059908, "grad_norm": 0.6043726801872253, "learning_rate": 5.7026458734886976e-05, "loss": 1.6919, "step": 4908 }, { "epoch": 1.2901106841396526, "grad_norm": 0.6083821654319763, "learning_rate": 5.7008936393902224e-05, "loss": 1.6752, "step": 4910 }, { "epoch": 1.2906361874733143, "grad_norm": 0.5966817736625671, "learning_rate": 5.699141405291747e-05, "loss": 1.6838, "step": 4912 }, { "epoch": 1.291161690806976, "grad_norm": 0.5941465497016907, "learning_rate": 5.697389171193271e-05, "loss": 1.6899, "step": 4914 }, { "epoch": 1.2916871941406378, "grad_norm": 0.6362673044204712, "learning_rate": 5.695636937094796e-05, "loss": 1.684, "step": 4916 }, { "epoch": 1.2922126974742996, "grad_norm": 0.6817765831947327, "learning_rate": 5.693884702996321e-05, "loss": 1.668, "step": 4918 }, { "epoch": 1.2927382008079613, "grad_norm": 0.6063424944877625, "learning_rate": 5.692132468897845e-05, "loss": 1.6628, "step": 4920 }, { "epoch": 1.2932637041416233, "grad_norm": 0.595859169960022, "learning_rate": 5.6903802347993696e-05, "loss": 1.6989, "step": 4922 }, { "epoch": 1.2937892074752848, "grad_norm": 0.5616616606712341, "learning_rate": 5.688628000700894e-05, "loss": 1.7036, "step": 4924 }, { "epoch": 1.2943147108089468, "grad_norm": 0.5995229482650757, "learning_rate": 5.6868757666024184e-05, "loss": 1.6894, "step": 4926 }, { "epoch": 1.2948402141426085, "grad_norm": 0.5990728735923767, "learning_rate": 5.685123532503943e-05, "loss": 1.6612, "step": 4928 }, { "epoch": 1.2953657174762703, "grad_norm": 0.5425363183021545, "learning_rate": 5.683371298405468e-05, "loss": 1.6523, "step": 4930 }, { "epoch": 1.295891220809932, "grad_norm": 0.5370776653289795, "learning_rate": 5.681619064306992e-05, "loss": 1.6977, "step": 4932 }, { "epoch": 1.2964167241435938, "grad_norm": 0.6286599636077881, "learning_rate": 5.6798668302085154e-05, "loss": 1.7288, "step": 4934 }, { "epoch": 1.2969422274772555, "grad_norm": 0.575813889503479, "learning_rate": 5.67811459611004e-05, "loss": 1.6914, "step": 4936 }, { "epoch": 1.2974677308109173, "grad_norm": 0.5949034690856934, "learning_rate": 5.676362362011565e-05, "loss": 1.6733, "step": 4938 }, { "epoch": 1.2979932341445792, "grad_norm": 0.6935321688652039, "learning_rate": 5.674610127913089e-05, "loss": 1.6886, "step": 4940 }, { "epoch": 1.2985187374782408, "grad_norm": 0.6900460124015808, "learning_rate": 5.672857893814614e-05, "loss": 1.6985, "step": 4942 }, { "epoch": 1.2990442408119027, "grad_norm": 0.5892564654350281, "learning_rate": 5.6711056597161385e-05, "loss": 1.6837, "step": 4944 }, { "epoch": 1.2995697441455645, "grad_norm": 0.6516983509063721, "learning_rate": 5.6693534256176626e-05, "loss": 1.6797, "step": 4946 }, { "epoch": 1.3000952474792262, "grad_norm": 0.8544719219207764, "learning_rate": 5.6676011915191873e-05, "loss": 1.6911, "step": 4948 }, { "epoch": 1.300620750812888, "grad_norm": 0.7540829181671143, "learning_rate": 5.665848957420712e-05, "loss": 1.7333, "step": 4950 }, { "epoch": 1.3011462541465497, "grad_norm": 0.8198524117469788, "learning_rate": 5.664096723322236e-05, "loss": 1.7286, "step": 4952 }, { "epoch": 1.3016717574802115, "grad_norm": 0.6572164297103882, "learning_rate": 5.662344489223761e-05, "loss": 1.7171, "step": 4954 }, { "epoch": 1.3021972608138732, "grad_norm": 0.6047423481941223, "learning_rate": 5.660592255125286e-05, "loss": 1.6762, "step": 4956 }, { "epoch": 1.3027227641475352, "grad_norm": 0.6690229177474976, "learning_rate": 5.65884002102681e-05, "loss": 1.6802, "step": 4958 }, { "epoch": 1.3032482674811967, "grad_norm": 0.57923823595047, "learning_rate": 5.657087786928333e-05, "loss": 1.6864, "step": 4960 }, { "epoch": 1.3037737708148587, "grad_norm": 0.6738536357879639, "learning_rate": 5.655335552829858e-05, "loss": 1.6883, "step": 4962 }, { "epoch": 1.3042992741485204, "grad_norm": 0.5950745344161987, "learning_rate": 5.653583318731383e-05, "loss": 1.6688, "step": 4964 }, { "epoch": 1.3048247774821822, "grad_norm": 0.5781100988388062, "learning_rate": 5.651831084632907e-05, "loss": 1.699, "step": 4966 }, { "epoch": 1.305350280815844, "grad_norm": 0.535622775554657, "learning_rate": 5.6500788505344315e-05, "loss": 1.6881, "step": 4968 }, { "epoch": 1.3058757841495057, "grad_norm": 0.6280853152275085, "learning_rate": 5.648326616435956e-05, "loss": 1.6802, "step": 4970 }, { "epoch": 1.3064012874831674, "grad_norm": 0.538373589515686, "learning_rate": 5.6465743823374803e-05, "loss": 1.6446, "step": 4972 }, { "epoch": 1.3069267908168292, "grad_norm": 0.5861368775367737, "learning_rate": 5.644822148239005e-05, "loss": 1.6667, "step": 4974 }, { "epoch": 1.3074522941504911, "grad_norm": 0.6625964045524597, "learning_rate": 5.64306991414053e-05, "loss": 1.6879, "step": 4976 }, { "epoch": 1.3079777974841527, "grad_norm": 0.6457346081733704, "learning_rate": 5.641317680042054e-05, "loss": 1.688, "step": 4978 }, { "epoch": 1.3085033008178146, "grad_norm": 0.6461663246154785, "learning_rate": 5.639565445943579e-05, "loss": 1.6974, "step": 4980 }, { "epoch": 1.3090288041514764, "grad_norm": 0.6430699825286865, "learning_rate": 5.6378132118451035e-05, "loss": 1.7059, "step": 4982 }, { "epoch": 1.3095543074851381, "grad_norm": 0.6152409315109253, "learning_rate": 5.636060977746627e-05, "loss": 1.649, "step": 4984 }, { "epoch": 1.3100798108187999, "grad_norm": 0.5380004644393921, "learning_rate": 5.634308743648151e-05, "loss": 1.6893, "step": 4986 }, { "epoch": 1.3106053141524616, "grad_norm": 0.6044327020645142, "learning_rate": 5.632556509549676e-05, "loss": 1.714, "step": 4988 }, { "epoch": 1.3111308174861234, "grad_norm": 0.5950967073440552, "learning_rate": 5.6308042754512004e-05, "loss": 1.695, "step": 4990 }, { "epoch": 1.3116563208197851, "grad_norm": 0.5887439846992493, "learning_rate": 5.6290520413527245e-05, "loss": 1.6914, "step": 4992 }, { "epoch": 1.312181824153447, "grad_norm": 0.6020603775978088, "learning_rate": 5.627299807254249e-05, "loss": 1.6847, "step": 4994 }, { "epoch": 1.3127073274871086, "grad_norm": 0.6895347237586975, "learning_rate": 5.625547573155774e-05, "loss": 1.6888, "step": 4996 }, { "epoch": 1.3132328308207706, "grad_norm": 0.7459006309509277, "learning_rate": 5.623795339057298e-05, "loss": 1.6828, "step": 4998 }, { "epoch": 1.3137583341544323, "grad_norm": 0.528291642665863, "learning_rate": 5.622043104958823e-05, "loss": 1.6612, "step": 5000 }, { "epoch": 1.314283837488094, "grad_norm": 0.5710020065307617, "learning_rate": 5.6202908708603476e-05, "loss": 1.7157, "step": 5002 }, { "epoch": 1.3148093408217558, "grad_norm": 0.7354927062988281, "learning_rate": 5.618538636761872e-05, "loss": 1.6948, "step": 5004 }, { "epoch": 1.3153348441554176, "grad_norm": 0.5487669110298157, "learning_rate": 5.6167864026633965e-05, "loss": 1.6546, "step": 5006 }, { "epoch": 1.3158603474890793, "grad_norm": 0.5558237433433533, "learning_rate": 5.615034168564921e-05, "loss": 1.6729, "step": 5008 }, { "epoch": 1.316385850822741, "grad_norm": 0.6180663704872131, "learning_rate": 5.6132819344664446e-05, "loss": 1.672, "step": 5010 }, { "epoch": 1.316911354156403, "grad_norm": 0.5914913415908813, "learning_rate": 5.611529700367969e-05, "loss": 1.6727, "step": 5012 }, { "epoch": 1.3174368574900648, "grad_norm": 0.6253445148468018, "learning_rate": 5.6097774662694935e-05, "loss": 1.6805, "step": 5014 }, { "epoch": 1.3179623608237265, "grad_norm": 0.755477249622345, "learning_rate": 5.608025232171018e-05, "loss": 1.6646, "step": 5016 }, { "epoch": 1.3184878641573883, "grad_norm": 0.5822807550430298, "learning_rate": 5.606272998072542e-05, "loss": 1.6779, "step": 5018 }, { "epoch": 1.31901336749105, "grad_norm": 0.5383006930351257, "learning_rate": 5.604520763974067e-05, "loss": 1.6942, "step": 5020 }, { "epoch": 1.3195388708247118, "grad_norm": 0.5826961398124695, "learning_rate": 5.602768529875592e-05, "loss": 1.6874, "step": 5022 }, { "epoch": 1.3200643741583735, "grad_norm": 0.5648834705352783, "learning_rate": 5.601016295777116e-05, "loss": 1.6948, "step": 5024 }, { "epoch": 1.3205898774920353, "grad_norm": 0.6096558570861816, "learning_rate": 5.5992640616786406e-05, "loss": 1.6924, "step": 5026 }, { "epoch": 1.321115380825697, "grad_norm": 0.5826833844184875, "learning_rate": 5.5975118275801654e-05, "loss": 1.7116, "step": 5028 }, { "epoch": 1.321640884159359, "grad_norm": 0.5931800007820129, "learning_rate": 5.5957595934816895e-05, "loss": 1.6996, "step": 5030 }, { "epoch": 1.3221663874930207, "grad_norm": 0.6546478271484375, "learning_rate": 5.594007359383214e-05, "loss": 1.6639, "step": 5032 }, { "epoch": 1.3226918908266825, "grad_norm": 0.6511355042457581, "learning_rate": 5.592255125284739e-05, "loss": 1.6912, "step": 5034 }, { "epoch": 1.3232173941603442, "grad_norm": 0.57513028383255, "learning_rate": 5.5905028911862624e-05, "loss": 1.6479, "step": 5036 }, { "epoch": 1.323742897494006, "grad_norm": 0.5553883910179138, "learning_rate": 5.5887506570877865e-05, "loss": 1.6877, "step": 5038 }, { "epoch": 1.3242684008276677, "grad_norm": 0.6477178931236267, "learning_rate": 5.586998422989311e-05, "loss": 1.7209, "step": 5040 }, { "epoch": 1.3247939041613295, "grad_norm": 0.6500737071037292, "learning_rate": 5.585246188890836e-05, "loss": 1.6838, "step": 5042 }, { "epoch": 1.3253194074949912, "grad_norm": 0.499759703874588, "learning_rate": 5.58349395479236e-05, "loss": 1.6858, "step": 5044 }, { "epoch": 1.325844910828653, "grad_norm": 0.6388137340545654, "learning_rate": 5.581741720693885e-05, "loss": 1.6836, "step": 5046 }, { "epoch": 1.326370414162315, "grad_norm": 0.7066943049430847, "learning_rate": 5.5799894865954096e-05, "loss": 1.7018, "step": 5048 }, { "epoch": 1.3268959174959767, "grad_norm": 0.5318599343299866, "learning_rate": 5.5782372524969336e-05, "loss": 1.676, "step": 5050 }, { "epoch": 1.3274214208296384, "grad_norm": 0.6772581338882446, "learning_rate": 5.5764850183984584e-05, "loss": 1.6909, "step": 5052 }, { "epoch": 1.3279469241633002, "grad_norm": 0.6018180847167969, "learning_rate": 5.574732784299983e-05, "loss": 1.6817, "step": 5054 }, { "epoch": 1.328472427496962, "grad_norm": 0.5764731168746948, "learning_rate": 5.572980550201507e-05, "loss": 1.6866, "step": 5056 }, { "epoch": 1.3289979308306237, "grad_norm": 0.613559365272522, "learning_rate": 5.571228316103032e-05, "loss": 1.6798, "step": 5058 }, { "epoch": 1.3295234341642854, "grad_norm": 0.6392707228660583, "learning_rate": 5.569476082004557e-05, "loss": 1.6794, "step": 5060 }, { "epoch": 1.3300489374979472, "grad_norm": 0.6719961762428284, "learning_rate": 5.56772384790608e-05, "loss": 1.6773, "step": 5062 }, { "epoch": 1.330574440831609, "grad_norm": 0.6991645693778992, "learning_rate": 5.565971613807604e-05, "loss": 1.7275, "step": 5064 }, { "epoch": 1.3310999441652709, "grad_norm": 0.54388827085495, "learning_rate": 5.564219379709129e-05, "loss": 1.6641, "step": 5066 }, { "epoch": 1.3316254474989326, "grad_norm": 0.8866559267044067, "learning_rate": 5.562467145610654e-05, "loss": 1.666, "step": 5068 }, { "epoch": 1.3321509508325944, "grad_norm": 0.5608734488487244, "learning_rate": 5.560714911512178e-05, "loss": 1.6793, "step": 5070 }, { "epoch": 1.3326764541662561, "grad_norm": 0.5888034701347351, "learning_rate": 5.5589626774137026e-05, "loss": 1.6902, "step": 5072 }, { "epoch": 1.3332019574999179, "grad_norm": 0.5383687019348145, "learning_rate": 5.557210443315227e-05, "loss": 1.6738, "step": 5074 }, { "epoch": 1.3337274608335796, "grad_norm": 0.681666374206543, "learning_rate": 5.5554582092167514e-05, "loss": 1.6709, "step": 5076 }, { "epoch": 1.3342529641672414, "grad_norm": 0.6175423264503479, "learning_rate": 5.553705975118276e-05, "loss": 1.6617, "step": 5078 }, { "epoch": 1.3347784675009033, "grad_norm": 0.5964341163635254, "learning_rate": 5.551953741019801e-05, "loss": 1.6484, "step": 5080 }, { "epoch": 1.3353039708345649, "grad_norm": 0.5749648809432983, "learning_rate": 5.550201506921325e-05, "loss": 1.6819, "step": 5082 }, { "epoch": 1.3358294741682268, "grad_norm": 0.6597902774810791, "learning_rate": 5.54844927282285e-05, "loss": 1.7234, "step": 5084 }, { "epoch": 1.3363549775018886, "grad_norm": 0.6115815043449402, "learning_rate": 5.546697038724373e-05, "loss": 1.683, "step": 5086 }, { "epoch": 1.3368804808355503, "grad_norm": 0.63287353515625, "learning_rate": 5.544944804625898e-05, "loss": 1.6759, "step": 5088 }, { "epoch": 1.337405984169212, "grad_norm": 0.6917816400527954, "learning_rate": 5.543192570527423e-05, "loss": 1.6938, "step": 5090 }, { "epoch": 1.3379314875028738, "grad_norm": 0.5193148255348206, "learning_rate": 5.541440336428947e-05, "loss": 1.6918, "step": 5092 }, { "epoch": 1.3384569908365356, "grad_norm": 0.5470016002655029, "learning_rate": 5.5396881023304715e-05, "loss": 1.6733, "step": 5094 }, { "epoch": 1.3389824941701973, "grad_norm": 0.6966432332992554, "learning_rate": 5.537935868231996e-05, "loss": 1.6883, "step": 5096 }, { "epoch": 1.3395079975038593, "grad_norm": 0.6289670467376709, "learning_rate": 5.5361836341335203e-05, "loss": 1.6666, "step": 5098 }, { "epoch": 1.3400335008375208, "grad_norm": 0.5251250863075256, "learning_rate": 5.534431400035045e-05, "loss": 1.6477, "step": 5100 }, { "epoch": 1.3405590041711828, "grad_norm": 0.663750171661377, "learning_rate": 5.53267916593657e-05, "loss": 1.6713, "step": 5102 }, { "epoch": 1.3410845075048445, "grad_norm": 0.5892371535301208, "learning_rate": 5.530926931838094e-05, "loss": 1.6935, "step": 5104 }, { "epoch": 1.3416100108385063, "grad_norm": 0.7290005683898926, "learning_rate": 5.529174697739619e-05, "loss": 1.6685, "step": 5106 }, { "epoch": 1.342135514172168, "grad_norm": 0.6136685013771057, "learning_rate": 5.5274224636411435e-05, "loss": 1.6441, "step": 5108 }, { "epoch": 1.3426610175058298, "grad_norm": 0.648530900478363, "learning_rate": 5.5256702295426675e-05, "loss": 1.6874, "step": 5110 }, { "epoch": 1.3431865208394915, "grad_norm": 0.556920051574707, "learning_rate": 5.523917995444191e-05, "loss": 1.6945, "step": 5112 }, { "epoch": 1.3437120241731533, "grad_norm": 0.6408731937408447, "learning_rate": 5.522165761345716e-05, "loss": 1.6719, "step": 5114 }, { "epoch": 1.3442375275068152, "grad_norm": 0.6126033663749695, "learning_rate": 5.5204135272472404e-05, "loss": 1.6851, "step": 5116 }, { "epoch": 1.3447630308404768, "grad_norm": 0.7299725413322449, "learning_rate": 5.5186612931487645e-05, "loss": 1.6442, "step": 5118 }, { "epoch": 1.3452885341741387, "grad_norm": 0.5226702690124512, "learning_rate": 5.516909059050289e-05, "loss": 1.6779, "step": 5120 }, { "epoch": 1.3458140375078005, "grad_norm": 0.5271426439285278, "learning_rate": 5.515156824951814e-05, "loss": 1.6756, "step": 5122 }, { "epoch": 1.3463395408414622, "grad_norm": 0.5417149066925049, "learning_rate": 5.513404590853338e-05, "loss": 1.7041, "step": 5124 }, { "epoch": 1.346865044175124, "grad_norm": 0.639298677444458, "learning_rate": 5.511652356754863e-05, "loss": 1.7372, "step": 5126 }, { "epoch": 1.3473905475087857, "grad_norm": 0.7180479764938354, "learning_rate": 5.5099001226563876e-05, "loss": 1.7078, "step": 5128 }, { "epoch": 1.3479160508424475, "grad_norm": 0.6249936819076538, "learning_rate": 5.508147888557912e-05, "loss": 1.672, "step": 5130 }, { "epoch": 1.3484415541761092, "grad_norm": 0.6733811497688293, "learning_rate": 5.5063956544594365e-05, "loss": 1.6648, "step": 5132 }, { "epoch": 1.3489670575097712, "grad_norm": 0.628979504108429, "learning_rate": 5.504643420360961e-05, "loss": 1.6724, "step": 5134 }, { "epoch": 1.3494925608434327, "grad_norm": 0.6797083020210266, "learning_rate": 5.502891186262485e-05, "loss": 1.6681, "step": 5136 }, { "epoch": 1.3500180641770947, "grad_norm": 0.5499573945999146, "learning_rate": 5.501138952164009e-05, "loss": 1.6877, "step": 5138 }, { "epoch": 1.3505435675107564, "grad_norm": 0.6924422383308411, "learning_rate": 5.4993867180655335e-05, "loss": 1.6836, "step": 5140 }, { "epoch": 1.3510690708444182, "grad_norm": 0.5735469460487366, "learning_rate": 5.497634483967058e-05, "loss": 1.6953, "step": 5142 }, { "epoch": 1.35159457417808, "grad_norm": 0.6161128282546997, "learning_rate": 5.495882249868582e-05, "loss": 1.6663, "step": 5144 }, { "epoch": 1.3521200775117417, "grad_norm": 0.5718648433685303, "learning_rate": 5.494130015770107e-05, "loss": 1.6785, "step": 5146 }, { "epoch": 1.3526455808454034, "grad_norm": 0.6534953713417053, "learning_rate": 5.492377781671632e-05, "loss": 1.6859, "step": 5148 }, { "epoch": 1.3531710841790652, "grad_norm": 0.7436297535896301, "learning_rate": 5.490625547573156e-05, "loss": 1.6833, "step": 5150 }, { "epoch": 1.3536965875127271, "grad_norm": 0.5223855972290039, "learning_rate": 5.4888733134746806e-05, "loss": 1.7009, "step": 5152 }, { "epoch": 1.3542220908463887, "grad_norm": 0.6421153545379639, "learning_rate": 5.4871210793762054e-05, "loss": 1.6884, "step": 5154 }, { "epoch": 1.3547475941800506, "grad_norm": 0.5627347826957703, "learning_rate": 5.4853688452777295e-05, "loss": 1.6761, "step": 5156 }, { "epoch": 1.3552730975137124, "grad_norm": 0.5863370299339294, "learning_rate": 5.483616611179254e-05, "loss": 1.7144, "step": 5158 }, { "epoch": 1.3557986008473741, "grad_norm": 0.6079491376876831, "learning_rate": 5.481864377080779e-05, "loss": 1.6679, "step": 5160 }, { "epoch": 1.3563241041810359, "grad_norm": 0.603889524936676, "learning_rate": 5.480112142982303e-05, "loss": 1.6858, "step": 5162 }, { "epoch": 1.3568496075146976, "grad_norm": 0.7292947173118591, "learning_rate": 5.4783599088838265e-05, "loss": 1.6597, "step": 5164 }, { "epoch": 1.3573751108483594, "grad_norm": 0.6873249411582947, "learning_rate": 5.476607674785351e-05, "loss": 1.6614, "step": 5166 }, { "epoch": 1.3579006141820211, "grad_norm": 0.6474335789680481, "learning_rate": 5.474855440686876e-05, "loss": 1.6651, "step": 5168 }, { "epoch": 1.358426117515683, "grad_norm": 0.6388329863548279, "learning_rate": 5.4731032065884e-05, "loss": 1.7278, "step": 5170 }, { "epoch": 1.3589516208493448, "grad_norm": 0.6011612415313721, "learning_rate": 5.471350972489925e-05, "loss": 1.6685, "step": 5172 }, { "epoch": 1.3594771241830066, "grad_norm": 0.5934230089187622, "learning_rate": 5.4695987383914496e-05, "loss": 1.6956, "step": 5174 }, { "epoch": 1.3600026275166683, "grad_norm": 0.5996755361557007, "learning_rate": 5.4678465042929736e-05, "loss": 1.6717, "step": 5176 }, { "epoch": 1.36052813085033, "grad_norm": 0.5604281425476074, "learning_rate": 5.4660942701944984e-05, "loss": 1.6535, "step": 5178 }, { "epoch": 1.3610536341839918, "grad_norm": 0.7021388411521912, "learning_rate": 5.464342036096023e-05, "loss": 1.6387, "step": 5180 }, { "epoch": 1.3615791375176536, "grad_norm": 0.599388599395752, "learning_rate": 5.462589801997547e-05, "loss": 1.7095, "step": 5182 }, { "epoch": 1.3621046408513153, "grad_norm": 0.6312151551246643, "learning_rate": 5.460837567899072e-05, "loss": 1.7024, "step": 5184 }, { "epoch": 1.362630144184977, "grad_norm": 0.6004204154014587, "learning_rate": 5.459085333800597e-05, "loss": 1.6731, "step": 5186 }, { "epoch": 1.363155647518639, "grad_norm": 0.5637236833572388, "learning_rate": 5.45733309970212e-05, "loss": 1.6846, "step": 5188 }, { "epoch": 1.3636811508523008, "grad_norm": 0.5435346961021423, "learning_rate": 5.455580865603644e-05, "loss": 1.697, "step": 5190 }, { "epoch": 1.3642066541859625, "grad_norm": 0.7027901411056519, "learning_rate": 5.453828631505169e-05, "loss": 1.6658, "step": 5192 }, { "epoch": 1.3647321575196243, "grad_norm": 0.5427135229110718, "learning_rate": 5.452076397406694e-05, "loss": 1.6802, "step": 5194 }, { "epoch": 1.365257660853286, "grad_norm": 0.9551213979721069, "learning_rate": 5.450324163308218e-05, "loss": 1.6675, "step": 5196 }, { "epoch": 1.3657831641869478, "grad_norm": 0.6727270483970642, "learning_rate": 5.4485719292097426e-05, "loss": 1.6909, "step": 5198 }, { "epoch": 1.3663086675206095, "grad_norm": 0.5825324058532715, "learning_rate": 5.446819695111267e-05, "loss": 1.6678, "step": 5200 }, { "epoch": 1.3663086675206095, "eval_loss": 1.6925063133239746, "eval_runtime": 487.2699, "eval_samples_per_second": 249.942, "eval_steps_per_second": 31.243, "step": 5200 }, { "epoch": 1.3668341708542713, "grad_norm": 0.5803776979446411, "learning_rate": 5.4450674610127914e-05, "loss": 1.687, "step": 5202 }, { "epoch": 1.367359674187933, "grad_norm": 0.5400435328483582, "learning_rate": 5.443315226914316e-05, "loss": 1.7248, "step": 5204 }, { "epoch": 1.367885177521595, "grad_norm": 0.5866122245788574, "learning_rate": 5.441562992815841e-05, "loss": 1.6612, "step": 5206 }, { "epoch": 1.3684106808552567, "grad_norm": 0.6416072249412537, "learning_rate": 5.439810758717365e-05, "loss": 1.6948, "step": 5208 }, { "epoch": 1.3689361841889185, "grad_norm": 0.5607174634933472, "learning_rate": 5.43805852461889e-05, "loss": 1.7079, "step": 5210 }, { "epoch": 1.3694616875225802, "grad_norm": 0.5060856938362122, "learning_rate": 5.4363062905204145e-05, "loss": 1.6811, "step": 5212 }, { "epoch": 1.369987190856242, "grad_norm": 0.5724950432777405, "learning_rate": 5.434554056421938e-05, "loss": 1.6251, "step": 5214 }, { "epoch": 1.3705126941899037, "grad_norm": 0.595393717288971, "learning_rate": 5.432801822323462e-05, "loss": 1.6867, "step": 5216 }, { "epoch": 1.3710381975235655, "grad_norm": 0.5957738161087036, "learning_rate": 5.431049588224987e-05, "loss": 1.6971, "step": 5218 }, { "epoch": 1.3715637008572272, "grad_norm": 0.6889095306396484, "learning_rate": 5.4292973541265115e-05, "loss": 1.6999, "step": 5220 }, { "epoch": 1.372089204190889, "grad_norm": 0.6807567477226257, "learning_rate": 5.4275451200280356e-05, "loss": 1.6674, "step": 5222 }, { "epoch": 1.372614707524551, "grad_norm": 0.6046966314315796, "learning_rate": 5.4257928859295603e-05, "loss": 1.6622, "step": 5224 }, { "epoch": 1.3731402108582127, "grad_norm": 0.6781222224235535, "learning_rate": 5.424040651831085e-05, "loss": 1.6974, "step": 5226 }, { "epoch": 1.3736657141918744, "grad_norm": 0.5710211992263794, "learning_rate": 5.422288417732609e-05, "loss": 1.6769, "step": 5228 }, { "epoch": 1.3741912175255362, "grad_norm": 0.6032019257545471, "learning_rate": 5.420536183634134e-05, "loss": 1.6615, "step": 5230 }, { "epoch": 1.374716720859198, "grad_norm": 0.6502026915550232, "learning_rate": 5.418783949535659e-05, "loss": 1.6965, "step": 5232 }, { "epoch": 1.3752422241928597, "grad_norm": 0.7333769798278809, "learning_rate": 5.417031715437183e-05, "loss": 1.689, "step": 5234 }, { "epoch": 1.3757677275265214, "grad_norm": 0.5849090218544006, "learning_rate": 5.4152794813387075e-05, "loss": 1.6696, "step": 5236 }, { "epoch": 1.3762932308601834, "grad_norm": 0.7076939940452576, "learning_rate": 5.413527247240232e-05, "loss": 1.6674, "step": 5238 }, { "epoch": 1.376818734193845, "grad_norm": 0.686165988445282, "learning_rate": 5.411775013141756e-05, "loss": 1.6729, "step": 5240 }, { "epoch": 1.377344237527507, "grad_norm": 0.7064648270606995, "learning_rate": 5.41002277904328e-05, "loss": 1.6532, "step": 5242 }, { "epoch": 1.3778697408611686, "grad_norm": 0.707788348197937, "learning_rate": 5.4082705449448045e-05, "loss": 1.635, "step": 5244 }, { "epoch": 1.3783952441948304, "grad_norm": 0.711150586605072, "learning_rate": 5.406518310846329e-05, "loss": 1.6671, "step": 5246 }, { "epoch": 1.3789207475284921, "grad_norm": 0.627653181552887, "learning_rate": 5.4047660767478534e-05, "loss": 1.6887, "step": 5248 }, { "epoch": 1.3794462508621539, "grad_norm": 0.5618575215339661, "learning_rate": 5.403013842649378e-05, "loss": 1.6773, "step": 5250 }, { "epoch": 1.3799717541958156, "grad_norm": 0.6045578718185425, "learning_rate": 5.401261608550903e-05, "loss": 1.7151, "step": 5252 }, { "epoch": 1.3804972575294774, "grad_norm": 0.6123912930488586, "learning_rate": 5.399509374452427e-05, "loss": 1.6428, "step": 5254 }, { "epoch": 1.3810227608631394, "grad_norm": 0.7001442909240723, "learning_rate": 5.397757140353952e-05, "loss": 1.7063, "step": 5256 }, { "epoch": 1.3815482641968009, "grad_norm": 0.5598956346511841, "learning_rate": 5.3960049062554765e-05, "loss": 1.671, "step": 5258 }, { "epoch": 1.3820737675304628, "grad_norm": 0.5916640758514404, "learning_rate": 5.3942526721570005e-05, "loss": 1.6685, "step": 5260 }, { "epoch": 1.3825992708641246, "grad_norm": 0.5688573718070984, "learning_rate": 5.392500438058525e-05, "loss": 1.6357, "step": 5262 }, { "epoch": 1.3831247741977863, "grad_norm": 0.6486282348632812, "learning_rate": 5.39074820396005e-05, "loss": 1.6803, "step": 5264 }, { "epoch": 1.383650277531448, "grad_norm": 0.6464768052101135, "learning_rate": 5.3889959698615735e-05, "loss": 1.6781, "step": 5266 }, { "epoch": 1.3841757808651098, "grad_norm": 0.5832222104072571, "learning_rate": 5.3872437357630975e-05, "loss": 1.7009, "step": 5268 }, { "epoch": 1.3847012841987716, "grad_norm": 0.6113573908805847, "learning_rate": 5.385491501664622e-05, "loss": 1.6855, "step": 5270 }, { "epoch": 1.3852267875324333, "grad_norm": 0.6312457323074341, "learning_rate": 5.383739267566147e-05, "loss": 1.7076, "step": 5272 }, { "epoch": 1.3857522908660953, "grad_norm": 0.6757798790931702, "learning_rate": 5.381987033467671e-05, "loss": 1.6796, "step": 5274 }, { "epoch": 1.3862777941997568, "grad_norm": 0.7711691856384277, "learning_rate": 5.380234799369196e-05, "loss": 1.656, "step": 5276 }, { "epoch": 1.3868032975334188, "grad_norm": 0.6918832659721375, "learning_rate": 5.3784825652707206e-05, "loss": 1.6723, "step": 5278 }, { "epoch": 1.3873288008670805, "grad_norm": 0.5812153220176697, "learning_rate": 5.376730331172245e-05, "loss": 1.6769, "step": 5280 }, { "epoch": 1.3878543042007423, "grad_norm": 0.642285168170929, "learning_rate": 5.3749780970737695e-05, "loss": 1.6962, "step": 5282 }, { "epoch": 1.388379807534404, "grad_norm": 0.5948686003684998, "learning_rate": 5.373225862975294e-05, "loss": 1.6874, "step": 5284 }, { "epoch": 1.3889053108680658, "grad_norm": 0.8630838394165039, "learning_rate": 5.371473628876818e-05, "loss": 1.6602, "step": 5286 }, { "epoch": 1.3894308142017275, "grad_norm": 0.624987006187439, "learning_rate": 5.369721394778343e-05, "loss": 1.6674, "step": 5288 }, { "epoch": 1.3899563175353893, "grad_norm": 0.5534707307815552, "learning_rate": 5.3679691606798665e-05, "loss": 1.7101, "step": 5290 }, { "epoch": 1.3904818208690513, "grad_norm": 0.547791063785553, "learning_rate": 5.366216926581391e-05, "loss": 1.7103, "step": 5292 }, { "epoch": 1.3910073242027128, "grad_norm": 0.7196782827377319, "learning_rate": 5.364464692482915e-05, "loss": 1.7016, "step": 5294 }, { "epoch": 1.3915328275363748, "grad_norm": 0.5943964123725891, "learning_rate": 5.36271245838444e-05, "loss": 1.6937, "step": 5296 }, { "epoch": 1.3920583308700365, "grad_norm": 0.5824291706085205, "learning_rate": 5.360960224285965e-05, "loss": 1.6606, "step": 5298 }, { "epoch": 1.3925838342036982, "grad_norm": 0.6096850633621216, "learning_rate": 5.359207990187489e-05, "loss": 1.6732, "step": 5300 }, { "epoch": 1.39310933753736, "grad_norm": 0.5567105412483215, "learning_rate": 5.3574557560890136e-05, "loss": 1.6917, "step": 5302 }, { "epoch": 1.3936348408710217, "grad_norm": 0.808890700340271, "learning_rate": 5.3557035219905384e-05, "loss": 1.6916, "step": 5304 }, { "epoch": 1.3941603442046835, "grad_norm": 0.602245032787323, "learning_rate": 5.3539512878920625e-05, "loss": 1.6565, "step": 5306 }, { "epoch": 1.3946858475383452, "grad_norm": 0.5515073537826538, "learning_rate": 5.352199053793587e-05, "loss": 1.6927, "step": 5308 }, { "epoch": 1.3952113508720072, "grad_norm": 0.6020869016647339, "learning_rate": 5.350446819695112e-05, "loss": 1.6813, "step": 5310 }, { "epoch": 1.3957368542056687, "grad_norm": 0.6068871021270752, "learning_rate": 5.348694585596636e-05, "loss": 1.699, "step": 5312 }, { "epoch": 1.3962623575393307, "grad_norm": 0.8839384913444519, "learning_rate": 5.346942351498161e-05, "loss": 1.7296, "step": 5314 }, { "epoch": 1.3967878608729924, "grad_norm": 0.6876543164253235, "learning_rate": 5.345190117399684e-05, "loss": 1.6827, "step": 5316 }, { "epoch": 1.3973133642066542, "grad_norm": 0.9712225198745728, "learning_rate": 5.343437883301209e-05, "loss": 1.6821, "step": 5318 }, { "epoch": 1.397838867540316, "grad_norm": 0.644990861415863, "learning_rate": 5.341685649202733e-05, "loss": 1.671, "step": 5320 }, { "epoch": 1.3983643708739777, "grad_norm": 0.7299576997756958, "learning_rate": 5.339933415104258e-05, "loss": 1.7054, "step": 5322 }, { "epoch": 1.3988898742076394, "grad_norm": 0.5608115196228027, "learning_rate": 5.3381811810057826e-05, "loss": 1.6713, "step": 5324 }, { "epoch": 1.3994153775413012, "grad_norm": 0.6044563055038452, "learning_rate": 5.3364289469073067e-05, "loss": 1.6838, "step": 5326 }, { "epoch": 1.3999408808749632, "grad_norm": 0.5304017663002014, "learning_rate": 5.3346767128088314e-05, "loss": 1.6351, "step": 5328 }, { "epoch": 1.400466384208625, "grad_norm": 0.5934461355209351, "learning_rate": 5.332924478710356e-05, "loss": 1.6485, "step": 5330 }, { "epoch": 1.4009918875422867, "grad_norm": 0.5502288937568665, "learning_rate": 5.33117224461188e-05, "loss": 1.683, "step": 5332 }, { "epoch": 1.4015173908759484, "grad_norm": 0.6545527577400208, "learning_rate": 5.329420010513405e-05, "loss": 1.6823, "step": 5334 }, { "epoch": 1.4020428942096101, "grad_norm": 0.5551027655601501, "learning_rate": 5.32766777641493e-05, "loss": 1.6867, "step": 5336 }, { "epoch": 1.402568397543272, "grad_norm": 0.6732555627822876, "learning_rate": 5.325915542316454e-05, "loss": 1.7031, "step": 5338 }, { "epoch": 1.4030939008769336, "grad_norm": 0.5857707858085632, "learning_rate": 5.3241633082179786e-05, "loss": 1.6607, "step": 5340 }, { "epoch": 1.4036194042105954, "grad_norm": 0.6332495212554932, "learning_rate": 5.322411074119502e-05, "loss": 1.6979, "step": 5342 }, { "epoch": 1.4041449075442571, "grad_norm": 0.5486958026885986, "learning_rate": 5.320658840021027e-05, "loss": 1.6582, "step": 5344 }, { "epoch": 1.404670410877919, "grad_norm": 0.5463687181472778, "learning_rate": 5.3189066059225515e-05, "loss": 1.7192, "step": 5346 }, { "epoch": 1.4051959142115809, "grad_norm": 0.4964401125907898, "learning_rate": 5.3171543718240756e-05, "loss": 1.6486, "step": 5348 }, { "epoch": 1.4057214175452426, "grad_norm": 0.5375365018844604, "learning_rate": 5.3154021377256003e-05, "loss": 1.714, "step": 5350 }, { "epoch": 1.4062469208789043, "grad_norm": 0.5706174373626709, "learning_rate": 5.313649903627125e-05, "loss": 1.702, "step": 5352 }, { "epoch": 1.406772424212566, "grad_norm": 0.6177610158920288, "learning_rate": 5.311897669528649e-05, "loss": 1.6439, "step": 5354 }, { "epoch": 1.4072979275462278, "grad_norm": 0.5553119778633118, "learning_rate": 5.310145435430174e-05, "loss": 1.69, "step": 5356 }, { "epoch": 1.4078234308798896, "grad_norm": 0.6558260917663574, "learning_rate": 5.308393201331699e-05, "loss": 1.7068, "step": 5358 }, { "epoch": 1.4083489342135513, "grad_norm": 0.7940452098846436, "learning_rate": 5.306640967233223e-05, "loss": 1.6912, "step": 5360 }, { "epoch": 1.408874437547213, "grad_norm": 0.577286958694458, "learning_rate": 5.3048887331347475e-05, "loss": 1.6755, "step": 5362 }, { "epoch": 1.409399940880875, "grad_norm": 0.6552362442016602, "learning_rate": 5.303136499036272e-05, "loss": 1.6952, "step": 5364 }, { "epoch": 1.4099254442145368, "grad_norm": 0.6791645884513855, "learning_rate": 5.3013842649377964e-05, "loss": 1.6979, "step": 5366 }, { "epoch": 1.4104509475481986, "grad_norm": 0.5956308841705322, "learning_rate": 5.29963203083932e-05, "loss": 1.6742, "step": 5368 }, { "epoch": 1.4109764508818603, "grad_norm": 0.6051899790763855, "learning_rate": 5.2978797967408445e-05, "loss": 1.6704, "step": 5370 }, { "epoch": 1.411501954215522, "grad_norm": 0.6038895845413208, "learning_rate": 5.296127562642369e-05, "loss": 1.6894, "step": 5372 }, { "epoch": 1.4120274575491838, "grad_norm": 0.5314784049987793, "learning_rate": 5.2943753285438934e-05, "loss": 1.6656, "step": 5374 }, { "epoch": 1.4125529608828455, "grad_norm": 0.5296756029129028, "learning_rate": 5.292623094445418e-05, "loss": 1.6957, "step": 5376 }, { "epoch": 1.4130784642165073, "grad_norm": 0.5364149808883667, "learning_rate": 5.290870860346943e-05, "loss": 1.6585, "step": 5378 }, { "epoch": 1.413603967550169, "grad_norm": 0.5761370062828064, "learning_rate": 5.289118626248467e-05, "loss": 1.6692, "step": 5380 }, { "epoch": 1.414129470883831, "grad_norm": 0.6723489165306091, "learning_rate": 5.287366392149992e-05, "loss": 1.6826, "step": 5382 }, { "epoch": 1.4146549742174928, "grad_norm": 0.5427255034446716, "learning_rate": 5.2856141580515165e-05, "loss": 1.6612, "step": 5384 }, { "epoch": 1.4151804775511545, "grad_norm": 0.5773686170578003, "learning_rate": 5.2838619239530405e-05, "loss": 1.6751, "step": 5386 }, { "epoch": 1.4157059808848163, "grad_norm": 0.9336940050125122, "learning_rate": 5.282109689854565e-05, "loss": 1.6976, "step": 5388 }, { "epoch": 1.416231484218478, "grad_norm": 0.5543003082275391, "learning_rate": 5.28035745575609e-05, "loss": 1.6991, "step": 5390 }, { "epoch": 1.4167569875521397, "grad_norm": 0.5574566125869751, "learning_rate": 5.2786052216576134e-05, "loss": 1.6575, "step": 5392 }, { "epoch": 1.4172824908858015, "grad_norm": 0.6602552533149719, "learning_rate": 5.2768529875591375e-05, "loss": 1.6748, "step": 5394 }, { "epoch": 1.4178079942194635, "grad_norm": 0.644187331199646, "learning_rate": 5.275100753460662e-05, "loss": 1.6565, "step": 5396 }, { "epoch": 1.418333497553125, "grad_norm": 0.5814189314842224, "learning_rate": 5.273348519362187e-05, "loss": 1.6965, "step": 5398 }, { "epoch": 1.418859000886787, "grad_norm": 0.5712095499038696, "learning_rate": 5.271596285263711e-05, "loss": 1.6593, "step": 5400 }, { "epoch": 1.4193845042204487, "grad_norm": 0.6919686198234558, "learning_rate": 5.269844051165236e-05, "loss": 1.692, "step": 5402 }, { "epoch": 1.4199100075541105, "grad_norm": 0.6162757277488708, "learning_rate": 5.2680918170667606e-05, "loss": 1.6726, "step": 5404 }, { "epoch": 1.4204355108877722, "grad_norm": 0.5998090505599976, "learning_rate": 5.266339582968285e-05, "loss": 1.7206, "step": 5406 }, { "epoch": 1.420961014221434, "grad_norm": 0.6242619156837463, "learning_rate": 5.2645873488698095e-05, "loss": 1.6647, "step": 5408 }, { "epoch": 1.4214865175550957, "grad_norm": 0.5882295966148376, "learning_rate": 5.262835114771334e-05, "loss": 1.6562, "step": 5410 }, { "epoch": 1.4220120208887574, "grad_norm": 0.5960384607315063, "learning_rate": 5.261082880672858e-05, "loss": 1.6877, "step": 5412 }, { "epoch": 1.4225375242224194, "grad_norm": 0.6366286873817444, "learning_rate": 5.259330646574383e-05, "loss": 1.7009, "step": 5414 }, { "epoch": 1.423063027556081, "grad_norm": 0.581294059753418, "learning_rate": 5.257578412475908e-05, "loss": 1.6918, "step": 5416 }, { "epoch": 1.423588530889743, "grad_norm": 0.6820783615112305, "learning_rate": 5.255826178377431e-05, "loss": 1.6602, "step": 5418 }, { "epoch": 1.4241140342234047, "grad_norm": 0.7352914810180664, "learning_rate": 5.254073944278955e-05, "loss": 1.6837, "step": 5420 }, { "epoch": 1.4246395375570664, "grad_norm": 0.5740265250205994, "learning_rate": 5.25232171018048e-05, "loss": 1.6741, "step": 5422 }, { "epoch": 1.4251650408907282, "grad_norm": 0.5686757564544678, "learning_rate": 5.250569476082005e-05, "loss": 1.6908, "step": 5424 }, { "epoch": 1.42569054422439, "grad_norm": 0.765792191028595, "learning_rate": 5.248817241983529e-05, "loss": 1.6928, "step": 5426 }, { "epoch": 1.4262160475580516, "grad_norm": 0.605812668800354, "learning_rate": 5.2470650078850536e-05, "loss": 1.7009, "step": 5428 }, { "epoch": 1.4267415508917134, "grad_norm": 0.5373610258102417, "learning_rate": 5.2453127737865784e-05, "loss": 1.6906, "step": 5430 }, { "epoch": 1.4272670542253754, "grad_norm": 0.6103860139846802, "learning_rate": 5.2435605396881025e-05, "loss": 1.683, "step": 5432 }, { "epoch": 1.427792557559037, "grad_norm": 0.5279322862625122, "learning_rate": 5.241808305589627e-05, "loss": 1.6679, "step": 5434 }, { "epoch": 1.4283180608926989, "grad_norm": 0.6445699334144592, "learning_rate": 5.240056071491152e-05, "loss": 1.6908, "step": 5436 }, { "epoch": 1.4288435642263606, "grad_norm": 0.5819953680038452, "learning_rate": 5.238303837392676e-05, "loss": 1.6849, "step": 5438 }, { "epoch": 1.4293690675600224, "grad_norm": 0.564022421836853, "learning_rate": 5.236551603294201e-05, "loss": 1.6799, "step": 5440 }, { "epoch": 1.429894570893684, "grad_norm": 0.5687031149864197, "learning_rate": 5.2347993691957256e-05, "loss": 1.6905, "step": 5442 }, { "epoch": 1.4304200742273459, "grad_norm": 0.6546675562858582, "learning_rate": 5.233047135097249e-05, "loss": 1.6752, "step": 5444 }, { "epoch": 1.4309455775610076, "grad_norm": 0.6557585597038269, "learning_rate": 5.231294900998773e-05, "loss": 1.6959, "step": 5446 }, { "epoch": 1.4314710808946693, "grad_norm": 0.6117187142372131, "learning_rate": 5.229542666900298e-05, "loss": 1.6983, "step": 5448 }, { "epoch": 1.4319965842283313, "grad_norm": 0.6230733394622803, "learning_rate": 5.2277904328018226e-05, "loss": 1.7063, "step": 5450 }, { "epoch": 1.4325220875619928, "grad_norm": 0.5881343483924866, "learning_rate": 5.2260381987033467e-05, "loss": 1.6718, "step": 5452 }, { "epoch": 1.4330475908956548, "grad_norm": 0.606519341468811, "learning_rate": 5.2242859646048714e-05, "loss": 1.6876, "step": 5454 }, { "epoch": 1.4335730942293166, "grad_norm": 0.6653869152069092, "learning_rate": 5.222533730506396e-05, "loss": 1.6707, "step": 5456 }, { "epoch": 1.4340985975629783, "grad_norm": 0.5549430251121521, "learning_rate": 5.22078149640792e-05, "loss": 1.6994, "step": 5458 }, { "epoch": 1.43462410089664, "grad_norm": 0.7070857882499695, "learning_rate": 5.219029262309445e-05, "loss": 1.7043, "step": 5460 }, { "epoch": 1.4351496042303018, "grad_norm": 0.4982629716396332, "learning_rate": 5.21727702821097e-05, "loss": 1.6567, "step": 5462 }, { "epoch": 1.4356751075639635, "grad_norm": 0.6242676973342896, "learning_rate": 5.215524794112494e-05, "loss": 1.6898, "step": 5464 }, { "epoch": 1.4362006108976253, "grad_norm": 0.5926311016082764, "learning_rate": 5.2137725600140186e-05, "loss": 1.7012, "step": 5466 }, { "epoch": 1.4367261142312873, "grad_norm": 0.7169223427772522, "learning_rate": 5.212020325915542e-05, "loss": 1.6782, "step": 5468 }, { "epoch": 1.4372516175649488, "grad_norm": 0.7072371244430542, "learning_rate": 5.210268091817067e-05, "loss": 1.6442, "step": 5470 }, { "epoch": 1.4377771208986108, "grad_norm": 0.5502269268035889, "learning_rate": 5.208515857718591e-05, "loss": 1.6743, "step": 5472 }, { "epoch": 1.4383026242322725, "grad_norm": 0.7821040749549866, "learning_rate": 5.2067636236201156e-05, "loss": 1.7106, "step": 5474 }, { "epoch": 1.4388281275659343, "grad_norm": 0.6043164730072021, "learning_rate": 5.2050113895216403e-05, "loss": 1.708, "step": 5476 }, { "epoch": 1.439353630899596, "grad_norm": 0.5633584856987, "learning_rate": 5.2032591554231644e-05, "loss": 1.6696, "step": 5478 }, { "epoch": 1.4398791342332578, "grad_norm": 0.5389137864112854, "learning_rate": 5.201506921324689e-05, "loss": 1.669, "step": 5480 }, { "epoch": 1.4404046375669195, "grad_norm": 0.6789732575416565, "learning_rate": 5.199754687226214e-05, "loss": 1.6899, "step": 5482 }, { "epoch": 1.4409301409005812, "grad_norm": 0.6296239495277405, "learning_rate": 5.198002453127738e-05, "loss": 1.6631, "step": 5484 }, { "epoch": 1.4414556442342432, "grad_norm": 0.700206458568573, "learning_rate": 5.196250219029263e-05, "loss": 1.6616, "step": 5486 }, { "epoch": 1.441981147567905, "grad_norm": 0.6670316457748413, "learning_rate": 5.1944979849307875e-05, "loss": 1.6699, "step": 5488 }, { "epoch": 1.4425066509015667, "grad_norm": 0.6687952876091003, "learning_rate": 5.1927457508323116e-05, "loss": 1.7015, "step": 5490 }, { "epoch": 1.4430321542352285, "grad_norm": 0.6341938972473145, "learning_rate": 5.1909935167338364e-05, "loss": 1.6713, "step": 5492 }, { "epoch": 1.4435576575688902, "grad_norm": 0.5659570693969727, "learning_rate": 5.18924128263536e-05, "loss": 1.6662, "step": 5494 }, { "epoch": 1.444083160902552, "grad_norm": 0.559827983379364, "learning_rate": 5.1874890485368845e-05, "loss": 1.6901, "step": 5496 }, { "epoch": 1.4446086642362137, "grad_norm": 0.5669519901275635, "learning_rate": 5.1857368144384086e-05, "loss": 1.6966, "step": 5498 }, { "epoch": 1.4451341675698754, "grad_norm": 0.544907808303833, "learning_rate": 5.1839845803399334e-05, "loss": 1.6837, "step": 5500 }, { "epoch": 1.4456596709035372, "grad_norm": 0.510985255241394, "learning_rate": 5.182232346241458e-05, "loss": 1.7191, "step": 5502 }, { "epoch": 1.4461851742371992, "grad_norm": 0.5953521728515625, "learning_rate": 5.180480112142982e-05, "loss": 1.6899, "step": 5504 }, { "epoch": 1.446710677570861, "grad_norm": 0.5401633381843567, "learning_rate": 5.178727878044507e-05, "loss": 1.6602, "step": 5506 }, { "epoch": 1.4472361809045227, "grad_norm": 0.6041406393051147, "learning_rate": 5.176975643946032e-05, "loss": 1.6573, "step": 5508 }, { "epoch": 1.4477616842381844, "grad_norm": 0.5650386214256287, "learning_rate": 5.175223409847556e-05, "loss": 1.6747, "step": 5510 }, { "epoch": 1.4482871875718462, "grad_norm": 0.6061777472496033, "learning_rate": 5.1734711757490805e-05, "loss": 1.677, "step": 5512 }, { "epoch": 1.448812690905508, "grad_norm": 0.5969398617744446, "learning_rate": 5.171718941650605e-05, "loss": 1.6579, "step": 5514 }, { "epoch": 1.4493381942391697, "grad_norm": 0.5931181311607361, "learning_rate": 5.1699667075521294e-05, "loss": 1.6772, "step": 5516 }, { "epoch": 1.4498636975728314, "grad_norm": 0.5793447494506836, "learning_rate": 5.168214473453654e-05, "loss": 1.6302, "step": 5518 }, { "epoch": 1.4503892009064931, "grad_norm": 0.5775421857833862, "learning_rate": 5.1664622393551775e-05, "loss": 1.6691, "step": 5520 }, { "epoch": 1.4509147042401551, "grad_norm": 0.606317937374115, "learning_rate": 5.164710005256702e-05, "loss": 1.6876, "step": 5522 }, { "epoch": 1.4514402075738169, "grad_norm": 0.5722896456718445, "learning_rate": 5.1629577711582264e-05, "loss": 1.643, "step": 5524 }, { "epoch": 1.4519657109074786, "grad_norm": 0.7023299932479858, "learning_rate": 5.161205537059751e-05, "loss": 1.6527, "step": 5526 }, { "epoch": 1.4524912142411404, "grad_norm": 0.5290958285331726, "learning_rate": 5.159453302961276e-05, "loss": 1.6556, "step": 5528 }, { "epoch": 1.453016717574802, "grad_norm": 0.6962506771087646, "learning_rate": 5.1577010688628e-05, "loss": 1.6781, "step": 5530 }, { "epoch": 1.4535422209084639, "grad_norm": 0.5442536473274231, "learning_rate": 5.155948834764325e-05, "loss": 1.6712, "step": 5532 }, { "epoch": 1.4540677242421256, "grad_norm": 0.6162612438201904, "learning_rate": 5.1541966006658495e-05, "loss": 1.7003, "step": 5534 }, { "epoch": 1.4545932275757874, "grad_norm": 0.5466321110725403, "learning_rate": 5.1524443665673735e-05, "loss": 1.6908, "step": 5536 }, { "epoch": 1.455118730909449, "grad_norm": 0.5761128067970276, "learning_rate": 5.150692132468898e-05, "loss": 1.6745, "step": 5538 }, { "epoch": 1.455644234243111, "grad_norm": 0.5764265656471252, "learning_rate": 5.148939898370423e-05, "loss": 1.6308, "step": 5540 }, { "epoch": 1.4561697375767728, "grad_norm": 0.5794236063957214, "learning_rate": 5.147187664271947e-05, "loss": 1.7178, "step": 5542 }, { "epoch": 1.4566952409104346, "grad_norm": 0.8042659759521484, "learning_rate": 5.145435430173472e-05, "loss": 1.6589, "step": 5544 }, { "epoch": 1.4572207442440963, "grad_norm": 0.63565993309021, "learning_rate": 5.143683196074995e-05, "loss": 1.6601, "step": 5546 }, { "epoch": 1.457746247577758, "grad_norm": 0.5769701600074768, "learning_rate": 5.14193096197652e-05, "loss": 1.6565, "step": 5548 }, { "epoch": 1.4582717509114198, "grad_norm": 0.6259598135948181, "learning_rate": 5.140178727878044e-05, "loss": 1.6664, "step": 5550 }, { "epoch": 1.4587972542450816, "grad_norm": 1.0551050901412964, "learning_rate": 5.138426493779569e-05, "loss": 1.7123, "step": 5552 }, { "epoch": 1.4593227575787435, "grad_norm": 0.5595609545707703, "learning_rate": 5.1366742596810936e-05, "loss": 1.6494, "step": 5554 }, { "epoch": 1.459848260912405, "grad_norm": 0.607704222202301, "learning_rate": 5.134922025582618e-05, "loss": 1.6585, "step": 5556 }, { "epoch": 1.460373764246067, "grad_norm": 0.7237640023231506, "learning_rate": 5.1331697914841425e-05, "loss": 1.6797, "step": 5558 }, { "epoch": 1.4608992675797288, "grad_norm": 0.7079179286956787, "learning_rate": 5.131417557385667e-05, "loss": 1.6882, "step": 5560 }, { "epoch": 1.4614247709133905, "grad_norm": 0.5847229361534119, "learning_rate": 5.129665323287191e-05, "loss": 1.6694, "step": 5562 }, { "epoch": 1.4619502742470523, "grad_norm": 0.5582857728004456, "learning_rate": 5.127913089188716e-05, "loss": 1.6723, "step": 5564 }, { "epoch": 1.462475777580714, "grad_norm": 0.5761253833770752, "learning_rate": 5.126160855090241e-05, "loss": 1.6953, "step": 5566 }, { "epoch": 1.4630012809143758, "grad_norm": 0.6386553645133972, "learning_rate": 5.124408620991765e-05, "loss": 1.6842, "step": 5568 }, { "epoch": 1.4635267842480375, "grad_norm": 0.6150875687599182, "learning_rate": 5.122656386893288e-05, "loss": 1.6365, "step": 5570 }, { "epoch": 1.4640522875816995, "grad_norm": 0.632836639881134, "learning_rate": 5.120904152794813e-05, "loss": 1.6497, "step": 5572 }, { "epoch": 1.464577790915361, "grad_norm": 0.713758111000061, "learning_rate": 5.119151918696338e-05, "loss": 1.6505, "step": 5574 }, { "epoch": 1.465103294249023, "grad_norm": 0.5495973825454712, "learning_rate": 5.117399684597862e-05, "loss": 1.6852, "step": 5576 }, { "epoch": 1.4656287975826847, "grad_norm": 0.6123538613319397, "learning_rate": 5.1156474504993867e-05, "loss": 1.6334, "step": 5578 }, { "epoch": 1.4661543009163465, "grad_norm": 0.6316397786140442, "learning_rate": 5.1138952164009114e-05, "loss": 1.6877, "step": 5580 }, { "epoch": 1.4666798042500082, "grad_norm": 0.6072596311569214, "learning_rate": 5.1121429823024355e-05, "loss": 1.6586, "step": 5582 }, { "epoch": 1.46720530758367, "grad_norm": 0.6176102757453918, "learning_rate": 5.11039074820396e-05, "loss": 1.6752, "step": 5584 }, { "epoch": 1.4677308109173317, "grad_norm": 0.6328986883163452, "learning_rate": 5.108638514105485e-05, "loss": 1.6886, "step": 5586 }, { "epoch": 1.4682563142509935, "grad_norm": 0.6120469570159912, "learning_rate": 5.10688628000701e-05, "loss": 1.6834, "step": 5588 }, { "epoch": 1.4687818175846554, "grad_norm": 0.6649283170700073, "learning_rate": 5.105134045908534e-05, "loss": 1.6399, "step": 5590 }, { "epoch": 1.469307320918317, "grad_norm": 0.6802614331245422, "learning_rate": 5.1033818118100586e-05, "loss": 1.7099, "step": 5592 }, { "epoch": 1.469832824251979, "grad_norm": 0.5907096862792969, "learning_rate": 5.1016295777115833e-05, "loss": 1.6647, "step": 5594 }, { "epoch": 1.4703583275856407, "grad_norm": 0.5924010276794434, "learning_rate": 5.099877343613107e-05, "loss": 1.6655, "step": 5596 }, { "epoch": 1.4708838309193024, "grad_norm": 0.5676859617233276, "learning_rate": 5.098125109514631e-05, "loss": 1.6651, "step": 5598 }, { "epoch": 1.4714093342529642, "grad_norm": 0.5564827919006348, "learning_rate": 5.0963728754161556e-05, "loss": 1.7036, "step": 5600 }, { "epoch": 1.4714093342529642, "eval_loss": 1.6887091398239136, "eval_runtime": 487.1901, "eval_samples_per_second": 249.982, "eval_steps_per_second": 31.249, "step": 5600 }, { "epoch": 1.471934837586626, "grad_norm": 0.6894435286521912, "learning_rate": 5.09462064131768e-05, "loss": 1.6565, "step": 5602 }, { "epoch": 1.4724603409202877, "grad_norm": 0.5259155035018921, "learning_rate": 5.0928684072192044e-05, "loss": 1.6428, "step": 5604 }, { "epoch": 1.4729858442539494, "grad_norm": 0.6473843455314636, "learning_rate": 5.091116173120729e-05, "loss": 1.6773, "step": 5606 }, { "epoch": 1.4735113475876114, "grad_norm": 0.5902252793312073, "learning_rate": 5.089363939022254e-05, "loss": 1.683, "step": 5608 }, { "epoch": 1.474036850921273, "grad_norm": 0.5241686105728149, "learning_rate": 5.087611704923778e-05, "loss": 1.7068, "step": 5610 }, { "epoch": 1.4745623542549349, "grad_norm": 0.5719613432884216, "learning_rate": 5.085859470825303e-05, "loss": 1.6997, "step": 5612 }, { "epoch": 1.4750878575885966, "grad_norm": 0.7695503234863281, "learning_rate": 5.0841072367268275e-05, "loss": 1.7057, "step": 5614 }, { "epoch": 1.4756133609222584, "grad_norm": 0.5506905913352966, "learning_rate": 5.0823550026283516e-05, "loss": 1.6853, "step": 5616 }, { "epoch": 1.4761388642559201, "grad_norm": 0.5574632883071899, "learning_rate": 5.0806027685298764e-05, "loss": 1.7207, "step": 5618 }, { "epoch": 1.4766643675895819, "grad_norm": 0.5256906747817993, "learning_rate": 5.078850534431401e-05, "loss": 1.6743, "step": 5620 }, { "epoch": 1.4771898709232436, "grad_norm": 1.0742486715316772, "learning_rate": 5.0770983003329245e-05, "loss": 1.6613, "step": 5622 }, { "epoch": 1.4777153742569054, "grad_norm": 0.6463772654533386, "learning_rate": 5.0753460662344486e-05, "loss": 1.6872, "step": 5624 }, { "epoch": 1.4782408775905673, "grad_norm": 0.5381261706352234, "learning_rate": 5.0735938321359733e-05, "loss": 1.6418, "step": 5626 }, { "epoch": 1.4787663809242289, "grad_norm": 0.7155745029449463, "learning_rate": 5.071841598037498e-05, "loss": 1.7034, "step": 5628 }, { "epoch": 1.4792918842578908, "grad_norm": 0.8163059949874878, "learning_rate": 5.070089363939022e-05, "loss": 1.6595, "step": 5630 }, { "epoch": 1.4798173875915526, "grad_norm": 0.6401390433311462, "learning_rate": 5.068337129840547e-05, "loss": 1.6658, "step": 5632 }, { "epoch": 1.4803428909252143, "grad_norm": 0.5915647149085999, "learning_rate": 5.066584895742072e-05, "loss": 1.6495, "step": 5634 }, { "epoch": 1.480868394258876, "grad_norm": 0.5638352632522583, "learning_rate": 5.064832661643596e-05, "loss": 1.6619, "step": 5636 }, { "epoch": 1.4813938975925378, "grad_norm": 0.6860741376876831, "learning_rate": 5.0630804275451205e-05, "loss": 1.6827, "step": 5638 }, { "epoch": 1.4819194009261996, "grad_norm": 0.5409403443336487, "learning_rate": 5.061328193446645e-05, "loss": 1.6391, "step": 5640 }, { "epoch": 1.4824449042598613, "grad_norm": 0.6866961717605591, "learning_rate": 5.0595759593481694e-05, "loss": 1.6657, "step": 5642 }, { "epoch": 1.4829704075935233, "grad_norm": 0.6035512089729309, "learning_rate": 5.057823725249694e-05, "loss": 1.6962, "step": 5644 }, { "epoch": 1.483495910927185, "grad_norm": 0.539889931678772, "learning_rate": 5.056071491151219e-05, "loss": 1.676, "step": 5646 }, { "epoch": 1.4840214142608468, "grad_norm": 0.5658326148986816, "learning_rate": 5.054319257052742e-05, "loss": 1.6876, "step": 5648 }, { "epoch": 1.4845469175945085, "grad_norm": 0.9115592241287231, "learning_rate": 5.0525670229542664e-05, "loss": 1.6691, "step": 5650 }, { "epoch": 1.4850724209281703, "grad_norm": 0.6758273839950562, "learning_rate": 5.050814788855791e-05, "loss": 1.6405, "step": 5652 }, { "epoch": 1.485597924261832, "grad_norm": 0.6089572310447693, "learning_rate": 5.049062554757316e-05, "loss": 1.6954, "step": 5654 }, { "epoch": 1.4861234275954938, "grad_norm": 0.5882745981216431, "learning_rate": 5.04731032065884e-05, "loss": 1.6768, "step": 5656 }, { "epoch": 1.4866489309291555, "grad_norm": 0.6646392345428467, "learning_rate": 5.045558086560365e-05, "loss": 1.6832, "step": 5658 }, { "epoch": 1.4871744342628173, "grad_norm": 0.7076845765113831, "learning_rate": 5.0438058524618895e-05, "loss": 1.6734, "step": 5660 }, { "epoch": 1.4876999375964792, "grad_norm": 0.6591199636459351, "learning_rate": 5.0420536183634135e-05, "loss": 1.7321, "step": 5662 }, { "epoch": 1.488225440930141, "grad_norm": 0.5403279066085815, "learning_rate": 5.040301384264938e-05, "loss": 1.6665, "step": 5664 }, { "epoch": 1.4887509442638027, "grad_norm": 0.5921054482460022, "learning_rate": 5.038549150166463e-05, "loss": 1.6738, "step": 5666 }, { "epoch": 1.4892764475974645, "grad_norm": 0.6299567818641663, "learning_rate": 5.036796916067987e-05, "loss": 1.6627, "step": 5668 }, { "epoch": 1.4898019509311262, "grad_norm": 0.6079999208450317, "learning_rate": 5.035044681969512e-05, "loss": 1.6921, "step": 5670 }, { "epoch": 1.490327454264788, "grad_norm": 0.5412185788154602, "learning_rate": 5.033292447871035e-05, "loss": 1.6779, "step": 5672 }, { "epoch": 1.4908529575984497, "grad_norm": 0.603542685508728, "learning_rate": 5.03154021377256e-05, "loss": 1.6948, "step": 5674 }, { "epoch": 1.4913784609321115, "grad_norm": 0.818874716758728, "learning_rate": 5.029787979674084e-05, "loss": 1.6824, "step": 5676 }, { "epoch": 1.4919039642657732, "grad_norm": 0.5792141556739807, "learning_rate": 5.028035745575609e-05, "loss": 1.679, "step": 5678 }, { "epoch": 1.4924294675994352, "grad_norm": 0.576353907585144, "learning_rate": 5.0262835114771336e-05, "loss": 1.6835, "step": 5680 }, { "epoch": 1.492954970933097, "grad_norm": 0.5527434349060059, "learning_rate": 5.024531277378658e-05, "loss": 1.7095, "step": 5682 }, { "epoch": 1.4934804742667587, "grad_norm": 0.5520890355110168, "learning_rate": 5.0227790432801825e-05, "loss": 1.6672, "step": 5684 }, { "epoch": 1.4940059776004204, "grad_norm": 0.5043492317199707, "learning_rate": 5.021026809181707e-05, "loss": 1.6717, "step": 5686 }, { "epoch": 1.4945314809340822, "grad_norm": 0.6247971653938293, "learning_rate": 5.019274575083231e-05, "loss": 1.6792, "step": 5688 }, { "epoch": 1.495056984267744, "grad_norm": 0.6311819553375244, "learning_rate": 5.017522340984756e-05, "loss": 1.6628, "step": 5690 }, { "epoch": 1.4955824876014057, "grad_norm": 0.5443778038024902, "learning_rate": 5.015770106886281e-05, "loss": 1.6362, "step": 5692 }, { "epoch": 1.4961079909350674, "grad_norm": 0.6705971360206604, "learning_rate": 5.014017872787805e-05, "loss": 1.6975, "step": 5694 }, { "epoch": 1.4966334942687292, "grad_norm": 0.6342272758483887, "learning_rate": 5.0122656386893297e-05, "loss": 1.6652, "step": 5696 }, { "epoch": 1.4971589976023911, "grad_norm": 0.615489661693573, "learning_rate": 5.010513404590853e-05, "loss": 1.6503, "step": 5698 }, { "epoch": 1.4976845009360529, "grad_norm": 0.5811960697174072, "learning_rate": 5.008761170492378e-05, "loss": 1.6369, "step": 5700 }, { "epoch": 1.4982100042697146, "grad_norm": 0.5534473061561584, "learning_rate": 5.007008936393902e-05, "loss": 1.6311, "step": 5702 }, { "epoch": 1.4987355076033764, "grad_norm": 0.6678853034973145, "learning_rate": 5.0052567022954266e-05, "loss": 1.6513, "step": 5704 }, { "epoch": 1.4992610109370381, "grad_norm": 0.5876911878585815, "learning_rate": 5.0035044681969514e-05, "loss": 1.6847, "step": 5706 }, { "epoch": 1.4997865142706999, "grad_norm": 0.6282883286476135, "learning_rate": 5.0017522340984755e-05, "loss": 1.6909, "step": 5708 }, { "epoch": 1.5003120176043616, "grad_norm": 0.5211758613586426, "learning_rate": 5e-05, "loss": 1.6478, "step": 5710 }, { "epoch": 1.5008375209380236, "grad_norm": 0.5276235938072205, "learning_rate": 4.998247765901525e-05, "loss": 1.6706, "step": 5712 }, { "epoch": 1.501363024271685, "grad_norm": 0.6381866335868835, "learning_rate": 4.996495531803049e-05, "loss": 1.672, "step": 5714 }, { "epoch": 1.501888527605347, "grad_norm": 0.5875132083892822, "learning_rate": 4.994743297704574e-05, "loss": 1.6611, "step": 5716 }, { "epoch": 1.5024140309390086, "grad_norm": 0.5340953469276428, "learning_rate": 4.992991063606098e-05, "loss": 1.6612, "step": 5718 }, { "epoch": 1.5029395342726706, "grad_norm": 0.6308703422546387, "learning_rate": 4.991238829507623e-05, "loss": 1.6988, "step": 5720 }, { "epoch": 1.5034650376063323, "grad_norm": 0.6118327975273132, "learning_rate": 4.989486595409147e-05, "loss": 1.6677, "step": 5722 }, { "epoch": 1.503990540939994, "grad_norm": 0.6005641222000122, "learning_rate": 4.9877343613106715e-05, "loss": 1.6629, "step": 5724 }, { "epoch": 1.5045160442736558, "grad_norm": 0.6117488145828247, "learning_rate": 4.985982127212196e-05, "loss": 1.6429, "step": 5726 }, { "epoch": 1.5050415476073176, "grad_norm": 0.7005740404129028, "learning_rate": 4.98422989311372e-05, "loss": 1.6896, "step": 5728 }, { "epoch": 1.5055670509409795, "grad_norm": 0.5488820672035217, "learning_rate": 4.9824776590152444e-05, "loss": 1.6603, "step": 5730 }, { "epoch": 1.506092554274641, "grad_norm": 0.6293522119522095, "learning_rate": 4.980725424916769e-05, "loss": 1.6465, "step": 5732 }, { "epoch": 1.506618057608303, "grad_norm": 0.5625261664390564, "learning_rate": 4.978973190818293e-05, "loss": 1.6572, "step": 5734 }, { "epoch": 1.5071435609419648, "grad_norm": 0.5663225054740906, "learning_rate": 4.977220956719818e-05, "loss": 1.6674, "step": 5736 }, { "epoch": 1.5076690642756265, "grad_norm": 0.5049892663955688, "learning_rate": 4.975468722621343e-05, "loss": 1.6529, "step": 5738 }, { "epoch": 1.5081945676092883, "grad_norm": 0.5977827906608582, "learning_rate": 4.973716488522867e-05, "loss": 1.6644, "step": 5740 }, { "epoch": 1.50872007094295, "grad_norm": 0.5831950306892395, "learning_rate": 4.971964254424391e-05, "loss": 1.6659, "step": 5742 }, { "epoch": 1.5092455742766118, "grad_norm": 0.6863638162612915, "learning_rate": 4.970212020325916e-05, "loss": 1.6686, "step": 5744 }, { "epoch": 1.5097710776102735, "grad_norm": 0.5951880812644958, "learning_rate": 4.9684597862274404e-05, "loss": 1.6547, "step": 5746 }, { "epoch": 1.5102965809439355, "grad_norm": 0.6561357975006104, "learning_rate": 4.9667075521289645e-05, "loss": 1.6984, "step": 5748 }, { "epoch": 1.510822084277597, "grad_norm": 0.5892223715782166, "learning_rate": 4.964955318030489e-05, "loss": 1.6927, "step": 5750 }, { "epoch": 1.511347587611259, "grad_norm": 0.5931475162506104, "learning_rate": 4.963203083932014e-05, "loss": 1.6643, "step": 5752 }, { "epoch": 1.5118730909449207, "grad_norm": 0.5545600652694702, "learning_rate": 4.961450849833538e-05, "loss": 1.6818, "step": 5754 }, { "epoch": 1.5123985942785825, "grad_norm": 0.606566309928894, "learning_rate": 4.959698615735062e-05, "loss": 1.6676, "step": 5756 }, { "epoch": 1.5129240976122442, "grad_norm": 0.5394220352172852, "learning_rate": 4.957946381636587e-05, "loss": 1.6563, "step": 5758 }, { "epoch": 1.513449600945906, "grad_norm": 0.5019289255142212, "learning_rate": 4.956194147538111e-05, "loss": 1.6536, "step": 5760 }, { "epoch": 1.5139751042795677, "grad_norm": 0.5706535577774048, "learning_rate": 4.954441913439636e-05, "loss": 1.6781, "step": 5762 }, { "epoch": 1.5145006076132295, "grad_norm": 0.7017900347709656, "learning_rate": 4.9526896793411605e-05, "loss": 1.7036, "step": 5764 }, { "epoch": 1.5150261109468914, "grad_norm": 0.627980649471283, "learning_rate": 4.9509374452426846e-05, "loss": 1.7071, "step": 5766 }, { "epoch": 1.515551614280553, "grad_norm": 0.5035885572433472, "learning_rate": 4.949185211144209e-05, "loss": 1.6837, "step": 5768 }, { "epoch": 1.516077117614215, "grad_norm": 0.5663711428642273, "learning_rate": 4.9474329770457334e-05, "loss": 1.6622, "step": 5770 }, { "epoch": 1.5166026209478767, "grad_norm": 0.5828589797019958, "learning_rate": 4.945680742947258e-05, "loss": 1.7211, "step": 5772 }, { "epoch": 1.5171281242815384, "grad_norm": 0.644883930683136, "learning_rate": 4.943928508848782e-05, "loss": 1.6592, "step": 5774 }, { "epoch": 1.5176536276152002, "grad_norm": 0.5231825113296509, "learning_rate": 4.942176274750307e-05, "loss": 1.6896, "step": 5776 }, { "epoch": 1.518179130948862, "grad_norm": 0.6508246064186096, "learning_rate": 4.940424040651832e-05, "loss": 1.709, "step": 5778 }, { "epoch": 1.518704634282524, "grad_norm": 0.7368707656860352, "learning_rate": 4.938671806553356e-05, "loss": 1.6449, "step": 5780 }, { "epoch": 1.5192301376161854, "grad_norm": 0.5632103085517883, "learning_rate": 4.93691957245488e-05, "loss": 1.6599, "step": 5782 }, { "epoch": 1.5197556409498474, "grad_norm": 0.5499205589294434, "learning_rate": 4.935167338356405e-05, "loss": 1.6798, "step": 5784 }, { "epoch": 1.520281144283509, "grad_norm": 0.6031380295753479, "learning_rate": 4.933415104257929e-05, "loss": 1.7011, "step": 5786 }, { "epoch": 1.5208066476171709, "grad_norm": 0.6366339325904846, "learning_rate": 4.9316628701594535e-05, "loss": 1.6722, "step": 5788 }, { "epoch": 1.5213321509508326, "grad_norm": 0.6513427495956421, "learning_rate": 4.929910636060978e-05, "loss": 1.6998, "step": 5790 }, { "epoch": 1.5218576542844944, "grad_norm": 0.5762157440185547, "learning_rate": 4.9281584019625024e-05, "loss": 1.7176, "step": 5792 }, { "epoch": 1.5223831576181561, "grad_norm": 0.5817427635192871, "learning_rate": 4.9264061678640265e-05, "loss": 1.6686, "step": 5794 }, { "epoch": 1.5229086609518179, "grad_norm": 0.6498470902442932, "learning_rate": 4.924653933765551e-05, "loss": 1.6536, "step": 5796 }, { "epoch": 1.5234341642854798, "grad_norm": 0.5209649205207825, "learning_rate": 4.922901699667076e-05, "loss": 1.6711, "step": 5798 }, { "epoch": 1.5239596676191414, "grad_norm": 0.5542787909507751, "learning_rate": 4.9211494655686e-05, "loss": 1.6931, "step": 5800 }, { "epoch": 1.5244851709528033, "grad_norm": 0.5448528528213501, "learning_rate": 4.919397231470125e-05, "loss": 1.6952, "step": 5802 }, { "epoch": 1.5250106742864649, "grad_norm": 0.6163997054100037, "learning_rate": 4.9176449973716496e-05, "loss": 1.638, "step": 5804 }, { "epoch": 1.5255361776201268, "grad_norm": 0.5389247536659241, "learning_rate": 4.915892763273173e-05, "loss": 1.7243, "step": 5806 }, { "epoch": 1.5260616809537886, "grad_norm": 0.5348747372627258, "learning_rate": 4.914140529174698e-05, "loss": 1.699, "step": 5808 }, { "epoch": 1.5265871842874503, "grad_norm": 0.5715484023094177, "learning_rate": 4.9123882950762225e-05, "loss": 1.6672, "step": 5810 }, { "epoch": 1.527112687621112, "grad_norm": 0.64728844165802, "learning_rate": 4.9106360609777465e-05, "loss": 1.66, "step": 5812 }, { "epoch": 1.5276381909547738, "grad_norm": 0.5770621299743652, "learning_rate": 4.908883826879271e-05, "loss": 1.6614, "step": 5814 }, { "epoch": 1.5281636942884358, "grad_norm": 0.9658989310264587, "learning_rate": 4.907131592780796e-05, "loss": 1.6941, "step": 5816 }, { "epoch": 1.5286891976220973, "grad_norm": 0.6207730174064636, "learning_rate": 4.90537935868232e-05, "loss": 1.6654, "step": 5818 }, { "epoch": 1.5292147009557593, "grad_norm": 0.5695872902870178, "learning_rate": 4.903627124583844e-05, "loss": 1.6503, "step": 5820 }, { "epoch": 1.5297402042894208, "grad_norm": 0.7392182946205139, "learning_rate": 4.901874890485369e-05, "loss": 1.6731, "step": 5822 }, { "epoch": 1.5302657076230828, "grad_norm": 0.5596084594726562, "learning_rate": 4.900122656386894e-05, "loss": 1.6816, "step": 5824 }, { "epoch": 1.5307912109567445, "grad_norm": 0.5417515635490417, "learning_rate": 4.898370422288418e-05, "loss": 1.6556, "step": 5826 }, { "epoch": 1.5313167142904063, "grad_norm": 0.6508921980857849, "learning_rate": 4.8966181881899426e-05, "loss": 1.684, "step": 5828 }, { "epoch": 1.531842217624068, "grad_norm": 0.562555193901062, "learning_rate": 4.894865954091467e-05, "loss": 1.6706, "step": 5830 }, { "epoch": 1.5323677209577298, "grad_norm": 0.6852477788925171, "learning_rate": 4.893113719992991e-05, "loss": 1.6604, "step": 5832 }, { "epoch": 1.5328932242913917, "grad_norm": 0.8392126560211182, "learning_rate": 4.8913614858945155e-05, "loss": 1.6496, "step": 5834 }, { "epoch": 1.5334187276250533, "grad_norm": 0.635260283946991, "learning_rate": 4.88960925179604e-05, "loss": 1.6715, "step": 5836 }, { "epoch": 1.5339442309587152, "grad_norm": 0.5033748149871826, "learning_rate": 4.887857017697565e-05, "loss": 1.6391, "step": 5838 }, { "epoch": 1.5344697342923768, "grad_norm": 0.6123384237289429, "learning_rate": 4.886104783599089e-05, "loss": 1.6689, "step": 5840 }, { "epoch": 1.5349952376260387, "grad_norm": 0.8817549347877502, "learning_rate": 4.884352549500614e-05, "loss": 1.7004, "step": 5842 }, { "epoch": 1.5355207409597005, "grad_norm": 0.5635519027709961, "learning_rate": 4.882600315402138e-05, "loss": 1.6854, "step": 5844 }, { "epoch": 1.5360462442933622, "grad_norm": 0.5673317313194275, "learning_rate": 4.880848081303662e-05, "loss": 1.6538, "step": 5846 }, { "epoch": 1.536571747627024, "grad_norm": 0.5565547347068787, "learning_rate": 4.879095847205187e-05, "loss": 1.6524, "step": 5848 }, { "epoch": 1.5370972509606857, "grad_norm": 0.5782384276390076, "learning_rate": 4.8773436131067115e-05, "loss": 1.6929, "step": 5850 }, { "epoch": 1.5376227542943477, "grad_norm": 0.6354172229766846, "learning_rate": 4.8755913790082356e-05, "loss": 1.6959, "step": 5852 }, { "epoch": 1.5381482576280092, "grad_norm": 0.8500383496284485, "learning_rate": 4.87383914490976e-05, "loss": 1.6716, "step": 5854 }, { "epoch": 1.5386737609616712, "grad_norm": 0.6829351782798767, "learning_rate": 4.872086910811285e-05, "loss": 1.619, "step": 5856 }, { "epoch": 1.5391992642953327, "grad_norm": 0.5141264200210571, "learning_rate": 4.870334676712809e-05, "loss": 1.6637, "step": 5858 }, { "epoch": 1.5397247676289947, "grad_norm": 0.5253787636756897, "learning_rate": 4.868582442614333e-05, "loss": 1.6297, "step": 5860 }, { "epoch": 1.5402502709626564, "grad_norm": 0.5017191171646118, "learning_rate": 4.866830208515858e-05, "loss": 1.6502, "step": 5862 }, { "epoch": 1.5407757742963182, "grad_norm": 0.5245093703269958, "learning_rate": 4.865077974417383e-05, "loss": 1.6876, "step": 5864 }, { "epoch": 1.54130127762998, "grad_norm": 0.6714510321617126, "learning_rate": 4.863325740318907e-05, "loss": 1.6635, "step": 5866 }, { "epoch": 1.5418267809636417, "grad_norm": 0.47801029682159424, "learning_rate": 4.8615735062204316e-05, "loss": 1.673, "step": 5868 }, { "epoch": 1.5423522842973036, "grad_norm": 0.5477654337882996, "learning_rate": 4.859821272121956e-05, "loss": 1.7127, "step": 5870 }, { "epoch": 1.5428777876309652, "grad_norm": 0.6024754643440247, "learning_rate": 4.85806903802348e-05, "loss": 1.6402, "step": 5872 }, { "epoch": 1.5434032909646271, "grad_norm": 0.5506448149681091, "learning_rate": 4.8563168039250045e-05, "loss": 1.6795, "step": 5874 }, { "epoch": 1.5439287942982887, "grad_norm": 0.6219335794448853, "learning_rate": 4.854564569826529e-05, "loss": 1.7043, "step": 5876 }, { "epoch": 1.5444542976319506, "grad_norm": 0.5376720428466797, "learning_rate": 4.8528123357280533e-05, "loss": 1.6416, "step": 5878 }, { "epoch": 1.5449798009656124, "grad_norm": 0.5363356471061707, "learning_rate": 4.851060101629578e-05, "loss": 1.6787, "step": 5880 }, { "epoch": 1.5455053042992741, "grad_norm": 0.6602938175201416, "learning_rate": 4.849307867531102e-05, "loss": 1.6648, "step": 5882 }, { "epoch": 1.5460308076329359, "grad_norm": 0.5136982798576355, "learning_rate": 4.847555633432627e-05, "loss": 1.6636, "step": 5884 }, { "epoch": 1.5465563109665976, "grad_norm": 0.5922480225563049, "learning_rate": 4.845803399334151e-05, "loss": 1.645, "step": 5886 }, { "epoch": 1.5470818143002596, "grad_norm": 0.6566057205200195, "learning_rate": 4.844051165235676e-05, "loss": 1.7023, "step": 5888 }, { "epoch": 1.5476073176339211, "grad_norm": 0.5478838682174683, "learning_rate": 4.8422989311372005e-05, "loss": 1.6359, "step": 5890 }, { "epoch": 1.548132820967583, "grad_norm": 0.5223366618156433, "learning_rate": 4.8405466970387246e-05, "loss": 1.6827, "step": 5892 }, { "epoch": 1.5486583243012448, "grad_norm": 0.5406147837638855, "learning_rate": 4.8387944629402494e-05, "loss": 1.6776, "step": 5894 }, { "epoch": 1.5491838276349066, "grad_norm": 0.5778117179870605, "learning_rate": 4.8370422288417734e-05, "loss": 1.6862, "step": 5896 }, { "epoch": 1.5497093309685683, "grad_norm": 0.7566986083984375, "learning_rate": 4.8352899947432975e-05, "loss": 1.6777, "step": 5898 }, { "epoch": 1.55023483430223, "grad_norm": 0.5816596150398254, "learning_rate": 4.833537760644822e-05, "loss": 1.6782, "step": 5900 }, { "epoch": 1.5507603376358918, "grad_norm": 0.6017654538154602, "learning_rate": 4.831785526546347e-05, "loss": 1.6683, "step": 5902 }, { "epoch": 1.5512858409695536, "grad_norm": 0.6033945679664612, "learning_rate": 4.830033292447871e-05, "loss": 1.6734, "step": 5904 }, { "epoch": 1.5518113443032155, "grad_norm": 0.6159767508506775, "learning_rate": 4.828281058349396e-05, "loss": 1.6499, "step": 5906 }, { "epoch": 1.552336847636877, "grad_norm": 0.5407187938690186, "learning_rate": 4.82652882425092e-05, "loss": 1.6808, "step": 5908 }, { "epoch": 1.552862350970539, "grad_norm": 0.6374317407608032, "learning_rate": 4.824776590152445e-05, "loss": 1.6735, "step": 5910 }, { "epoch": 1.5533878543042008, "grad_norm": 0.6518111228942871, "learning_rate": 4.823024356053969e-05, "loss": 1.6818, "step": 5912 }, { "epoch": 1.5539133576378625, "grad_norm": 0.5397405028343201, "learning_rate": 4.8212721219554935e-05, "loss": 1.6484, "step": 5914 }, { "epoch": 1.5544388609715243, "grad_norm": 0.5633911490440369, "learning_rate": 4.819519887857018e-05, "loss": 1.6438, "step": 5916 }, { "epoch": 1.554964364305186, "grad_norm": 0.5151371359825134, "learning_rate": 4.8177676537585424e-05, "loss": 1.6492, "step": 5918 }, { "epoch": 1.5554898676388478, "grad_norm": 0.5781606435775757, "learning_rate": 4.816015419660067e-05, "loss": 1.6881, "step": 5920 }, { "epoch": 1.5560153709725095, "grad_norm": 0.6180148720741272, "learning_rate": 4.814263185561591e-05, "loss": 1.6847, "step": 5922 }, { "epoch": 1.5565408743061715, "grad_norm": 0.5826863646507263, "learning_rate": 4.812510951463115e-05, "loss": 1.6514, "step": 5924 }, { "epoch": 1.557066377639833, "grad_norm": 0.5639335513114929, "learning_rate": 4.81075871736464e-05, "loss": 1.6146, "step": 5926 }, { "epoch": 1.557591880973495, "grad_norm": 0.5149716734886169, "learning_rate": 4.809006483266165e-05, "loss": 1.7064, "step": 5928 }, { "epoch": 1.5581173843071567, "grad_norm": 0.5117688179016113, "learning_rate": 4.807254249167689e-05, "loss": 1.6757, "step": 5930 }, { "epoch": 1.5586428876408185, "grad_norm": 0.6055058836936951, "learning_rate": 4.8055020150692136e-05, "loss": 1.6777, "step": 5932 }, { "epoch": 1.5591683909744802, "grad_norm": 0.5205301642417908, "learning_rate": 4.803749780970738e-05, "loss": 1.6599, "step": 5934 }, { "epoch": 1.559693894308142, "grad_norm": 0.5878111124038696, "learning_rate": 4.8019975468722625e-05, "loss": 1.6491, "step": 5936 }, { "epoch": 1.560219397641804, "grad_norm": 0.6508564949035645, "learning_rate": 4.8002453127737865e-05, "loss": 1.7085, "step": 5938 }, { "epoch": 1.5607449009754655, "grad_norm": 0.7473523020744324, "learning_rate": 4.798493078675311e-05, "loss": 1.6765, "step": 5940 }, { "epoch": 1.5612704043091274, "grad_norm": 0.5754930377006531, "learning_rate": 4.796740844576836e-05, "loss": 1.6825, "step": 5942 }, { "epoch": 1.561795907642789, "grad_norm": 0.5699328780174255, "learning_rate": 4.79498861047836e-05, "loss": 1.6851, "step": 5944 }, { "epoch": 1.562321410976451, "grad_norm": 0.5697868466377258, "learning_rate": 4.793236376379884e-05, "loss": 1.644, "step": 5946 }, { "epoch": 1.5628469143101127, "grad_norm": 0.5635419487953186, "learning_rate": 4.791484142281409e-05, "loss": 1.6765, "step": 5948 }, { "epoch": 1.5633724176437744, "grad_norm": 0.7371823191642761, "learning_rate": 4.789731908182933e-05, "loss": 1.6876, "step": 5950 }, { "epoch": 1.5638979209774362, "grad_norm": 0.4995562434196472, "learning_rate": 4.787979674084458e-05, "loss": 1.6593, "step": 5952 }, { "epoch": 1.564423424311098, "grad_norm": 0.5490265488624573, "learning_rate": 4.7862274399859826e-05, "loss": 1.6424, "step": 5954 }, { "epoch": 1.56494892764476, "grad_norm": 0.5627312064170837, "learning_rate": 4.7844752058875066e-05, "loss": 1.6849, "step": 5956 }, { "epoch": 1.5654744309784214, "grad_norm": 0.5825894474983215, "learning_rate": 4.7827229717890314e-05, "loss": 1.6714, "step": 5958 }, { "epoch": 1.5659999343120834, "grad_norm": 0.6863036751747131, "learning_rate": 4.7809707376905555e-05, "loss": 1.69, "step": 5960 }, { "epoch": 1.566525437645745, "grad_norm": 0.6272795796394348, "learning_rate": 4.77921850359208e-05, "loss": 1.6536, "step": 5962 }, { "epoch": 1.567050940979407, "grad_norm": 0.6180011630058289, "learning_rate": 4.777466269493604e-05, "loss": 1.6975, "step": 5964 }, { "epoch": 1.5675764443130686, "grad_norm": 0.5767374038696289, "learning_rate": 4.775714035395129e-05, "loss": 1.6504, "step": 5966 }, { "epoch": 1.5681019476467304, "grad_norm": 0.5190562009811401, "learning_rate": 4.773961801296654e-05, "loss": 1.6131, "step": 5968 }, { "epoch": 1.5686274509803921, "grad_norm": 0.6157041788101196, "learning_rate": 4.772209567198178e-05, "loss": 1.6916, "step": 5970 }, { "epoch": 1.5691529543140539, "grad_norm": 0.5237650275230408, "learning_rate": 4.770457333099702e-05, "loss": 1.6746, "step": 5972 }, { "epoch": 1.5696784576477159, "grad_norm": 0.5459091663360596, "learning_rate": 4.768705099001227e-05, "loss": 1.6643, "step": 5974 }, { "epoch": 1.5702039609813774, "grad_norm": 0.5838450193405151, "learning_rate": 4.7669528649027515e-05, "loss": 1.6745, "step": 5976 }, { "epoch": 1.5707294643150393, "grad_norm": 0.5081990361213684, "learning_rate": 4.7652006308042756e-05, "loss": 1.6876, "step": 5978 }, { "epoch": 1.5712549676487009, "grad_norm": 0.6588307619094849, "learning_rate": 4.7634483967058e-05, "loss": 1.7017, "step": 5980 }, { "epoch": 1.5717804709823628, "grad_norm": 0.8295912146568298, "learning_rate": 4.761696162607325e-05, "loss": 1.6709, "step": 5982 }, { "epoch": 1.5723059743160246, "grad_norm": 0.6136393547058105, "learning_rate": 4.7599439285088485e-05, "loss": 1.6989, "step": 5984 }, { "epoch": 1.5728314776496863, "grad_norm": 0.5983635187149048, "learning_rate": 4.758191694410373e-05, "loss": 1.6786, "step": 5986 }, { "epoch": 1.573356980983348, "grad_norm": 0.6584506630897522, "learning_rate": 4.756439460311898e-05, "loss": 1.6549, "step": 5988 }, { "epoch": 1.5738824843170098, "grad_norm": 0.513239860534668, "learning_rate": 4.754687226213422e-05, "loss": 1.683, "step": 5990 }, { "epoch": 1.5744079876506718, "grad_norm": 0.5512750744819641, "learning_rate": 4.752934992114947e-05, "loss": 1.703, "step": 5992 }, { "epoch": 1.5749334909843333, "grad_norm": 0.7697680592536926, "learning_rate": 4.7511827580164716e-05, "loss": 1.6361, "step": 5994 }, { "epoch": 1.5754589943179953, "grad_norm": 0.5221796035766602, "learning_rate": 4.749430523917996e-05, "loss": 1.6525, "step": 5996 }, { "epoch": 1.5759844976516568, "grad_norm": 0.5528659820556641, "learning_rate": 4.74767828981952e-05, "loss": 1.6904, "step": 5998 }, { "epoch": 1.5765100009853188, "grad_norm": 0.6376633644104004, "learning_rate": 4.7459260557210445e-05, "loss": 1.6849, "step": 6000 }, { "epoch": 1.5765100009853188, "eval_loss": 1.6817389726638794, "eval_runtime": 487.2585, "eval_samples_per_second": 249.947, "eval_steps_per_second": 31.244, "step": 6000 }, { "epoch": 1.5770355043189805, "grad_norm": 0.5638622641563416, "learning_rate": 4.744173821622569e-05, "loss": 1.6566, "step": 6002 }, { "epoch": 1.5775610076526423, "grad_norm": 0.6327787637710571, "learning_rate": 4.7424215875240933e-05, "loss": 1.6806, "step": 6004 }, { "epoch": 1.578086510986304, "grad_norm": 0.6774953007698059, "learning_rate": 4.740669353425618e-05, "loss": 1.6806, "step": 6006 }, { "epoch": 1.5786120143199658, "grad_norm": 0.6806586980819702, "learning_rate": 4.738917119327143e-05, "loss": 1.6435, "step": 6008 }, { "epoch": 1.5791375176536278, "grad_norm": 0.5648463368415833, "learning_rate": 4.737164885228666e-05, "loss": 1.7014, "step": 6010 }, { "epoch": 1.5796630209872893, "grad_norm": 0.5950681567192078, "learning_rate": 4.735412651130191e-05, "loss": 1.6408, "step": 6012 }, { "epoch": 1.5801885243209512, "grad_norm": 0.6674718856811523, "learning_rate": 4.733660417031716e-05, "loss": 1.6451, "step": 6014 }, { "epoch": 1.5807140276546128, "grad_norm": 0.8182973265647888, "learning_rate": 4.73190818293324e-05, "loss": 1.6913, "step": 6016 }, { "epoch": 1.5812395309882747, "grad_norm": 0.5322865843772888, "learning_rate": 4.7301559488347646e-05, "loss": 1.7063, "step": 6018 }, { "epoch": 1.5817650343219365, "grad_norm": 0.5284510850906372, "learning_rate": 4.7284037147362894e-05, "loss": 1.6556, "step": 6020 }, { "epoch": 1.5822905376555982, "grad_norm": 0.6167967915534973, "learning_rate": 4.7266514806378134e-05, "loss": 1.6455, "step": 6022 }, { "epoch": 1.58281604098926, "grad_norm": 0.5635978579521179, "learning_rate": 4.7248992465393375e-05, "loss": 1.6768, "step": 6024 }, { "epoch": 1.5833415443229217, "grad_norm": 0.6442639231681824, "learning_rate": 4.723147012440862e-05, "loss": 1.6707, "step": 6026 }, { "epoch": 1.5838670476565837, "grad_norm": 0.5623191595077515, "learning_rate": 4.721394778342387e-05, "loss": 1.711, "step": 6028 }, { "epoch": 1.5843925509902452, "grad_norm": 0.5593612194061279, "learning_rate": 4.719642544243911e-05, "loss": 1.6713, "step": 6030 }, { "epoch": 1.5849180543239072, "grad_norm": 0.7203925251960754, "learning_rate": 4.717890310145436e-05, "loss": 1.6842, "step": 6032 }, { "epoch": 1.5854435576575687, "grad_norm": 0.5466020703315735, "learning_rate": 4.7161380760469606e-05, "loss": 1.6921, "step": 6034 }, { "epoch": 1.5859690609912307, "grad_norm": 0.5434938669204712, "learning_rate": 4.714385841948484e-05, "loss": 1.6605, "step": 6036 }, { "epoch": 1.5864945643248924, "grad_norm": 0.5552278757095337, "learning_rate": 4.712633607850009e-05, "loss": 1.707, "step": 6038 }, { "epoch": 1.5870200676585542, "grad_norm": 0.6083989143371582, "learning_rate": 4.7108813737515335e-05, "loss": 1.6888, "step": 6040 }, { "epoch": 1.587545570992216, "grad_norm": 0.5356237292289734, "learning_rate": 4.7091291396530576e-05, "loss": 1.6478, "step": 6042 }, { "epoch": 1.5880710743258777, "grad_norm": 0.4878697097301483, "learning_rate": 4.7073769055545824e-05, "loss": 1.6558, "step": 6044 }, { "epoch": 1.5885965776595397, "grad_norm": 0.5412904620170593, "learning_rate": 4.705624671456107e-05, "loss": 1.6763, "step": 6046 }, { "epoch": 1.5891220809932012, "grad_norm": 0.5351582765579224, "learning_rate": 4.703872437357631e-05, "loss": 1.6891, "step": 6048 }, { "epoch": 1.5896475843268632, "grad_norm": 0.5699527859687805, "learning_rate": 4.702120203259155e-05, "loss": 1.6841, "step": 6050 }, { "epoch": 1.590173087660525, "grad_norm": 0.5373657941818237, "learning_rate": 4.70036796916068e-05, "loss": 1.6589, "step": 6052 }, { "epoch": 1.5906985909941866, "grad_norm": 0.5621985197067261, "learning_rate": 4.698615735062205e-05, "loss": 1.6845, "step": 6054 }, { "epoch": 1.5912240943278484, "grad_norm": 0.5501397848129272, "learning_rate": 4.696863500963729e-05, "loss": 1.6752, "step": 6056 }, { "epoch": 1.5917495976615101, "grad_norm": 0.6588435173034668, "learning_rate": 4.6951112668652536e-05, "loss": 1.6727, "step": 6058 }, { "epoch": 1.5922751009951719, "grad_norm": 0.5301753282546997, "learning_rate": 4.6933590327667784e-05, "loss": 1.6811, "step": 6060 }, { "epoch": 1.5928006043288336, "grad_norm": 0.5546782612800598, "learning_rate": 4.691606798668302e-05, "loss": 1.6298, "step": 6062 }, { "epoch": 1.5933261076624956, "grad_norm": 0.6335771679878235, "learning_rate": 4.6898545645698265e-05, "loss": 1.6606, "step": 6064 }, { "epoch": 1.5938516109961571, "grad_norm": 0.5431744456291199, "learning_rate": 4.688102330471351e-05, "loss": 1.6444, "step": 6066 }, { "epoch": 1.594377114329819, "grad_norm": 0.6626846790313721, "learning_rate": 4.6863500963728754e-05, "loss": 1.6857, "step": 6068 }, { "epoch": 1.5949026176634808, "grad_norm": 0.5748207569122314, "learning_rate": 4.6845978622744e-05, "loss": 1.6811, "step": 6070 }, { "epoch": 1.5954281209971426, "grad_norm": 0.6388061046600342, "learning_rate": 4.682845628175925e-05, "loss": 1.6696, "step": 6072 }, { "epoch": 1.5959536243308043, "grad_norm": 0.5904421210289001, "learning_rate": 4.681093394077449e-05, "loss": 1.6872, "step": 6074 }, { "epoch": 1.596479127664466, "grad_norm": 0.5612810254096985, "learning_rate": 4.679341159978973e-05, "loss": 1.7049, "step": 6076 }, { "epoch": 1.597004630998128, "grad_norm": 0.5887944102287292, "learning_rate": 4.677588925880498e-05, "loss": 1.6556, "step": 6078 }, { "epoch": 1.5975301343317896, "grad_norm": 0.5331344604492188, "learning_rate": 4.6758366917820226e-05, "loss": 1.6706, "step": 6080 }, { "epoch": 1.5980556376654516, "grad_norm": 0.6311489939689636, "learning_rate": 4.6740844576835466e-05, "loss": 1.6893, "step": 6082 }, { "epoch": 1.598581140999113, "grad_norm": 0.5310975313186646, "learning_rate": 4.6723322235850714e-05, "loss": 1.6057, "step": 6084 }, { "epoch": 1.599106644332775, "grad_norm": 0.5268464684486389, "learning_rate": 4.6705799894865955e-05, "loss": 1.6649, "step": 6086 }, { "epoch": 1.5996321476664368, "grad_norm": 0.5862019658088684, "learning_rate": 4.66882775538812e-05, "loss": 1.6631, "step": 6088 }, { "epoch": 1.6001576510000985, "grad_norm": 0.5458804368972778, "learning_rate": 4.667075521289644e-05, "loss": 1.671, "step": 6090 }, { "epoch": 1.6006831543337603, "grad_norm": 0.5199334025382996, "learning_rate": 4.665323287191169e-05, "loss": 1.6498, "step": 6092 }, { "epoch": 1.601208657667422, "grad_norm": 0.5879623293876648, "learning_rate": 4.663571053092694e-05, "loss": 1.667, "step": 6094 }, { "epoch": 1.601734161001084, "grad_norm": 0.6247827410697937, "learning_rate": 4.661818818994218e-05, "loss": 1.7004, "step": 6096 }, { "epoch": 1.6022596643347455, "grad_norm": 0.6340193152427673, "learning_rate": 4.6600665848957427e-05, "loss": 1.6622, "step": 6098 }, { "epoch": 1.6027851676684075, "grad_norm": 0.6243396401405334, "learning_rate": 4.658314350797267e-05, "loss": 1.6983, "step": 6100 }, { "epoch": 1.603310671002069, "grad_norm": 0.584221601486206, "learning_rate": 4.656562116698791e-05, "loss": 1.6548, "step": 6102 }, { "epoch": 1.603836174335731, "grad_norm": 0.5587270259857178, "learning_rate": 4.6548098826003156e-05, "loss": 1.6482, "step": 6104 }, { "epoch": 1.6043616776693927, "grad_norm": 0.8525761365890503, "learning_rate": 4.65305764850184e-05, "loss": 1.6601, "step": 6106 }, { "epoch": 1.6048871810030545, "grad_norm": 0.5316641330718994, "learning_rate": 4.6513054144033644e-05, "loss": 1.6809, "step": 6108 }, { "epoch": 1.6054126843367162, "grad_norm": 0.5327313542366028, "learning_rate": 4.649553180304889e-05, "loss": 1.6303, "step": 6110 }, { "epoch": 1.605938187670378, "grad_norm": 0.5327521562576294, "learning_rate": 4.647800946206413e-05, "loss": 1.6875, "step": 6112 }, { "epoch": 1.60646369100404, "grad_norm": 0.7889205813407898, "learning_rate": 4.646048712107938e-05, "loss": 1.7043, "step": 6114 }, { "epoch": 1.6069891943377015, "grad_norm": 0.6641364097595215, "learning_rate": 4.644296478009462e-05, "loss": 1.7062, "step": 6116 }, { "epoch": 1.6075146976713635, "grad_norm": 0.675348162651062, "learning_rate": 4.642544243910987e-05, "loss": 1.6911, "step": 6118 }, { "epoch": 1.608040201005025, "grad_norm": 0.541476845741272, "learning_rate": 4.6407920098125116e-05, "loss": 1.6667, "step": 6120 }, { "epoch": 1.608565704338687, "grad_norm": 0.5840083360671997, "learning_rate": 4.639039775714036e-05, "loss": 1.652, "step": 6122 }, { "epoch": 1.6090912076723487, "grad_norm": 0.5409373641014099, "learning_rate": 4.6372875416155604e-05, "loss": 1.7031, "step": 6124 }, { "epoch": 1.6096167110060104, "grad_norm": 0.5161097049713135, "learning_rate": 4.6355353075170845e-05, "loss": 1.6873, "step": 6126 }, { "epoch": 1.6101422143396722, "grad_norm": 0.6245579719543457, "learning_rate": 4.6337830734186086e-05, "loss": 1.6652, "step": 6128 }, { "epoch": 1.610667717673334, "grad_norm": 0.60563725233078, "learning_rate": 4.6320308393201333e-05, "loss": 1.6783, "step": 6130 }, { "epoch": 1.611193221006996, "grad_norm": 0.667809009552002, "learning_rate": 4.630278605221658e-05, "loss": 1.6492, "step": 6132 }, { "epoch": 1.6117187243406574, "grad_norm": 0.5152156352996826, "learning_rate": 4.628526371123182e-05, "loss": 1.6757, "step": 6134 }, { "epoch": 1.6122442276743194, "grad_norm": 0.5245918035507202, "learning_rate": 4.626774137024707e-05, "loss": 1.6368, "step": 6136 }, { "epoch": 1.612769731007981, "grad_norm": 0.5406076908111572, "learning_rate": 4.625021902926231e-05, "loss": 1.6817, "step": 6138 }, { "epoch": 1.613295234341643, "grad_norm": 0.6298016905784607, "learning_rate": 4.623269668827756e-05, "loss": 1.6542, "step": 6140 }, { "epoch": 1.6138207376753047, "grad_norm": 0.5466469526290894, "learning_rate": 4.62151743472928e-05, "loss": 1.6367, "step": 6142 }, { "epoch": 1.6143462410089664, "grad_norm": 0.7066838145256042, "learning_rate": 4.6197652006308046e-05, "loss": 1.6461, "step": 6144 }, { "epoch": 1.6148717443426281, "grad_norm": 0.5102798938751221, "learning_rate": 4.6180129665323294e-05, "loss": 1.6628, "step": 6146 }, { "epoch": 1.61539724767629, "grad_norm": 0.72835773229599, "learning_rate": 4.6162607324338534e-05, "loss": 1.7141, "step": 6148 }, { "epoch": 1.6159227510099519, "grad_norm": 0.5149089097976685, "learning_rate": 4.6145084983353775e-05, "loss": 1.6539, "step": 6150 }, { "epoch": 1.6164482543436134, "grad_norm": 0.5791059732437134, "learning_rate": 4.612756264236902e-05, "loss": 1.6791, "step": 6152 }, { "epoch": 1.6169737576772754, "grad_norm": 0.5340200662612915, "learning_rate": 4.6110040301384263e-05, "loss": 1.6807, "step": 6154 }, { "epoch": 1.6174992610109369, "grad_norm": 0.5943130254745483, "learning_rate": 4.609251796039951e-05, "loss": 1.662, "step": 6156 }, { "epoch": 1.6180247643445989, "grad_norm": 0.5829207897186279, "learning_rate": 4.607499561941476e-05, "loss": 1.67, "step": 6158 }, { "epoch": 1.6185502676782606, "grad_norm": 0.5549664497375488, "learning_rate": 4.605747327843e-05, "loss": 1.6686, "step": 6160 }, { "epoch": 1.6190757710119223, "grad_norm": 0.6359058618545532, "learning_rate": 4.603995093744525e-05, "loss": 1.6694, "step": 6162 }, { "epoch": 1.619601274345584, "grad_norm": 0.5652121305465698, "learning_rate": 4.602242859646049e-05, "loss": 1.6678, "step": 6164 }, { "epoch": 1.6201267776792458, "grad_norm": 0.5643856525421143, "learning_rate": 4.6004906255475735e-05, "loss": 1.6915, "step": 6166 }, { "epoch": 1.6206522810129078, "grad_norm": 0.5165106058120728, "learning_rate": 4.5987383914490976e-05, "loss": 1.6643, "step": 6168 }, { "epoch": 1.6211777843465693, "grad_norm": 0.6091976761817932, "learning_rate": 4.5969861573506224e-05, "loss": 1.6878, "step": 6170 }, { "epoch": 1.6217032876802313, "grad_norm": 0.5565605759620667, "learning_rate": 4.595233923252147e-05, "loss": 1.6843, "step": 6172 }, { "epoch": 1.6222287910138928, "grad_norm": 0.635827362537384, "learning_rate": 4.593481689153671e-05, "loss": 1.6554, "step": 6174 }, { "epoch": 1.6227542943475548, "grad_norm": 0.6421988010406494, "learning_rate": 4.591729455055195e-05, "loss": 1.6298, "step": 6176 }, { "epoch": 1.6232797976812166, "grad_norm": 0.7460888624191284, "learning_rate": 4.58997722095672e-05, "loss": 1.7074, "step": 6178 }, { "epoch": 1.6238053010148783, "grad_norm": 0.5302199721336365, "learning_rate": 4.588224986858244e-05, "loss": 1.6546, "step": 6180 }, { "epoch": 1.62433080434854, "grad_norm": 0.5736710429191589, "learning_rate": 4.586472752759769e-05, "loss": 1.6816, "step": 6182 }, { "epoch": 1.6248563076822018, "grad_norm": 0.6789550185203552, "learning_rate": 4.5847205186612936e-05, "loss": 1.6643, "step": 6184 }, { "epoch": 1.6253818110158638, "grad_norm": 0.5486941337585449, "learning_rate": 4.582968284562818e-05, "loss": 1.68, "step": 6186 }, { "epoch": 1.6259073143495253, "grad_norm": 0.8008251786231995, "learning_rate": 4.581216050464342e-05, "loss": 1.6386, "step": 6188 }, { "epoch": 1.6264328176831873, "grad_norm": 0.6284978985786438, "learning_rate": 4.5794638163658665e-05, "loss": 1.6497, "step": 6190 }, { "epoch": 1.6269583210168488, "grad_norm": 0.5751000642776489, "learning_rate": 4.577711582267391e-05, "loss": 1.6897, "step": 6192 }, { "epoch": 1.6274838243505108, "grad_norm": 0.5420372486114502, "learning_rate": 4.5759593481689154e-05, "loss": 1.6936, "step": 6194 }, { "epoch": 1.6280093276841725, "grad_norm": 0.5770832896232605, "learning_rate": 4.57420711407044e-05, "loss": 1.6682, "step": 6196 }, { "epoch": 1.6285348310178343, "grad_norm": 0.5784463286399841, "learning_rate": 4.572454879971965e-05, "loss": 1.6884, "step": 6198 }, { "epoch": 1.629060334351496, "grad_norm": 0.5765862464904785, "learning_rate": 4.570702645873489e-05, "loss": 1.6228, "step": 6200 }, { "epoch": 1.6295858376851577, "grad_norm": 0.6257441639900208, "learning_rate": 4.568950411775013e-05, "loss": 1.6559, "step": 6202 }, { "epoch": 1.6301113410188197, "grad_norm": 0.5480515360832214, "learning_rate": 4.567198177676538e-05, "loss": 1.6524, "step": 6204 }, { "epoch": 1.6306368443524812, "grad_norm": 0.5308948755264282, "learning_rate": 4.565445943578062e-05, "loss": 1.6954, "step": 6206 }, { "epoch": 1.6311623476861432, "grad_norm": 0.640443742275238, "learning_rate": 4.5636937094795866e-05, "loss": 1.6434, "step": 6208 }, { "epoch": 1.631687851019805, "grad_norm": 0.5399090647697449, "learning_rate": 4.5619414753811114e-05, "loss": 1.6317, "step": 6210 }, { "epoch": 1.6322133543534667, "grad_norm": 0.5155448913574219, "learning_rate": 4.5601892412826355e-05, "loss": 1.6258, "step": 6212 }, { "epoch": 1.6327388576871285, "grad_norm": 0.540034830570221, "learning_rate": 4.5584370071841596e-05, "loss": 1.6592, "step": 6214 }, { "epoch": 1.6332643610207902, "grad_norm": 0.6191601157188416, "learning_rate": 4.556684773085684e-05, "loss": 1.6188, "step": 6216 }, { "epoch": 1.633789864354452, "grad_norm": 0.5206524133682251, "learning_rate": 4.554932538987209e-05, "loss": 1.667, "step": 6218 }, { "epoch": 1.6343153676881137, "grad_norm": 0.5997523069381714, "learning_rate": 4.553180304888733e-05, "loss": 1.7165, "step": 6220 }, { "epoch": 1.6348408710217757, "grad_norm": 0.6088078618049622, "learning_rate": 4.551428070790258e-05, "loss": 1.65, "step": 6222 }, { "epoch": 1.6353663743554372, "grad_norm": 0.5121122598648071, "learning_rate": 4.5496758366917827e-05, "loss": 1.6661, "step": 6224 }, { "epoch": 1.6358918776890992, "grad_norm": 0.5405161380767822, "learning_rate": 4.547923602593307e-05, "loss": 1.6812, "step": 6226 }, { "epoch": 1.636417381022761, "grad_norm": 0.5181125402450562, "learning_rate": 4.546171368494831e-05, "loss": 1.6888, "step": 6228 }, { "epoch": 1.6369428843564227, "grad_norm": 0.7036988735198975, "learning_rate": 4.5444191343963556e-05, "loss": 1.7048, "step": 6230 }, { "epoch": 1.6374683876900844, "grad_norm": 0.559990644454956, "learning_rate": 4.54266690029788e-05, "loss": 1.6745, "step": 6232 }, { "epoch": 1.6379938910237462, "grad_norm": 0.5458931922912598, "learning_rate": 4.5409146661994044e-05, "loss": 1.6668, "step": 6234 }, { "epoch": 1.6385193943574081, "grad_norm": 0.6637448072433472, "learning_rate": 4.539162432100929e-05, "loss": 1.7194, "step": 6236 }, { "epoch": 1.6390448976910696, "grad_norm": 0.5551019310951233, "learning_rate": 4.537410198002454e-05, "loss": 1.6846, "step": 6238 }, { "epoch": 1.6395704010247316, "grad_norm": 0.6413047909736633, "learning_rate": 4.535657963903977e-05, "loss": 1.6505, "step": 6240 }, { "epoch": 1.6400959043583931, "grad_norm": 0.5083743929862976, "learning_rate": 4.533905729805502e-05, "loss": 1.6929, "step": 6242 }, { "epoch": 1.6406214076920551, "grad_norm": 0.608717679977417, "learning_rate": 4.532153495707027e-05, "loss": 1.6767, "step": 6244 }, { "epoch": 1.6411469110257169, "grad_norm": 0.5961493253707886, "learning_rate": 4.530401261608551e-05, "loss": 1.6434, "step": 6246 }, { "epoch": 1.6416724143593786, "grad_norm": 0.5599090456962585, "learning_rate": 4.528649027510076e-05, "loss": 1.6774, "step": 6248 }, { "epoch": 1.6421979176930404, "grad_norm": 0.5643408298492432, "learning_rate": 4.5268967934116004e-05, "loss": 1.6631, "step": 6250 }, { "epoch": 1.642723421026702, "grad_norm": 0.5354277491569519, "learning_rate": 4.5251445593131245e-05, "loss": 1.6685, "step": 6252 }, { "epoch": 1.643248924360364, "grad_norm": 0.6432374119758606, "learning_rate": 4.5233923252146486e-05, "loss": 1.6746, "step": 6254 }, { "epoch": 1.6437744276940256, "grad_norm": 0.5566967725753784, "learning_rate": 4.521640091116173e-05, "loss": 1.6512, "step": 6256 }, { "epoch": 1.6442999310276876, "grad_norm": 0.517951488494873, "learning_rate": 4.519887857017698e-05, "loss": 1.6485, "step": 6258 }, { "epoch": 1.644825434361349, "grad_norm": 0.5770967602729797, "learning_rate": 4.518135622919222e-05, "loss": 1.6786, "step": 6260 }, { "epoch": 1.645350937695011, "grad_norm": 0.5681661367416382, "learning_rate": 4.516383388820747e-05, "loss": 1.6464, "step": 6262 }, { "epoch": 1.6458764410286728, "grad_norm": 0.6962856650352478, "learning_rate": 4.514631154722272e-05, "loss": 1.6435, "step": 6264 }, { "epoch": 1.6464019443623346, "grad_norm": 0.5683899521827698, "learning_rate": 4.512878920623795e-05, "loss": 1.6862, "step": 6266 }, { "epoch": 1.6469274476959963, "grad_norm": 0.6042145490646362, "learning_rate": 4.51112668652532e-05, "loss": 1.6574, "step": 6268 }, { "epoch": 1.647452951029658, "grad_norm": 0.5737462639808655, "learning_rate": 4.5093744524268446e-05, "loss": 1.6785, "step": 6270 }, { "epoch": 1.64797845436332, "grad_norm": 0.6028643846511841, "learning_rate": 4.507622218328369e-05, "loss": 1.6568, "step": 6272 }, { "epoch": 1.6485039576969815, "grad_norm": 0.6065294742584229, "learning_rate": 4.5058699842298934e-05, "loss": 1.6882, "step": 6274 }, { "epoch": 1.6490294610306435, "grad_norm": 0.5672723054885864, "learning_rate": 4.504117750131418e-05, "loss": 1.619, "step": 6276 }, { "epoch": 1.649554964364305, "grad_norm": 0.727794885635376, "learning_rate": 4.502365516032942e-05, "loss": 1.6583, "step": 6278 }, { "epoch": 1.650080467697967, "grad_norm": 0.6044321060180664, "learning_rate": 4.5006132819344663e-05, "loss": 1.6795, "step": 6280 }, { "epoch": 1.6506059710316288, "grad_norm": 0.535193145275116, "learning_rate": 4.498861047835991e-05, "loss": 1.6772, "step": 6282 }, { "epoch": 1.6511314743652905, "grad_norm": 0.651655375957489, "learning_rate": 4.497108813737516e-05, "loss": 1.6993, "step": 6284 }, { "epoch": 1.6516569776989523, "grad_norm": 0.6530981659889221, "learning_rate": 4.49535657963904e-05, "loss": 1.6768, "step": 6286 }, { "epoch": 1.652182481032614, "grad_norm": 0.6580101251602173, "learning_rate": 4.493604345540565e-05, "loss": 1.695, "step": 6288 }, { "epoch": 1.652707984366276, "grad_norm": 0.6548781991004944, "learning_rate": 4.491852111442089e-05, "loss": 1.682, "step": 6290 }, { "epoch": 1.6532334876999375, "grad_norm": 0.5895609259605408, "learning_rate": 4.490099877343613e-05, "loss": 1.703, "step": 6292 }, { "epoch": 1.6537589910335995, "grad_norm": 0.5485368371009827, "learning_rate": 4.4883476432451376e-05, "loss": 1.686, "step": 6294 }, { "epoch": 1.654284494367261, "grad_norm": 0.6156832575798035, "learning_rate": 4.4865954091466624e-05, "loss": 1.68, "step": 6296 }, { "epoch": 1.654809997700923, "grad_norm": 0.5617910027503967, "learning_rate": 4.4848431750481864e-05, "loss": 1.6528, "step": 6298 }, { "epoch": 1.6553355010345847, "grad_norm": 0.568001925945282, "learning_rate": 4.483090940949711e-05, "loss": 1.6656, "step": 6300 }, { "epoch": 1.6558610043682465, "grad_norm": 0.5492426156997681, "learning_rate": 4.481338706851236e-05, "loss": 1.669, "step": 6302 }, { "epoch": 1.6563865077019082, "grad_norm": 0.5638085007667542, "learning_rate": 4.47958647275276e-05, "loss": 1.7048, "step": 6304 }, { "epoch": 1.65691201103557, "grad_norm": 0.5807180404663086, "learning_rate": 4.477834238654284e-05, "loss": 1.6495, "step": 6306 }, { "epoch": 1.657437514369232, "grad_norm": 0.6146125197410583, "learning_rate": 4.476082004555809e-05, "loss": 1.7158, "step": 6308 }, { "epoch": 1.6579630177028934, "grad_norm": 0.5622976422309875, "learning_rate": 4.4743297704573336e-05, "loss": 1.6907, "step": 6310 }, { "epoch": 1.6584885210365554, "grad_norm": 0.49633076786994934, "learning_rate": 4.472577536358858e-05, "loss": 1.6695, "step": 6312 }, { "epoch": 1.659014024370217, "grad_norm": 0.5975070595741272, "learning_rate": 4.4708253022603825e-05, "loss": 1.7011, "step": 6314 }, { "epoch": 1.659539527703879, "grad_norm": 0.6197476983070374, "learning_rate": 4.4690730681619065e-05, "loss": 1.6522, "step": 6316 }, { "epoch": 1.6600650310375407, "grad_norm": 0.6124839782714844, "learning_rate": 4.4673208340634306e-05, "loss": 1.7181, "step": 6318 }, { "epoch": 1.6605905343712024, "grad_norm": 0.5166561007499695, "learning_rate": 4.4655685999649554e-05, "loss": 1.6536, "step": 6320 }, { "epoch": 1.6611160377048642, "grad_norm": 0.5568446516990662, "learning_rate": 4.46381636586648e-05, "loss": 1.682, "step": 6322 }, { "epoch": 1.661641541038526, "grad_norm": 0.5822721719741821, "learning_rate": 4.462064131768004e-05, "loss": 1.6657, "step": 6324 }, { "epoch": 1.6621670443721879, "grad_norm": 0.6438559293746948, "learning_rate": 4.460311897669529e-05, "loss": 1.6795, "step": 6326 }, { "epoch": 1.6626925477058494, "grad_norm": 0.7065990567207336, "learning_rate": 4.458559663571053e-05, "loss": 1.6618, "step": 6328 }, { "epoch": 1.6632180510395114, "grad_norm": 0.512935996055603, "learning_rate": 4.456807429472578e-05, "loss": 1.6742, "step": 6330 }, { "epoch": 1.663743554373173, "grad_norm": 0.7639873027801514, "learning_rate": 4.455055195374102e-05, "loss": 1.6593, "step": 6332 }, { "epoch": 1.6642690577068349, "grad_norm": 0.5977439880371094, "learning_rate": 4.4533029612756266e-05, "loss": 1.6691, "step": 6334 }, { "epoch": 1.6647945610404966, "grad_norm": 0.5824545621871948, "learning_rate": 4.4515507271771514e-05, "loss": 1.6571, "step": 6336 }, { "epoch": 1.6653200643741584, "grad_norm": 0.5170328617095947, "learning_rate": 4.4497984930786755e-05, "loss": 1.676, "step": 6338 }, { "epoch": 1.66584556770782, "grad_norm": 0.6465065479278564, "learning_rate": 4.4480462589802e-05, "loss": 1.6516, "step": 6340 }, { "epoch": 1.6663710710414819, "grad_norm": 0.639741837978363, "learning_rate": 4.446294024881724e-05, "loss": 1.6696, "step": 6342 }, { "epoch": 1.6668965743751438, "grad_norm": 0.5170881152153015, "learning_rate": 4.444541790783249e-05, "loss": 1.6353, "step": 6344 }, { "epoch": 1.6674220777088054, "grad_norm": 0.5460655689239502, "learning_rate": 4.442789556684773e-05, "loss": 1.6869, "step": 6346 }, { "epoch": 1.6679475810424673, "grad_norm": 0.5527986288070679, "learning_rate": 4.441037322586298e-05, "loss": 1.6261, "step": 6348 }, { "epoch": 1.6684730843761288, "grad_norm": 0.5400204062461853, "learning_rate": 4.4392850884878227e-05, "loss": 1.6559, "step": 6350 }, { "epoch": 1.6689985877097908, "grad_norm": 0.5666835904121399, "learning_rate": 4.437532854389347e-05, "loss": 1.7025, "step": 6352 }, { "epoch": 1.6695240910434526, "grad_norm": 0.5993382930755615, "learning_rate": 4.435780620290871e-05, "loss": 1.676, "step": 6354 }, { "epoch": 1.6700495943771143, "grad_norm": 0.5372394919395447, "learning_rate": 4.4340283861923956e-05, "loss": 1.6549, "step": 6356 }, { "epoch": 1.670575097710776, "grad_norm": 0.6082696914672852, "learning_rate": 4.4322761520939196e-05, "loss": 1.6643, "step": 6358 }, { "epoch": 1.6711006010444378, "grad_norm": 0.6554064154624939, "learning_rate": 4.4305239179954444e-05, "loss": 1.692, "step": 6360 }, { "epoch": 1.6716261043780998, "grad_norm": 0.4961806833744049, "learning_rate": 4.428771683896969e-05, "loss": 1.6921, "step": 6362 }, { "epoch": 1.6721516077117613, "grad_norm": 0.5501666069030762, "learning_rate": 4.427019449798493e-05, "loss": 1.678, "step": 6364 }, { "epoch": 1.6726771110454233, "grad_norm": 0.5967716574668884, "learning_rate": 4.425267215700018e-05, "loss": 1.6548, "step": 6366 }, { "epoch": 1.673202614379085, "grad_norm": 0.654115617275238, "learning_rate": 4.423514981601542e-05, "loss": 1.6859, "step": 6368 }, { "epoch": 1.6737281177127468, "grad_norm": 0.5566443204879761, "learning_rate": 4.421762747503067e-05, "loss": 1.653, "step": 6370 }, { "epoch": 1.6742536210464085, "grad_norm": 0.6134412884712219, "learning_rate": 4.420010513404591e-05, "loss": 1.6876, "step": 6372 }, { "epoch": 1.6747791243800703, "grad_norm": 0.7075713276863098, "learning_rate": 4.418258279306116e-05, "loss": 1.6738, "step": 6374 }, { "epoch": 1.675304627713732, "grad_norm": 0.5378595590591431, "learning_rate": 4.4165060452076404e-05, "loss": 1.6696, "step": 6376 }, { "epoch": 1.6758301310473938, "grad_norm": 0.5113415718078613, "learning_rate": 4.4147538111091645e-05, "loss": 1.6662, "step": 6378 }, { "epoch": 1.6763556343810557, "grad_norm": 0.6252776384353638, "learning_rate": 4.4130015770106886e-05, "loss": 1.6517, "step": 6380 }, { "epoch": 1.6768811377147173, "grad_norm": 0.5247318744659424, "learning_rate": 4.411249342912213e-05, "loss": 1.6769, "step": 6382 }, { "epoch": 1.6774066410483792, "grad_norm": 0.5468015670776367, "learning_rate": 4.4094971088137374e-05, "loss": 1.6827, "step": 6384 }, { "epoch": 1.677932144382041, "grad_norm": 0.5248594880104065, "learning_rate": 4.407744874715262e-05, "loss": 1.691, "step": 6386 }, { "epoch": 1.6784576477157027, "grad_norm": 0.5515632629394531, "learning_rate": 4.405992640616787e-05, "loss": 1.664, "step": 6388 }, { "epoch": 1.6789831510493645, "grad_norm": 0.565548300743103, "learning_rate": 4.404240406518311e-05, "loss": 1.6677, "step": 6390 }, { "epoch": 1.6795086543830262, "grad_norm": 0.5626314878463745, "learning_rate": 4.402488172419835e-05, "loss": 1.6778, "step": 6392 }, { "epoch": 1.6800341577166882, "grad_norm": 0.5197250247001648, "learning_rate": 4.40073593832136e-05, "loss": 1.6824, "step": 6394 }, { "epoch": 1.6805596610503497, "grad_norm": 0.5577792525291443, "learning_rate": 4.3989837042228846e-05, "loss": 1.6798, "step": 6396 }, { "epoch": 1.6810851643840117, "grad_norm": 0.5743740200996399, "learning_rate": 4.397231470124409e-05, "loss": 1.6328, "step": 6398 }, { "epoch": 1.6816106677176732, "grad_norm": 0.4996255934238434, "learning_rate": 4.3954792360259334e-05, "loss": 1.6781, "step": 6400 }, { "epoch": 1.6816106677176732, "eval_loss": 1.6763501167297363, "eval_runtime": 487.3304, "eval_samples_per_second": 249.911, "eval_steps_per_second": 31.24, "step": 6400 }, { "epoch": 1.6821361710513352, "grad_norm": 0.6712773442268372, "learning_rate": 4.393727001927458e-05, "loss": 1.679, "step": 6402 }, { "epoch": 1.682661674384997, "grad_norm": 0.5399149656295776, "learning_rate": 4.391974767828982e-05, "loss": 1.7156, "step": 6404 }, { "epoch": 1.6831871777186587, "grad_norm": 0.5838908553123474, "learning_rate": 4.3902225337305063e-05, "loss": 1.6559, "step": 6406 }, { "epoch": 1.6837126810523204, "grad_norm": 0.6989631056785583, "learning_rate": 4.388470299632031e-05, "loss": 1.6536, "step": 6408 }, { "epoch": 1.6842381843859822, "grad_norm": 0.6087121963500977, "learning_rate": 4.386718065533555e-05, "loss": 1.68, "step": 6410 }, { "epoch": 1.6847636877196441, "grad_norm": 0.595737099647522, "learning_rate": 4.38496583143508e-05, "loss": 1.6431, "step": 6412 }, { "epoch": 1.6852891910533057, "grad_norm": 0.5878545045852661, "learning_rate": 4.383213597336605e-05, "loss": 1.6524, "step": 6414 }, { "epoch": 1.6858146943869676, "grad_norm": 0.5520877242088318, "learning_rate": 4.381461363238129e-05, "loss": 1.6234, "step": 6416 }, { "epoch": 1.6863401977206292, "grad_norm": 0.530946671962738, "learning_rate": 4.379709129139653e-05, "loss": 1.6478, "step": 6418 }, { "epoch": 1.6868657010542911, "grad_norm": 0.5743231177330017, "learning_rate": 4.3779568950411776e-05, "loss": 1.6882, "step": 6420 }, { "epoch": 1.6873912043879529, "grad_norm": 0.5853824019432068, "learning_rate": 4.3762046609427024e-05, "loss": 1.6798, "step": 6422 }, { "epoch": 1.6879167077216146, "grad_norm": 0.5864454507827759, "learning_rate": 4.3744524268442264e-05, "loss": 1.656, "step": 6424 }, { "epoch": 1.6884422110552764, "grad_norm": 0.5097535848617554, "learning_rate": 4.372700192745751e-05, "loss": 1.6534, "step": 6426 }, { "epoch": 1.6889677143889381, "grad_norm": 0.5435791015625, "learning_rate": 4.370947958647276e-05, "loss": 1.6782, "step": 6428 }, { "epoch": 1.6894932177226, "grad_norm": 0.6465846300125122, "learning_rate": 4.3691957245487994e-05, "loss": 1.6816, "step": 6430 }, { "epoch": 1.6900187210562616, "grad_norm": 0.561132550239563, "learning_rate": 4.367443490450324e-05, "loss": 1.6838, "step": 6432 }, { "epoch": 1.6905442243899236, "grad_norm": 0.530135452747345, "learning_rate": 4.365691256351849e-05, "loss": 1.6541, "step": 6434 }, { "epoch": 1.691069727723585, "grad_norm": 0.6319286823272705, "learning_rate": 4.363939022253373e-05, "loss": 1.651, "step": 6436 }, { "epoch": 1.691595231057247, "grad_norm": 0.4940394461154938, "learning_rate": 4.362186788154898e-05, "loss": 1.6613, "step": 6438 }, { "epoch": 1.6921207343909088, "grad_norm": 0.4995363652706146, "learning_rate": 4.3604345540564225e-05, "loss": 1.6572, "step": 6440 }, { "epoch": 1.6926462377245706, "grad_norm": 0.5799241662025452, "learning_rate": 4.3586823199579465e-05, "loss": 1.6753, "step": 6442 }, { "epoch": 1.6931717410582323, "grad_norm": 0.5875564813613892, "learning_rate": 4.3569300858594706e-05, "loss": 1.6322, "step": 6444 }, { "epoch": 1.693697244391894, "grad_norm": 0.5701809525489807, "learning_rate": 4.3551778517609954e-05, "loss": 1.7094, "step": 6446 }, { "epoch": 1.694222747725556, "grad_norm": 0.576756477355957, "learning_rate": 4.35342561766252e-05, "loss": 1.667, "step": 6448 }, { "epoch": 1.6947482510592176, "grad_norm": 0.5512332320213318, "learning_rate": 4.351673383564044e-05, "loss": 1.6592, "step": 6450 }, { "epoch": 1.6952737543928795, "grad_norm": 0.5287933349609375, "learning_rate": 4.349921149465569e-05, "loss": 1.6677, "step": 6452 }, { "epoch": 1.695799257726541, "grad_norm": 0.623125433921814, "learning_rate": 4.348168915367094e-05, "loss": 1.6633, "step": 6454 }, { "epoch": 1.696324761060203, "grad_norm": 0.613586962223053, "learning_rate": 4.346416681268617e-05, "loss": 1.66, "step": 6456 }, { "epoch": 1.6968502643938648, "grad_norm": 0.5631827116012573, "learning_rate": 4.344664447170142e-05, "loss": 1.6701, "step": 6458 }, { "epoch": 1.6973757677275265, "grad_norm": 0.4900628328323364, "learning_rate": 4.3429122130716666e-05, "loss": 1.6603, "step": 6460 }, { "epoch": 1.6979012710611883, "grad_norm": 0.5031628608703613, "learning_rate": 4.341159978973191e-05, "loss": 1.6554, "step": 6462 }, { "epoch": 1.69842677439485, "grad_norm": 0.5051286816596985, "learning_rate": 4.3394077448747155e-05, "loss": 1.6324, "step": 6464 }, { "epoch": 1.698952277728512, "grad_norm": 0.5495415925979614, "learning_rate": 4.33765551077624e-05, "loss": 1.679, "step": 6466 }, { "epoch": 1.6994777810621735, "grad_norm": 0.7252418398857117, "learning_rate": 4.335903276677764e-05, "loss": 1.6898, "step": 6468 }, { "epoch": 1.7000032843958355, "grad_norm": 0.5228211879730225, "learning_rate": 4.3341510425792884e-05, "loss": 1.644, "step": 6470 }, { "epoch": 1.700528787729497, "grad_norm": 0.5848027467727661, "learning_rate": 4.332398808480813e-05, "loss": 1.656, "step": 6472 }, { "epoch": 1.701054291063159, "grad_norm": 0.6442865133285522, "learning_rate": 4.330646574382338e-05, "loss": 1.6261, "step": 6474 }, { "epoch": 1.7015797943968207, "grad_norm": 0.5885564684867859, "learning_rate": 4.328894340283862e-05, "loss": 1.6406, "step": 6476 }, { "epoch": 1.7021052977304825, "grad_norm": 0.5726144909858704, "learning_rate": 4.327142106185387e-05, "loss": 1.6843, "step": 6478 }, { "epoch": 1.7026308010641442, "grad_norm": 0.6747820377349854, "learning_rate": 4.3253898720869115e-05, "loss": 1.6851, "step": 6480 }, { "epoch": 1.703156304397806, "grad_norm": 0.5923687815666199, "learning_rate": 4.3236376379884356e-05, "loss": 1.6318, "step": 6482 }, { "epoch": 1.703681807731468, "grad_norm": 0.5728587508201599, "learning_rate": 4.3218854038899596e-05, "loss": 1.682, "step": 6484 }, { "epoch": 1.7042073110651295, "grad_norm": 0.5209431648254395, "learning_rate": 4.3201331697914844e-05, "loss": 1.662, "step": 6486 }, { "epoch": 1.7047328143987914, "grad_norm": 0.5561770796775818, "learning_rate": 4.318380935693009e-05, "loss": 1.7016, "step": 6488 }, { "epoch": 1.705258317732453, "grad_norm": 0.5122435688972473, "learning_rate": 4.316628701594533e-05, "loss": 1.6682, "step": 6490 }, { "epoch": 1.705783821066115, "grad_norm": 0.6389492750167847, "learning_rate": 4.314876467496058e-05, "loss": 1.6379, "step": 6492 }, { "epoch": 1.7063093243997767, "grad_norm": 0.5349413752555847, "learning_rate": 4.313124233397582e-05, "loss": 1.6343, "step": 6494 }, { "epoch": 1.7068348277334384, "grad_norm": 0.5826820731163025, "learning_rate": 4.311371999299106e-05, "loss": 1.6775, "step": 6496 }, { "epoch": 1.7073603310671002, "grad_norm": 0.5609052181243896, "learning_rate": 4.309619765200631e-05, "loss": 1.7076, "step": 6498 }, { "epoch": 1.707885834400762, "grad_norm": 0.5599504113197327, "learning_rate": 4.307867531102156e-05, "loss": 1.6997, "step": 6500 }, { "epoch": 1.7084113377344239, "grad_norm": 0.5563739538192749, "learning_rate": 4.30611529700368e-05, "loss": 1.6595, "step": 6502 }, { "epoch": 1.7089368410680854, "grad_norm": 0.5203465819358826, "learning_rate": 4.3043630629052045e-05, "loss": 1.6452, "step": 6504 }, { "epoch": 1.7094623444017474, "grad_norm": 0.59616619348526, "learning_rate": 4.302610828806729e-05, "loss": 1.6495, "step": 6506 }, { "epoch": 1.709987847735409, "grad_norm": 0.7246098518371582, "learning_rate": 4.300858594708253e-05, "loss": 1.6682, "step": 6508 }, { "epoch": 1.7105133510690709, "grad_norm": 0.5792231559753418, "learning_rate": 4.2991063606097774e-05, "loss": 1.6474, "step": 6510 }, { "epoch": 1.7110388544027326, "grad_norm": 0.5333255529403687, "learning_rate": 4.297354126511302e-05, "loss": 1.6795, "step": 6512 }, { "epoch": 1.7115643577363944, "grad_norm": 0.674374520778656, "learning_rate": 4.295601892412827e-05, "loss": 1.6514, "step": 6514 }, { "epoch": 1.7120898610700561, "grad_norm": 0.5450239181518555, "learning_rate": 4.293849658314351e-05, "loss": 1.6606, "step": 6516 }, { "epoch": 1.7126153644037179, "grad_norm": 0.6590548157691956, "learning_rate": 4.292097424215876e-05, "loss": 1.6337, "step": 6518 }, { "epoch": 1.7131408677373798, "grad_norm": 0.5265931487083435, "learning_rate": 4.2903451901174e-05, "loss": 1.6842, "step": 6520 }, { "epoch": 1.7136663710710414, "grad_norm": 0.6718656420707703, "learning_rate": 4.288592956018924e-05, "loss": 1.696, "step": 6522 }, { "epoch": 1.7141918744047033, "grad_norm": 0.6330803632736206, "learning_rate": 4.286840721920449e-05, "loss": 1.671, "step": 6524 }, { "epoch": 1.714717377738365, "grad_norm": 0.4948212206363678, "learning_rate": 4.2850884878219734e-05, "loss": 1.6464, "step": 6526 }, { "epoch": 1.7152428810720268, "grad_norm": 0.5330238342285156, "learning_rate": 4.2833362537234975e-05, "loss": 1.6661, "step": 6528 }, { "epoch": 1.7157683844056886, "grad_norm": 0.5928429961204529, "learning_rate": 4.281584019625022e-05, "loss": 1.645, "step": 6530 }, { "epoch": 1.7162938877393503, "grad_norm": 0.535369336605072, "learning_rate": 4.2798317855265463e-05, "loss": 1.6554, "step": 6532 }, { "epoch": 1.716819391073012, "grad_norm": 0.5079066157341003, "learning_rate": 4.278079551428071e-05, "loss": 1.6498, "step": 6534 }, { "epoch": 1.7173448944066738, "grad_norm": 0.5394106507301331, "learning_rate": 4.276327317329595e-05, "loss": 1.6538, "step": 6536 }, { "epoch": 1.7178703977403358, "grad_norm": 0.5476702451705933, "learning_rate": 4.27457508323112e-05, "loss": 1.6568, "step": 6538 }, { "epoch": 1.7183959010739973, "grad_norm": 0.6342707276344299, "learning_rate": 4.272822849132645e-05, "loss": 1.6829, "step": 6540 }, { "epoch": 1.7189214044076593, "grad_norm": 0.6379374265670776, "learning_rate": 4.271070615034169e-05, "loss": 1.6742, "step": 6542 }, { "epoch": 1.719446907741321, "grad_norm": 0.6817846894264221, "learning_rate": 4.2693183809356935e-05, "loss": 1.6354, "step": 6544 }, { "epoch": 1.7199724110749828, "grad_norm": 0.6458591222763062, "learning_rate": 4.2675661468372176e-05, "loss": 1.6599, "step": 6546 }, { "epoch": 1.7204979144086445, "grad_norm": 0.5006933808326721, "learning_rate": 4.265813912738742e-05, "loss": 1.6438, "step": 6548 }, { "epoch": 1.7210234177423063, "grad_norm": 0.4796706736087799, "learning_rate": 4.2640616786402664e-05, "loss": 1.6738, "step": 6550 }, { "epoch": 1.7215489210759682, "grad_norm": 0.5186893939971924, "learning_rate": 4.262309444541791e-05, "loss": 1.691, "step": 6552 }, { "epoch": 1.7220744244096298, "grad_norm": 0.5311626195907593, "learning_rate": 4.260557210443315e-05, "loss": 1.6739, "step": 6554 }, { "epoch": 1.7225999277432917, "grad_norm": 0.5143429040908813, "learning_rate": 4.25880497634484e-05, "loss": 1.6533, "step": 6556 }, { "epoch": 1.7231254310769533, "grad_norm": 0.5430511236190796, "learning_rate": 4.257052742246364e-05, "loss": 1.6758, "step": 6558 }, { "epoch": 1.7236509344106152, "grad_norm": 0.6149283647537231, "learning_rate": 4.255300508147889e-05, "loss": 1.6581, "step": 6560 }, { "epoch": 1.724176437744277, "grad_norm": 0.5292539000511169, "learning_rate": 4.253548274049413e-05, "loss": 1.6494, "step": 6562 }, { "epoch": 1.7247019410779387, "grad_norm": 0.615308403968811, "learning_rate": 4.251796039950938e-05, "loss": 1.6732, "step": 6564 }, { "epoch": 1.7252274444116005, "grad_norm": 0.562659740447998, "learning_rate": 4.2500438058524625e-05, "loss": 1.6881, "step": 6566 }, { "epoch": 1.7257529477452622, "grad_norm": 0.6096563339233398, "learning_rate": 4.2482915717539865e-05, "loss": 1.6344, "step": 6568 }, { "epoch": 1.7262784510789242, "grad_norm": 0.568242073059082, "learning_rate": 4.2465393376555106e-05, "loss": 1.6506, "step": 6570 }, { "epoch": 1.7268039544125857, "grad_norm": 0.6250702738761902, "learning_rate": 4.2447871035570354e-05, "loss": 1.6647, "step": 6572 }, { "epoch": 1.7273294577462477, "grad_norm": 0.6344661116600037, "learning_rate": 4.2430348694585594e-05, "loss": 1.6733, "step": 6574 }, { "epoch": 1.7278549610799092, "grad_norm": 0.5727905631065369, "learning_rate": 4.241282635360084e-05, "loss": 1.6757, "step": 6576 }, { "epoch": 1.7283804644135712, "grad_norm": 0.5363614559173584, "learning_rate": 4.239530401261609e-05, "loss": 1.6418, "step": 6578 }, { "epoch": 1.728905967747233, "grad_norm": 0.5695384740829468, "learning_rate": 4.237778167163133e-05, "loss": 1.6424, "step": 6580 }, { "epoch": 1.7294314710808947, "grad_norm": 0.6201584935188293, "learning_rate": 4.236025933064658e-05, "loss": 1.6648, "step": 6582 }, { "epoch": 1.7299569744145564, "grad_norm": 0.4974352717399597, "learning_rate": 4.234273698966182e-05, "loss": 1.6666, "step": 6584 }, { "epoch": 1.7304824777482182, "grad_norm": 0.5813178420066833, "learning_rate": 4.2325214648677066e-05, "loss": 1.6307, "step": 6586 }, { "epoch": 1.7310079810818801, "grad_norm": 0.5724592804908752, "learning_rate": 4.230769230769231e-05, "loss": 1.6467, "step": 6588 }, { "epoch": 1.7315334844155417, "grad_norm": 0.5860669612884521, "learning_rate": 4.2290169966707555e-05, "loss": 1.6765, "step": 6590 }, { "epoch": 1.7320589877492036, "grad_norm": 0.5303966999053955, "learning_rate": 4.22726476257228e-05, "loss": 1.6575, "step": 6592 }, { "epoch": 1.7325844910828652, "grad_norm": 0.5227830410003662, "learning_rate": 4.225512528473804e-05, "loss": 1.6884, "step": 6594 }, { "epoch": 1.7331099944165271, "grad_norm": 0.5740933418273926, "learning_rate": 4.2237602943753284e-05, "loss": 1.6472, "step": 6596 }, { "epoch": 1.7336354977501889, "grad_norm": 0.5894073843955994, "learning_rate": 4.222008060276853e-05, "loss": 1.6755, "step": 6598 }, { "epoch": 1.7341610010838506, "grad_norm": 0.5687074065208435, "learning_rate": 4.220255826178378e-05, "loss": 1.675, "step": 6600 }, { "epoch": 1.7346865044175124, "grad_norm": 0.6006156206130981, "learning_rate": 4.218503592079902e-05, "loss": 1.6651, "step": 6602 }, { "epoch": 1.7352120077511741, "grad_norm": 0.564089834690094, "learning_rate": 4.216751357981427e-05, "loss": 1.6789, "step": 6604 }, { "epoch": 1.735737511084836, "grad_norm": 0.7533421516418457, "learning_rate": 4.2149991238829515e-05, "loss": 1.6557, "step": 6606 }, { "epoch": 1.7362630144184976, "grad_norm": 0.5872588753700256, "learning_rate": 4.2132468897844756e-05, "loss": 1.6713, "step": 6608 }, { "epoch": 1.7367885177521596, "grad_norm": 0.608405590057373, "learning_rate": 4.2114946556859996e-05, "loss": 1.6648, "step": 6610 }, { "epoch": 1.7373140210858211, "grad_norm": 0.5417534112930298, "learning_rate": 4.2097424215875244e-05, "loss": 1.6667, "step": 6612 }, { "epoch": 1.737839524419483, "grad_norm": 0.5255427956581116, "learning_rate": 4.2079901874890485e-05, "loss": 1.6847, "step": 6614 }, { "epoch": 1.7383650277531448, "grad_norm": 0.49976515769958496, "learning_rate": 4.206237953390573e-05, "loss": 1.6426, "step": 6616 }, { "epoch": 1.7388905310868066, "grad_norm": 0.5245642066001892, "learning_rate": 4.204485719292098e-05, "loss": 1.6557, "step": 6618 }, { "epoch": 1.7394160344204683, "grad_norm": 0.5113621354103088, "learning_rate": 4.202733485193622e-05, "loss": 1.6725, "step": 6620 }, { "epoch": 1.73994153775413, "grad_norm": 0.6118736863136292, "learning_rate": 4.200981251095146e-05, "loss": 1.6524, "step": 6622 }, { "epoch": 1.740467041087792, "grad_norm": 0.6329546570777893, "learning_rate": 4.199229016996671e-05, "loss": 1.7146, "step": 6624 }, { "epoch": 1.7409925444214536, "grad_norm": 0.5709455013275146, "learning_rate": 4.1974767828981957e-05, "loss": 1.6481, "step": 6626 }, { "epoch": 1.7415180477551155, "grad_norm": 0.5557751655578613, "learning_rate": 4.19572454879972e-05, "loss": 1.6298, "step": 6628 }, { "epoch": 1.742043551088777, "grad_norm": 0.5406216979026794, "learning_rate": 4.1939723147012445e-05, "loss": 1.7127, "step": 6630 }, { "epoch": 1.742569054422439, "grad_norm": 0.6411069631576538, "learning_rate": 4.192220080602769e-05, "loss": 1.674, "step": 6632 }, { "epoch": 1.7430945577561008, "grad_norm": 0.5912994742393494, "learning_rate": 4.1904678465042927e-05, "loss": 1.7002, "step": 6634 }, { "epoch": 1.7436200610897625, "grad_norm": 0.5379955172538757, "learning_rate": 4.1887156124058174e-05, "loss": 1.6702, "step": 6636 }, { "epoch": 1.7441455644234243, "grad_norm": 0.5569443702697754, "learning_rate": 4.186963378307342e-05, "loss": 1.6848, "step": 6638 }, { "epoch": 1.744671067757086, "grad_norm": 0.5895394086837769, "learning_rate": 4.185211144208866e-05, "loss": 1.6775, "step": 6640 }, { "epoch": 1.745196571090748, "grad_norm": 0.6112500429153442, "learning_rate": 4.183458910110391e-05, "loss": 1.6713, "step": 6642 }, { "epoch": 1.7457220744244095, "grad_norm": 0.762654185295105, "learning_rate": 4.181706676011916e-05, "loss": 1.6848, "step": 6644 }, { "epoch": 1.7462475777580715, "grad_norm": 0.6181445717811584, "learning_rate": 4.17995444191344e-05, "loss": 1.6735, "step": 6646 }, { "epoch": 1.746773081091733, "grad_norm": 0.5101475119590759, "learning_rate": 4.178202207814964e-05, "loss": 1.6639, "step": 6648 }, { "epoch": 1.747298584425395, "grad_norm": 0.5235376358032227, "learning_rate": 4.176449973716489e-05, "loss": 1.6485, "step": 6650 }, { "epoch": 1.7478240877590567, "grad_norm": 0.5604961514472961, "learning_rate": 4.1746977396180134e-05, "loss": 1.6501, "step": 6652 }, { "epoch": 1.7483495910927185, "grad_norm": 0.759231448173523, "learning_rate": 4.1729455055195375e-05, "loss": 1.6752, "step": 6654 }, { "epoch": 1.7488750944263802, "grad_norm": 0.5132787823677063, "learning_rate": 4.171193271421062e-05, "loss": 1.672, "step": 6656 }, { "epoch": 1.749400597760042, "grad_norm": 0.5898250341415405, "learning_rate": 4.169441037322587e-05, "loss": 1.6543, "step": 6658 }, { "epoch": 1.749926101093704, "grad_norm": 0.5514124631881714, "learning_rate": 4.1676888032241104e-05, "loss": 1.6879, "step": 6660 }, { "epoch": 1.7504516044273655, "grad_norm": 0.7601991295814514, "learning_rate": 4.165936569125635e-05, "loss": 1.7259, "step": 6662 }, { "epoch": 1.7509771077610274, "grad_norm": 0.6521760821342468, "learning_rate": 4.16418433502716e-05, "loss": 1.6488, "step": 6664 }, { "epoch": 1.751502611094689, "grad_norm": 0.6008849143981934, "learning_rate": 4.162432100928684e-05, "loss": 1.6577, "step": 6666 }, { "epoch": 1.752028114428351, "grad_norm": 0.6038839221000671, "learning_rate": 4.160679866830209e-05, "loss": 1.6503, "step": 6668 }, { "epoch": 1.7525536177620127, "grad_norm": 0.6234827041625977, "learning_rate": 4.1589276327317335e-05, "loss": 1.6779, "step": 6670 }, { "epoch": 1.7530791210956744, "grad_norm": 0.5239622592926025, "learning_rate": 4.1571753986332576e-05, "loss": 1.6431, "step": 6672 }, { "epoch": 1.7536046244293362, "grad_norm": 0.6171594262123108, "learning_rate": 4.155423164534782e-05, "loss": 1.6436, "step": 6674 }, { "epoch": 1.754130127762998, "grad_norm": 0.651139497756958, "learning_rate": 4.1536709304363064e-05, "loss": 1.6492, "step": 6676 }, { "epoch": 1.75465563109666, "grad_norm": 0.5682376027107239, "learning_rate": 4.151918696337831e-05, "loss": 1.6417, "step": 6678 }, { "epoch": 1.7551811344303214, "grad_norm": 0.6295192837715149, "learning_rate": 4.150166462239355e-05, "loss": 1.6839, "step": 6680 }, { "epoch": 1.7557066377639834, "grad_norm": 0.6096534132957458, "learning_rate": 4.14841422814088e-05, "loss": 1.7125, "step": 6682 }, { "epoch": 1.7562321410976451, "grad_norm": 0.5466519594192505, "learning_rate": 4.146661994042405e-05, "loss": 1.6566, "step": 6684 }, { "epoch": 1.7567576444313069, "grad_norm": 0.5740132331848145, "learning_rate": 4.144909759943928e-05, "loss": 1.6564, "step": 6686 }, { "epoch": 1.7572831477649686, "grad_norm": 0.6351927518844604, "learning_rate": 4.143157525845453e-05, "loss": 1.6752, "step": 6688 }, { "epoch": 1.7578086510986304, "grad_norm": 0.5739028453826904, "learning_rate": 4.141405291746978e-05, "loss": 1.6834, "step": 6690 }, { "epoch": 1.7583341544322921, "grad_norm": 0.5718164443969727, "learning_rate": 4.139653057648502e-05, "loss": 1.6433, "step": 6692 }, { "epoch": 1.7588596577659539, "grad_norm": 0.738676905632019, "learning_rate": 4.1379008235500265e-05, "loss": 1.6528, "step": 6694 }, { "epoch": 1.7593851610996158, "grad_norm": 0.596734881401062, "learning_rate": 4.136148589451551e-05, "loss": 1.6431, "step": 6696 }, { "epoch": 1.7599106644332774, "grad_norm": 0.5336854457855225, "learning_rate": 4.1343963553530754e-05, "loss": 1.6692, "step": 6698 }, { "epoch": 1.7604361677669393, "grad_norm": 0.4876728653907776, "learning_rate": 4.1326441212545994e-05, "loss": 1.6595, "step": 6700 }, { "epoch": 1.760961671100601, "grad_norm": 0.5300989747047424, "learning_rate": 4.130891887156124e-05, "loss": 1.6767, "step": 6702 }, { "epoch": 1.7614871744342628, "grad_norm": 0.54608154296875, "learning_rate": 4.129139653057649e-05, "loss": 1.6279, "step": 6704 }, { "epoch": 1.7620126777679246, "grad_norm": 0.5571487545967102, "learning_rate": 4.127387418959173e-05, "loss": 1.6963, "step": 6706 }, { "epoch": 1.7625381811015863, "grad_norm": 0.5999481081962585, "learning_rate": 4.125635184860698e-05, "loss": 1.6807, "step": 6708 }, { "epoch": 1.7630636844352483, "grad_norm": 0.5582924485206604, "learning_rate": 4.123882950762222e-05, "loss": 1.6503, "step": 6710 }, { "epoch": 1.7635891877689098, "grad_norm": 0.5172569751739502, "learning_rate": 4.122130716663746e-05, "loss": 1.6502, "step": 6712 }, { "epoch": 1.7641146911025718, "grad_norm": 0.5434536933898926, "learning_rate": 4.120378482565271e-05, "loss": 1.624, "step": 6714 }, { "epoch": 1.7646401944362333, "grad_norm": 0.5931615233421326, "learning_rate": 4.1186262484667955e-05, "loss": 1.6881, "step": 6716 }, { "epoch": 1.7651656977698953, "grad_norm": 0.5632887482643127, "learning_rate": 4.1168740143683195e-05, "loss": 1.66, "step": 6718 }, { "epoch": 1.765691201103557, "grad_norm": 0.6316903233528137, "learning_rate": 4.115121780269844e-05, "loss": 1.6821, "step": 6720 }, { "epoch": 1.7662167044372188, "grad_norm": 0.5220393538475037, "learning_rate": 4.113369546171369e-05, "loss": 1.6585, "step": 6722 }, { "epoch": 1.7667422077708805, "grad_norm": 0.5338044166564941, "learning_rate": 4.111617312072893e-05, "loss": 1.6359, "step": 6724 }, { "epoch": 1.7672677111045423, "grad_norm": 0.5751186013221741, "learning_rate": 4.109865077974417e-05, "loss": 1.6678, "step": 6726 }, { "epoch": 1.7677932144382043, "grad_norm": 0.5516241788864136, "learning_rate": 4.108112843875942e-05, "loss": 1.6589, "step": 6728 }, { "epoch": 1.7683187177718658, "grad_norm": 0.5440977811813354, "learning_rate": 4.106360609777467e-05, "loss": 1.6496, "step": 6730 }, { "epoch": 1.7688442211055277, "grad_norm": 0.5160251259803772, "learning_rate": 4.104608375678991e-05, "loss": 1.6586, "step": 6732 }, { "epoch": 1.7693697244391893, "grad_norm": 0.6195341348648071, "learning_rate": 4.1028561415805156e-05, "loss": 1.6621, "step": 6734 }, { "epoch": 1.7698952277728512, "grad_norm": 0.5011487007141113, "learning_rate": 4.1011039074820396e-05, "loss": 1.7067, "step": 6736 }, { "epoch": 1.770420731106513, "grad_norm": 0.5898102521896362, "learning_rate": 4.0993516733835644e-05, "loss": 1.6775, "step": 6738 }, { "epoch": 1.7709462344401747, "grad_norm": 0.6446313261985779, "learning_rate": 4.0975994392850885e-05, "loss": 1.6751, "step": 6740 }, { "epoch": 1.7714717377738365, "grad_norm": 0.5387564301490784, "learning_rate": 4.095847205186613e-05, "loss": 1.6861, "step": 6742 }, { "epoch": 1.7719972411074982, "grad_norm": 0.6098289489746094, "learning_rate": 4.094094971088138e-05, "loss": 1.6653, "step": 6744 }, { "epoch": 1.7725227444411602, "grad_norm": 0.5589563846588135, "learning_rate": 4.092342736989662e-05, "loss": 1.6878, "step": 6746 }, { "epoch": 1.7730482477748217, "grad_norm": 0.6051377058029175, "learning_rate": 4.090590502891187e-05, "loss": 1.6779, "step": 6748 }, { "epoch": 1.7735737511084837, "grad_norm": 0.5657187104225159, "learning_rate": 4.088838268792711e-05, "loss": 1.6585, "step": 6750 }, { "epoch": 1.7740992544421452, "grad_norm": 0.9947826862335205, "learning_rate": 4.087086034694235e-05, "loss": 1.6364, "step": 6752 }, { "epoch": 1.7746247577758072, "grad_norm": 0.5528366565704346, "learning_rate": 4.08533380059576e-05, "loss": 1.6796, "step": 6754 }, { "epoch": 1.775150261109469, "grad_norm": 0.7105492949485779, "learning_rate": 4.0835815664972845e-05, "loss": 1.6359, "step": 6756 }, { "epoch": 1.7756757644431307, "grad_norm": 0.5398980975151062, "learning_rate": 4.0818293323988086e-05, "loss": 1.6358, "step": 6758 }, { "epoch": 1.7762012677767924, "grad_norm": 0.518286943435669, "learning_rate": 4.080077098300333e-05, "loss": 1.6691, "step": 6760 }, { "epoch": 1.7767267711104542, "grad_norm": 0.5343197584152222, "learning_rate": 4.0783248642018574e-05, "loss": 1.6604, "step": 6762 }, { "epoch": 1.7772522744441162, "grad_norm": 0.6191185116767883, "learning_rate": 4.076572630103382e-05, "loss": 1.6693, "step": 6764 }, { "epoch": 1.7777777777777777, "grad_norm": 0.6115537285804749, "learning_rate": 4.074820396004906e-05, "loss": 1.6533, "step": 6766 }, { "epoch": 1.7783032811114396, "grad_norm": 0.6350643634796143, "learning_rate": 4.073068161906431e-05, "loss": 1.6717, "step": 6768 }, { "epoch": 1.7788287844451012, "grad_norm": 0.5143194198608398, "learning_rate": 4.071315927807956e-05, "loss": 1.6684, "step": 6770 }, { "epoch": 1.7793542877787631, "grad_norm": 0.6066332459449768, "learning_rate": 4.06956369370948e-05, "loss": 1.6722, "step": 6772 }, { "epoch": 1.779879791112425, "grad_norm": 0.5524691939353943, "learning_rate": 4.067811459611004e-05, "loss": 1.6423, "step": 6774 }, { "epoch": 1.7804052944460866, "grad_norm": 0.6012272834777832, "learning_rate": 4.066059225512529e-05, "loss": 1.6931, "step": 6776 }, { "epoch": 1.7809307977797484, "grad_norm": 0.58221834897995, "learning_rate": 4.064306991414053e-05, "loss": 1.6591, "step": 6778 }, { "epoch": 1.7814563011134101, "grad_norm": 0.5327997803688049, "learning_rate": 4.0625547573155775e-05, "loss": 1.6903, "step": 6780 }, { "epoch": 1.781981804447072, "grad_norm": 0.5887238383293152, "learning_rate": 4.060802523217102e-05, "loss": 1.6571, "step": 6782 }, { "epoch": 1.7825073077807336, "grad_norm": 0.5583620071411133, "learning_rate": 4.059050289118626e-05, "loss": 1.6552, "step": 6784 }, { "epoch": 1.7830328111143956, "grad_norm": 0.6321818232536316, "learning_rate": 4.057298055020151e-05, "loss": 1.6565, "step": 6786 }, { "epoch": 1.7835583144480571, "grad_norm": 0.496971070766449, "learning_rate": 4.055545820921675e-05, "loss": 1.6311, "step": 6788 }, { "epoch": 1.784083817781719, "grad_norm": 0.5267062783241272, "learning_rate": 4.0537935868232e-05, "loss": 1.6598, "step": 6790 }, { "epoch": 1.7846093211153808, "grad_norm": 0.502678394317627, "learning_rate": 4.052041352724724e-05, "loss": 1.6493, "step": 6792 }, { "epoch": 1.7851348244490426, "grad_norm": 0.6034113764762878, "learning_rate": 4.050289118626249e-05, "loss": 1.6914, "step": 6794 }, { "epoch": 1.7856603277827043, "grad_norm": 0.7164289355278015, "learning_rate": 4.0485368845277735e-05, "loss": 1.7111, "step": 6796 }, { "epoch": 1.786185831116366, "grad_norm": 0.5487950444221497, "learning_rate": 4.0467846504292976e-05, "loss": 1.6346, "step": 6798 }, { "epoch": 1.786711334450028, "grad_norm": 0.6355817914009094, "learning_rate": 4.045032416330822e-05, "loss": 1.6228, "step": 6800 }, { "epoch": 1.786711334450028, "eval_loss": 1.6711539030075073, "eval_runtime": 487.2898, "eval_samples_per_second": 249.931, "eval_steps_per_second": 31.242, "step": 6800 }, { "epoch": 1.7872368377836896, "grad_norm": 0.5209816694259644, "learning_rate": 4.0432801822323464e-05, "loss": 1.6396, "step": 6802 }, { "epoch": 1.7877623411173515, "grad_norm": 0.5979394912719727, "learning_rate": 4.0415279481338705e-05, "loss": 1.6686, "step": 6804 }, { "epoch": 1.788287844451013, "grad_norm": 0.5141789317131042, "learning_rate": 4.039775714035395e-05, "loss": 1.6538, "step": 6806 }, { "epoch": 1.788813347784675, "grad_norm": 0.5531857013702393, "learning_rate": 4.03802347993692e-05, "loss": 1.6612, "step": 6808 }, { "epoch": 1.7893388511183368, "grad_norm": 0.5284379720687866, "learning_rate": 4.036271245838444e-05, "loss": 1.6403, "step": 6810 }, { "epoch": 1.7898643544519985, "grad_norm": 0.5298596620559692, "learning_rate": 4.034519011739968e-05, "loss": 1.6494, "step": 6812 }, { "epoch": 1.7903898577856603, "grad_norm": 0.5482889413833618, "learning_rate": 4.032766777641493e-05, "loss": 1.6516, "step": 6814 }, { "epoch": 1.790915361119322, "grad_norm": 0.5631160736083984, "learning_rate": 4.031014543543018e-05, "loss": 1.6419, "step": 6816 }, { "epoch": 1.791440864452984, "grad_norm": 0.5150030851364136, "learning_rate": 4.029262309444542e-05, "loss": 1.6502, "step": 6818 }, { "epoch": 1.7919663677866455, "grad_norm": 0.5491872429847717, "learning_rate": 4.0275100753460665e-05, "loss": 1.6489, "step": 6820 }, { "epoch": 1.7924918711203075, "grad_norm": 0.5404025912284851, "learning_rate": 4.025757841247591e-05, "loss": 1.6455, "step": 6822 }, { "epoch": 1.793017374453969, "grad_norm": 0.5373459458351135, "learning_rate": 4.0240056071491154e-05, "loss": 1.6487, "step": 6824 }, { "epoch": 1.793542877787631, "grad_norm": 0.6558994054794312, "learning_rate": 4.0222533730506394e-05, "loss": 1.6603, "step": 6826 }, { "epoch": 1.7940683811212927, "grad_norm": 0.5680163502693176, "learning_rate": 4.020501138952164e-05, "loss": 1.6607, "step": 6828 }, { "epoch": 1.7945938844549545, "grad_norm": 0.5356695652008057, "learning_rate": 4.018748904853688e-05, "loss": 1.6526, "step": 6830 }, { "epoch": 1.7951193877886162, "grad_norm": 0.5755831003189087, "learning_rate": 4.016996670755213e-05, "loss": 1.6748, "step": 6832 }, { "epoch": 1.795644891122278, "grad_norm": 0.6050164103507996, "learning_rate": 4.015244436656738e-05, "loss": 1.6366, "step": 6834 }, { "epoch": 1.79617039445594, "grad_norm": 0.5978443026542664, "learning_rate": 4.013492202558262e-05, "loss": 1.6549, "step": 6836 }, { "epoch": 1.7966958977896015, "grad_norm": 0.806139349937439, "learning_rate": 4.011739968459786e-05, "loss": 1.6486, "step": 6838 }, { "epoch": 1.7972214011232635, "grad_norm": 0.665317714214325, "learning_rate": 4.009987734361311e-05, "loss": 1.6468, "step": 6840 }, { "epoch": 1.7977469044569252, "grad_norm": 0.5707154870033264, "learning_rate": 4.0082355002628355e-05, "loss": 1.6592, "step": 6842 }, { "epoch": 1.798272407790587, "grad_norm": 0.5100306868553162, "learning_rate": 4.0064832661643595e-05, "loss": 1.6732, "step": 6844 }, { "epoch": 1.7987979111242487, "grad_norm": 0.4903377294540405, "learning_rate": 4.004731032065884e-05, "loss": 1.6383, "step": 6846 }, { "epoch": 1.7993234144579104, "grad_norm": 0.5019045472145081, "learning_rate": 4.002978797967409e-05, "loss": 1.6668, "step": 6848 }, { "epoch": 1.7998489177915722, "grad_norm": 0.5553399324417114, "learning_rate": 4.001226563868933e-05, "loss": 1.6666, "step": 6850 }, { "epoch": 1.800374421125234, "grad_norm": 0.5196052193641663, "learning_rate": 3.999474329770457e-05, "loss": 1.6928, "step": 6852 }, { "epoch": 1.800899924458896, "grad_norm": 0.5712267756462097, "learning_rate": 3.997722095671982e-05, "loss": 1.6228, "step": 6854 }, { "epoch": 1.8014254277925574, "grad_norm": 0.6430991291999817, "learning_rate": 3.995969861573507e-05, "loss": 1.6583, "step": 6856 }, { "epoch": 1.8019509311262194, "grad_norm": 0.5435091257095337, "learning_rate": 3.994217627475031e-05, "loss": 1.6458, "step": 6858 }, { "epoch": 1.8024764344598811, "grad_norm": 0.5334445238113403, "learning_rate": 3.9924653933765556e-05, "loss": 1.6534, "step": 6860 }, { "epoch": 1.803001937793543, "grad_norm": 0.6035925149917603, "learning_rate": 3.99071315927808e-05, "loss": 1.6449, "step": 6862 }, { "epoch": 1.8035274411272046, "grad_norm": 0.6789858341217041, "learning_rate": 3.988960925179604e-05, "loss": 1.6599, "step": 6864 }, { "epoch": 1.8040529444608664, "grad_norm": 0.6741647720336914, "learning_rate": 3.9872086910811285e-05, "loss": 1.6474, "step": 6866 }, { "epoch": 1.8045784477945284, "grad_norm": 0.601939857006073, "learning_rate": 3.985456456982653e-05, "loss": 1.657, "step": 6868 }, { "epoch": 1.8051039511281899, "grad_norm": 0.5084041357040405, "learning_rate": 3.983704222884177e-05, "loss": 1.6707, "step": 6870 }, { "epoch": 1.8056294544618519, "grad_norm": 0.6498923897743225, "learning_rate": 3.981951988785702e-05, "loss": 1.6738, "step": 6872 }, { "epoch": 1.8061549577955134, "grad_norm": 0.6679303050041199, "learning_rate": 3.980199754687227e-05, "loss": 1.6565, "step": 6874 }, { "epoch": 1.8066804611291754, "grad_norm": 0.642929196357727, "learning_rate": 3.978447520588751e-05, "loss": 1.6437, "step": 6876 }, { "epoch": 1.807205964462837, "grad_norm": 0.5809937119483948, "learning_rate": 3.976695286490275e-05, "loss": 1.6754, "step": 6878 }, { "epoch": 1.8077314677964988, "grad_norm": 0.5564219355583191, "learning_rate": 3.9749430523918e-05, "loss": 1.6823, "step": 6880 }, { "epoch": 1.8082569711301606, "grad_norm": 0.5706532001495361, "learning_rate": 3.9731908182933245e-05, "loss": 1.6705, "step": 6882 }, { "epoch": 1.8087824744638223, "grad_norm": 0.5469827651977539, "learning_rate": 3.9714385841948486e-05, "loss": 1.6463, "step": 6884 }, { "epoch": 1.8093079777974843, "grad_norm": 0.5436684489250183, "learning_rate": 3.969686350096373e-05, "loss": 1.6658, "step": 6886 }, { "epoch": 1.8098334811311458, "grad_norm": 0.5233422517776489, "learning_rate": 3.967934115997898e-05, "loss": 1.6741, "step": 6888 }, { "epoch": 1.8103589844648078, "grad_norm": 0.5667338967323303, "learning_rate": 3.9661818818994215e-05, "loss": 1.6323, "step": 6890 }, { "epoch": 1.8108844877984693, "grad_norm": 0.5260610580444336, "learning_rate": 3.964429647800946e-05, "loss": 1.6492, "step": 6892 }, { "epoch": 1.8114099911321313, "grad_norm": 0.554559051990509, "learning_rate": 3.962677413702471e-05, "loss": 1.6686, "step": 6894 }, { "epoch": 1.811935494465793, "grad_norm": 0.6630009412765503, "learning_rate": 3.960925179603995e-05, "loss": 1.6507, "step": 6896 }, { "epoch": 1.8124609977994548, "grad_norm": 0.5093562006950378, "learning_rate": 3.95917294550552e-05, "loss": 1.6631, "step": 6898 }, { "epoch": 1.8129865011331165, "grad_norm": 0.5125998854637146, "learning_rate": 3.9574207114070446e-05, "loss": 1.6141, "step": 6900 }, { "epoch": 1.8135120044667783, "grad_norm": 0.5436182022094727, "learning_rate": 3.955668477308569e-05, "loss": 1.6298, "step": 6902 }, { "epoch": 1.8140375078004403, "grad_norm": 0.5174747705459595, "learning_rate": 3.953916243210093e-05, "loss": 1.6381, "step": 6904 }, { "epoch": 1.8145630111341018, "grad_norm": 0.48531410098075867, "learning_rate": 3.9521640091116175e-05, "loss": 1.6465, "step": 6906 }, { "epoch": 1.8150885144677638, "grad_norm": 0.5112138390541077, "learning_rate": 3.950411775013142e-05, "loss": 1.6469, "step": 6908 }, { "epoch": 1.8156140178014253, "grad_norm": 0.578628420829773, "learning_rate": 3.948659540914666e-05, "loss": 1.6874, "step": 6910 }, { "epoch": 1.8161395211350873, "grad_norm": 0.5760912895202637, "learning_rate": 3.946907306816191e-05, "loss": 1.6661, "step": 6912 }, { "epoch": 1.816665024468749, "grad_norm": 0.5140530467033386, "learning_rate": 3.945155072717715e-05, "loss": 1.6359, "step": 6914 }, { "epoch": 1.8171905278024107, "grad_norm": 0.5136293172836304, "learning_rate": 3.943402838619239e-05, "loss": 1.6839, "step": 6916 }, { "epoch": 1.8177160311360725, "grad_norm": 0.6095285415649414, "learning_rate": 3.941650604520764e-05, "loss": 1.6552, "step": 6918 }, { "epoch": 1.8182415344697342, "grad_norm": 0.5882896780967712, "learning_rate": 3.939898370422289e-05, "loss": 1.6532, "step": 6920 }, { "epoch": 1.8187670378033962, "grad_norm": 0.5088122487068176, "learning_rate": 3.938146136323813e-05, "loss": 1.6719, "step": 6922 }, { "epoch": 1.8192925411370577, "grad_norm": 0.5035478472709656, "learning_rate": 3.9363939022253376e-05, "loss": 1.641, "step": 6924 }, { "epoch": 1.8198180444707197, "grad_norm": 0.559293270111084, "learning_rate": 3.9346416681268624e-05, "loss": 1.6971, "step": 6926 }, { "epoch": 1.8203435478043812, "grad_norm": 0.5681344866752625, "learning_rate": 3.9328894340283864e-05, "loss": 1.6569, "step": 6928 }, { "epoch": 1.8208690511380432, "grad_norm": 0.6714912056922913, "learning_rate": 3.9311371999299105e-05, "loss": 1.7089, "step": 6930 }, { "epoch": 1.821394554471705, "grad_norm": 0.5596345067024231, "learning_rate": 3.929384965831435e-05, "loss": 1.6888, "step": 6932 }, { "epoch": 1.8219200578053667, "grad_norm": 0.5444710850715637, "learning_rate": 3.92763273173296e-05, "loss": 1.6772, "step": 6934 }, { "epoch": 1.8224455611390284, "grad_norm": 0.5050802230834961, "learning_rate": 3.925880497634484e-05, "loss": 1.6613, "step": 6936 }, { "epoch": 1.8229710644726902, "grad_norm": 0.5749676823616028, "learning_rate": 3.924128263536009e-05, "loss": 1.6448, "step": 6938 }, { "epoch": 1.8234965678063522, "grad_norm": 0.577195405960083, "learning_rate": 3.922376029437533e-05, "loss": 1.6614, "step": 6940 }, { "epoch": 1.8240220711400137, "grad_norm": 0.5361042022705078, "learning_rate": 3.920623795339057e-05, "loss": 1.6764, "step": 6942 }, { "epoch": 1.8245475744736757, "grad_norm": 0.5485758781433105, "learning_rate": 3.918871561240582e-05, "loss": 1.6486, "step": 6944 }, { "epoch": 1.8250730778073372, "grad_norm": 0.7427318096160889, "learning_rate": 3.9171193271421065e-05, "loss": 1.6755, "step": 6946 }, { "epoch": 1.8255985811409992, "grad_norm": 0.5816475749015808, "learning_rate": 3.9153670930436306e-05, "loss": 1.6518, "step": 6948 }, { "epoch": 1.826124084474661, "grad_norm": 0.48851844668388367, "learning_rate": 3.9136148589451554e-05, "loss": 1.6655, "step": 6950 }, { "epoch": 1.8266495878083227, "grad_norm": 0.5446299910545349, "learning_rate": 3.91186262484668e-05, "loss": 1.6374, "step": 6952 }, { "epoch": 1.8271750911419844, "grad_norm": 0.514245331287384, "learning_rate": 3.910110390748204e-05, "loss": 1.6463, "step": 6954 }, { "epoch": 1.8277005944756461, "grad_norm": 0.5527070760726929, "learning_rate": 3.908358156649728e-05, "loss": 1.6874, "step": 6956 }, { "epoch": 1.8282260978093081, "grad_norm": 0.5745643377304077, "learning_rate": 3.906605922551253e-05, "loss": 1.6334, "step": 6958 }, { "epoch": 1.8287516011429696, "grad_norm": 0.5475890636444092, "learning_rate": 3.904853688452778e-05, "loss": 1.6336, "step": 6960 }, { "epoch": 1.8292771044766316, "grad_norm": 0.5883870124816895, "learning_rate": 3.903101454354302e-05, "loss": 1.6226, "step": 6962 }, { "epoch": 1.8298026078102931, "grad_norm": 0.5167810916900635, "learning_rate": 3.9013492202558266e-05, "loss": 1.6612, "step": 6964 }, { "epoch": 1.830328111143955, "grad_norm": 0.5100778937339783, "learning_rate": 3.899596986157351e-05, "loss": 1.6666, "step": 6966 }, { "epoch": 1.8308536144776169, "grad_norm": 0.5465711355209351, "learning_rate": 3.897844752058875e-05, "loss": 1.6586, "step": 6968 }, { "epoch": 1.8313791178112786, "grad_norm": 0.5632458329200745, "learning_rate": 3.8960925179603995e-05, "loss": 1.6729, "step": 6970 }, { "epoch": 1.8319046211449403, "grad_norm": 0.7153643369674683, "learning_rate": 3.894340283861924e-05, "loss": 1.6767, "step": 6972 }, { "epoch": 1.832430124478602, "grad_norm": 0.5567420721054077, "learning_rate": 3.8925880497634484e-05, "loss": 1.6836, "step": 6974 }, { "epoch": 1.832955627812264, "grad_norm": 0.6071416735649109, "learning_rate": 3.890835815664973e-05, "loss": 1.6403, "step": 6976 }, { "epoch": 1.8334811311459256, "grad_norm": 0.5392048954963684, "learning_rate": 3.889083581566497e-05, "loss": 1.6667, "step": 6978 }, { "epoch": 1.8340066344795876, "grad_norm": 0.7555952668190002, "learning_rate": 3.887331347468022e-05, "loss": 1.6623, "step": 6980 }, { "epoch": 1.834532137813249, "grad_norm": 0.6968433856964111, "learning_rate": 3.885579113369546e-05, "loss": 1.6556, "step": 6982 }, { "epoch": 1.835057641146911, "grad_norm": 0.6319105625152588, "learning_rate": 3.883826879271071e-05, "loss": 1.7025, "step": 6984 }, { "epoch": 1.8355831444805728, "grad_norm": 0.5621939301490784, "learning_rate": 3.8820746451725956e-05, "loss": 1.639, "step": 6986 }, { "epoch": 1.8361086478142346, "grad_norm": 0.5498519539833069, "learning_rate": 3.8803224110741196e-05, "loss": 1.6726, "step": 6988 }, { "epoch": 1.8366341511478963, "grad_norm": 0.6178312301635742, "learning_rate": 3.8785701769756444e-05, "loss": 1.6525, "step": 6990 }, { "epoch": 1.837159654481558, "grad_norm": 0.4725324511528015, "learning_rate": 3.8768179428771685e-05, "loss": 1.6423, "step": 6992 }, { "epoch": 1.83768515781522, "grad_norm": 0.5918049812316895, "learning_rate": 3.875065708778693e-05, "loss": 1.6728, "step": 6994 }, { "epoch": 1.8382106611488815, "grad_norm": 0.6157652735710144, "learning_rate": 3.873313474680217e-05, "loss": 1.6549, "step": 6996 }, { "epoch": 1.8387361644825435, "grad_norm": 0.596299409866333, "learning_rate": 3.871561240581742e-05, "loss": 1.6488, "step": 6998 }, { "epoch": 1.8392616678162053, "grad_norm": 0.4981100559234619, "learning_rate": 3.869809006483267e-05, "loss": 1.6163, "step": 7000 }, { "epoch": 1.839787171149867, "grad_norm": 0.5667177438735962, "learning_rate": 3.868056772384791e-05, "loss": 1.7114, "step": 7002 }, { "epoch": 1.8403126744835288, "grad_norm": 0.5904120206832886, "learning_rate": 3.866304538286315e-05, "loss": 1.644, "step": 7004 }, { "epoch": 1.8408381778171905, "grad_norm": 0.5185456871986389, "learning_rate": 3.86455230418784e-05, "loss": 1.6537, "step": 7006 }, { "epoch": 1.8413636811508522, "grad_norm": 0.5449099540710449, "learning_rate": 3.862800070089364e-05, "loss": 1.6807, "step": 7008 }, { "epoch": 1.841889184484514, "grad_norm": 0.7209144234657288, "learning_rate": 3.8610478359908886e-05, "loss": 1.6648, "step": 7010 }, { "epoch": 1.842414687818176, "grad_norm": 0.6890124082565308, "learning_rate": 3.859295601892413e-05, "loss": 1.647, "step": 7012 }, { "epoch": 1.8429401911518375, "grad_norm": 0.5385224223136902, "learning_rate": 3.8575433677939374e-05, "loss": 1.6475, "step": 7014 }, { "epoch": 1.8434656944854995, "grad_norm": 0.5653911232948303, "learning_rate": 3.8557911336954615e-05, "loss": 1.6752, "step": 7016 }, { "epoch": 1.8439911978191612, "grad_norm": 0.5241896510124207, "learning_rate": 3.854038899596986e-05, "loss": 1.6747, "step": 7018 }, { "epoch": 1.844516701152823, "grad_norm": 0.7143029570579529, "learning_rate": 3.852286665498511e-05, "loss": 1.6538, "step": 7020 }, { "epoch": 1.8450422044864847, "grad_norm": 0.5827885270118713, "learning_rate": 3.850534431400035e-05, "loss": 1.6987, "step": 7022 }, { "epoch": 1.8455677078201465, "grad_norm": 0.6297259330749512, "learning_rate": 3.84878219730156e-05, "loss": 1.6628, "step": 7024 }, { "epoch": 1.8460932111538084, "grad_norm": 0.5605854392051697, "learning_rate": 3.8470299632030846e-05, "loss": 1.6216, "step": 7026 }, { "epoch": 1.84661871448747, "grad_norm": 0.6488915085792542, "learning_rate": 3.845277729104609e-05, "loss": 1.7036, "step": 7028 }, { "epoch": 1.847144217821132, "grad_norm": 0.6576531529426575, "learning_rate": 3.843525495006133e-05, "loss": 1.6846, "step": 7030 }, { "epoch": 1.8476697211547934, "grad_norm": 0.49783241748809814, "learning_rate": 3.8417732609076575e-05, "loss": 1.6566, "step": 7032 }, { "epoch": 1.8481952244884554, "grad_norm": 0.5974870324134827, "learning_rate": 3.8400210268091816e-05, "loss": 1.6787, "step": 7034 }, { "epoch": 1.8487207278221172, "grad_norm": 0.5442907810211182, "learning_rate": 3.838268792710706e-05, "loss": 1.6414, "step": 7036 }, { "epoch": 1.849246231155779, "grad_norm": 0.4982667565345764, "learning_rate": 3.836516558612231e-05, "loss": 1.664, "step": 7038 }, { "epoch": 1.8497717344894407, "grad_norm": 0.5198219418525696, "learning_rate": 3.834764324513755e-05, "loss": 1.6708, "step": 7040 }, { "epoch": 1.8502972378231024, "grad_norm": 0.6068596839904785, "learning_rate": 3.833012090415279e-05, "loss": 1.6467, "step": 7042 }, { "epoch": 1.8508227411567644, "grad_norm": 0.5471599102020264, "learning_rate": 3.831259856316804e-05, "loss": 1.6371, "step": 7044 }, { "epoch": 1.851348244490426, "grad_norm": 0.7527357339859009, "learning_rate": 3.829507622218329e-05, "loss": 1.6562, "step": 7046 }, { "epoch": 1.8518737478240879, "grad_norm": 0.5454807877540588, "learning_rate": 3.827755388119853e-05, "loss": 1.6923, "step": 7048 }, { "epoch": 1.8523992511577494, "grad_norm": 0.5361452102661133, "learning_rate": 3.8260031540213776e-05, "loss": 1.6479, "step": 7050 }, { "epoch": 1.8529247544914114, "grad_norm": 0.5446274876594543, "learning_rate": 3.8242509199229024e-05, "loss": 1.6576, "step": 7052 }, { "epoch": 1.853450257825073, "grad_norm": 0.6107861995697021, "learning_rate": 3.822498685824426e-05, "loss": 1.6777, "step": 7054 }, { "epoch": 1.8539757611587349, "grad_norm": 0.526027262210846, "learning_rate": 3.8207464517259505e-05, "loss": 1.6423, "step": 7056 }, { "epoch": 1.8545012644923966, "grad_norm": 0.5729789137840271, "learning_rate": 3.818994217627475e-05, "loss": 1.6364, "step": 7058 }, { "epoch": 1.8550267678260584, "grad_norm": 0.5303583145141602, "learning_rate": 3.8172419835289993e-05, "loss": 1.6984, "step": 7060 }, { "epoch": 1.8555522711597203, "grad_norm": 0.6077755093574524, "learning_rate": 3.815489749430524e-05, "loss": 1.6934, "step": 7062 }, { "epoch": 1.8560777744933818, "grad_norm": 0.5593380331993103, "learning_rate": 3.813737515332049e-05, "loss": 1.6546, "step": 7064 }, { "epoch": 1.8566032778270438, "grad_norm": 0.5528572797775269, "learning_rate": 3.811985281233573e-05, "loss": 1.6722, "step": 7066 }, { "epoch": 1.8571287811607053, "grad_norm": 0.5037624835968018, "learning_rate": 3.810233047135097e-05, "loss": 1.6701, "step": 7068 }, { "epoch": 1.8576542844943673, "grad_norm": 0.5588459968566895, "learning_rate": 3.808480813036622e-05, "loss": 1.6871, "step": 7070 }, { "epoch": 1.858179787828029, "grad_norm": 0.5192500948905945, "learning_rate": 3.8067285789381465e-05, "loss": 1.6539, "step": 7072 }, { "epoch": 1.8587052911616908, "grad_norm": 0.5383935570716858, "learning_rate": 3.8049763448396706e-05, "loss": 1.672, "step": 7074 }, { "epoch": 1.8592307944953526, "grad_norm": 0.6632468104362488, "learning_rate": 3.8032241107411954e-05, "loss": 1.6735, "step": 7076 }, { "epoch": 1.8597562978290143, "grad_norm": 0.5410235524177551, "learning_rate": 3.80147187664272e-05, "loss": 1.6374, "step": 7078 }, { "epoch": 1.8602818011626763, "grad_norm": 0.7134387493133545, "learning_rate": 3.7997196425442435e-05, "loss": 1.6661, "step": 7080 }, { "epoch": 1.8608073044963378, "grad_norm": 0.54608154296875, "learning_rate": 3.797967408445768e-05, "loss": 1.6297, "step": 7082 }, { "epoch": 1.8613328078299998, "grad_norm": 0.6386792659759521, "learning_rate": 3.796215174347293e-05, "loss": 1.6489, "step": 7084 }, { "epoch": 1.8618583111636613, "grad_norm": 0.5409175157546997, "learning_rate": 3.794462940248817e-05, "loss": 1.6609, "step": 7086 }, { "epoch": 1.8623838144973233, "grad_norm": 0.611273467540741, "learning_rate": 3.792710706150342e-05, "loss": 1.6685, "step": 7088 }, { "epoch": 1.862909317830985, "grad_norm": 0.6014897227287292, "learning_rate": 3.7909584720518666e-05, "loss": 1.6068, "step": 7090 }, { "epoch": 1.8634348211646468, "grad_norm": 0.5631469488143921, "learning_rate": 3.789206237953391e-05, "loss": 1.6478, "step": 7092 }, { "epoch": 1.8639603244983085, "grad_norm": 0.566448450088501, "learning_rate": 3.787454003854915e-05, "loss": 1.6861, "step": 7094 }, { "epoch": 1.8644858278319703, "grad_norm": 0.5702322721481323, "learning_rate": 3.7857017697564395e-05, "loss": 1.6507, "step": 7096 }, { "epoch": 1.8650113311656322, "grad_norm": 0.5720840096473694, "learning_rate": 3.783949535657964e-05, "loss": 1.6615, "step": 7098 }, { "epoch": 1.8655368344992938, "grad_norm": 0.586739718914032, "learning_rate": 3.7821973015594884e-05, "loss": 1.6837, "step": 7100 }, { "epoch": 1.8660623378329557, "grad_norm": 0.5152260065078735, "learning_rate": 3.780445067461013e-05, "loss": 1.6565, "step": 7102 }, { "epoch": 1.8665878411666172, "grad_norm": 0.5608677864074707, "learning_rate": 3.778692833362538e-05, "loss": 1.6448, "step": 7104 }, { "epoch": 1.8671133445002792, "grad_norm": 0.5977829694747925, "learning_rate": 3.776940599264062e-05, "loss": 1.6415, "step": 7106 }, { "epoch": 1.867638847833941, "grad_norm": 0.6419270038604736, "learning_rate": 3.775188365165586e-05, "loss": 1.6032, "step": 7108 }, { "epoch": 1.8681643511676027, "grad_norm": 0.6474549770355225, "learning_rate": 3.773436131067111e-05, "loss": 1.6717, "step": 7110 }, { "epoch": 1.8686898545012645, "grad_norm": 0.6148583889007568, "learning_rate": 3.7716838969686356e-05, "loss": 1.6476, "step": 7112 }, { "epoch": 1.8692153578349262, "grad_norm": 0.5739107728004456, "learning_rate": 3.7699316628701596e-05, "loss": 1.652, "step": 7114 }, { "epoch": 1.8697408611685882, "grad_norm": 0.5666532516479492, "learning_rate": 3.7681794287716844e-05, "loss": 1.6611, "step": 7116 }, { "epoch": 1.8702663645022497, "grad_norm": 0.6231557726860046, "learning_rate": 3.7664271946732085e-05, "loss": 1.6285, "step": 7118 }, { "epoch": 1.8707918678359117, "grad_norm": 0.5692910552024841, "learning_rate": 3.7646749605747325e-05, "loss": 1.6195, "step": 7120 }, { "epoch": 1.8713173711695732, "grad_norm": 0.5744662880897522, "learning_rate": 3.762922726476257e-05, "loss": 1.6603, "step": 7122 }, { "epoch": 1.8718428745032352, "grad_norm": 0.5582786798477173, "learning_rate": 3.761170492377782e-05, "loss": 1.6719, "step": 7124 }, { "epoch": 1.872368377836897, "grad_norm": 0.5148811340332031, "learning_rate": 3.759418258279306e-05, "loss": 1.6329, "step": 7126 }, { "epoch": 1.8728938811705587, "grad_norm": 0.5253287553787231, "learning_rate": 3.757666024180831e-05, "loss": 1.6576, "step": 7128 }, { "epoch": 1.8734193845042204, "grad_norm": 0.5456867814064026, "learning_rate": 3.7559137900823557e-05, "loss": 1.6682, "step": 7130 }, { "epoch": 1.8739448878378822, "grad_norm": 0.5340244770050049, "learning_rate": 3.75416155598388e-05, "loss": 1.6758, "step": 7132 }, { "epoch": 1.8744703911715441, "grad_norm": 0.5905424356460571, "learning_rate": 3.752409321885404e-05, "loss": 1.6707, "step": 7134 }, { "epoch": 1.8749958945052057, "grad_norm": 0.5440637469291687, "learning_rate": 3.7506570877869286e-05, "loss": 1.6507, "step": 7136 }, { "epoch": 1.8755213978388676, "grad_norm": 0.751801073551178, "learning_rate": 3.748904853688453e-05, "loss": 1.6656, "step": 7138 }, { "epoch": 1.8760469011725294, "grad_norm": 0.5282323956489563, "learning_rate": 3.7471526195899774e-05, "loss": 1.6808, "step": 7140 }, { "epoch": 1.8765724045061911, "grad_norm": 0.5995266437530518, "learning_rate": 3.745400385491502e-05, "loss": 1.6374, "step": 7142 }, { "epoch": 1.8770979078398529, "grad_norm": 0.5131629705429077, "learning_rate": 3.743648151393026e-05, "loss": 1.631, "step": 7144 }, { "epoch": 1.8776234111735146, "grad_norm": 0.5343523025512695, "learning_rate": 3.74189591729455e-05, "loss": 1.6253, "step": 7146 }, { "epoch": 1.8781489145071764, "grad_norm": 0.6353945136070251, "learning_rate": 3.740143683196075e-05, "loss": 1.6843, "step": 7148 }, { "epoch": 1.878674417840838, "grad_norm": 0.5876971483230591, "learning_rate": 3.7383914490976e-05, "loss": 1.6692, "step": 7150 }, { "epoch": 1.8791999211745, "grad_norm": 0.5077223181724548, "learning_rate": 3.736639214999124e-05, "loss": 1.6095, "step": 7152 }, { "epoch": 1.8797254245081616, "grad_norm": 0.5225921273231506, "learning_rate": 3.734886980900649e-05, "loss": 1.6475, "step": 7154 }, { "epoch": 1.8802509278418236, "grad_norm": 0.549243688583374, "learning_rate": 3.733134746802173e-05, "loss": 1.6529, "step": 7156 }, { "epoch": 1.8807764311754853, "grad_norm": 0.5567914843559265, "learning_rate": 3.7313825127036975e-05, "loss": 1.6621, "step": 7158 }, { "epoch": 1.881301934509147, "grad_norm": 0.5527283549308777, "learning_rate": 3.7296302786052216e-05, "loss": 1.6705, "step": 7160 }, { "epoch": 1.8818274378428088, "grad_norm": 0.5174548625946045, "learning_rate": 3.727878044506746e-05, "loss": 1.6606, "step": 7162 }, { "epoch": 1.8823529411764706, "grad_norm": 0.527554988861084, "learning_rate": 3.726125810408271e-05, "loss": 1.6543, "step": 7164 }, { "epoch": 1.8828784445101323, "grad_norm": 0.6432197093963623, "learning_rate": 3.724373576309795e-05, "loss": 1.6551, "step": 7166 }, { "epoch": 1.883403947843794, "grad_norm": 0.6546508073806763, "learning_rate": 3.72262134221132e-05, "loss": 1.644, "step": 7168 }, { "epoch": 1.883929451177456, "grad_norm": 0.5889551043510437, "learning_rate": 3.720869108112844e-05, "loss": 1.6767, "step": 7170 }, { "epoch": 1.8844549545111176, "grad_norm": 0.5616403222084045, "learning_rate": 3.719116874014368e-05, "loss": 1.661, "step": 7172 }, { "epoch": 1.8849804578447795, "grad_norm": 0.7210134267807007, "learning_rate": 3.717364639915893e-05, "loss": 1.6527, "step": 7174 }, { "epoch": 1.8855059611784413, "grad_norm": 0.5026001930236816, "learning_rate": 3.7156124058174176e-05, "loss": 1.6589, "step": 7176 }, { "epoch": 1.886031464512103, "grad_norm": 0.5873162746429443, "learning_rate": 3.713860171718942e-05, "loss": 1.622, "step": 7178 }, { "epoch": 1.8865569678457648, "grad_norm": 0.729069709777832, "learning_rate": 3.7121079376204664e-05, "loss": 1.6229, "step": 7180 }, { "epoch": 1.8870824711794265, "grad_norm": 0.5042127966880798, "learning_rate": 3.7103557035219905e-05, "loss": 1.6329, "step": 7182 }, { "epoch": 1.8876079745130885, "grad_norm": 0.5304409861564636, "learning_rate": 3.708603469423515e-05, "loss": 1.6427, "step": 7184 }, { "epoch": 1.88813347784675, "grad_norm": 0.6537191271781921, "learning_rate": 3.7068512353250393e-05, "loss": 1.662, "step": 7186 }, { "epoch": 1.888658981180412, "grad_norm": 0.5206599235534668, "learning_rate": 3.705099001226564e-05, "loss": 1.6375, "step": 7188 }, { "epoch": 1.8891844845140735, "grad_norm": 0.6516169905662537, "learning_rate": 3.703346767128089e-05, "loss": 1.6364, "step": 7190 }, { "epoch": 1.8897099878477355, "grad_norm": 0.5258002877235413, "learning_rate": 3.701594533029613e-05, "loss": 1.6826, "step": 7192 }, { "epoch": 1.8902354911813972, "grad_norm": 0.5022348761558533, "learning_rate": 3.699842298931138e-05, "loss": 1.64, "step": 7194 }, { "epoch": 1.890760994515059, "grad_norm": 0.585818350315094, "learning_rate": 3.698090064832662e-05, "loss": 1.6968, "step": 7196 }, { "epoch": 1.8912864978487207, "grad_norm": 0.5346968770027161, "learning_rate": 3.696337830734186e-05, "loss": 1.6514, "step": 7198 }, { "epoch": 1.8918120011823825, "grad_norm": 0.6125622391700745, "learning_rate": 3.6945855966357106e-05, "loss": 1.6467, "step": 7200 }, { "epoch": 1.8918120011823825, "eval_loss": 1.66790771484375, "eval_runtime": 487.3046, "eval_samples_per_second": 249.924, "eval_steps_per_second": 31.241, "step": 7200 }, { "epoch": 1.8923375045160444, "grad_norm": 0.5719324350357056, "learning_rate": 3.6928333625372354e-05, "loss": 1.6776, "step": 7202 }, { "epoch": 1.892863007849706, "grad_norm": 0.5245485901832581, "learning_rate": 3.6910811284387594e-05, "loss": 1.6324, "step": 7204 }, { "epoch": 1.893388511183368, "grad_norm": 0.5099211931228638, "learning_rate": 3.689328894340284e-05, "loss": 1.6764, "step": 7206 }, { "epoch": 1.8939140145170295, "grad_norm": 0.7997536659240723, "learning_rate": 3.687576660241808e-05, "loss": 1.6619, "step": 7208 }, { "epoch": 1.8944395178506914, "grad_norm": 0.5983949899673462, "learning_rate": 3.685824426143333e-05, "loss": 1.6557, "step": 7210 }, { "epoch": 1.8949650211843532, "grad_norm": 0.6213306784629822, "learning_rate": 3.684072192044857e-05, "loss": 1.6559, "step": 7212 }, { "epoch": 1.895490524518015, "grad_norm": 0.5697503685951233, "learning_rate": 3.682319957946382e-05, "loss": 1.6754, "step": 7214 }, { "epoch": 1.8960160278516767, "grad_norm": 0.507168710231781, "learning_rate": 3.6805677238479066e-05, "loss": 1.6618, "step": 7216 }, { "epoch": 1.8965415311853384, "grad_norm": 0.6169989109039307, "learning_rate": 3.678815489749431e-05, "loss": 1.6329, "step": 7218 }, { "epoch": 1.8970670345190004, "grad_norm": 0.516326904296875, "learning_rate": 3.677063255650955e-05, "loss": 1.65, "step": 7220 }, { "epoch": 1.897592537852662, "grad_norm": 0.5588873624801636, "learning_rate": 3.6753110215524795e-05, "loss": 1.6707, "step": 7222 }, { "epoch": 1.8981180411863239, "grad_norm": 0.7022035717964172, "learning_rate": 3.6735587874540036e-05, "loss": 1.6935, "step": 7224 }, { "epoch": 1.8986435445199854, "grad_norm": 0.5556198954582214, "learning_rate": 3.6718065533555284e-05, "loss": 1.6955, "step": 7226 }, { "epoch": 1.8991690478536474, "grad_norm": 0.6451961398124695, "learning_rate": 3.670054319257053e-05, "loss": 1.6607, "step": 7228 }, { "epoch": 1.8996945511873091, "grad_norm": 0.603360116481781, "learning_rate": 3.668302085158577e-05, "loss": 1.626, "step": 7230 }, { "epoch": 1.9002200545209709, "grad_norm": 0.7140105962753296, "learning_rate": 3.666549851060102e-05, "loss": 1.6587, "step": 7232 }, { "epoch": 1.9007455578546326, "grad_norm": 0.5646045804023743, "learning_rate": 3.664797616961626e-05, "loss": 1.6681, "step": 7234 }, { "epoch": 1.9012710611882944, "grad_norm": 0.5183248519897461, "learning_rate": 3.663045382863151e-05, "loss": 1.6912, "step": 7236 }, { "epoch": 1.9017965645219563, "grad_norm": 0.6256281137466431, "learning_rate": 3.661293148764675e-05, "loss": 1.6699, "step": 7238 }, { "epoch": 1.9023220678556179, "grad_norm": 0.5462591052055359, "learning_rate": 3.6595409146661996e-05, "loss": 1.6628, "step": 7240 }, { "epoch": 1.9028475711892798, "grad_norm": 0.715267539024353, "learning_rate": 3.6577886805677244e-05, "loss": 1.6907, "step": 7242 }, { "epoch": 1.9033730745229414, "grad_norm": 0.8691731095314026, "learning_rate": 3.6560364464692485e-05, "loss": 1.6982, "step": 7244 }, { "epoch": 1.9038985778566033, "grad_norm": 0.55055171251297, "learning_rate": 3.6542842123707725e-05, "loss": 1.6757, "step": 7246 }, { "epoch": 1.904424081190265, "grad_norm": 0.5819158554077148, "learning_rate": 3.652531978272297e-05, "loss": 1.6571, "step": 7248 }, { "epoch": 1.9049495845239268, "grad_norm": 0.6203599572181702, "learning_rate": 3.650779744173822e-05, "loss": 1.7292, "step": 7250 }, { "epoch": 1.9054750878575886, "grad_norm": 0.5759249925613403, "learning_rate": 3.649027510075346e-05, "loss": 1.6227, "step": 7252 }, { "epoch": 1.9060005911912503, "grad_norm": 0.581551730632782, "learning_rate": 3.647275275976871e-05, "loss": 1.6741, "step": 7254 }, { "epoch": 1.9065260945249123, "grad_norm": 0.6072301864624023, "learning_rate": 3.6455230418783956e-05, "loss": 1.6663, "step": 7256 }, { "epoch": 1.9070515978585738, "grad_norm": 0.5655650496482849, "learning_rate": 3.643770807779919e-05, "loss": 1.6729, "step": 7258 }, { "epoch": 1.9075771011922358, "grad_norm": 0.5449069738388062, "learning_rate": 3.642018573681444e-05, "loss": 1.6514, "step": 7260 }, { "epoch": 1.9081026045258973, "grad_norm": 0.573408842086792, "learning_rate": 3.6402663395829686e-05, "loss": 1.6718, "step": 7262 }, { "epoch": 1.9086281078595593, "grad_norm": 0.7997104525566101, "learning_rate": 3.6385141054844926e-05, "loss": 1.702, "step": 7264 }, { "epoch": 1.909153611193221, "grad_norm": 0.565268337726593, "learning_rate": 3.6367618713860174e-05, "loss": 1.6244, "step": 7266 }, { "epoch": 1.9096791145268828, "grad_norm": 0.5995902419090271, "learning_rate": 3.635009637287542e-05, "loss": 1.6694, "step": 7268 }, { "epoch": 1.9102046178605445, "grad_norm": 0.6907638907432556, "learning_rate": 3.633257403189066e-05, "loss": 1.6425, "step": 7270 }, { "epoch": 1.9107301211942063, "grad_norm": 0.49709975719451904, "learning_rate": 3.63150516909059e-05, "loss": 1.6564, "step": 7272 }, { "epoch": 1.9112556245278682, "grad_norm": 0.5460817813873291, "learning_rate": 3.629752934992115e-05, "loss": 1.6822, "step": 7274 }, { "epoch": 1.9117811278615298, "grad_norm": 0.6267193555831909, "learning_rate": 3.62800070089364e-05, "loss": 1.6325, "step": 7276 }, { "epoch": 1.9123066311951917, "grad_norm": 0.5849470496177673, "learning_rate": 3.626248466795164e-05, "loss": 1.6596, "step": 7278 }, { "epoch": 1.9128321345288533, "grad_norm": 0.5416064262390137, "learning_rate": 3.6244962326966887e-05, "loss": 1.6786, "step": 7280 }, { "epoch": 1.9133576378625152, "grad_norm": 0.6678909063339233, "learning_rate": 3.6227439985982134e-05, "loss": 1.6392, "step": 7282 }, { "epoch": 1.913883141196177, "grad_norm": 0.5513505935668945, "learning_rate": 3.620991764499737e-05, "loss": 1.6942, "step": 7284 }, { "epoch": 1.9144086445298387, "grad_norm": 0.7723992466926575, "learning_rate": 3.6192395304012616e-05, "loss": 1.6853, "step": 7286 }, { "epoch": 1.9149341478635005, "grad_norm": 0.610593318939209, "learning_rate": 3.617487296302786e-05, "loss": 1.6927, "step": 7288 }, { "epoch": 1.9154596511971622, "grad_norm": 0.6118401288986206, "learning_rate": 3.6157350622043104e-05, "loss": 1.6397, "step": 7290 }, { "epoch": 1.9159851545308242, "grad_norm": 0.5796778798103333, "learning_rate": 3.613982828105835e-05, "loss": 1.6625, "step": 7292 }, { "epoch": 1.9165106578644857, "grad_norm": 0.520003080368042, "learning_rate": 3.61223059400736e-05, "loss": 1.6902, "step": 7294 }, { "epoch": 1.9170361611981477, "grad_norm": 0.6326977610588074, "learning_rate": 3.610478359908884e-05, "loss": 1.6767, "step": 7296 }, { "epoch": 1.9175616645318094, "grad_norm": 0.4994673728942871, "learning_rate": 3.608726125810408e-05, "loss": 1.625, "step": 7298 }, { "epoch": 1.9180871678654712, "grad_norm": 0.4929233193397522, "learning_rate": 3.606973891711933e-05, "loss": 1.7034, "step": 7300 }, { "epoch": 1.918612671199133, "grad_norm": 0.5794808268547058, "learning_rate": 3.6052216576134576e-05, "loss": 1.6522, "step": 7302 }, { "epoch": 1.9191381745327947, "grad_norm": 0.5688311457633972, "learning_rate": 3.603469423514982e-05, "loss": 1.7031, "step": 7304 }, { "epoch": 1.9196636778664564, "grad_norm": 0.5700064897537231, "learning_rate": 3.6017171894165064e-05, "loss": 1.6469, "step": 7306 }, { "epoch": 1.9201891812001182, "grad_norm": 0.4965689480304718, "learning_rate": 3.599964955318031e-05, "loss": 1.6539, "step": 7308 }, { "epoch": 1.9207146845337801, "grad_norm": 0.6212711334228516, "learning_rate": 3.5982127212195546e-05, "loss": 1.6558, "step": 7310 }, { "epoch": 1.9212401878674417, "grad_norm": 0.5985603928565979, "learning_rate": 3.5964604871210793e-05, "loss": 1.6462, "step": 7312 }, { "epoch": 1.9217656912011036, "grad_norm": 0.6384910345077515, "learning_rate": 3.594708253022604e-05, "loss": 1.6279, "step": 7314 }, { "epoch": 1.9222911945347654, "grad_norm": 0.5548694133758545, "learning_rate": 3.592956018924128e-05, "loss": 1.6488, "step": 7316 }, { "epoch": 1.9228166978684271, "grad_norm": 0.5115952491760254, "learning_rate": 3.591203784825653e-05, "loss": 1.6557, "step": 7318 }, { "epoch": 1.9233422012020889, "grad_norm": 0.564540684223175, "learning_rate": 3.589451550727178e-05, "loss": 1.6506, "step": 7320 }, { "epoch": 1.9238677045357506, "grad_norm": 0.5424622297286987, "learning_rate": 3.587699316628702e-05, "loss": 1.6837, "step": 7322 }, { "epoch": 1.9243932078694124, "grad_norm": 0.6576436758041382, "learning_rate": 3.585947082530226e-05, "loss": 1.6663, "step": 7324 }, { "epoch": 1.9249187112030741, "grad_norm": 0.6056269407272339, "learning_rate": 3.5841948484317506e-05, "loss": 1.6427, "step": 7326 }, { "epoch": 1.925444214536736, "grad_norm": 0.5156649947166443, "learning_rate": 3.5824426143332754e-05, "loss": 1.649, "step": 7328 }, { "epoch": 1.9259697178703976, "grad_norm": 0.5065081119537354, "learning_rate": 3.5806903802347994e-05, "loss": 1.6422, "step": 7330 }, { "epoch": 1.9264952212040596, "grad_norm": 0.5199773907661438, "learning_rate": 3.578938146136324e-05, "loss": 1.634, "step": 7332 }, { "epoch": 1.9270207245377213, "grad_norm": 0.5321483016014099, "learning_rate": 3.577185912037849e-05, "loss": 1.6501, "step": 7334 }, { "epoch": 1.927546227871383, "grad_norm": 0.5092171430587769, "learning_rate": 3.5754336779393724e-05, "loss": 1.6337, "step": 7336 }, { "epoch": 1.9280717312050448, "grad_norm": 0.4993257224559784, "learning_rate": 3.573681443840897e-05, "loss": 1.647, "step": 7338 }, { "epoch": 1.9285972345387066, "grad_norm": 0.7736045718193054, "learning_rate": 3.571929209742422e-05, "loss": 1.645, "step": 7340 }, { "epoch": 1.9291227378723685, "grad_norm": 0.5889518857002258, "learning_rate": 3.570176975643946e-05, "loss": 1.6412, "step": 7342 }, { "epoch": 1.92964824120603, "grad_norm": 0.5440062880516052, "learning_rate": 3.568424741545471e-05, "loss": 1.6537, "step": 7344 }, { "epoch": 1.930173744539692, "grad_norm": 0.5857016444206238, "learning_rate": 3.5666725074469955e-05, "loss": 1.6627, "step": 7346 }, { "epoch": 1.9306992478733536, "grad_norm": 0.5349956750869751, "learning_rate": 3.5649202733485195e-05, "loss": 1.6621, "step": 7348 }, { "epoch": 1.9312247512070155, "grad_norm": 0.6116125583648682, "learning_rate": 3.5631680392500436e-05, "loss": 1.6555, "step": 7350 }, { "epoch": 1.9317502545406773, "grad_norm": 0.5522750020027161, "learning_rate": 3.5614158051515684e-05, "loss": 1.6551, "step": 7352 }, { "epoch": 1.932275757874339, "grad_norm": 0.503121554851532, "learning_rate": 3.559663571053093e-05, "loss": 1.6105, "step": 7354 }, { "epoch": 1.9328012612080008, "grad_norm": 0.7201147079467773, "learning_rate": 3.557911336954617e-05, "loss": 1.6763, "step": 7356 }, { "epoch": 1.9333267645416625, "grad_norm": 0.507443904876709, "learning_rate": 3.556159102856142e-05, "loss": 1.6476, "step": 7358 }, { "epoch": 1.9338522678753245, "grad_norm": 0.5974195003509521, "learning_rate": 3.554406868757666e-05, "loss": 1.6446, "step": 7360 }, { "epoch": 1.934377771208986, "grad_norm": 0.5196303725242615, "learning_rate": 3.552654634659191e-05, "loss": 1.638, "step": 7362 }, { "epoch": 1.934903274542648, "grad_norm": 0.5367706418037415, "learning_rate": 3.550902400560715e-05, "loss": 1.6712, "step": 7364 }, { "epoch": 1.9354287778763095, "grad_norm": 0.5725764632225037, "learning_rate": 3.5491501664622396e-05, "loss": 1.6764, "step": 7366 }, { "epoch": 1.9359542812099715, "grad_norm": 0.6095370054244995, "learning_rate": 3.5473979323637644e-05, "loss": 1.6385, "step": 7368 }, { "epoch": 1.9364797845436332, "grad_norm": 0.6413910388946533, "learning_rate": 3.5456456982652885e-05, "loss": 1.6665, "step": 7370 }, { "epoch": 1.937005287877295, "grad_norm": 0.5054334998130798, "learning_rate": 3.543893464166813e-05, "loss": 1.6639, "step": 7372 }, { "epoch": 1.9375307912109567, "grad_norm": 0.5909721255302429, "learning_rate": 3.542141230068337e-05, "loss": 1.6658, "step": 7374 }, { "epoch": 1.9380562945446185, "grad_norm": 0.5415735244750977, "learning_rate": 3.5403889959698614e-05, "loss": 1.6872, "step": 7376 }, { "epoch": 1.9385817978782804, "grad_norm": 0.5185040831565857, "learning_rate": 3.538636761871386e-05, "loss": 1.6371, "step": 7378 }, { "epoch": 1.939107301211942, "grad_norm": 0.4956663250923157, "learning_rate": 3.536884527772911e-05, "loss": 1.6036, "step": 7380 }, { "epoch": 1.939632804545604, "grad_norm": 0.5410330891609192, "learning_rate": 3.535132293674435e-05, "loss": 1.6482, "step": 7382 }, { "epoch": 1.9401583078792655, "grad_norm": 0.6475924253463745, "learning_rate": 3.53338005957596e-05, "loss": 1.6297, "step": 7384 }, { "epoch": 1.9406838112129274, "grad_norm": 0.5302926301956177, "learning_rate": 3.531627825477484e-05, "loss": 1.6652, "step": 7386 }, { "epoch": 1.9412093145465892, "grad_norm": 0.6099222898483276, "learning_rate": 3.5298755913790086e-05, "loss": 1.6463, "step": 7388 }, { "epoch": 1.941734817880251, "grad_norm": 0.6674348711967468, "learning_rate": 3.5281233572805326e-05, "loss": 1.6421, "step": 7390 }, { "epoch": 1.9422603212139127, "grad_norm": 0.5395660400390625, "learning_rate": 3.5263711231820574e-05, "loss": 1.6465, "step": 7392 }, { "epoch": 1.9427858245475744, "grad_norm": 0.629122257232666, "learning_rate": 3.524618889083582e-05, "loss": 1.6914, "step": 7394 }, { "epoch": 1.9433113278812364, "grad_norm": 0.5263876914978027, "learning_rate": 3.522866654985106e-05, "loss": 1.7023, "step": 7396 }, { "epoch": 1.943836831214898, "grad_norm": 0.5672745108604431, "learning_rate": 3.52111442088663e-05, "loss": 1.652, "step": 7398 }, { "epoch": 1.94436233454856, "grad_norm": 0.551514744758606, "learning_rate": 3.519362186788155e-05, "loss": 1.6294, "step": 7400 }, { "epoch": 1.9448878378822214, "grad_norm": 0.593605101108551, "learning_rate": 3.517609952689679e-05, "loss": 1.6561, "step": 7402 }, { "epoch": 1.9454133412158834, "grad_norm": 0.5020228028297424, "learning_rate": 3.515857718591204e-05, "loss": 1.6724, "step": 7404 }, { "epoch": 1.9459388445495451, "grad_norm": 0.5694274306297302, "learning_rate": 3.5141054844927287e-05, "loss": 1.6771, "step": 7406 }, { "epoch": 1.9464643478832069, "grad_norm": 0.6046936511993408, "learning_rate": 3.512353250394253e-05, "loss": 1.6627, "step": 7408 }, { "epoch": 1.9469898512168686, "grad_norm": 0.5441648364067078, "learning_rate": 3.5106010162957775e-05, "loss": 1.638, "step": 7410 }, { "epoch": 1.9475153545505304, "grad_norm": 0.5168318748474121, "learning_rate": 3.5088487821973016e-05, "loss": 1.6782, "step": 7412 }, { "epoch": 1.9480408578841923, "grad_norm": 0.5330861210823059, "learning_rate": 3.507096548098826e-05, "loss": 1.6427, "step": 7414 }, { "epoch": 1.9485663612178539, "grad_norm": 0.6046448945999146, "learning_rate": 3.5053443140003504e-05, "loss": 1.644, "step": 7416 }, { "epoch": 1.9490918645515158, "grad_norm": 0.5944949388504028, "learning_rate": 3.503592079901875e-05, "loss": 1.6684, "step": 7418 }, { "epoch": 1.9496173678851774, "grad_norm": 0.5597866773605347, "learning_rate": 3.5018398458034e-05, "loss": 1.646, "step": 7420 }, { "epoch": 1.9501428712188393, "grad_norm": 0.6573653817176819, "learning_rate": 3.500087611704924e-05, "loss": 1.6601, "step": 7422 }, { "epoch": 1.950668374552501, "grad_norm": 0.5286207795143127, "learning_rate": 3.498335377606448e-05, "loss": 1.6259, "step": 7424 }, { "epoch": 1.9511938778861628, "grad_norm": 0.5400940179824829, "learning_rate": 3.496583143507973e-05, "loss": 1.6581, "step": 7426 }, { "epoch": 1.9517193812198246, "grad_norm": 0.6813338994979858, "learning_rate": 3.494830909409497e-05, "loss": 1.6552, "step": 7428 }, { "epoch": 1.9522448845534863, "grad_norm": 0.5815770030021667, "learning_rate": 3.493078675311022e-05, "loss": 1.6845, "step": 7430 }, { "epoch": 1.9527703878871483, "grad_norm": 0.5522500872612, "learning_rate": 3.4913264412125464e-05, "loss": 1.6749, "step": 7432 }, { "epoch": 1.9532958912208098, "grad_norm": 0.5736134052276611, "learning_rate": 3.4895742071140705e-05, "loss": 1.6413, "step": 7434 }, { "epoch": 1.9538213945544718, "grad_norm": 0.4932587742805481, "learning_rate": 3.487821973015595e-05, "loss": 1.6416, "step": 7436 }, { "epoch": 1.9543468978881333, "grad_norm": 0.5147337913513184, "learning_rate": 3.486069738917119e-05, "loss": 1.6626, "step": 7438 }, { "epoch": 1.9548724012217953, "grad_norm": 0.7022672891616821, "learning_rate": 3.484317504818644e-05, "loss": 1.6723, "step": 7440 }, { "epoch": 1.955397904555457, "grad_norm": 0.6867913007736206, "learning_rate": 3.482565270720168e-05, "loss": 1.6431, "step": 7442 }, { "epoch": 1.9559234078891188, "grad_norm": 0.5425695180892944, "learning_rate": 3.480813036621693e-05, "loss": 1.6566, "step": 7444 }, { "epoch": 1.9564489112227805, "grad_norm": 0.6468666195869446, "learning_rate": 3.479060802523218e-05, "loss": 1.6638, "step": 7446 }, { "epoch": 1.9569744145564423, "grad_norm": 0.5925945043563843, "learning_rate": 3.477308568424742e-05, "loss": 1.6443, "step": 7448 }, { "epoch": 1.9574999178901042, "grad_norm": 0.6654757857322693, "learning_rate": 3.475556334326266e-05, "loss": 1.6064, "step": 7450 }, { "epoch": 1.9580254212237658, "grad_norm": 0.5577012300491333, "learning_rate": 3.4738041002277906e-05, "loss": 1.6556, "step": 7452 }, { "epoch": 1.9585509245574277, "grad_norm": 0.7134981751441956, "learning_rate": 3.472051866129315e-05, "loss": 1.6739, "step": 7454 }, { "epoch": 1.9590764278910895, "grad_norm": 0.5233617424964905, "learning_rate": 3.4702996320308394e-05, "loss": 1.6466, "step": 7456 }, { "epoch": 1.9596019312247512, "grad_norm": 0.5530445575714111, "learning_rate": 3.468547397932364e-05, "loss": 1.6994, "step": 7458 }, { "epoch": 1.960127434558413, "grad_norm": 0.5197651982307434, "learning_rate": 3.466795163833888e-05, "loss": 1.6614, "step": 7460 }, { "epoch": 1.9606529378920747, "grad_norm": 0.48130765557289124, "learning_rate": 3.4650429297354123e-05, "loss": 1.6449, "step": 7462 }, { "epoch": 1.9611784412257365, "grad_norm": 0.49137425422668457, "learning_rate": 3.463290695636937e-05, "loss": 1.6527, "step": 7464 }, { "epoch": 1.9617039445593982, "grad_norm": 0.5547472238540649, "learning_rate": 3.461538461538462e-05, "loss": 1.6491, "step": 7466 }, { "epoch": 1.9622294478930602, "grad_norm": 0.5760181546211243, "learning_rate": 3.459786227439986e-05, "loss": 1.6794, "step": 7468 }, { "epoch": 1.9627549512267217, "grad_norm": 0.5289864540100098, "learning_rate": 3.458033993341511e-05, "loss": 1.6548, "step": 7470 }, { "epoch": 1.9632804545603837, "grad_norm": 0.5183879733085632, "learning_rate": 3.4562817592430355e-05, "loss": 1.6629, "step": 7472 }, { "epoch": 1.9638059578940454, "grad_norm": 0.5636990666389465, "learning_rate": 3.4545295251445595e-05, "loss": 1.6591, "step": 7474 }, { "epoch": 1.9643314612277072, "grad_norm": 0.5583269596099854, "learning_rate": 3.4527772910460836e-05, "loss": 1.6576, "step": 7476 }, { "epoch": 1.964856964561369, "grad_norm": 0.5348683595657349, "learning_rate": 3.4510250569476084e-05, "loss": 1.6148, "step": 7478 }, { "epoch": 1.9653824678950307, "grad_norm": 0.5336459875106812, "learning_rate": 3.4492728228491324e-05, "loss": 1.6704, "step": 7480 }, { "epoch": 1.9659079712286924, "grad_norm": 0.5364224314689636, "learning_rate": 3.447520588750657e-05, "loss": 1.6391, "step": 7482 }, { "epoch": 1.9664334745623542, "grad_norm": 0.5437600612640381, "learning_rate": 3.445768354652182e-05, "loss": 1.6817, "step": 7484 }, { "epoch": 1.9669589778960161, "grad_norm": 0.6408882737159729, "learning_rate": 3.444016120553706e-05, "loss": 1.6505, "step": 7486 }, { "epoch": 1.9674844812296777, "grad_norm": 0.5990426540374756, "learning_rate": 3.44226388645523e-05, "loss": 1.6634, "step": 7488 }, { "epoch": 1.9680099845633396, "grad_norm": 0.5437130928039551, "learning_rate": 3.440511652356755e-05, "loss": 1.6716, "step": 7490 }, { "epoch": 1.9685354878970014, "grad_norm": 0.6260076761245728, "learning_rate": 3.4387594182582796e-05, "loss": 1.6569, "step": 7492 }, { "epoch": 1.9690609912306631, "grad_norm": 0.5225280523300171, "learning_rate": 3.437007184159804e-05, "loss": 1.6649, "step": 7494 }, { "epoch": 1.9695864945643249, "grad_norm": 0.5389422178268433, "learning_rate": 3.4352549500613285e-05, "loss": 1.6723, "step": 7496 }, { "epoch": 1.9701119978979866, "grad_norm": 0.6174726486206055, "learning_rate": 3.433502715962853e-05, "loss": 1.669, "step": 7498 }, { "epoch": 1.9706375012316486, "grad_norm": 0.5065149664878845, "learning_rate": 3.431750481864377e-05, "loss": 1.6638, "step": 7500 }, { "epoch": 1.9711630045653101, "grad_norm": 0.6950640678405762, "learning_rate": 3.4299982477659014e-05, "loss": 1.6713, "step": 7502 }, { "epoch": 1.971688507898972, "grad_norm": 0.507963240146637, "learning_rate": 3.428246013667426e-05, "loss": 1.6332, "step": 7504 }, { "epoch": 1.9722140112326336, "grad_norm": 0.6077314019203186, "learning_rate": 3.426493779568951e-05, "loss": 1.6641, "step": 7506 }, { "epoch": 1.9727395145662956, "grad_norm": 0.5328965783119202, "learning_rate": 3.424741545470475e-05, "loss": 1.6593, "step": 7508 }, { "epoch": 1.9732650178999573, "grad_norm": 0.6714265942573547, "learning_rate": 3.422989311372e-05, "loss": 1.6444, "step": 7510 }, { "epoch": 1.973790521233619, "grad_norm": 0.5329544544219971, "learning_rate": 3.4212370772735245e-05, "loss": 1.6337, "step": 7512 }, { "epoch": 1.9743160245672808, "grad_norm": 0.5360748767852783, "learning_rate": 3.419484843175048e-05, "loss": 1.6535, "step": 7514 }, { "epoch": 1.9748415279009426, "grad_norm": 0.5412916541099548, "learning_rate": 3.4177326090765726e-05, "loss": 1.6504, "step": 7516 }, { "epoch": 1.9753670312346046, "grad_norm": 0.6592147946357727, "learning_rate": 3.4159803749780974e-05, "loss": 1.6766, "step": 7518 }, { "epoch": 1.975892534568266, "grad_norm": 0.5632504820823669, "learning_rate": 3.4142281408796215e-05, "loss": 1.6587, "step": 7520 }, { "epoch": 1.976418037901928, "grad_norm": 0.5235944390296936, "learning_rate": 3.412475906781146e-05, "loss": 1.7091, "step": 7522 }, { "epoch": 1.9769435412355896, "grad_norm": 0.7552381753921509, "learning_rate": 3.410723672682671e-05, "loss": 1.6765, "step": 7524 }, { "epoch": 1.9774690445692515, "grad_norm": 0.5463106036186218, "learning_rate": 3.408971438584195e-05, "loss": 1.7028, "step": 7526 }, { "epoch": 1.9779945479029133, "grad_norm": 0.5811282396316528, "learning_rate": 3.407219204485719e-05, "loss": 1.626, "step": 7528 }, { "epoch": 1.978520051236575, "grad_norm": 0.5381944179534912, "learning_rate": 3.405466970387244e-05, "loss": 1.6557, "step": 7530 }, { "epoch": 1.9790455545702368, "grad_norm": 0.5382105112075806, "learning_rate": 3.4037147362887687e-05, "loss": 1.6717, "step": 7532 }, { "epoch": 1.9795710579038985, "grad_norm": 0.5555400252342224, "learning_rate": 3.401962502190293e-05, "loss": 1.6546, "step": 7534 }, { "epoch": 1.9800965612375605, "grad_norm": 0.5068405270576477, "learning_rate": 3.4002102680918175e-05, "loss": 1.6378, "step": 7536 }, { "epoch": 1.980622064571222, "grad_norm": 0.5872784852981567, "learning_rate": 3.398458033993342e-05, "loss": 1.6615, "step": 7538 }, { "epoch": 1.981147567904884, "grad_norm": 0.5751393437385559, "learning_rate": 3.3967057998948656e-05, "loss": 1.6712, "step": 7540 }, { "epoch": 1.9816730712385455, "grad_norm": 0.5746015310287476, "learning_rate": 3.3949535657963904e-05, "loss": 1.667, "step": 7542 }, { "epoch": 1.9821985745722075, "grad_norm": 0.6387732028961182, "learning_rate": 3.393201331697915e-05, "loss": 1.6253, "step": 7544 }, { "epoch": 1.9827240779058692, "grad_norm": 0.5271580815315247, "learning_rate": 3.391449097599439e-05, "loss": 1.6442, "step": 7546 }, { "epoch": 1.983249581239531, "grad_norm": 0.5445979237556458, "learning_rate": 3.389696863500964e-05, "loss": 1.6363, "step": 7548 }, { "epoch": 1.9837750845731927, "grad_norm": 0.6095319986343384, "learning_rate": 3.387944629402489e-05, "loss": 1.6486, "step": 7550 }, { "epoch": 1.9843005879068545, "grad_norm": 0.6398605704307556, "learning_rate": 3.386192395304013e-05, "loss": 1.6427, "step": 7552 }, { "epoch": 1.9848260912405165, "grad_norm": 0.6042175889015198, "learning_rate": 3.384440161205537e-05, "loss": 1.6522, "step": 7554 }, { "epoch": 1.985351594574178, "grad_norm": 0.550961971282959, "learning_rate": 3.382687927107062e-05, "loss": 1.7006, "step": 7556 }, { "epoch": 1.98587709790784, "grad_norm": 0.5935929417610168, "learning_rate": 3.3809356930085864e-05, "loss": 1.6537, "step": 7558 }, { "epoch": 1.9864026012415015, "grad_norm": 0.6039239764213562, "learning_rate": 3.3791834589101105e-05, "loss": 1.6623, "step": 7560 }, { "epoch": 1.9869281045751634, "grad_norm": 0.5280980467796326, "learning_rate": 3.377431224811635e-05, "loss": 1.6695, "step": 7562 }, { "epoch": 1.9874536079088252, "grad_norm": 0.5845438838005066, "learning_rate": 3.375678990713159e-05, "loss": 1.6387, "step": 7564 }, { "epoch": 1.987979111242487, "grad_norm": 0.5099235773086548, "learning_rate": 3.3739267566146834e-05, "loss": 1.644, "step": 7566 }, { "epoch": 1.9885046145761487, "grad_norm": 0.6559625864028931, "learning_rate": 3.372174522516208e-05, "loss": 1.6683, "step": 7568 }, { "epoch": 1.9890301179098104, "grad_norm": 0.5141841173171997, "learning_rate": 3.370422288417733e-05, "loss": 1.6487, "step": 7570 }, { "epoch": 1.9895556212434724, "grad_norm": 0.5415523052215576, "learning_rate": 3.368670054319257e-05, "loss": 1.6654, "step": 7572 }, { "epoch": 1.990081124577134, "grad_norm": 0.5395537614822388, "learning_rate": 3.366917820220782e-05, "loss": 1.6397, "step": 7574 }, { "epoch": 1.990606627910796, "grad_norm": 0.7312270402908325, "learning_rate": 3.3651655861223065e-05, "loss": 1.628, "step": 7576 }, { "epoch": 1.9911321312444574, "grad_norm": 0.7088197469711304, "learning_rate": 3.3634133520238306e-05, "loss": 1.6725, "step": 7578 }, { "epoch": 1.9916576345781194, "grad_norm": 0.5116075277328491, "learning_rate": 3.361661117925355e-05, "loss": 1.6307, "step": 7580 }, { "epoch": 1.9921831379117811, "grad_norm": 0.5469351410865784, "learning_rate": 3.3599088838268794e-05, "loss": 1.6465, "step": 7582 }, { "epoch": 1.992708641245443, "grad_norm": 0.5921772718429565, "learning_rate": 3.358156649728404e-05, "loss": 1.6809, "step": 7584 }, { "epoch": 1.9932341445791046, "grad_norm": 0.5668914318084717, "learning_rate": 3.356404415629928e-05, "loss": 1.639, "step": 7586 }, { "epoch": 1.9937596479127664, "grad_norm": 0.5449174642562866, "learning_rate": 3.354652181531453e-05, "loss": 1.6501, "step": 7588 }, { "epoch": 1.9942851512464284, "grad_norm": 0.6408663988113403, "learning_rate": 3.352899947432977e-05, "loss": 1.6949, "step": 7590 }, { "epoch": 1.9948106545800899, "grad_norm": 0.6358173489570618, "learning_rate": 3.351147713334501e-05, "loss": 1.6808, "step": 7592 }, { "epoch": 1.9953361579137519, "grad_norm": 0.5143889784812927, "learning_rate": 3.349395479236026e-05, "loss": 1.6659, "step": 7594 }, { "epoch": 1.9958616612474134, "grad_norm": 0.5499483346939087, "learning_rate": 3.347643245137551e-05, "loss": 1.6649, "step": 7596 }, { "epoch": 1.9963871645810753, "grad_norm": 0.5296257138252258, "learning_rate": 3.345891011039075e-05, "loss": 1.6784, "step": 7598 }, { "epoch": 1.996912667914737, "grad_norm": 0.6414445638656616, "learning_rate": 3.3441387769405995e-05, "loss": 1.6672, "step": 7600 }, { "epoch": 1.996912667914737, "eval_loss": 1.6618915796279907, "eval_runtime": 487.2163, "eval_samples_per_second": 249.969, "eval_steps_per_second": 31.247, "step": 7600 }, { "epoch": 1.9974381712483988, "grad_norm": 0.5835072994232178, "learning_rate": 3.3423865428421236e-05, "loss": 1.6681, "step": 7602 }, { "epoch": 1.9979636745820606, "grad_norm": 0.5567203164100647, "learning_rate": 3.3406343087436484e-05, "loss": 1.6592, "step": 7604 }, { "epoch": 1.9984891779157223, "grad_norm": 0.6631194949150085, "learning_rate": 3.3388820746451724e-05, "loss": 1.6551, "step": 7606 }, { "epoch": 1.9990146812493843, "grad_norm": 0.6589709520339966, "learning_rate": 3.337129840546697e-05, "loss": 1.6358, "step": 7608 }, { "epoch": 1.9995401845830458, "grad_norm": 0.5504697561264038, "learning_rate": 3.335377606448222e-05, "loss": 1.6477, "step": 7610 }, { "epoch": 2.000065687916708, "grad_norm": 0.4931085705757141, "learning_rate": 3.333625372349746e-05, "loss": 1.6487, "step": 7612 }, { "epoch": 2.0005911912503693, "grad_norm": 0.6642125844955444, "learning_rate": 3.331873138251271e-05, "loss": 1.6306, "step": 7614 }, { "epoch": 2.0011166945840313, "grad_norm": 0.5301389098167419, "learning_rate": 3.330120904152795e-05, "loss": 1.574, "step": 7616 }, { "epoch": 2.001642197917693, "grad_norm": 0.5445417761802673, "learning_rate": 3.3283686700543196e-05, "loss": 1.6205, "step": 7618 }, { "epoch": 2.002167701251355, "grad_norm": 0.5667280554771423, "learning_rate": 3.326616435955844e-05, "loss": 1.6149, "step": 7620 }, { "epoch": 2.0026932045850168, "grad_norm": 0.5716310739517212, "learning_rate": 3.3248642018573685e-05, "loss": 1.6333, "step": 7622 }, { "epoch": 2.0032187079186783, "grad_norm": 0.7009231448173523, "learning_rate": 3.323111967758893e-05, "loss": 1.6281, "step": 7624 }, { "epoch": 2.0037442112523403, "grad_norm": 0.5482553839683533, "learning_rate": 3.321359733660417e-05, "loss": 1.6054, "step": 7626 }, { "epoch": 2.004269714586002, "grad_norm": 0.48852571845054626, "learning_rate": 3.3196074995619414e-05, "loss": 1.636, "step": 7628 }, { "epoch": 2.0047952179196638, "grad_norm": 0.5163108110427856, "learning_rate": 3.317855265463466e-05, "loss": 1.6036, "step": 7630 }, { "epoch": 2.0053207212533253, "grad_norm": 0.5240321755409241, "learning_rate": 3.31610303136499e-05, "loss": 1.5722, "step": 7632 }, { "epoch": 2.0058462245869872, "grad_norm": 0.5390346646308899, "learning_rate": 3.314350797266515e-05, "loss": 1.5939, "step": 7634 }, { "epoch": 2.006371727920649, "grad_norm": 0.5812144875526428, "learning_rate": 3.31259856316804e-05, "loss": 1.6445, "step": 7636 }, { "epoch": 2.0068972312543107, "grad_norm": 0.6792988777160645, "learning_rate": 3.310846329069564e-05, "loss": 1.6003, "step": 7638 }, { "epoch": 2.0074227345879727, "grad_norm": 0.6539336442947388, "learning_rate": 3.309094094971088e-05, "loss": 1.632, "step": 7640 }, { "epoch": 2.0079482379216342, "grad_norm": 0.6297439336776733, "learning_rate": 3.3073418608726126e-05, "loss": 1.6148, "step": 7642 }, { "epoch": 2.008473741255296, "grad_norm": 0.6377638578414917, "learning_rate": 3.3055896267741374e-05, "loss": 1.6097, "step": 7644 }, { "epoch": 2.0089992445889577, "grad_norm": 0.5924256443977356, "learning_rate": 3.3038373926756615e-05, "loss": 1.6239, "step": 7646 }, { "epoch": 2.0095247479226197, "grad_norm": 0.541309118270874, "learning_rate": 3.302085158577186e-05, "loss": 1.6145, "step": 7648 }, { "epoch": 2.0100502512562812, "grad_norm": 0.708676278591156, "learning_rate": 3.300332924478711e-05, "loss": 1.6037, "step": 7650 }, { "epoch": 2.010575754589943, "grad_norm": 0.6031850576400757, "learning_rate": 3.298580690380235e-05, "loss": 1.6071, "step": 7652 }, { "epoch": 2.011101257923605, "grad_norm": 0.6296379566192627, "learning_rate": 3.296828456281759e-05, "loss": 1.6348, "step": 7654 }, { "epoch": 2.0116267612572667, "grad_norm": 0.5863556861877441, "learning_rate": 3.295076222183284e-05, "loss": 1.6446, "step": 7656 }, { "epoch": 2.0121522645909287, "grad_norm": 0.5582921504974365, "learning_rate": 3.293323988084808e-05, "loss": 1.62, "step": 7658 }, { "epoch": 2.01267776792459, "grad_norm": 0.5858394503593445, "learning_rate": 3.291571753986333e-05, "loss": 1.5987, "step": 7660 }, { "epoch": 2.013203271258252, "grad_norm": 0.6063563227653503, "learning_rate": 3.2898195198878575e-05, "loss": 1.6469, "step": 7662 }, { "epoch": 2.0137287745919137, "grad_norm": 0.536972165107727, "learning_rate": 3.2880672857893816e-05, "loss": 1.6042, "step": 7664 }, { "epoch": 2.0142542779255757, "grad_norm": 0.6241976618766785, "learning_rate": 3.2863150516909056e-05, "loss": 1.5904, "step": 7666 }, { "epoch": 2.014779781259237, "grad_norm": 0.5789350271224976, "learning_rate": 3.2845628175924304e-05, "loss": 1.5959, "step": 7668 }, { "epoch": 2.015305284592899, "grad_norm": 0.6051028370857239, "learning_rate": 3.282810583493955e-05, "loss": 1.628, "step": 7670 }, { "epoch": 2.015830787926561, "grad_norm": 0.658258855342865, "learning_rate": 3.281058349395479e-05, "loss": 1.6064, "step": 7672 }, { "epoch": 2.0163562912602226, "grad_norm": 0.5936440825462341, "learning_rate": 3.279306115297004e-05, "loss": 1.6102, "step": 7674 }, { "epoch": 2.0168817945938846, "grad_norm": 0.5593782663345337, "learning_rate": 3.277553881198529e-05, "loss": 1.6103, "step": 7676 }, { "epoch": 2.017407297927546, "grad_norm": 0.5200657248497009, "learning_rate": 3.275801647100053e-05, "loss": 1.5972, "step": 7678 }, { "epoch": 2.017932801261208, "grad_norm": 0.6176472902297974, "learning_rate": 3.274049413001577e-05, "loss": 1.5623, "step": 7680 }, { "epoch": 2.0184583045948696, "grad_norm": 0.7342426776885986, "learning_rate": 3.272297178903102e-05, "loss": 1.6424, "step": 7682 }, { "epoch": 2.0189838079285316, "grad_norm": 0.5766749978065491, "learning_rate": 3.270544944804626e-05, "loss": 1.6093, "step": 7684 }, { "epoch": 2.019509311262193, "grad_norm": 0.5769773125648499, "learning_rate": 3.2687927107061505e-05, "loss": 1.6372, "step": 7686 }, { "epoch": 2.020034814595855, "grad_norm": 0.6358572244644165, "learning_rate": 3.267040476607675e-05, "loss": 1.6274, "step": 7688 }, { "epoch": 2.020560317929517, "grad_norm": 0.5427133440971375, "learning_rate": 3.265288242509199e-05, "loss": 1.594, "step": 7690 }, { "epoch": 2.0210858212631786, "grad_norm": 0.5329941511154175, "learning_rate": 3.2635360084107234e-05, "loss": 1.606, "step": 7692 }, { "epoch": 2.0216113245968406, "grad_norm": 0.5538175106048584, "learning_rate": 3.261783774312248e-05, "loss": 1.5938, "step": 7694 }, { "epoch": 2.022136827930502, "grad_norm": 0.5610730051994324, "learning_rate": 3.260031540213773e-05, "loss": 1.604, "step": 7696 }, { "epoch": 2.022662331264164, "grad_norm": 0.6616785526275635, "learning_rate": 3.258279306115297e-05, "loss": 1.6107, "step": 7698 }, { "epoch": 2.0231878345978256, "grad_norm": 0.5210354924201965, "learning_rate": 3.256527072016822e-05, "loss": 1.6338, "step": 7700 }, { "epoch": 2.0237133379314876, "grad_norm": 0.7099502682685852, "learning_rate": 3.2547748379183465e-05, "loss": 1.632, "step": 7702 }, { "epoch": 2.024238841265149, "grad_norm": 0.5911880731582642, "learning_rate": 3.25302260381987e-05, "loss": 1.5943, "step": 7704 }, { "epoch": 2.024764344598811, "grad_norm": 0.5507931709289551, "learning_rate": 3.251270369721395e-05, "loss": 1.605, "step": 7706 }, { "epoch": 2.025289847932473, "grad_norm": 0.5524379014968872, "learning_rate": 3.2495181356229194e-05, "loss": 1.5664, "step": 7708 }, { "epoch": 2.0258153512661345, "grad_norm": 0.5344357490539551, "learning_rate": 3.2477659015244435e-05, "loss": 1.6136, "step": 7710 }, { "epoch": 2.0263408545997965, "grad_norm": 0.6390090584754944, "learning_rate": 3.246013667425968e-05, "loss": 1.6033, "step": 7712 }, { "epoch": 2.026866357933458, "grad_norm": 0.5854797959327698, "learning_rate": 3.244261433327493e-05, "loss": 1.5812, "step": 7714 }, { "epoch": 2.02739186126712, "grad_norm": 0.6743679642677307, "learning_rate": 3.242509199229017e-05, "loss": 1.577, "step": 7716 }, { "epoch": 2.0279173646007815, "grad_norm": 0.6346087455749512, "learning_rate": 3.240756965130541e-05, "loss": 1.6094, "step": 7718 }, { "epoch": 2.0284428679344435, "grad_norm": 0.6077213883399963, "learning_rate": 3.239004731032066e-05, "loss": 1.6264, "step": 7720 }, { "epoch": 2.028968371268105, "grad_norm": 0.6114009618759155, "learning_rate": 3.237252496933591e-05, "loss": 1.6323, "step": 7722 }, { "epoch": 2.029493874601767, "grad_norm": 0.5229764580726624, "learning_rate": 3.235500262835115e-05, "loss": 1.6128, "step": 7724 }, { "epoch": 2.030019377935429, "grad_norm": 0.7623997330665588, "learning_rate": 3.2337480287366395e-05, "loss": 1.5912, "step": 7726 }, { "epoch": 2.0305448812690905, "grad_norm": 0.7716650366783142, "learning_rate": 3.231995794638164e-05, "loss": 1.6001, "step": 7728 }, { "epoch": 2.0310703846027525, "grad_norm": 0.5649428963661194, "learning_rate": 3.230243560539688e-05, "loss": 1.6171, "step": 7730 }, { "epoch": 2.031595887936414, "grad_norm": 0.689694344997406, "learning_rate": 3.2284913264412124e-05, "loss": 1.6245, "step": 7732 }, { "epoch": 2.032121391270076, "grad_norm": 0.6371480226516724, "learning_rate": 3.226739092342737e-05, "loss": 1.6262, "step": 7734 }, { "epoch": 2.0326468946037375, "grad_norm": 0.5378861427307129, "learning_rate": 3.224986858244261e-05, "loss": 1.6057, "step": 7736 }, { "epoch": 2.0331723979373995, "grad_norm": 0.633033037185669, "learning_rate": 3.223234624145786e-05, "loss": 1.6442, "step": 7738 }, { "epoch": 2.033697901271061, "grad_norm": 0.6772140860557556, "learning_rate": 3.221482390047311e-05, "loss": 1.5639, "step": 7740 }, { "epoch": 2.034223404604723, "grad_norm": 0.5355550050735474, "learning_rate": 3.219730155948835e-05, "loss": 1.5942, "step": 7742 }, { "epoch": 2.034748907938385, "grad_norm": 0.5276470184326172, "learning_rate": 3.217977921850359e-05, "loss": 1.6259, "step": 7744 }, { "epoch": 2.0352744112720464, "grad_norm": 0.5952677726745605, "learning_rate": 3.216225687751884e-05, "loss": 1.6073, "step": 7746 }, { "epoch": 2.0357999146057084, "grad_norm": 0.5393653512001038, "learning_rate": 3.2144734536534085e-05, "loss": 1.612, "step": 7748 }, { "epoch": 2.03632541793937, "grad_norm": 0.5733909010887146, "learning_rate": 3.2127212195549325e-05, "loss": 1.6513, "step": 7750 }, { "epoch": 2.036850921273032, "grad_norm": 0.6015291810035706, "learning_rate": 3.210968985456457e-05, "loss": 1.613, "step": 7752 }, { "epoch": 2.0373764246066934, "grad_norm": 0.6741379499435425, "learning_rate": 3.209216751357982e-05, "loss": 1.6277, "step": 7754 }, { "epoch": 2.0379019279403554, "grad_norm": 0.6242867112159729, "learning_rate": 3.207464517259506e-05, "loss": 1.6299, "step": 7756 }, { "epoch": 2.038427431274017, "grad_norm": 0.6203280091285706, "learning_rate": 3.20571228316103e-05, "loss": 1.624, "step": 7758 }, { "epoch": 2.038952934607679, "grad_norm": 0.5922704339027405, "learning_rate": 3.203960049062555e-05, "loss": 1.6176, "step": 7760 }, { "epoch": 2.039478437941341, "grad_norm": 0.5987135767936707, "learning_rate": 3.20220781496408e-05, "loss": 1.631, "step": 7762 }, { "epoch": 2.0400039412750024, "grad_norm": 0.658090353012085, "learning_rate": 3.200455580865604e-05, "loss": 1.6455, "step": 7764 }, { "epoch": 2.0405294446086644, "grad_norm": 0.5447620749473572, "learning_rate": 3.1987033467671286e-05, "loss": 1.6257, "step": 7766 }, { "epoch": 2.041054947942326, "grad_norm": 0.5644233226776123, "learning_rate": 3.1969511126686526e-05, "loss": 1.6187, "step": 7768 }, { "epoch": 2.041580451275988, "grad_norm": 0.6040834188461304, "learning_rate": 3.195198878570177e-05, "loss": 1.651, "step": 7770 }, { "epoch": 2.0421059546096494, "grad_norm": 0.5864139199256897, "learning_rate": 3.1934466444717015e-05, "loss": 1.6151, "step": 7772 }, { "epoch": 2.0426314579433114, "grad_norm": 0.5407016277313232, "learning_rate": 3.191694410373226e-05, "loss": 1.6075, "step": 7774 }, { "epoch": 2.043156961276973, "grad_norm": 0.6198828816413879, "learning_rate": 3.18994217627475e-05, "loss": 1.6075, "step": 7776 }, { "epoch": 2.043682464610635, "grad_norm": 0.5452790856361389, "learning_rate": 3.188189942176275e-05, "loss": 1.5942, "step": 7778 }, { "epoch": 2.044207967944297, "grad_norm": 0.5661458969116211, "learning_rate": 3.1864377080778e-05, "loss": 1.64, "step": 7780 }, { "epoch": 2.0447334712779583, "grad_norm": 0.5775606036186218, "learning_rate": 3.184685473979324e-05, "loss": 1.6221, "step": 7782 }, { "epoch": 2.0452589746116203, "grad_norm": 0.6352986693382263, "learning_rate": 3.182933239880848e-05, "loss": 1.5736, "step": 7784 }, { "epoch": 2.045784477945282, "grad_norm": 0.6562779545783997, "learning_rate": 3.181181005782373e-05, "loss": 1.659, "step": 7786 }, { "epoch": 2.046309981278944, "grad_norm": 0.6051666140556335, "learning_rate": 3.1794287716838975e-05, "loss": 1.6418, "step": 7788 }, { "epoch": 2.0468354846126053, "grad_norm": 0.5075708031654358, "learning_rate": 3.1776765375854216e-05, "loss": 1.6209, "step": 7790 }, { "epoch": 2.0473609879462673, "grad_norm": 0.646185576915741, "learning_rate": 3.175924303486946e-05, "loss": 1.6081, "step": 7792 }, { "epoch": 2.047886491279929, "grad_norm": 0.5304019451141357, "learning_rate": 3.1741720693884704e-05, "loss": 1.6065, "step": 7794 }, { "epoch": 2.048411994613591, "grad_norm": 0.6288559436798096, "learning_rate": 3.1724198352899945e-05, "loss": 1.63, "step": 7796 }, { "epoch": 2.0489374979472528, "grad_norm": 0.5334764122962952, "learning_rate": 3.170667601191519e-05, "loss": 1.6161, "step": 7798 }, { "epoch": 2.0494630012809143, "grad_norm": 0.5528808236122131, "learning_rate": 3.168915367093044e-05, "loss": 1.5925, "step": 7800 }, { "epoch": 2.0499885046145763, "grad_norm": 0.5791876912117004, "learning_rate": 3.167163132994568e-05, "loss": 1.6322, "step": 7802 }, { "epoch": 2.050514007948238, "grad_norm": 0.6923157572746277, "learning_rate": 3.165410898896093e-05, "loss": 1.6301, "step": 7804 }, { "epoch": 2.0510395112818998, "grad_norm": 0.5527494549751282, "learning_rate": 3.163658664797617e-05, "loss": 1.6363, "step": 7806 }, { "epoch": 2.0515650146155613, "grad_norm": 0.554539144039154, "learning_rate": 3.1619064306991417e-05, "loss": 1.6324, "step": 7808 }, { "epoch": 2.0520905179492233, "grad_norm": 0.6409008502960205, "learning_rate": 3.160154196600666e-05, "loss": 1.6378, "step": 7810 }, { "epoch": 2.0526160212828852, "grad_norm": 0.6073037385940552, "learning_rate": 3.1584019625021905e-05, "loss": 1.5801, "step": 7812 }, { "epoch": 2.0531415246165468, "grad_norm": 0.6152986288070679, "learning_rate": 3.156649728403715e-05, "loss": 1.6128, "step": 7814 }, { "epoch": 2.0536670279502087, "grad_norm": 0.5760658383369446, "learning_rate": 3.154897494305239e-05, "loss": 1.5979, "step": 7816 }, { "epoch": 2.0541925312838702, "grad_norm": 0.5909692645072937, "learning_rate": 3.153145260206764e-05, "loss": 1.5931, "step": 7818 }, { "epoch": 2.054718034617532, "grad_norm": 0.5456188917160034, "learning_rate": 3.151393026108288e-05, "loss": 1.6557, "step": 7820 }, { "epoch": 2.0552435379511937, "grad_norm": 0.6995749473571777, "learning_rate": 3.149640792009812e-05, "loss": 1.5974, "step": 7822 }, { "epoch": 2.0557690412848557, "grad_norm": 0.7031815648078918, "learning_rate": 3.147888557911337e-05, "loss": 1.6331, "step": 7824 }, { "epoch": 2.0562945446185172, "grad_norm": 0.5628200769424438, "learning_rate": 3.146136323812862e-05, "loss": 1.6148, "step": 7826 }, { "epoch": 2.056820047952179, "grad_norm": 0.5670493841171265, "learning_rate": 3.144384089714386e-05, "loss": 1.6436, "step": 7828 }, { "epoch": 2.057345551285841, "grad_norm": 0.5815064311027527, "learning_rate": 3.1426318556159106e-05, "loss": 1.6107, "step": 7830 }, { "epoch": 2.0578710546195027, "grad_norm": 0.4898243844509125, "learning_rate": 3.140879621517435e-05, "loss": 1.5852, "step": 7832 }, { "epoch": 2.0583965579531647, "grad_norm": 0.71734219789505, "learning_rate": 3.1391273874189594e-05, "loss": 1.6009, "step": 7834 }, { "epoch": 2.058922061286826, "grad_norm": 0.5565480589866638, "learning_rate": 3.1373751533204835e-05, "loss": 1.611, "step": 7836 }, { "epoch": 2.059447564620488, "grad_norm": 0.5884687900543213, "learning_rate": 3.135622919222008e-05, "loss": 1.5922, "step": 7838 }, { "epoch": 2.0599730679541497, "grad_norm": 0.5969564318656921, "learning_rate": 3.133870685123533e-05, "loss": 1.6237, "step": 7840 }, { "epoch": 2.0604985712878117, "grad_norm": 0.6498041152954102, "learning_rate": 3.132118451025057e-05, "loss": 1.6127, "step": 7842 }, { "epoch": 2.061024074621473, "grad_norm": 0.6092948913574219, "learning_rate": 3.130366216926581e-05, "loss": 1.6377, "step": 7844 }, { "epoch": 2.061549577955135, "grad_norm": 0.5632656812667847, "learning_rate": 3.128613982828106e-05, "loss": 1.5976, "step": 7846 }, { "epoch": 2.062075081288797, "grad_norm": 0.6170439720153809, "learning_rate": 3.12686174872963e-05, "loss": 1.5962, "step": 7848 }, { "epoch": 2.0626005846224587, "grad_norm": 0.5882241725921631, "learning_rate": 3.125109514631155e-05, "loss": 1.5918, "step": 7850 }, { "epoch": 2.0631260879561206, "grad_norm": 0.5418891906738281, "learning_rate": 3.1233572805326795e-05, "loss": 1.6289, "step": 7852 }, { "epoch": 2.063651591289782, "grad_norm": 0.6917753219604492, "learning_rate": 3.1216050464342036e-05, "loss": 1.6565, "step": 7854 }, { "epoch": 2.064177094623444, "grad_norm": 0.596343457698822, "learning_rate": 3.1198528123357284e-05, "loss": 1.6087, "step": 7856 }, { "epoch": 2.0647025979571056, "grad_norm": 0.5705437660217285, "learning_rate": 3.1181005782372524e-05, "loss": 1.607, "step": 7858 }, { "epoch": 2.0652281012907676, "grad_norm": 0.5608698129653931, "learning_rate": 3.116348344138777e-05, "loss": 1.5935, "step": 7860 }, { "epoch": 2.065753604624429, "grad_norm": 0.5956361293792725, "learning_rate": 3.114596110040301e-05, "loss": 1.5784, "step": 7862 }, { "epoch": 2.066279107958091, "grad_norm": 0.5428637862205505, "learning_rate": 3.112843875941826e-05, "loss": 1.6384, "step": 7864 }, { "epoch": 2.066804611291753, "grad_norm": 0.5758678317070007, "learning_rate": 3.111091641843351e-05, "loss": 1.6397, "step": 7866 }, { "epoch": 2.0673301146254146, "grad_norm": 0.605904221534729, "learning_rate": 3.109339407744875e-05, "loss": 1.6312, "step": 7868 }, { "epoch": 2.0678556179590766, "grad_norm": 0.5100085735321045, "learning_rate": 3.107587173646399e-05, "loss": 1.583, "step": 7870 }, { "epoch": 2.068381121292738, "grad_norm": 0.5583624839782715, "learning_rate": 3.105834939547924e-05, "loss": 1.6319, "step": 7872 }, { "epoch": 2.0689066246264, "grad_norm": 0.7279824018478394, "learning_rate": 3.1040827054494485e-05, "loss": 1.6142, "step": 7874 }, { "epoch": 2.0694321279600616, "grad_norm": 0.5635095238685608, "learning_rate": 3.1023304713509725e-05, "loss": 1.6363, "step": 7876 }, { "epoch": 2.0699576312937236, "grad_norm": 0.6370921730995178, "learning_rate": 3.100578237252497e-05, "loss": 1.6132, "step": 7878 }, { "epoch": 2.070483134627385, "grad_norm": 0.6772926449775696, "learning_rate": 3.098826003154022e-05, "loss": 1.6033, "step": 7880 }, { "epoch": 2.071008637961047, "grad_norm": 0.7656962275505066, "learning_rate": 3.0970737690555454e-05, "loss": 1.5948, "step": 7882 }, { "epoch": 2.071534141294709, "grad_norm": 0.6695175766944885, "learning_rate": 3.09532153495707e-05, "loss": 1.6419, "step": 7884 }, { "epoch": 2.0720596446283706, "grad_norm": 0.7357918620109558, "learning_rate": 3.093569300858595e-05, "loss": 1.5987, "step": 7886 }, { "epoch": 2.0725851479620325, "grad_norm": 0.5544195175170898, "learning_rate": 3.091817066760119e-05, "loss": 1.6389, "step": 7888 }, { "epoch": 2.073110651295694, "grad_norm": 0.674032986164093, "learning_rate": 3.090064832661644e-05, "loss": 1.6046, "step": 7890 }, { "epoch": 2.073636154629356, "grad_norm": 0.5453786253929138, "learning_rate": 3.0883125985631686e-05, "loss": 1.6194, "step": 7892 }, { "epoch": 2.0741616579630175, "grad_norm": 0.6969361901283264, "learning_rate": 3.0865603644646926e-05, "loss": 1.5962, "step": 7894 }, { "epoch": 2.0746871612966795, "grad_norm": 0.8166598677635193, "learning_rate": 3.084808130366217e-05, "loss": 1.5563, "step": 7896 }, { "epoch": 2.075212664630341, "grad_norm": 0.6729124784469604, "learning_rate": 3.0830558962677415e-05, "loss": 1.5873, "step": 7898 }, { "epoch": 2.075738167964003, "grad_norm": 0.5767272114753723, "learning_rate": 3.081303662169266e-05, "loss": 1.6117, "step": 7900 }, { "epoch": 2.076263671297665, "grad_norm": 0.5525824427604675, "learning_rate": 3.07955142807079e-05, "loss": 1.6047, "step": 7902 }, { "epoch": 2.0767891746313265, "grad_norm": 0.6294286251068115, "learning_rate": 3.077799193972315e-05, "loss": 1.6076, "step": 7904 }, { "epoch": 2.0773146779649885, "grad_norm": 0.5846551060676575, "learning_rate": 3.07604695987384e-05, "loss": 1.6207, "step": 7906 }, { "epoch": 2.07784018129865, "grad_norm": 0.5915541052818298, "learning_rate": 3.074294725775363e-05, "loss": 1.6222, "step": 7908 }, { "epoch": 2.078365684632312, "grad_norm": 0.5178691148757935, "learning_rate": 3.072542491676888e-05, "loss": 1.6086, "step": 7910 }, { "epoch": 2.0788911879659735, "grad_norm": 0.5767921805381775, "learning_rate": 3.070790257578413e-05, "loss": 1.6102, "step": 7912 }, { "epoch": 2.0794166912996355, "grad_norm": 0.7136160135269165, "learning_rate": 3.069038023479937e-05, "loss": 1.6103, "step": 7914 }, { "epoch": 2.079942194633297, "grad_norm": 0.5835669040679932, "learning_rate": 3.0672857893814616e-05, "loss": 1.6199, "step": 7916 }, { "epoch": 2.080467697966959, "grad_norm": 0.6543889045715332, "learning_rate": 3.065533555282986e-05, "loss": 1.6452, "step": 7918 }, { "epoch": 2.080993201300621, "grad_norm": 0.5414628386497498, "learning_rate": 3.0637813211845104e-05, "loss": 1.6277, "step": 7920 }, { "epoch": 2.0815187046342825, "grad_norm": 0.5655497908592224, "learning_rate": 3.0620290870860345e-05, "loss": 1.6294, "step": 7922 }, { "epoch": 2.0820442079679444, "grad_norm": 0.5398779511451721, "learning_rate": 3.060276852987559e-05, "loss": 1.5906, "step": 7924 }, { "epoch": 2.082569711301606, "grad_norm": 0.6147701740264893, "learning_rate": 3.058524618889084e-05, "loss": 1.6274, "step": 7926 }, { "epoch": 2.083095214635268, "grad_norm": 0.5175689458847046, "learning_rate": 3.056772384790608e-05, "loss": 1.6206, "step": 7928 }, { "epoch": 2.0836207179689294, "grad_norm": 0.5908101797103882, "learning_rate": 3.055020150692133e-05, "loss": 1.5894, "step": 7930 }, { "epoch": 2.0841462213025914, "grad_norm": 0.7519561648368835, "learning_rate": 3.0532679165936576e-05, "loss": 1.5862, "step": 7932 }, { "epoch": 2.0846717246362534, "grad_norm": 0.6898811459541321, "learning_rate": 3.0515156824951813e-05, "loss": 1.5917, "step": 7934 }, { "epoch": 2.085197227969915, "grad_norm": 0.6252642869949341, "learning_rate": 3.0497634483967057e-05, "loss": 1.616, "step": 7936 }, { "epoch": 2.085722731303577, "grad_norm": 0.6489799618721008, "learning_rate": 3.0480112142982305e-05, "loss": 1.5802, "step": 7938 }, { "epoch": 2.0862482346372384, "grad_norm": 0.544003427028656, "learning_rate": 3.046258980199755e-05, "loss": 1.6274, "step": 7940 }, { "epoch": 2.0867737379709004, "grad_norm": 0.6666077971458435, "learning_rate": 3.0445067461012793e-05, "loss": 1.5999, "step": 7942 }, { "epoch": 2.087299241304562, "grad_norm": 0.6988151669502258, "learning_rate": 3.042754512002804e-05, "loss": 1.6314, "step": 7944 }, { "epoch": 2.087824744638224, "grad_norm": 0.5597438812255859, "learning_rate": 3.0410022779043278e-05, "loss": 1.6348, "step": 7946 }, { "epoch": 2.0883502479718854, "grad_norm": 0.5571982264518738, "learning_rate": 3.0392500438058526e-05, "loss": 1.6356, "step": 7948 }, { "epoch": 2.0888757513055474, "grad_norm": 0.5218850374221802, "learning_rate": 3.037497809707377e-05, "loss": 1.588, "step": 7950 }, { "epoch": 2.089401254639209, "grad_norm": 0.627077579498291, "learning_rate": 3.0357455756089014e-05, "loss": 1.5931, "step": 7952 }, { "epoch": 2.089926757972871, "grad_norm": 0.6151776313781738, "learning_rate": 3.0339933415104262e-05, "loss": 1.6073, "step": 7954 }, { "epoch": 2.090452261306533, "grad_norm": 0.5469711422920227, "learning_rate": 3.0322411074119506e-05, "loss": 1.5982, "step": 7956 }, { "epoch": 2.0909777646401944, "grad_norm": 0.6806207299232483, "learning_rate": 3.030488873313475e-05, "loss": 1.6376, "step": 7958 }, { "epoch": 2.0915032679738563, "grad_norm": 0.6746312379837036, "learning_rate": 3.028736639214999e-05, "loss": 1.6113, "step": 7960 }, { "epoch": 2.092028771307518, "grad_norm": 0.6440801024436951, "learning_rate": 3.0269844051165235e-05, "loss": 1.6223, "step": 7962 }, { "epoch": 2.09255427464118, "grad_norm": 0.5747029185295105, "learning_rate": 3.0252321710180483e-05, "loss": 1.5959, "step": 7964 }, { "epoch": 2.0930797779748413, "grad_norm": 0.5977646708488464, "learning_rate": 3.0234799369195727e-05, "loss": 1.6403, "step": 7966 }, { "epoch": 2.0936052813085033, "grad_norm": 0.5634188055992126, "learning_rate": 3.021727702821097e-05, "loss": 1.6062, "step": 7968 }, { "epoch": 2.0941307846421653, "grad_norm": 0.661744236946106, "learning_rate": 3.019975468722622e-05, "loss": 1.6394, "step": 7970 }, { "epoch": 2.094656287975827, "grad_norm": 0.5878733396530151, "learning_rate": 3.0182232346241456e-05, "loss": 1.5781, "step": 7972 }, { "epoch": 2.095181791309489, "grad_norm": 0.5121323466300964, "learning_rate": 3.0164710005256703e-05, "loss": 1.6238, "step": 7974 }, { "epoch": 2.0957072946431503, "grad_norm": 0.6021749377250671, "learning_rate": 3.0147187664271948e-05, "loss": 1.6077, "step": 7976 }, { "epoch": 2.0962327979768123, "grad_norm": 0.5374654531478882, "learning_rate": 3.0129665323287192e-05, "loss": 1.6252, "step": 7978 }, { "epoch": 2.096758301310474, "grad_norm": 0.5462518930435181, "learning_rate": 3.011214298230244e-05, "loss": 1.619, "step": 7980 }, { "epoch": 2.0972838046441358, "grad_norm": 0.5597232580184937, "learning_rate": 3.0094620641317684e-05, "loss": 1.6151, "step": 7982 }, { "epoch": 2.0978093079777973, "grad_norm": 0.6084597110748291, "learning_rate": 3.0077098300332924e-05, "loss": 1.6398, "step": 7984 }, { "epoch": 2.0983348113114593, "grad_norm": 0.5504382252693176, "learning_rate": 3.005957595934817e-05, "loss": 1.5902, "step": 7986 }, { "epoch": 2.0988603146451212, "grad_norm": 0.5516804456710815, "learning_rate": 3.0042053618363413e-05, "loss": 1.625, "step": 7988 }, { "epoch": 2.0993858179787828, "grad_norm": 0.5523831248283386, "learning_rate": 3.002453127737866e-05, "loss": 1.6291, "step": 7990 }, { "epoch": 2.0999113213124447, "grad_norm": 0.6193360686302185, "learning_rate": 3.0007008936393904e-05, "loss": 1.5764, "step": 7992 }, { "epoch": 2.1004368246461063, "grad_norm": 0.620893657207489, "learning_rate": 2.998948659540915e-05, "loss": 1.6295, "step": 7994 }, { "epoch": 2.1009623279797682, "grad_norm": 0.5495589971542358, "learning_rate": 2.9971964254424396e-05, "loss": 1.6159, "step": 7996 }, { "epoch": 2.1014878313134298, "grad_norm": 0.5830954909324646, "learning_rate": 2.9954441913439634e-05, "loss": 1.5844, "step": 7998 }, { "epoch": 2.1020133346470917, "grad_norm": 0.6515240669250488, "learning_rate": 2.993691957245488e-05, "loss": 1.6092, "step": 8000 }, { "epoch": 2.1020133346470917, "eval_loss": 1.6652451753616333, "eval_runtime": 487.2001, "eval_samples_per_second": 249.977, "eval_steps_per_second": 31.248, "step": 8000 }, { "epoch": 2.1025388379807533, "grad_norm": 0.5562619566917419, "learning_rate": 2.9919397231470125e-05, "loss": 1.6272, "step": 8002 }, { "epoch": 2.103064341314415, "grad_norm": 0.8904392123222351, "learning_rate": 2.990187489048537e-05, "loss": 1.6286, "step": 8004 }, { "epoch": 2.103589844648077, "grad_norm": 0.6131122708320618, "learning_rate": 2.9884352549500617e-05, "loss": 1.6225, "step": 8006 }, { "epoch": 2.1041153479817387, "grad_norm": 0.5321511030197144, "learning_rate": 2.986683020851586e-05, "loss": 1.5881, "step": 8008 }, { "epoch": 2.1046408513154007, "grad_norm": 0.5121209621429443, "learning_rate": 2.9849307867531102e-05, "loss": 1.5884, "step": 8010 }, { "epoch": 2.105166354649062, "grad_norm": 0.5307869911193848, "learning_rate": 2.9831785526546346e-05, "loss": 1.5926, "step": 8012 }, { "epoch": 2.105691857982724, "grad_norm": 0.5231491327285767, "learning_rate": 2.981426318556159e-05, "loss": 1.6319, "step": 8014 }, { "epoch": 2.1062173613163857, "grad_norm": 0.5306436419487, "learning_rate": 2.9796740844576838e-05, "loss": 1.6526, "step": 8016 }, { "epoch": 2.1067428646500477, "grad_norm": 0.6463943123817444, "learning_rate": 2.9779218503592082e-05, "loss": 1.6074, "step": 8018 }, { "epoch": 2.107268367983709, "grad_norm": 0.5345489382743835, "learning_rate": 2.9761696162607326e-05, "loss": 1.6413, "step": 8020 }, { "epoch": 2.107793871317371, "grad_norm": 0.6284579634666443, "learning_rate": 2.9744173821622574e-05, "loss": 1.63, "step": 8022 }, { "epoch": 2.108319374651033, "grad_norm": 0.5651626586914062, "learning_rate": 2.972665148063781e-05, "loss": 1.5837, "step": 8024 }, { "epoch": 2.1088448779846947, "grad_norm": 0.692808210849762, "learning_rate": 2.970912913965306e-05, "loss": 1.6361, "step": 8026 }, { "epoch": 2.1093703813183566, "grad_norm": 0.5368518829345703, "learning_rate": 2.9691606798668303e-05, "loss": 1.6577, "step": 8028 }, { "epoch": 2.109895884652018, "grad_norm": 0.561772346496582, "learning_rate": 2.9674084457683547e-05, "loss": 1.5959, "step": 8030 }, { "epoch": 2.11042138798568, "grad_norm": 0.6566132307052612, "learning_rate": 2.9656562116698795e-05, "loss": 1.622, "step": 8032 }, { "epoch": 2.1109468913193417, "grad_norm": 0.7207258939743042, "learning_rate": 2.963903977571404e-05, "loss": 1.6318, "step": 8034 }, { "epoch": 2.1114723946530036, "grad_norm": 0.5623325705528259, "learning_rate": 2.962151743472928e-05, "loss": 1.5964, "step": 8036 }, { "epoch": 2.111997897986665, "grad_norm": 0.6407482624053955, "learning_rate": 2.9603995093744524e-05, "loss": 1.5759, "step": 8038 }, { "epoch": 2.112523401320327, "grad_norm": 0.5547378659248352, "learning_rate": 2.9586472752759768e-05, "loss": 1.6112, "step": 8040 }, { "epoch": 2.113048904653989, "grad_norm": 0.5998631715774536, "learning_rate": 2.9568950411775016e-05, "loss": 1.6102, "step": 8042 }, { "epoch": 2.1135744079876506, "grad_norm": 0.5907427072525024, "learning_rate": 2.955142807079026e-05, "loss": 1.6056, "step": 8044 }, { "epoch": 2.1140999113213126, "grad_norm": 0.5377500653266907, "learning_rate": 2.9533905729805504e-05, "loss": 1.626, "step": 8046 }, { "epoch": 2.114625414654974, "grad_norm": 0.5881298780441284, "learning_rate": 2.9516383388820745e-05, "loss": 1.6245, "step": 8048 }, { "epoch": 2.115150917988636, "grad_norm": 0.6338145136833191, "learning_rate": 2.949886104783599e-05, "loss": 1.6133, "step": 8050 }, { "epoch": 2.1156764213222976, "grad_norm": 0.7909775972366333, "learning_rate": 2.9481338706851236e-05, "loss": 1.625, "step": 8052 }, { "epoch": 2.1162019246559596, "grad_norm": 0.5986071228981018, "learning_rate": 2.946381636586648e-05, "loss": 1.6209, "step": 8054 }, { "epoch": 2.116727427989621, "grad_norm": 0.6373353600502014, "learning_rate": 2.9446294024881725e-05, "loss": 1.5849, "step": 8056 }, { "epoch": 2.117252931323283, "grad_norm": 0.7317929267883301, "learning_rate": 2.9428771683896972e-05, "loss": 1.6032, "step": 8058 }, { "epoch": 2.117778434656945, "grad_norm": 0.5454069375991821, "learning_rate": 2.9411249342912217e-05, "loss": 1.6171, "step": 8060 }, { "epoch": 2.1183039379906066, "grad_norm": 0.5561736226081848, "learning_rate": 2.9393727001927457e-05, "loss": 1.6434, "step": 8062 }, { "epoch": 2.1188294413242685, "grad_norm": 0.685099720954895, "learning_rate": 2.93762046609427e-05, "loss": 1.5534, "step": 8064 }, { "epoch": 2.11935494465793, "grad_norm": 0.5449316501617432, "learning_rate": 2.935868231995795e-05, "loss": 1.5926, "step": 8066 }, { "epoch": 2.119880447991592, "grad_norm": 0.5778328776359558, "learning_rate": 2.9341159978973193e-05, "loss": 1.6166, "step": 8068 }, { "epoch": 2.1204059513252536, "grad_norm": 0.5505948662757874, "learning_rate": 2.9323637637988437e-05, "loss": 1.5876, "step": 8070 }, { "epoch": 2.1209314546589155, "grad_norm": 0.5759081244468689, "learning_rate": 2.9306115297003685e-05, "loss": 1.6145, "step": 8072 }, { "epoch": 2.121456957992577, "grad_norm": 0.5547658205032349, "learning_rate": 2.9288592956018922e-05, "loss": 1.564, "step": 8074 }, { "epoch": 2.121982461326239, "grad_norm": 0.6756736040115356, "learning_rate": 2.927107061503417e-05, "loss": 1.6301, "step": 8076 }, { "epoch": 2.122507964659901, "grad_norm": 0.8322302103042603, "learning_rate": 2.9253548274049414e-05, "loss": 1.6044, "step": 8078 }, { "epoch": 2.1230334679935625, "grad_norm": 0.6076900362968445, "learning_rate": 2.923602593306466e-05, "loss": 1.5982, "step": 8080 }, { "epoch": 2.1235589713272245, "grad_norm": 0.5850244760513306, "learning_rate": 2.9218503592079906e-05, "loss": 1.6248, "step": 8082 }, { "epoch": 2.124084474660886, "grad_norm": 0.5558748841285706, "learning_rate": 2.920098125109515e-05, "loss": 1.6224, "step": 8084 }, { "epoch": 2.124609977994548, "grad_norm": 0.5563886165618896, "learning_rate": 2.918345891011039e-05, "loss": 1.5805, "step": 8086 }, { "epoch": 2.1251354813282095, "grad_norm": 0.6558648943901062, "learning_rate": 2.9165936569125635e-05, "loss": 1.6255, "step": 8088 }, { "epoch": 2.1256609846618715, "grad_norm": 0.6515272259712219, "learning_rate": 2.914841422814088e-05, "loss": 1.6127, "step": 8090 }, { "epoch": 2.1261864879955334, "grad_norm": 0.5626866221427917, "learning_rate": 2.9130891887156127e-05, "loss": 1.6125, "step": 8092 }, { "epoch": 2.126711991329195, "grad_norm": 0.6942625045776367, "learning_rate": 2.911336954617137e-05, "loss": 1.5824, "step": 8094 }, { "epoch": 2.127237494662857, "grad_norm": 0.5993350148200989, "learning_rate": 2.9095847205186615e-05, "loss": 1.6309, "step": 8096 }, { "epoch": 2.1277629979965185, "grad_norm": 0.5620863437652588, "learning_rate": 2.9078324864201863e-05, "loss": 1.6287, "step": 8098 }, { "epoch": 2.1282885013301804, "grad_norm": 0.5420330762863159, "learning_rate": 2.90608025232171e-05, "loss": 1.6263, "step": 8100 }, { "epoch": 2.128814004663842, "grad_norm": 0.5831283330917358, "learning_rate": 2.9043280182232348e-05, "loss": 1.6221, "step": 8102 }, { "epoch": 2.129339507997504, "grad_norm": 0.6942195296287537, "learning_rate": 2.9025757841247592e-05, "loss": 1.6411, "step": 8104 }, { "epoch": 2.1298650113311655, "grad_norm": 0.5689584016799927, "learning_rate": 2.9008235500262836e-05, "loss": 1.6443, "step": 8106 }, { "epoch": 2.1303905146648274, "grad_norm": 0.6463392376899719, "learning_rate": 2.8990713159278084e-05, "loss": 1.5987, "step": 8108 }, { "epoch": 2.130916017998489, "grad_norm": 0.591022253036499, "learning_rate": 2.8973190818293328e-05, "loss": 1.6283, "step": 8110 }, { "epoch": 2.131441521332151, "grad_norm": 0.6164734363555908, "learning_rate": 2.895566847730857e-05, "loss": 1.6188, "step": 8112 }, { "epoch": 2.131967024665813, "grad_norm": 0.5488820672035217, "learning_rate": 2.8938146136323813e-05, "loss": 1.6562, "step": 8114 }, { "epoch": 2.1324925279994744, "grad_norm": 0.7101133465766907, "learning_rate": 2.8920623795339057e-05, "loss": 1.6206, "step": 8116 }, { "epoch": 2.1330180313331364, "grad_norm": 0.5032434463500977, "learning_rate": 2.8903101454354304e-05, "loss": 1.6042, "step": 8118 }, { "epoch": 2.133543534666798, "grad_norm": 0.5224511623382568, "learning_rate": 2.888557911336955e-05, "loss": 1.5865, "step": 8120 }, { "epoch": 2.13406903800046, "grad_norm": 0.5269969701766968, "learning_rate": 2.8868056772384793e-05, "loss": 1.6296, "step": 8122 }, { "epoch": 2.1345945413341214, "grad_norm": 0.5620083212852478, "learning_rate": 2.885053443140004e-05, "loss": 1.6425, "step": 8124 }, { "epoch": 2.1351200446677834, "grad_norm": 0.6436342597007751, "learning_rate": 2.8833012090415278e-05, "loss": 1.6053, "step": 8126 }, { "epoch": 2.1356455480014453, "grad_norm": 0.6406291127204895, "learning_rate": 2.8815489749430525e-05, "loss": 1.6254, "step": 8128 }, { "epoch": 2.136171051335107, "grad_norm": 0.551295280456543, "learning_rate": 2.879796740844577e-05, "loss": 1.621, "step": 8130 }, { "epoch": 2.136696554668769, "grad_norm": 0.6591944098472595, "learning_rate": 2.8780445067461014e-05, "loss": 1.6578, "step": 8132 }, { "epoch": 2.1372220580024304, "grad_norm": 0.7538178563117981, "learning_rate": 2.876292272647626e-05, "loss": 1.6057, "step": 8134 }, { "epoch": 2.1377475613360923, "grad_norm": 0.5904437303543091, "learning_rate": 2.8745400385491505e-05, "loss": 1.5984, "step": 8136 }, { "epoch": 2.138273064669754, "grad_norm": 0.5584096312522888, "learning_rate": 2.8727878044506746e-05, "loss": 1.5751, "step": 8138 }, { "epoch": 2.138798568003416, "grad_norm": 0.6350988745689392, "learning_rate": 2.871035570352199e-05, "loss": 1.6102, "step": 8140 }, { "epoch": 2.1393240713370774, "grad_norm": 0.8004497289657593, "learning_rate": 2.8692833362537235e-05, "loss": 1.6205, "step": 8142 }, { "epoch": 2.1398495746707393, "grad_norm": 0.6031829118728638, "learning_rate": 2.8675311021552482e-05, "loss": 1.615, "step": 8144 }, { "epoch": 2.140375078004401, "grad_norm": 0.5796357989311218, "learning_rate": 2.8657788680567726e-05, "loss": 1.6007, "step": 8146 }, { "epoch": 2.140900581338063, "grad_norm": 0.5166471004486084, "learning_rate": 2.864026633958297e-05, "loss": 1.592, "step": 8148 }, { "epoch": 2.141426084671725, "grad_norm": 0.6542901396751404, "learning_rate": 2.862274399859821e-05, "loss": 1.648, "step": 8150 }, { "epoch": 2.1419515880053863, "grad_norm": 0.5643502473831177, "learning_rate": 2.8605221657613455e-05, "loss": 1.6187, "step": 8152 }, { "epoch": 2.1424770913390483, "grad_norm": 0.5144004821777344, "learning_rate": 2.8587699316628703e-05, "loss": 1.5961, "step": 8154 }, { "epoch": 2.14300259467271, "grad_norm": 0.5907447934150696, "learning_rate": 2.8570176975643947e-05, "loss": 1.6091, "step": 8156 }, { "epoch": 2.143528098006372, "grad_norm": 0.6062257885932922, "learning_rate": 2.855265463465919e-05, "loss": 1.596, "step": 8158 }, { "epoch": 2.1440536013400333, "grad_norm": 0.5850198268890381, "learning_rate": 2.853513229367444e-05, "loss": 1.6202, "step": 8160 }, { "epoch": 2.1445791046736953, "grad_norm": 0.6049871444702148, "learning_rate": 2.8517609952689683e-05, "loss": 1.6151, "step": 8162 }, { "epoch": 2.1451046080073572, "grad_norm": 0.6060041785240173, "learning_rate": 2.8500087611704924e-05, "loss": 1.566, "step": 8164 }, { "epoch": 2.1456301113410188, "grad_norm": 0.5683371424674988, "learning_rate": 2.8482565270720168e-05, "loss": 1.6301, "step": 8166 }, { "epoch": 2.1461556146746807, "grad_norm": 0.5292571783065796, "learning_rate": 2.8465042929735412e-05, "loss": 1.6151, "step": 8168 }, { "epoch": 2.1466811180083423, "grad_norm": 0.6342765688896179, "learning_rate": 2.844752058875066e-05, "loss": 1.5728, "step": 8170 }, { "epoch": 2.1472066213420042, "grad_norm": 0.551582932472229, "learning_rate": 2.8429998247765904e-05, "loss": 1.6054, "step": 8172 }, { "epoch": 2.1477321246756658, "grad_norm": 0.548427939414978, "learning_rate": 2.8412475906781148e-05, "loss": 1.5991, "step": 8174 }, { "epoch": 2.1482576280093277, "grad_norm": 0.6584466695785522, "learning_rate": 2.839495356579639e-05, "loss": 1.6213, "step": 8176 }, { "epoch": 2.1487831313429893, "grad_norm": 0.6075412631034851, "learning_rate": 2.8377431224811633e-05, "loss": 1.6279, "step": 8178 }, { "epoch": 2.1493086346766512, "grad_norm": 0.569524884223938, "learning_rate": 2.835990888382688e-05, "loss": 1.6053, "step": 8180 }, { "epoch": 2.149834138010313, "grad_norm": 0.5383790731430054, "learning_rate": 2.8342386542842125e-05, "loss": 1.5938, "step": 8182 }, { "epoch": 2.1503596413439747, "grad_norm": 0.5958060026168823, "learning_rate": 2.832486420185737e-05, "loss": 1.6272, "step": 8184 }, { "epoch": 2.1508851446776367, "grad_norm": 0.5532635450363159, "learning_rate": 2.8307341860872617e-05, "loss": 1.6431, "step": 8186 }, { "epoch": 2.151410648011298, "grad_norm": 0.5432900190353394, "learning_rate": 2.8289819519887857e-05, "loss": 1.6579, "step": 8188 }, { "epoch": 2.15193615134496, "grad_norm": 0.6363177299499512, "learning_rate": 2.82722971789031e-05, "loss": 1.6349, "step": 8190 }, { "epoch": 2.1524616546786217, "grad_norm": 0.6205646395683289, "learning_rate": 2.8254774837918346e-05, "loss": 1.6014, "step": 8192 }, { "epoch": 2.1529871580122837, "grad_norm": 0.5696967244148254, "learning_rate": 2.8237252496933593e-05, "loss": 1.6086, "step": 8194 }, { "epoch": 2.153512661345945, "grad_norm": 0.5662211775779724, "learning_rate": 2.8219730155948837e-05, "loss": 1.6246, "step": 8196 }, { "epoch": 2.154038164679607, "grad_norm": 0.5794999599456787, "learning_rate": 2.820220781496408e-05, "loss": 1.6127, "step": 8198 }, { "epoch": 2.154563668013269, "grad_norm": 0.6433011889457703, "learning_rate": 2.818468547397933e-05, "loss": 1.5973, "step": 8200 }, { "epoch": 2.1550891713469307, "grad_norm": 0.5784979462623596, "learning_rate": 2.8167163132994567e-05, "loss": 1.6314, "step": 8202 }, { "epoch": 2.1556146746805926, "grad_norm": 0.5415730476379395, "learning_rate": 2.8149640792009814e-05, "loss": 1.635, "step": 8204 }, { "epoch": 2.156140178014254, "grad_norm": 0.5347517132759094, "learning_rate": 2.8132118451025058e-05, "loss": 1.5721, "step": 8206 }, { "epoch": 2.156665681347916, "grad_norm": 0.5696678757667542, "learning_rate": 2.8114596110040302e-05, "loss": 1.6163, "step": 8208 }, { "epoch": 2.1571911846815777, "grad_norm": 0.5461645126342773, "learning_rate": 2.809707376905555e-05, "loss": 1.6097, "step": 8210 }, { "epoch": 2.1577166880152396, "grad_norm": 0.5799645781517029, "learning_rate": 2.8079551428070794e-05, "loss": 1.6526, "step": 8212 }, { "epoch": 2.158242191348901, "grad_norm": 0.5340853333473206, "learning_rate": 2.8062029087086035e-05, "loss": 1.5865, "step": 8214 }, { "epoch": 2.158767694682563, "grad_norm": 0.5770725607872009, "learning_rate": 2.804450674610128e-05, "loss": 1.6281, "step": 8216 }, { "epoch": 2.159293198016225, "grad_norm": 0.5207443237304688, "learning_rate": 2.8026984405116523e-05, "loss": 1.6303, "step": 8218 }, { "epoch": 2.1598187013498866, "grad_norm": 0.6137516498565674, "learning_rate": 2.800946206413177e-05, "loss": 1.639, "step": 8220 }, { "epoch": 2.1603442046835486, "grad_norm": 0.5726540088653564, "learning_rate": 2.7991939723147015e-05, "loss": 1.6108, "step": 8222 }, { "epoch": 2.16086970801721, "grad_norm": 0.5607831478118896, "learning_rate": 2.797441738216226e-05, "loss": 1.611, "step": 8224 }, { "epoch": 2.161395211350872, "grad_norm": 0.6390299201011658, "learning_rate": 2.79568950411775e-05, "loss": 1.6325, "step": 8226 }, { "epoch": 2.1619207146845336, "grad_norm": 0.548121452331543, "learning_rate": 2.7939372700192744e-05, "loss": 1.6311, "step": 8228 }, { "epoch": 2.1624462180181956, "grad_norm": 0.6899086833000183, "learning_rate": 2.7921850359207992e-05, "loss": 1.5686, "step": 8230 }, { "epoch": 2.162971721351857, "grad_norm": 0.6069139838218689, "learning_rate": 2.7904328018223236e-05, "loss": 1.603, "step": 8232 }, { "epoch": 2.163497224685519, "grad_norm": 0.5628888010978699, "learning_rate": 2.788680567723848e-05, "loss": 1.616, "step": 8234 }, { "epoch": 2.164022728019181, "grad_norm": 0.5229991674423218, "learning_rate": 2.7869283336253728e-05, "loss": 1.6172, "step": 8236 }, { "epoch": 2.1645482313528426, "grad_norm": 0.5317997932434082, "learning_rate": 2.7851760995268972e-05, "loss": 1.5771, "step": 8238 }, { "epoch": 2.1650737346865045, "grad_norm": 0.5842552781105042, "learning_rate": 2.7834238654284213e-05, "loss": 1.6199, "step": 8240 }, { "epoch": 2.165599238020166, "grad_norm": 0.6741251349449158, "learning_rate": 2.7816716313299457e-05, "loss": 1.6156, "step": 8242 }, { "epoch": 2.166124741353828, "grad_norm": 0.5447443723678589, "learning_rate": 2.77991939723147e-05, "loss": 1.5832, "step": 8244 }, { "epoch": 2.1666502446874896, "grad_norm": 0.49737128615379333, "learning_rate": 2.778167163132995e-05, "loss": 1.6369, "step": 8246 }, { "epoch": 2.1671757480211515, "grad_norm": 0.6779609322547913, "learning_rate": 2.7764149290345193e-05, "loss": 1.631, "step": 8248 }, { "epoch": 2.1677012513548135, "grad_norm": 0.5730245113372803, "learning_rate": 2.7746626949360437e-05, "loss": 1.6004, "step": 8250 }, { "epoch": 2.168226754688475, "grad_norm": 0.5977212190628052, "learning_rate": 2.7729104608375678e-05, "loss": 1.6173, "step": 8252 }, { "epoch": 2.168752258022137, "grad_norm": 0.5202796459197998, "learning_rate": 2.7711582267390922e-05, "loss": 1.6027, "step": 8254 }, { "epoch": 2.1692777613557985, "grad_norm": 0.7288246154785156, "learning_rate": 2.769405992640617e-05, "loss": 1.6007, "step": 8256 }, { "epoch": 2.1698032646894605, "grad_norm": 0.5423540472984314, "learning_rate": 2.7676537585421414e-05, "loss": 1.6032, "step": 8258 }, { "epoch": 2.170328768023122, "grad_norm": 0.6103919744491577, "learning_rate": 2.7659015244436658e-05, "loss": 1.6147, "step": 8260 }, { "epoch": 2.170854271356784, "grad_norm": 0.5658676624298096, "learning_rate": 2.7641492903451905e-05, "loss": 1.6187, "step": 8262 }, { "epoch": 2.1713797746904455, "grad_norm": 0.5302108526229858, "learning_rate": 2.762397056246715e-05, "loss": 1.5885, "step": 8264 }, { "epoch": 2.1719052780241075, "grad_norm": 0.5685304999351501, "learning_rate": 2.760644822148239e-05, "loss": 1.5809, "step": 8266 }, { "epoch": 2.172430781357769, "grad_norm": 0.6211028695106506, "learning_rate": 2.7588925880497635e-05, "loss": 1.5982, "step": 8268 }, { "epoch": 2.172956284691431, "grad_norm": 0.5506071448326111, "learning_rate": 2.757140353951288e-05, "loss": 1.6059, "step": 8270 }, { "epoch": 2.173481788025093, "grad_norm": 0.5578741431236267, "learning_rate": 2.7553881198528126e-05, "loss": 1.6065, "step": 8272 }, { "epoch": 2.1740072913587545, "grad_norm": 0.6002737283706665, "learning_rate": 2.753635885754337e-05, "loss": 1.6622, "step": 8274 }, { "epoch": 2.1745327946924164, "grad_norm": 0.5421825647354126, "learning_rate": 2.7518836516558615e-05, "loss": 1.594, "step": 8276 }, { "epoch": 2.175058298026078, "grad_norm": 0.5473990440368652, "learning_rate": 2.7501314175573855e-05, "loss": 1.6006, "step": 8278 }, { "epoch": 2.17558380135974, "grad_norm": 0.6513854265213013, "learning_rate": 2.74837918345891e-05, "loss": 1.6222, "step": 8280 }, { "epoch": 2.1761093046934015, "grad_norm": 0.5265668034553528, "learning_rate": 2.7466269493604347e-05, "loss": 1.5859, "step": 8282 }, { "epoch": 2.1766348080270634, "grad_norm": 0.5653666853904724, "learning_rate": 2.744874715261959e-05, "loss": 1.6145, "step": 8284 }, { "epoch": 2.1771603113607254, "grad_norm": 0.6216524839401245, "learning_rate": 2.7431224811634835e-05, "loss": 1.5975, "step": 8286 }, { "epoch": 2.177685814694387, "grad_norm": 0.5801669955253601, "learning_rate": 2.7413702470650083e-05, "loss": 1.6318, "step": 8288 }, { "epoch": 2.178211318028049, "grad_norm": 0.6386235356330872, "learning_rate": 2.739618012966532e-05, "loss": 1.6193, "step": 8290 }, { "epoch": 2.1787368213617104, "grad_norm": 0.5952389240264893, "learning_rate": 2.7378657788680568e-05, "loss": 1.5982, "step": 8292 }, { "epoch": 2.1792623246953724, "grad_norm": 0.5205252170562744, "learning_rate": 2.7361135447695812e-05, "loss": 1.6088, "step": 8294 }, { "epoch": 2.179787828029034, "grad_norm": 0.5996859669685364, "learning_rate": 2.7343613106711056e-05, "loss": 1.5929, "step": 8296 }, { "epoch": 2.180313331362696, "grad_norm": 0.5440699458122253, "learning_rate": 2.7326090765726304e-05, "loss": 1.6266, "step": 8298 }, { "epoch": 2.1808388346963574, "grad_norm": 0.5174278616905212, "learning_rate": 2.7308568424741548e-05, "loss": 1.6134, "step": 8300 }, { "epoch": 2.1813643380300194, "grad_norm": 0.5336670875549316, "learning_rate": 2.7291046083756792e-05, "loss": 1.6026, "step": 8302 }, { "epoch": 2.181889841363681, "grad_norm": 0.5199602246284485, "learning_rate": 2.7273523742772033e-05, "loss": 1.6117, "step": 8304 }, { "epoch": 2.182415344697343, "grad_norm": 0.5980170369148254, "learning_rate": 2.7256001401787277e-05, "loss": 1.6027, "step": 8306 }, { "epoch": 2.182940848031005, "grad_norm": 0.6026611924171448, "learning_rate": 2.7238479060802525e-05, "loss": 1.6326, "step": 8308 }, { "epoch": 2.1834663513646664, "grad_norm": 0.54527747631073, "learning_rate": 2.722095671981777e-05, "loss": 1.587, "step": 8310 }, { "epoch": 2.1839918546983283, "grad_norm": 0.6268514394760132, "learning_rate": 2.7203434378833013e-05, "loss": 1.6362, "step": 8312 }, { "epoch": 2.18451735803199, "grad_norm": 0.6219541430473328, "learning_rate": 2.718591203784826e-05, "loss": 1.5783, "step": 8314 }, { "epoch": 2.185042861365652, "grad_norm": 0.5530925393104553, "learning_rate": 2.71683896968635e-05, "loss": 1.6121, "step": 8316 }, { "epoch": 2.1855683646993134, "grad_norm": 0.6584067940711975, "learning_rate": 2.7150867355878746e-05, "loss": 1.6114, "step": 8318 }, { "epoch": 2.1860938680329753, "grad_norm": 0.5550515055656433, "learning_rate": 2.713334501489399e-05, "loss": 1.6087, "step": 8320 }, { "epoch": 2.1866193713666373, "grad_norm": 0.595063328742981, "learning_rate": 2.7115822673909237e-05, "loss": 1.606, "step": 8322 }, { "epoch": 2.187144874700299, "grad_norm": 0.6574273705482483, "learning_rate": 2.709830033292448e-05, "loss": 1.6054, "step": 8324 }, { "epoch": 2.187670378033961, "grad_norm": 0.5191264748573303, "learning_rate": 2.7080777991939726e-05, "loss": 1.5956, "step": 8326 }, { "epoch": 2.1881958813676223, "grad_norm": 0.6227658987045288, "learning_rate": 2.7063255650954967e-05, "loss": 1.6053, "step": 8328 }, { "epoch": 2.1887213847012843, "grad_norm": 0.6377692222595215, "learning_rate": 2.704573330997021e-05, "loss": 1.6253, "step": 8330 }, { "epoch": 2.189246888034946, "grad_norm": 0.5611161589622498, "learning_rate": 2.7028210968985458e-05, "loss": 1.5996, "step": 8332 }, { "epoch": 2.189772391368608, "grad_norm": 0.628909170627594, "learning_rate": 2.7010688628000702e-05, "loss": 1.6267, "step": 8334 }, { "epoch": 2.1902978947022693, "grad_norm": 0.5963556170463562, "learning_rate": 2.6993166287015947e-05, "loss": 1.5845, "step": 8336 }, { "epoch": 2.1908233980359313, "grad_norm": 0.5764312744140625, "learning_rate": 2.6975643946031194e-05, "loss": 1.593, "step": 8338 }, { "epoch": 2.1913489013695933, "grad_norm": 0.7820791006088257, "learning_rate": 2.695812160504644e-05, "loss": 1.5945, "step": 8340 }, { "epoch": 2.191874404703255, "grad_norm": 0.542460560798645, "learning_rate": 2.694059926406168e-05, "loss": 1.6183, "step": 8342 }, { "epoch": 2.1923999080369168, "grad_norm": 0.6432899832725525, "learning_rate": 2.6923076923076923e-05, "loss": 1.6584, "step": 8344 }, { "epoch": 2.1929254113705783, "grad_norm": 0.5868924856185913, "learning_rate": 2.6905554582092167e-05, "loss": 1.6196, "step": 8346 }, { "epoch": 2.1934509147042403, "grad_norm": 0.5434276461601257, "learning_rate": 2.6888032241107415e-05, "loss": 1.581, "step": 8348 }, { "epoch": 2.1939764180379018, "grad_norm": 0.5824474692344666, "learning_rate": 2.687050990012266e-05, "loss": 1.5816, "step": 8350 }, { "epoch": 2.1945019213715637, "grad_norm": 0.5336950421333313, "learning_rate": 2.6852987559137903e-05, "loss": 1.574, "step": 8352 }, { "epoch": 2.1950274247052253, "grad_norm": 0.5753375291824341, "learning_rate": 2.6835465218153144e-05, "loss": 1.604, "step": 8354 }, { "epoch": 2.1955529280388872, "grad_norm": 0.762776792049408, "learning_rate": 2.681794287716839e-05, "loss": 1.6355, "step": 8356 }, { "epoch": 2.196078431372549, "grad_norm": 0.5597438812255859, "learning_rate": 2.6800420536183636e-05, "loss": 1.5729, "step": 8358 }, { "epoch": 2.1966039347062107, "grad_norm": 0.5700204968452454, "learning_rate": 2.678289819519888e-05, "loss": 1.6107, "step": 8360 }, { "epoch": 2.1971294380398727, "grad_norm": 0.5418670773506165, "learning_rate": 2.6765375854214124e-05, "loss": 1.6243, "step": 8362 }, { "epoch": 2.1976549413735342, "grad_norm": 0.5794771909713745, "learning_rate": 2.6747853513229372e-05, "loss": 1.6099, "step": 8364 }, { "epoch": 2.198180444707196, "grad_norm": 0.6689446568489075, "learning_rate": 2.6730331172244616e-05, "loss": 1.5979, "step": 8366 }, { "epoch": 2.1987059480408577, "grad_norm": 0.6027274131774902, "learning_rate": 2.6712808831259857e-05, "loss": 1.6143, "step": 8368 }, { "epoch": 2.1992314513745197, "grad_norm": 0.5987696051597595, "learning_rate": 2.66952864902751e-05, "loss": 1.5895, "step": 8370 }, { "epoch": 2.1997569547081812, "grad_norm": 0.566920816898346, "learning_rate": 2.6677764149290345e-05, "loss": 1.6106, "step": 8372 }, { "epoch": 2.200282458041843, "grad_norm": 0.611470639705658, "learning_rate": 2.6660241808305593e-05, "loss": 1.5572, "step": 8374 }, { "epoch": 2.200807961375505, "grad_norm": 0.666123628616333, "learning_rate": 2.6642719467320837e-05, "loss": 1.6263, "step": 8376 }, { "epoch": 2.2013334647091667, "grad_norm": 0.522304117679596, "learning_rate": 2.662519712633608e-05, "loss": 1.6031, "step": 8378 }, { "epoch": 2.2018589680428287, "grad_norm": 0.518319845199585, "learning_rate": 2.6607674785351322e-05, "loss": 1.5743, "step": 8380 }, { "epoch": 2.20238447137649, "grad_norm": 0.547029435634613, "learning_rate": 2.6590152444366566e-05, "loss": 1.6111, "step": 8382 }, { "epoch": 2.202909974710152, "grad_norm": 0.5654579401016235, "learning_rate": 2.6572630103381814e-05, "loss": 1.6057, "step": 8384 }, { "epoch": 2.2034354780438137, "grad_norm": 0.6351662874221802, "learning_rate": 2.6555107762397058e-05, "loss": 1.6298, "step": 8386 }, { "epoch": 2.2039609813774756, "grad_norm": 0.5234793424606323, "learning_rate": 2.6537585421412302e-05, "loss": 1.5897, "step": 8388 }, { "epoch": 2.204486484711137, "grad_norm": 0.6238772869110107, "learning_rate": 2.652006308042755e-05, "loss": 1.6321, "step": 8390 }, { "epoch": 2.205011988044799, "grad_norm": 0.5562398433685303, "learning_rate": 2.6502540739442787e-05, "loss": 1.575, "step": 8392 }, { "epoch": 2.205537491378461, "grad_norm": 0.5776354670524597, "learning_rate": 2.6485018398458034e-05, "loss": 1.5714, "step": 8394 }, { "epoch": 2.2060629947121226, "grad_norm": 0.5097812414169312, "learning_rate": 2.646749605747328e-05, "loss": 1.6081, "step": 8396 }, { "epoch": 2.2065884980457846, "grad_norm": 0.5820745825767517, "learning_rate": 2.6449973716488523e-05, "loss": 1.646, "step": 8398 }, { "epoch": 2.207114001379446, "grad_norm": 0.5649163722991943, "learning_rate": 2.643245137550377e-05, "loss": 1.6181, "step": 8400 }, { "epoch": 2.207114001379446, "eval_loss": 1.6614795923233032, "eval_runtime": 487.2799, "eval_samples_per_second": 249.936, "eval_steps_per_second": 31.243, "step": 8400 }, { "epoch": 2.207639504713108, "grad_norm": 0.5877487063407898, "learning_rate": 2.6414929034519015e-05, "loss": 1.6239, "step": 8402 }, { "epoch": 2.2081650080467696, "grad_norm": 0.5796919465065002, "learning_rate": 2.639740669353426e-05, "loss": 1.6233, "step": 8404 }, { "epoch": 2.2086905113804316, "grad_norm": 0.58669114112854, "learning_rate": 2.63798843525495e-05, "loss": 1.6164, "step": 8406 }, { "epoch": 2.2092160147140936, "grad_norm": 0.6143302917480469, "learning_rate": 2.6362362011564744e-05, "loss": 1.6609, "step": 8408 }, { "epoch": 2.209741518047755, "grad_norm": 0.6326837539672852, "learning_rate": 2.634483967057999e-05, "loss": 1.5887, "step": 8410 }, { "epoch": 2.210267021381417, "grad_norm": 0.5528976917266846, "learning_rate": 2.6327317329595235e-05, "loss": 1.5859, "step": 8412 }, { "epoch": 2.2107925247150786, "grad_norm": 0.654974102973938, "learning_rate": 2.630979498861048e-05, "loss": 1.5918, "step": 8414 }, { "epoch": 2.2113180280487406, "grad_norm": 0.5469368100166321, "learning_rate": 2.6292272647625727e-05, "loss": 1.6064, "step": 8416 }, { "epoch": 2.211843531382402, "grad_norm": 0.5513100028038025, "learning_rate": 2.6274750306640965e-05, "loss": 1.5964, "step": 8418 }, { "epoch": 2.212369034716064, "grad_norm": 0.5723447799682617, "learning_rate": 2.6257227965656212e-05, "loss": 1.5847, "step": 8420 }, { "epoch": 2.2128945380497256, "grad_norm": 0.5903245806694031, "learning_rate": 2.6239705624671456e-05, "loss": 1.6176, "step": 8422 }, { "epoch": 2.2134200413833875, "grad_norm": 0.7093879580497742, "learning_rate": 2.62221832836867e-05, "loss": 1.627, "step": 8424 }, { "epoch": 2.213945544717049, "grad_norm": 0.6263571977615356, "learning_rate": 2.6204660942701948e-05, "loss": 1.606, "step": 8426 }, { "epoch": 2.214471048050711, "grad_norm": 0.5385801196098328, "learning_rate": 2.6187138601717192e-05, "loss": 1.6006, "step": 8428 }, { "epoch": 2.214996551384373, "grad_norm": 0.5380145311355591, "learning_rate": 2.6169616260732433e-05, "loss": 1.6179, "step": 8430 }, { "epoch": 2.2155220547180345, "grad_norm": 0.6820893287658691, "learning_rate": 2.6152093919747677e-05, "loss": 1.5878, "step": 8432 }, { "epoch": 2.2160475580516965, "grad_norm": 0.6620394587516785, "learning_rate": 2.613457157876292e-05, "loss": 1.6155, "step": 8434 }, { "epoch": 2.216573061385358, "grad_norm": 0.5877991914749146, "learning_rate": 2.611704923777817e-05, "loss": 1.6346, "step": 8436 }, { "epoch": 2.21709856471902, "grad_norm": 0.7124659419059753, "learning_rate": 2.6099526896793413e-05, "loss": 1.624, "step": 8438 }, { "epoch": 2.2176240680526815, "grad_norm": 0.6077954769134521, "learning_rate": 2.608200455580866e-05, "loss": 1.606, "step": 8440 }, { "epoch": 2.2181495713863435, "grad_norm": 0.7110419869422913, "learning_rate": 2.6064482214823905e-05, "loss": 1.5749, "step": 8442 }, { "epoch": 2.2186750747200055, "grad_norm": 0.5183662176132202, "learning_rate": 2.6046959873839146e-05, "loss": 1.6047, "step": 8444 }, { "epoch": 2.219200578053667, "grad_norm": 0.6769959926605225, "learning_rate": 2.602943753285439e-05, "loss": 1.6635, "step": 8446 }, { "epoch": 2.219726081387329, "grad_norm": 0.610445499420166, "learning_rate": 2.6011915191869634e-05, "loss": 1.6053, "step": 8448 }, { "epoch": 2.2202515847209905, "grad_norm": 0.537892758846283, "learning_rate": 2.599439285088488e-05, "loss": 1.6046, "step": 8450 }, { "epoch": 2.2207770880546525, "grad_norm": 0.5192505121231079, "learning_rate": 2.5976870509900126e-05, "loss": 1.5956, "step": 8452 }, { "epoch": 2.221302591388314, "grad_norm": 0.5772652626037598, "learning_rate": 2.595934816891537e-05, "loss": 1.6013, "step": 8454 }, { "epoch": 2.221828094721976, "grad_norm": 0.5051235556602478, "learning_rate": 2.594182582793061e-05, "loss": 1.5965, "step": 8456 }, { "epoch": 2.2223535980556375, "grad_norm": 0.5779315233230591, "learning_rate": 2.5924303486945855e-05, "loss": 1.5888, "step": 8458 }, { "epoch": 2.2228791013892994, "grad_norm": 0.5708422660827637, "learning_rate": 2.5906781145961102e-05, "loss": 1.638, "step": 8460 }, { "epoch": 2.223404604722961, "grad_norm": 0.6042600274085999, "learning_rate": 2.5889258804976347e-05, "loss": 1.5939, "step": 8462 }, { "epoch": 2.223930108056623, "grad_norm": 0.5264221429824829, "learning_rate": 2.587173646399159e-05, "loss": 1.6029, "step": 8464 }, { "epoch": 2.224455611390285, "grad_norm": 0.5832411050796509, "learning_rate": 2.585421412300684e-05, "loss": 1.6332, "step": 8466 }, { "epoch": 2.2249811147239464, "grad_norm": 0.5503547191619873, "learning_rate": 2.5836691782022076e-05, "loss": 1.6139, "step": 8468 }, { "epoch": 2.2255066180576084, "grad_norm": 0.6002954244613647, "learning_rate": 2.5819169441037323e-05, "loss": 1.6085, "step": 8470 }, { "epoch": 2.22603212139127, "grad_norm": 0.5220708250999451, "learning_rate": 2.5801647100052567e-05, "loss": 1.5779, "step": 8472 }, { "epoch": 2.226557624724932, "grad_norm": 0.5423254370689392, "learning_rate": 2.578412475906781e-05, "loss": 1.5818, "step": 8474 }, { "epoch": 2.2270831280585934, "grad_norm": 0.5738494396209717, "learning_rate": 2.576660241808306e-05, "loss": 1.6046, "step": 8476 }, { "epoch": 2.2276086313922554, "grad_norm": 0.5414014458656311, "learning_rate": 2.5749080077098303e-05, "loss": 1.6238, "step": 8478 }, { "epoch": 2.2281341347259174, "grad_norm": 0.5559380650520325, "learning_rate": 2.5731557736113548e-05, "loss": 1.6421, "step": 8480 }, { "epoch": 2.228659638059579, "grad_norm": 0.5835655331611633, "learning_rate": 2.571403539512879e-05, "loss": 1.6208, "step": 8482 }, { "epoch": 2.229185141393241, "grad_norm": 0.5489543676376343, "learning_rate": 2.5696513054144033e-05, "loss": 1.5749, "step": 8484 }, { "epoch": 2.2297106447269024, "grad_norm": 0.5658118724822998, "learning_rate": 2.567899071315928e-05, "loss": 1.6382, "step": 8486 }, { "epoch": 2.2302361480605644, "grad_norm": 0.5926935076713562, "learning_rate": 2.5661468372174524e-05, "loss": 1.6327, "step": 8488 }, { "epoch": 2.230761651394226, "grad_norm": 0.5220993757247925, "learning_rate": 2.564394603118977e-05, "loss": 1.6266, "step": 8490 }, { "epoch": 2.231287154727888, "grad_norm": 0.5491675138473511, "learning_rate": 2.5626423690205016e-05, "loss": 1.615, "step": 8492 }, { "epoch": 2.2318126580615494, "grad_norm": 0.5789914131164551, "learning_rate": 2.5608901349220253e-05, "loss": 1.6568, "step": 8494 }, { "epoch": 2.2323381613952114, "grad_norm": 0.5792714357376099, "learning_rate": 2.55913790082355e-05, "loss": 1.6344, "step": 8496 }, { "epoch": 2.2328636647288733, "grad_norm": 0.4977031648159027, "learning_rate": 2.5573856667250745e-05, "loss": 1.6286, "step": 8498 }, { "epoch": 2.233389168062535, "grad_norm": 0.6100000739097595, "learning_rate": 2.555633432626599e-05, "loss": 1.6234, "step": 8500 }, { "epoch": 2.233914671396197, "grad_norm": 0.6163233518600464, "learning_rate": 2.5538811985281237e-05, "loss": 1.6096, "step": 8502 }, { "epoch": 2.2344401747298583, "grad_norm": 0.5867898464202881, "learning_rate": 2.552128964429648e-05, "loss": 1.604, "step": 8504 }, { "epoch": 2.2349656780635203, "grad_norm": 0.8214870095252991, "learning_rate": 2.5503767303311725e-05, "loss": 1.6476, "step": 8506 }, { "epoch": 2.235491181397182, "grad_norm": 0.7553618550300598, "learning_rate": 2.5486244962326966e-05, "loss": 1.588, "step": 8508 }, { "epoch": 2.236016684730844, "grad_norm": 0.5797671675682068, "learning_rate": 2.546872262134221e-05, "loss": 1.6125, "step": 8510 }, { "epoch": 2.2365421880645053, "grad_norm": 0.6248254179954529, "learning_rate": 2.5451200280357458e-05, "loss": 1.6339, "step": 8512 }, { "epoch": 2.2370676913981673, "grad_norm": 0.5321136116981506, "learning_rate": 2.5433677939372702e-05, "loss": 1.6058, "step": 8514 }, { "epoch": 2.2375931947318293, "grad_norm": 0.685704231262207, "learning_rate": 2.5416155598387946e-05, "loss": 1.6305, "step": 8516 }, { "epoch": 2.238118698065491, "grad_norm": 0.7109000086784363, "learning_rate": 2.5398633257403194e-05, "loss": 1.6176, "step": 8518 }, { "epoch": 2.2386442013991528, "grad_norm": 0.5430434346199036, "learning_rate": 2.538111091641843e-05, "loss": 1.5943, "step": 8520 }, { "epoch": 2.2391697047328143, "grad_norm": 0.5879561305046082, "learning_rate": 2.536358857543368e-05, "loss": 1.6418, "step": 8522 }, { "epoch": 2.2396952080664763, "grad_norm": 0.5790303349494934, "learning_rate": 2.5346066234448923e-05, "loss": 1.6413, "step": 8524 }, { "epoch": 2.240220711400138, "grad_norm": 0.5287625193595886, "learning_rate": 2.5328543893464167e-05, "loss": 1.5898, "step": 8526 }, { "epoch": 2.2407462147337998, "grad_norm": 0.6279188394546509, "learning_rate": 2.5311021552479415e-05, "loss": 1.6066, "step": 8528 }, { "epoch": 2.2412717180674613, "grad_norm": 0.5477473139762878, "learning_rate": 2.529349921149466e-05, "loss": 1.6004, "step": 8530 }, { "epoch": 2.2417972214011233, "grad_norm": 0.5529081225395203, "learning_rate": 2.52759768705099e-05, "loss": 1.6265, "step": 8532 }, { "epoch": 2.242322724734785, "grad_norm": 0.6740888357162476, "learning_rate": 2.5258454529525144e-05, "loss": 1.6015, "step": 8534 }, { "epoch": 2.2428482280684467, "grad_norm": 0.5619795322418213, "learning_rate": 2.5240932188540388e-05, "loss": 1.6388, "step": 8536 }, { "epoch": 2.2433737314021087, "grad_norm": 0.5655804872512817, "learning_rate": 2.5223409847555635e-05, "loss": 1.6052, "step": 8538 }, { "epoch": 2.2438992347357702, "grad_norm": 0.5193714499473572, "learning_rate": 2.520588750657088e-05, "loss": 1.6223, "step": 8540 }, { "epoch": 2.244424738069432, "grad_norm": 0.5750133991241455, "learning_rate": 2.5188365165586124e-05, "loss": 1.6025, "step": 8542 }, { "epoch": 2.2449502414030937, "grad_norm": 0.673194169998169, "learning_rate": 2.517084282460137e-05, "loss": 1.6275, "step": 8544 }, { "epoch": 2.2454757447367557, "grad_norm": 0.5762125849723816, "learning_rate": 2.515332048361661e-05, "loss": 1.6061, "step": 8546 }, { "epoch": 2.2460012480704172, "grad_norm": 0.5318456292152405, "learning_rate": 2.5135798142631856e-05, "loss": 1.6048, "step": 8548 }, { "epoch": 2.246526751404079, "grad_norm": 0.5658667683601379, "learning_rate": 2.51182758016471e-05, "loss": 1.5761, "step": 8550 }, { "epoch": 2.247052254737741, "grad_norm": 0.5093829035758972, "learning_rate": 2.5100753460662345e-05, "loss": 1.5823, "step": 8552 }, { "epoch": 2.2475777580714027, "grad_norm": 0.6146730780601501, "learning_rate": 2.5083231119677592e-05, "loss": 1.6037, "step": 8554 }, { "epoch": 2.2481032614050647, "grad_norm": 0.5599241256713867, "learning_rate": 2.5065708778692836e-05, "loss": 1.6559, "step": 8556 }, { "epoch": 2.248628764738726, "grad_norm": 0.6648929119110107, "learning_rate": 2.5048186437708077e-05, "loss": 1.6054, "step": 8558 }, { "epoch": 2.249154268072388, "grad_norm": 0.6260311603546143, "learning_rate": 2.503066409672332e-05, "loss": 1.6113, "step": 8560 }, { "epoch": 2.2496797714060497, "grad_norm": 0.6752457022666931, "learning_rate": 2.5013141755738566e-05, "loss": 1.6019, "step": 8562 }, { "epoch": 2.2502052747397117, "grad_norm": 0.5815406441688538, "learning_rate": 2.4995619414753813e-05, "loss": 1.6158, "step": 8564 }, { "epoch": 2.2507307780733736, "grad_norm": 0.7072321772575378, "learning_rate": 2.4978097073769057e-05, "loss": 1.6549, "step": 8566 }, { "epoch": 2.251256281407035, "grad_norm": 0.5526368021965027, "learning_rate": 2.49605747327843e-05, "loss": 1.6122, "step": 8568 }, { "epoch": 2.251781784740697, "grad_norm": 0.644477128982544, "learning_rate": 2.4943052391799546e-05, "loss": 1.5802, "step": 8570 }, { "epoch": 2.2523072880743586, "grad_norm": 0.5599563717842102, "learning_rate": 2.492553005081479e-05, "loss": 1.6255, "step": 8572 }, { "epoch": 2.2528327914080206, "grad_norm": 0.5871409177780151, "learning_rate": 2.4908007709830034e-05, "loss": 1.5929, "step": 8574 }, { "epoch": 2.253358294741682, "grad_norm": 0.654251754283905, "learning_rate": 2.4890485368845278e-05, "loss": 1.6135, "step": 8576 }, { "epoch": 2.253883798075344, "grad_norm": 0.5650330185890198, "learning_rate": 2.4872963027860526e-05, "loss": 1.5967, "step": 8578 }, { "epoch": 2.2544093014090056, "grad_norm": 0.5818485021591187, "learning_rate": 2.4855440686875766e-05, "loss": 1.642, "step": 8580 }, { "epoch": 2.2549348047426676, "grad_norm": 0.5410641431808472, "learning_rate": 2.483791834589101e-05, "loss": 1.6136, "step": 8582 }, { "epoch": 2.255460308076329, "grad_norm": 0.5259566903114319, "learning_rate": 2.4820396004906258e-05, "loss": 1.6212, "step": 8584 }, { "epoch": 2.255985811409991, "grad_norm": 0.5885828733444214, "learning_rate": 2.48028736639215e-05, "loss": 1.6015, "step": 8586 }, { "epoch": 2.256511314743653, "grad_norm": 0.6429271697998047, "learning_rate": 2.4785351322936747e-05, "loss": 1.621, "step": 8588 }, { "epoch": 2.2570368180773146, "grad_norm": 0.5706207156181335, "learning_rate": 2.476782898195199e-05, "loss": 1.5826, "step": 8590 }, { "epoch": 2.2575623214109766, "grad_norm": 0.6020461320877075, "learning_rate": 2.4750306640967235e-05, "loss": 1.6296, "step": 8592 }, { "epoch": 2.258087824744638, "grad_norm": 0.5262504816055298, "learning_rate": 2.473278429998248e-05, "loss": 1.6015, "step": 8594 }, { "epoch": 2.2586133280783, "grad_norm": 0.6201303005218506, "learning_rate": 2.4715261958997723e-05, "loss": 1.5931, "step": 8596 }, { "epoch": 2.2591388314119616, "grad_norm": 0.5339419841766357, "learning_rate": 2.4697739618012967e-05, "loss": 1.6369, "step": 8598 }, { "epoch": 2.2596643347456236, "grad_norm": 0.6167606115341187, "learning_rate": 2.468021727702821e-05, "loss": 1.6056, "step": 8600 }, { "epoch": 2.2601898380792855, "grad_norm": 0.6669736504554749, "learning_rate": 2.4662694936043456e-05, "loss": 1.6262, "step": 8602 }, { "epoch": 2.260715341412947, "grad_norm": 0.6823409795761108, "learning_rate": 2.4645172595058703e-05, "loss": 1.5777, "step": 8604 }, { "epoch": 2.261240844746609, "grad_norm": 0.5810272097587585, "learning_rate": 2.4627650254073944e-05, "loss": 1.6163, "step": 8606 }, { "epoch": 2.2617663480802706, "grad_norm": 0.5693057179450989, "learning_rate": 2.461012791308919e-05, "loss": 1.6286, "step": 8608 }, { "epoch": 2.2622918514139325, "grad_norm": 0.6601256132125854, "learning_rate": 2.4592605572104436e-05, "loss": 1.6179, "step": 8610 }, { "epoch": 2.262817354747594, "grad_norm": 0.5781053304672241, "learning_rate": 2.4575083231119677e-05, "loss": 1.6364, "step": 8612 }, { "epoch": 2.263342858081256, "grad_norm": 0.7442349195480347, "learning_rate": 2.4557560890134924e-05, "loss": 1.6105, "step": 8614 }, { "epoch": 2.2638683614149175, "grad_norm": 0.6304720044136047, "learning_rate": 2.454003854915017e-05, "loss": 1.6368, "step": 8616 }, { "epoch": 2.2643938647485795, "grad_norm": 0.8399868607521057, "learning_rate": 2.452251620816541e-05, "loss": 1.6196, "step": 8618 }, { "epoch": 2.264919368082241, "grad_norm": 0.6608265042304993, "learning_rate": 2.4504993867180657e-05, "loss": 1.6327, "step": 8620 }, { "epoch": 2.265444871415903, "grad_norm": 0.5849167108535767, "learning_rate": 2.44874715261959e-05, "loss": 1.6294, "step": 8622 }, { "epoch": 2.265970374749565, "grad_norm": 0.5310716032981873, "learning_rate": 2.4469949185211145e-05, "loss": 1.5839, "step": 8624 }, { "epoch": 2.2664958780832265, "grad_norm": 0.5808155536651611, "learning_rate": 2.445242684422639e-05, "loss": 1.6326, "step": 8626 }, { "epoch": 2.2670213814168885, "grad_norm": 0.6455567479133606, "learning_rate": 2.4434904503241633e-05, "loss": 1.6199, "step": 8628 }, { "epoch": 2.26754688475055, "grad_norm": 0.7273616790771484, "learning_rate": 2.441738216225688e-05, "loss": 1.6067, "step": 8630 }, { "epoch": 2.268072388084212, "grad_norm": 0.7093321681022644, "learning_rate": 2.4399859821272122e-05, "loss": 1.6274, "step": 8632 }, { "epoch": 2.2685978914178735, "grad_norm": 0.5976110696792603, "learning_rate": 2.438233748028737e-05, "loss": 1.6328, "step": 8634 }, { "epoch": 2.2691233947515355, "grad_norm": 0.6373220682144165, "learning_rate": 2.4364815139302614e-05, "loss": 1.6248, "step": 8636 }, { "epoch": 2.2696488980851974, "grad_norm": 0.5636451244354248, "learning_rate": 2.4347292798317854e-05, "loss": 1.6133, "step": 8638 }, { "epoch": 2.270174401418859, "grad_norm": 0.6106368899345398, "learning_rate": 2.4329770457333102e-05, "loss": 1.6036, "step": 8640 }, { "epoch": 2.270699904752521, "grad_norm": 0.526772677898407, "learning_rate": 2.4312248116348346e-05, "loss": 1.5967, "step": 8642 }, { "epoch": 2.2712254080861825, "grad_norm": 0.5734396576881409, "learning_rate": 2.429472577536359e-05, "loss": 1.6056, "step": 8644 }, { "epoch": 2.2717509114198444, "grad_norm": 0.5490990281105042, "learning_rate": 2.4277203434378834e-05, "loss": 1.6057, "step": 8646 }, { "epoch": 2.272276414753506, "grad_norm": 0.5757094621658325, "learning_rate": 2.425968109339408e-05, "loss": 1.6127, "step": 8648 }, { "epoch": 2.272801918087168, "grad_norm": 0.5849709510803223, "learning_rate": 2.4242158752409323e-05, "loss": 1.6182, "step": 8650 }, { "epoch": 2.27332742142083, "grad_norm": 0.6258499026298523, "learning_rate": 2.4224636411424567e-05, "loss": 1.5819, "step": 8652 }, { "epoch": 2.2738529247544914, "grad_norm": 0.5439596176147461, "learning_rate": 2.420711407043981e-05, "loss": 1.6197, "step": 8654 }, { "epoch": 2.274378428088153, "grad_norm": 0.5642632246017456, "learning_rate": 2.4189591729455055e-05, "loss": 1.619, "step": 8656 }, { "epoch": 2.274903931421815, "grad_norm": 0.5832663178443909, "learning_rate": 2.41720693884703e-05, "loss": 1.6291, "step": 8658 }, { "epoch": 2.275429434755477, "grad_norm": 0.5677375197410583, "learning_rate": 2.4154547047485547e-05, "loss": 1.6087, "step": 8660 }, { "epoch": 2.2759549380891384, "grad_norm": 0.5964575409889221, "learning_rate": 2.413702470650079e-05, "loss": 1.5785, "step": 8662 }, { "epoch": 2.2764804414228004, "grad_norm": 0.6877849102020264, "learning_rate": 2.4119502365516032e-05, "loss": 1.6554, "step": 8664 }, { "epoch": 2.277005944756462, "grad_norm": 0.6050941348075867, "learning_rate": 2.410198002453128e-05, "loss": 1.6324, "step": 8666 }, { "epoch": 2.277531448090124, "grad_norm": 0.6498258113861084, "learning_rate": 2.4084457683546524e-05, "loss": 1.6018, "step": 8668 }, { "epoch": 2.2780569514237854, "grad_norm": 0.7161571979522705, "learning_rate": 2.4066935342561768e-05, "loss": 1.6233, "step": 8670 }, { "epoch": 2.2785824547574474, "grad_norm": 0.6215277910232544, "learning_rate": 2.4049413001577012e-05, "loss": 1.5664, "step": 8672 }, { "epoch": 2.2791079580911093, "grad_norm": 0.6165092587471008, "learning_rate": 2.4031890660592256e-05, "loss": 1.6217, "step": 8674 }, { "epoch": 2.279633461424771, "grad_norm": 0.5820355415344238, "learning_rate": 2.40143683196075e-05, "loss": 1.631, "step": 8676 }, { "epoch": 2.280158964758433, "grad_norm": 0.5764251351356506, "learning_rate": 2.3996845978622745e-05, "loss": 1.6371, "step": 8678 }, { "epoch": 2.2806844680920944, "grad_norm": 0.5712262988090515, "learning_rate": 2.397932363763799e-05, "loss": 1.6164, "step": 8680 }, { "epoch": 2.2812099714257563, "grad_norm": 0.5973013639450073, "learning_rate": 2.3961801296653233e-05, "loss": 1.6543, "step": 8682 }, { "epoch": 2.281735474759418, "grad_norm": 0.5426216721534729, "learning_rate": 2.3944278955668477e-05, "loss": 1.5761, "step": 8684 }, { "epoch": 2.28226097809308, "grad_norm": 0.5732000470161438, "learning_rate": 2.3926756614683725e-05, "loss": 1.6052, "step": 8686 }, { "epoch": 2.282786481426742, "grad_norm": 0.581996500492096, "learning_rate": 2.3909234273698966e-05, "loss": 1.6382, "step": 8688 }, { "epoch": 2.2833119847604033, "grad_norm": 0.5811718702316284, "learning_rate": 2.3891711932714213e-05, "loss": 1.6127, "step": 8690 }, { "epoch": 2.2838374880940653, "grad_norm": 0.59296053647995, "learning_rate": 2.3874189591729457e-05, "loss": 1.6458, "step": 8692 }, { "epoch": 2.284362991427727, "grad_norm": 0.4986303746700287, "learning_rate": 2.38566672507447e-05, "loss": 1.6373, "step": 8694 }, { "epoch": 2.2848884947613888, "grad_norm": 0.711003839969635, "learning_rate": 2.3839144909759946e-05, "loss": 1.6123, "step": 8696 }, { "epoch": 2.2854139980950503, "grad_norm": 0.5871834754943848, "learning_rate": 2.382162256877519e-05, "loss": 1.6206, "step": 8698 }, { "epoch": 2.2859395014287123, "grad_norm": 0.5751228928565979, "learning_rate": 2.3804100227790434e-05, "loss": 1.6307, "step": 8700 }, { "epoch": 2.286465004762374, "grad_norm": 0.5439415574073792, "learning_rate": 2.3786577886805678e-05, "loss": 1.6081, "step": 8702 }, { "epoch": 2.2869905080960358, "grad_norm": 0.6993831992149353, "learning_rate": 2.3769055545820922e-05, "loss": 1.6197, "step": 8704 }, { "epoch": 2.2875160114296973, "grad_norm": 0.5735906958580017, "learning_rate": 2.375153320483617e-05, "loss": 1.6184, "step": 8706 }, { "epoch": 2.2880415147633593, "grad_norm": 0.7263919115066528, "learning_rate": 2.373401086385141e-05, "loss": 1.626, "step": 8708 }, { "epoch": 2.2885670180970212, "grad_norm": 0.6145097613334656, "learning_rate": 2.3716488522866655e-05, "loss": 1.5732, "step": 8710 }, { "epoch": 2.2890925214306828, "grad_norm": 0.5744717717170715, "learning_rate": 2.3698966181881902e-05, "loss": 1.6301, "step": 8712 }, { "epoch": 2.2896180247643447, "grad_norm": 0.5422975420951843, "learning_rate": 2.3681443840897143e-05, "loss": 1.6168, "step": 8714 }, { "epoch": 2.2901435280980063, "grad_norm": 0.5488165020942688, "learning_rate": 2.366392149991239e-05, "loss": 1.6252, "step": 8716 }, { "epoch": 2.2906690314316682, "grad_norm": 0.5232349634170532, "learning_rate": 2.3646399158927635e-05, "loss": 1.5893, "step": 8718 }, { "epoch": 2.2911945347653297, "grad_norm": 0.5339794754981995, "learning_rate": 2.3628876817942876e-05, "loss": 1.6187, "step": 8720 }, { "epoch": 2.2917200380989917, "grad_norm": 0.5548917651176453, "learning_rate": 2.3611354476958123e-05, "loss": 1.6204, "step": 8722 }, { "epoch": 2.2922455414326537, "grad_norm": 0.5368824005126953, "learning_rate": 2.3593832135973367e-05, "loss": 1.6096, "step": 8724 }, { "epoch": 2.292771044766315, "grad_norm": 0.581849217414856, "learning_rate": 2.357630979498861e-05, "loss": 1.6213, "step": 8726 }, { "epoch": 2.293296548099977, "grad_norm": 0.5897862315177917, "learning_rate": 2.3558787454003856e-05, "loss": 1.6521, "step": 8728 }, { "epoch": 2.2938220514336387, "grad_norm": 0.49220868945121765, "learning_rate": 2.35412651130191e-05, "loss": 1.5872, "step": 8730 }, { "epoch": 2.2943475547673007, "grad_norm": 0.6717751622200012, "learning_rate": 2.3523742772034348e-05, "loss": 1.6153, "step": 8732 }, { "epoch": 2.294873058100962, "grad_norm": 0.7588834166526794, "learning_rate": 2.350622043104959e-05, "loss": 1.6137, "step": 8734 }, { "epoch": 2.295398561434624, "grad_norm": 0.6023603081703186, "learning_rate": 2.3488698090064832e-05, "loss": 1.5825, "step": 8736 }, { "epoch": 2.2959240647682857, "grad_norm": 0.590947687625885, "learning_rate": 2.347117574908008e-05, "loss": 1.5805, "step": 8738 }, { "epoch": 2.2964495681019477, "grad_norm": 0.5251069664955139, "learning_rate": 2.345365340809532e-05, "loss": 1.6111, "step": 8740 }, { "epoch": 2.296975071435609, "grad_norm": 0.5737573504447937, "learning_rate": 2.343613106711057e-05, "loss": 1.5913, "step": 8742 }, { "epoch": 2.297500574769271, "grad_norm": 0.5265782475471497, "learning_rate": 2.3418608726125813e-05, "loss": 1.6285, "step": 8744 }, { "epoch": 2.298026078102933, "grad_norm": 0.6365170478820801, "learning_rate": 2.3401086385141053e-05, "loss": 1.6699, "step": 8746 }, { "epoch": 2.2985515814365947, "grad_norm": 0.5703795552253723, "learning_rate": 2.33835640441563e-05, "loss": 1.6226, "step": 8748 }, { "epoch": 2.2990770847702566, "grad_norm": 0.6636425852775574, "learning_rate": 2.3366041703171545e-05, "loss": 1.6502, "step": 8750 }, { "epoch": 2.299602588103918, "grad_norm": 0.5792987942695618, "learning_rate": 2.334851936218679e-05, "loss": 1.6144, "step": 8752 }, { "epoch": 2.30012809143758, "grad_norm": 0.5882440209388733, "learning_rate": 2.3330997021202033e-05, "loss": 1.6421, "step": 8754 }, { "epoch": 2.3006535947712417, "grad_norm": 0.8438565135002136, "learning_rate": 2.3313474680217278e-05, "loss": 1.6205, "step": 8756 }, { "epoch": 2.3011790981049036, "grad_norm": 0.5310785174369812, "learning_rate": 2.3295952339232522e-05, "loss": 1.6295, "step": 8758 }, { "epoch": 2.3017046014385656, "grad_norm": 0.5153002738952637, "learning_rate": 2.3278429998247766e-05, "loss": 1.6023, "step": 8760 }, { "epoch": 2.302230104772227, "grad_norm": 0.5312076807022095, "learning_rate": 2.3260907657263014e-05, "loss": 1.617, "step": 8762 }, { "epoch": 2.302755608105889, "grad_norm": 0.554939329624176, "learning_rate": 2.3243385316278258e-05, "loss": 1.6376, "step": 8764 }, { "epoch": 2.3032811114395506, "grad_norm": 0.5337182283401489, "learning_rate": 2.32258629752935e-05, "loss": 1.6258, "step": 8766 }, { "epoch": 2.3038066147732126, "grad_norm": 0.5950394868850708, "learning_rate": 2.3208340634308746e-05, "loss": 1.5906, "step": 8768 }, { "epoch": 2.304332118106874, "grad_norm": 0.5580046772956848, "learning_rate": 2.319081829332399e-05, "loss": 1.602, "step": 8770 }, { "epoch": 2.304857621440536, "grad_norm": 0.5954751968383789, "learning_rate": 2.3173295952339234e-05, "loss": 1.6433, "step": 8772 }, { "epoch": 2.3053831247741976, "grad_norm": 0.5827406644821167, "learning_rate": 2.315577361135448e-05, "loss": 1.6155, "step": 8774 }, { "epoch": 2.3059086281078596, "grad_norm": 0.5616747140884399, "learning_rate": 2.3138251270369723e-05, "loss": 1.6427, "step": 8776 }, { "epoch": 2.306434131441521, "grad_norm": 0.5560279488563538, "learning_rate": 2.3120728929384967e-05, "loss": 1.5679, "step": 8778 }, { "epoch": 2.306959634775183, "grad_norm": 0.5234860777854919, "learning_rate": 2.310320658840021e-05, "loss": 1.6176, "step": 8780 }, { "epoch": 2.307485138108845, "grad_norm": 0.5881795287132263, "learning_rate": 2.3085684247415455e-05, "loss": 1.635, "step": 8782 }, { "epoch": 2.3080106414425066, "grad_norm": 0.6444844603538513, "learning_rate": 2.30681619064307e-05, "loss": 1.6151, "step": 8784 }, { "epoch": 2.3085361447761685, "grad_norm": 0.5576640367507935, "learning_rate": 2.3050639565445944e-05, "loss": 1.6469, "step": 8786 }, { "epoch": 2.30906164810983, "grad_norm": 0.6228268146514893, "learning_rate": 2.303311722446119e-05, "loss": 1.6463, "step": 8788 }, { "epoch": 2.309587151443492, "grad_norm": 0.589922308921814, "learning_rate": 2.3015594883476432e-05, "loss": 1.6203, "step": 8790 }, { "epoch": 2.3101126547771536, "grad_norm": 0.6492023468017578, "learning_rate": 2.2998072542491676e-05, "loss": 1.6371, "step": 8792 }, { "epoch": 2.3106381581108155, "grad_norm": 0.5567497611045837, "learning_rate": 2.2980550201506924e-05, "loss": 1.6225, "step": 8794 }, { "epoch": 2.3111636614444775, "grad_norm": 0.568630576133728, "learning_rate": 2.2963027860522165e-05, "loss": 1.6063, "step": 8796 }, { "epoch": 2.311689164778139, "grad_norm": 0.5894288420677185, "learning_rate": 2.2945505519537412e-05, "loss": 1.6124, "step": 8798 }, { "epoch": 2.312214668111801, "grad_norm": 0.5140713453292847, "learning_rate": 2.2927983178552656e-05, "loss": 1.6183, "step": 8800 }, { "epoch": 2.312214668111801, "eval_loss": 1.6566152572631836, "eval_runtime": 486.4646, "eval_samples_per_second": 250.355, "eval_steps_per_second": 31.295, "step": 8800 }, { "epoch": 2.3127401714454625, "grad_norm": 0.5767953395843506, "learning_rate": 2.29104608375679e-05, "loss": 1.588, "step": 8802 }, { "epoch": 2.3132656747791245, "grad_norm": 0.5397815704345703, "learning_rate": 2.2892938496583145e-05, "loss": 1.6311, "step": 8804 }, { "epoch": 2.313791178112786, "grad_norm": 0.59912109375, "learning_rate": 2.287541615559839e-05, "loss": 1.6282, "step": 8806 }, { "epoch": 2.314316681446448, "grad_norm": 0.5919666886329651, "learning_rate": 2.2857893814613633e-05, "loss": 1.5869, "step": 8808 }, { "epoch": 2.31484218478011, "grad_norm": 0.6073286533355713, "learning_rate": 2.2840371473628877e-05, "loss": 1.6117, "step": 8810 }, { "epoch": 2.3153676881137715, "grad_norm": 0.5764369368553162, "learning_rate": 2.282284913264412e-05, "loss": 1.6216, "step": 8812 }, { "epoch": 2.315893191447433, "grad_norm": 0.5165627598762512, "learning_rate": 2.280532679165937e-05, "loss": 1.5775, "step": 8814 }, { "epoch": 2.316418694781095, "grad_norm": 0.6328101754188538, "learning_rate": 2.278780445067461e-05, "loss": 1.6164, "step": 8816 }, { "epoch": 2.316944198114757, "grad_norm": 0.5554302930831909, "learning_rate": 2.2770282109689857e-05, "loss": 1.5912, "step": 8818 }, { "epoch": 2.3174697014484185, "grad_norm": 0.5721133947372437, "learning_rate": 2.27527597687051e-05, "loss": 1.5831, "step": 8820 }, { "epoch": 2.3179952047820804, "grad_norm": 0.5364890694618225, "learning_rate": 2.2735237427720342e-05, "loss": 1.6334, "step": 8822 }, { "epoch": 2.318520708115742, "grad_norm": 0.5981882214546204, "learning_rate": 2.271771508673559e-05, "loss": 1.5971, "step": 8824 }, { "epoch": 2.319046211449404, "grad_norm": 0.5527270436286926, "learning_rate": 2.2700192745750834e-05, "loss": 1.5844, "step": 8826 }, { "epoch": 2.3195717147830655, "grad_norm": 0.5365408062934875, "learning_rate": 2.2682670404766078e-05, "loss": 1.6016, "step": 8828 }, { "epoch": 2.3200972181167274, "grad_norm": 0.5421087741851807, "learning_rate": 2.2665148063781322e-05, "loss": 1.6095, "step": 8830 }, { "epoch": 2.3206227214503894, "grad_norm": 0.6584184765815735, "learning_rate": 2.2647625722796566e-05, "loss": 1.6167, "step": 8832 }, { "epoch": 2.321148224784051, "grad_norm": 0.6406242251396179, "learning_rate": 2.2630103381811814e-05, "loss": 1.6178, "step": 8834 }, { "epoch": 2.321673728117713, "grad_norm": 0.5838053226470947, "learning_rate": 2.2612581040827055e-05, "loss": 1.596, "step": 8836 }, { "epoch": 2.3221992314513744, "grad_norm": 0.5230047702789307, "learning_rate": 2.25950586998423e-05, "loss": 1.6143, "step": 8838 }, { "epoch": 2.3227247347850364, "grad_norm": 0.698841392993927, "learning_rate": 2.2577536358857547e-05, "loss": 1.5636, "step": 8840 }, { "epoch": 2.323250238118698, "grad_norm": 0.6338639855384827, "learning_rate": 2.2560014017872787e-05, "loss": 1.6137, "step": 8842 }, { "epoch": 2.32377574145236, "grad_norm": 0.5631201863288879, "learning_rate": 2.2542491676888035e-05, "loss": 1.6118, "step": 8844 }, { "epoch": 2.324301244786022, "grad_norm": 0.5797097086906433, "learning_rate": 2.252496933590328e-05, "loss": 1.6117, "step": 8846 }, { "epoch": 2.3248267481196834, "grad_norm": 0.55458003282547, "learning_rate": 2.250744699491852e-05, "loss": 1.6074, "step": 8848 }, { "epoch": 2.3253522514533453, "grad_norm": 0.6182234883308411, "learning_rate": 2.2489924653933767e-05, "loss": 1.5855, "step": 8850 }, { "epoch": 2.325877754787007, "grad_norm": 0.7982610464096069, "learning_rate": 2.247240231294901e-05, "loss": 1.6479, "step": 8852 }, { "epoch": 2.326403258120669, "grad_norm": 0.5972685813903809, "learning_rate": 2.2454879971964256e-05, "loss": 1.6508, "step": 8854 }, { "epoch": 2.3269287614543304, "grad_norm": 0.5340309739112854, "learning_rate": 2.24373576309795e-05, "loss": 1.587, "step": 8856 }, { "epoch": 2.3274542647879923, "grad_norm": 0.598004162311554, "learning_rate": 2.2419835289994744e-05, "loss": 1.5807, "step": 8858 }, { "epoch": 2.327979768121654, "grad_norm": 0.5330363512039185, "learning_rate": 2.2402312949009988e-05, "loss": 1.5864, "step": 8860 }, { "epoch": 2.328505271455316, "grad_norm": 0.5319264531135559, "learning_rate": 2.2384790608025232e-05, "loss": 1.5815, "step": 8862 }, { "epoch": 2.3290307747889774, "grad_norm": 0.7008064389228821, "learning_rate": 2.2367268267040477e-05, "loss": 1.5968, "step": 8864 }, { "epoch": 2.3295562781226393, "grad_norm": 0.5392124056816101, "learning_rate": 2.2349745926055724e-05, "loss": 1.5889, "step": 8866 }, { "epoch": 2.3300817814563013, "grad_norm": 0.5684927701950073, "learning_rate": 2.2332223585070965e-05, "loss": 1.5789, "step": 8868 }, { "epoch": 2.330607284789963, "grad_norm": 0.5385169386863708, "learning_rate": 2.2314701244086213e-05, "loss": 1.6039, "step": 8870 }, { "epoch": 2.331132788123625, "grad_norm": 0.5375089645385742, "learning_rate": 2.2297178903101457e-05, "loss": 1.6012, "step": 8872 }, { "epoch": 2.3316582914572863, "grad_norm": 0.5316143035888672, "learning_rate": 2.2279656562116698e-05, "loss": 1.6277, "step": 8874 }, { "epoch": 2.3321837947909483, "grad_norm": 0.5862188339233398, "learning_rate": 2.2262134221131945e-05, "loss": 1.5808, "step": 8876 }, { "epoch": 2.33270929812461, "grad_norm": 0.5072478652000427, "learning_rate": 2.224461188014719e-05, "loss": 1.6257, "step": 8878 }, { "epoch": 2.333234801458272, "grad_norm": 0.7240272164344788, "learning_rate": 2.2227089539162433e-05, "loss": 1.6551, "step": 8880 }, { "epoch": 2.3337603047919337, "grad_norm": 0.5968764424324036, "learning_rate": 2.2209567198177678e-05, "loss": 1.6277, "step": 8882 }, { "epoch": 2.3342858081255953, "grad_norm": 0.576076328754425, "learning_rate": 2.2192044857192922e-05, "loss": 1.6187, "step": 8884 }, { "epoch": 2.3348113114592572, "grad_norm": 0.7722670435905457, "learning_rate": 2.2174522516208166e-05, "loss": 1.6401, "step": 8886 }, { "epoch": 2.3353368147929188, "grad_norm": 0.5726610422134399, "learning_rate": 2.215700017522341e-05, "loss": 1.6182, "step": 8888 }, { "epoch": 2.3358623181265807, "grad_norm": 0.6139241456985474, "learning_rate": 2.2139477834238658e-05, "loss": 1.6314, "step": 8890 }, { "epoch": 2.3363878214602423, "grad_norm": 0.6056875586509705, "learning_rate": 2.21219554932539e-05, "loss": 1.603, "step": 8892 }, { "epoch": 2.3369133247939042, "grad_norm": 0.6434742212295532, "learning_rate": 2.2104433152269143e-05, "loss": 1.6344, "step": 8894 }, { "epoch": 2.3374388281275658, "grad_norm": 0.5169597268104553, "learning_rate": 2.208691081128439e-05, "loss": 1.5879, "step": 8896 }, { "epoch": 2.3379643314612277, "grad_norm": 0.5403237342834473, "learning_rate": 2.206938847029963e-05, "loss": 1.5932, "step": 8898 }, { "epoch": 2.3384898347948893, "grad_norm": 0.6257784962654114, "learning_rate": 2.205186612931488e-05, "loss": 1.6506, "step": 8900 }, { "epoch": 2.3390153381285512, "grad_norm": 0.5608735084533691, "learning_rate": 2.2034343788330123e-05, "loss": 1.587, "step": 8902 }, { "epoch": 2.339540841462213, "grad_norm": 0.6551423668861389, "learning_rate": 2.2016821447345367e-05, "loss": 1.5972, "step": 8904 }, { "epoch": 2.3400663447958747, "grad_norm": 0.5031147599220276, "learning_rate": 2.199929910636061e-05, "loss": 1.5885, "step": 8906 }, { "epoch": 2.3405918481295367, "grad_norm": 0.6937734484672546, "learning_rate": 2.1981776765375855e-05, "loss": 1.6201, "step": 8908 }, { "epoch": 2.341117351463198, "grad_norm": 0.6449792385101318, "learning_rate": 2.19642544243911e-05, "loss": 1.6335, "step": 8910 }, { "epoch": 2.34164285479686, "grad_norm": 0.8069351315498352, "learning_rate": 2.1946732083406344e-05, "loss": 1.6147, "step": 8912 }, { "epoch": 2.3421683581305217, "grad_norm": 0.5487890839576721, "learning_rate": 2.1929209742421588e-05, "loss": 1.6011, "step": 8914 }, { "epoch": 2.3426938614641837, "grad_norm": 0.5937405228614807, "learning_rate": 2.1911687401436835e-05, "loss": 1.6196, "step": 8916 }, { "epoch": 2.3432193647978456, "grad_norm": 0.5741852521896362, "learning_rate": 2.1894165060452076e-05, "loss": 1.5813, "step": 8918 }, { "epoch": 2.343744868131507, "grad_norm": 0.5851845741271973, "learning_rate": 2.187664271946732e-05, "loss": 1.5938, "step": 8920 }, { "epoch": 2.344270371465169, "grad_norm": 0.5910730361938477, "learning_rate": 2.1859120378482568e-05, "loss": 1.6274, "step": 8922 }, { "epoch": 2.3447958747988307, "grad_norm": 0.5576980710029602, "learning_rate": 2.184159803749781e-05, "loss": 1.6288, "step": 8924 }, { "epoch": 2.3453213781324926, "grad_norm": 0.5467607975006104, "learning_rate": 2.1824075696513056e-05, "loss": 1.5965, "step": 8926 }, { "epoch": 2.345846881466154, "grad_norm": 0.5948976874351501, "learning_rate": 2.18065533555283e-05, "loss": 1.6238, "step": 8928 }, { "epoch": 2.346372384799816, "grad_norm": 0.6754350066184998, "learning_rate": 2.178903101454354e-05, "loss": 1.6123, "step": 8930 }, { "epoch": 2.3468978881334777, "grad_norm": 0.5642646551132202, "learning_rate": 2.177150867355879e-05, "loss": 1.6032, "step": 8932 }, { "epoch": 2.3474233914671396, "grad_norm": 0.5213932991027832, "learning_rate": 2.1753986332574033e-05, "loss": 1.6275, "step": 8934 }, { "epoch": 2.347948894800801, "grad_norm": 0.5514481663703918, "learning_rate": 2.1736463991589277e-05, "loss": 1.6281, "step": 8936 }, { "epoch": 2.348474398134463, "grad_norm": 0.6020217537879944, "learning_rate": 2.171894165060452e-05, "loss": 1.5991, "step": 8938 }, { "epoch": 2.348999901468125, "grad_norm": 0.6889709830284119, "learning_rate": 2.1701419309619765e-05, "loss": 1.5949, "step": 8940 }, { "epoch": 2.3495254048017866, "grad_norm": 0.5877057313919067, "learning_rate": 2.1683896968635013e-05, "loss": 1.5978, "step": 8942 }, { "epoch": 2.3500509081354486, "grad_norm": 0.7173901200294495, "learning_rate": 2.1666374627650254e-05, "loss": 1.594, "step": 8944 }, { "epoch": 2.35057641146911, "grad_norm": 0.5696520805358887, "learning_rate": 2.16488522866655e-05, "loss": 1.6291, "step": 8946 }, { "epoch": 2.351101914802772, "grad_norm": 0.5779030919075012, "learning_rate": 2.1631329945680746e-05, "loss": 1.6113, "step": 8948 }, { "epoch": 2.3516274181364336, "grad_norm": 0.5482315421104431, "learning_rate": 2.1613807604695986e-05, "loss": 1.5829, "step": 8950 }, { "epoch": 2.3521529214700956, "grad_norm": 0.6139046549797058, "learning_rate": 2.1596285263711234e-05, "loss": 1.6207, "step": 8952 }, { "epoch": 2.3526784248037576, "grad_norm": 0.569508969783783, "learning_rate": 2.1578762922726478e-05, "loss": 1.6108, "step": 8954 }, { "epoch": 2.353203928137419, "grad_norm": 0.5926774740219116, "learning_rate": 2.1561240581741722e-05, "loss": 1.6094, "step": 8956 }, { "epoch": 2.353729431471081, "grad_norm": 0.527396559715271, "learning_rate": 2.1543718240756966e-05, "loss": 1.6116, "step": 8958 }, { "epoch": 2.3542549348047426, "grad_norm": 0.589083731174469, "learning_rate": 2.152619589977221e-05, "loss": 1.6393, "step": 8960 }, { "epoch": 2.3547804381384045, "grad_norm": 0.5615043044090271, "learning_rate": 2.1508673558787455e-05, "loss": 1.5967, "step": 8962 }, { "epoch": 2.355305941472066, "grad_norm": 0.5468254685401917, "learning_rate": 2.14911512178027e-05, "loss": 1.6055, "step": 8964 }, { "epoch": 2.355831444805728, "grad_norm": 0.5892741084098816, "learning_rate": 2.1473628876817943e-05, "loss": 1.6191, "step": 8966 }, { "epoch": 2.35635694813939, "grad_norm": 0.5186066627502441, "learning_rate": 2.1456106535833187e-05, "loss": 1.6039, "step": 8968 }, { "epoch": 2.3568824514730515, "grad_norm": 0.5194823741912842, "learning_rate": 2.143858419484843e-05, "loss": 1.6223, "step": 8970 }, { "epoch": 2.357407954806713, "grad_norm": 0.590596616268158, "learning_rate": 2.142106185386368e-05, "loss": 1.6085, "step": 8972 }, { "epoch": 2.357933458140375, "grad_norm": 0.5250555872917175, "learning_rate": 2.1403539512878923e-05, "loss": 1.5753, "step": 8974 }, { "epoch": 2.358458961474037, "grad_norm": 0.5621309280395508, "learning_rate": 2.1386017171894164e-05, "loss": 1.5993, "step": 8976 }, { "epoch": 2.3589844648076985, "grad_norm": 0.5648888945579529, "learning_rate": 2.136849483090941e-05, "loss": 1.6124, "step": 8978 }, { "epoch": 2.3595099681413605, "grad_norm": 0.6254675388336182, "learning_rate": 2.1350972489924656e-05, "loss": 1.6137, "step": 8980 }, { "epoch": 2.360035471475022, "grad_norm": 0.5874773859977722, "learning_rate": 2.13334501489399e-05, "loss": 1.6192, "step": 8982 }, { "epoch": 2.360560974808684, "grad_norm": 0.5968673229217529, "learning_rate": 2.1315927807955144e-05, "loss": 1.5798, "step": 8984 }, { "epoch": 2.3610864781423455, "grad_norm": 0.5419182181358337, "learning_rate": 2.1298405466970388e-05, "loss": 1.5893, "step": 8986 }, { "epoch": 2.3616119814760075, "grad_norm": 0.5870303511619568, "learning_rate": 2.1280883125985632e-05, "loss": 1.6178, "step": 8988 }, { "epoch": 2.3621374848096695, "grad_norm": 0.5218934416770935, "learning_rate": 2.1263360785000877e-05, "loss": 1.5627, "step": 8990 }, { "epoch": 2.362662988143331, "grad_norm": 0.7437348365783691, "learning_rate": 2.124583844401612e-05, "loss": 1.633, "step": 8992 }, { "epoch": 2.363188491476993, "grad_norm": 0.6861118078231812, "learning_rate": 2.1228316103031365e-05, "loss": 1.6051, "step": 8994 }, { "epoch": 2.3637139948106545, "grad_norm": 0.7474240660667419, "learning_rate": 2.121079376204661e-05, "loss": 1.6053, "step": 8996 }, { "epoch": 2.3642394981443164, "grad_norm": 0.6993446946144104, "learning_rate": 2.1193271421061857e-05, "loss": 1.6337, "step": 8998 }, { "epoch": 2.364765001477978, "grad_norm": 0.5786541104316711, "learning_rate": 2.1175749080077097e-05, "loss": 1.6258, "step": 9000 }, { "epoch": 2.36529050481164, "grad_norm": 0.6487932205200195, "learning_rate": 2.115822673909234e-05, "loss": 1.5785, "step": 9002 }, { "epoch": 2.365816008145302, "grad_norm": 0.6424734592437744, "learning_rate": 2.114070439810759e-05, "loss": 1.6047, "step": 9004 }, { "epoch": 2.3663415114789634, "grad_norm": 0.5754727721214294, "learning_rate": 2.1123182057122833e-05, "loss": 1.6199, "step": 9006 }, { "epoch": 2.3668670148126254, "grad_norm": 0.6415467858314514, "learning_rate": 2.1105659716138078e-05, "loss": 1.6112, "step": 9008 }, { "epoch": 2.367392518146287, "grad_norm": 0.5382227897644043, "learning_rate": 2.1088137375153322e-05, "loss": 1.6199, "step": 9010 }, { "epoch": 2.367918021479949, "grad_norm": 0.5709097385406494, "learning_rate": 2.1070615034168566e-05, "loss": 1.6274, "step": 9012 }, { "epoch": 2.3684435248136104, "grad_norm": 0.584823489189148, "learning_rate": 2.105309269318381e-05, "loss": 1.6364, "step": 9014 }, { "epoch": 2.3689690281472724, "grad_norm": 0.5769666433334351, "learning_rate": 2.1035570352199054e-05, "loss": 1.613, "step": 9016 }, { "epoch": 2.369494531480934, "grad_norm": 0.6009288430213928, "learning_rate": 2.1018048011214302e-05, "loss": 1.6115, "step": 9018 }, { "epoch": 2.370020034814596, "grad_norm": 0.5430034399032593, "learning_rate": 2.1000525670229543e-05, "loss": 1.5935, "step": 9020 }, { "epoch": 2.3705455381482574, "grad_norm": 0.5354077816009521, "learning_rate": 2.0983003329244787e-05, "loss": 1.6303, "step": 9022 }, { "epoch": 2.3710710414819194, "grad_norm": 0.5706160664558411, "learning_rate": 2.0965480988260034e-05, "loss": 1.5922, "step": 9024 }, { "epoch": 2.3715965448155814, "grad_norm": 0.5503175854682922, "learning_rate": 2.0947958647275275e-05, "loss": 1.5995, "step": 9026 }, { "epoch": 2.372122048149243, "grad_norm": 0.5860917568206787, "learning_rate": 2.0930436306290523e-05, "loss": 1.617, "step": 9028 }, { "epoch": 2.372647551482905, "grad_norm": 0.5663593411445618, "learning_rate": 2.0912913965305767e-05, "loss": 1.6142, "step": 9030 }, { "epoch": 2.3731730548165664, "grad_norm": 0.5283660888671875, "learning_rate": 2.0895391624321008e-05, "loss": 1.6111, "step": 9032 }, { "epoch": 2.3736985581502283, "grad_norm": 0.6298090815544128, "learning_rate": 2.0877869283336255e-05, "loss": 1.6362, "step": 9034 }, { "epoch": 2.37422406148389, "grad_norm": 0.6019343137741089, "learning_rate": 2.08603469423515e-05, "loss": 1.6493, "step": 9036 }, { "epoch": 2.374749564817552, "grad_norm": 0.6397587656974792, "learning_rate": 2.0842824601366744e-05, "loss": 1.6063, "step": 9038 }, { "epoch": 2.375275068151214, "grad_norm": 0.634833037853241, "learning_rate": 2.0825302260381988e-05, "loss": 1.6041, "step": 9040 }, { "epoch": 2.3758005714848753, "grad_norm": 0.535504162311554, "learning_rate": 2.0807779919397232e-05, "loss": 1.6136, "step": 9042 }, { "epoch": 2.3763260748185373, "grad_norm": 0.5898988246917725, "learning_rate": 2.079025757841248e-05, "loss": 1.5947, "step": 9044 }, { "epoch": 2.376851578152199, "grad_norm": 0.6104636192321777, "learning_rate": 2.077273523742772e-05, "loss": 1.6149, "step": 9046 }, { "epoch": 2.377377081485861, "grad_norm": 0.6539409756660461, "learning_rate": 2.0755212896442964e-05, "loss": 1.5934, "step": 9048 }, { "epoch": 2.3779025848195223, "grad_norm": 0.6595098972320557, "learning_rate": 2.0737690555458212e-05, "loss": 1.588, "step": 9050 }, { "epoch": 2.3784280881531843, "grad_norm": 0.5743740200996399, "learning_rate": 2.0720168214473453e-05, "loss": 1.6563, "step": 9052 }, { "epoch": 2.378953591486846, "grad_norm": 0.7335343956947327, "learning_rate": 2.07026458734887e-05, "loss": 1.6172, "step": 9054 }, { "epoch": 2.379479094820508, "grad_norm": 0.5948194861412048, "learning_rate": 2.0685123532503945e-05, "loss": 1.6038, "step": 9056 }, { "epoch": 2.3800045981541693, "grad_norm": 0.6061370968818665, "learning_rate": 2.0667601191519185e-05, "loss": 1.6064, "step": 9058 }, { "epoch": 2.3805301014878313, "grad_norm": 0.5088295936584473, "learning_rate": 2.0650078850534433e-05, "loss": 1.5974, "step": 9060 }, { "epoch": 2.3810556048214933, "grad_norm": 0.5611675977706909, "learning_rate": 2.0632556509549677e-05, "loss": 1.6219, "step": 9062 }, { "epoch": 2.381581108155155, "grad_norm": 0.7418251037597656, "learning_rate": 2.061503416856492e-05, "loss": 1.6091, "step": 9064 }, { "epoch": 2.3821066114888167, "grad_norm": 0.729642927646637, "learning_rate": 2.0597511827580165e-05, "loss": 1.5959, "step": 9066 }, { "epoch": 2.3826321148224783, "grad_norm": 0.564006507396698, "learning_rate": 2.057998948659541e-05, "loss": 1.6484, "step": 9068 }, { "epoch": 2.3831576181561402, "grad_norm": 0.5700123906135559, "learning_rate": 2.0562467145610654e-05, "loss": 1.61, "step": 9070 }, { "epoch": 2.3836831214898018, "grad_norm": 0.5242093801498413, "learning_rate": 2.0544944804625898e-05, "loss": 1.6011, "step": 9072 }, { "epoch": 2.3842086248234637, "grad_norm": 0.5738998651504517, "learning_rate": 2.0527422463641146e-05, "loss": 1.6154, "step": 9074 }, { "epoch": 2.3847341281571257, "grad_norm": 0.5821254253387451, "learning_rate": 2.050990012265639e-05, "loss": 1.6443, "step": 9076 }, { "epoch": 2.3852596314907872, "grad_norm": 0.6593937873840332, "learning_rate": 2.049237778167163e-05, "loss": 1.6266, "step": 9078 }, { "epoch": 2.385785134824449, "grad_norm": 0.5662251710891724, "learning_rate": 2.0474855440686878e-05, "loss": 1.6036, "step": 9080 }, { "epoch": 2.3863106381581107, "grad_norm": 0.6064842939376831, "learning_rate": 2.0457333099702122e-05, "loss": 1.6067, "step": 9082 }, { "epoch": 2.3868361414917727, "grad_norm": 0.5419683456420898, "learning_rate": 2.0439810758717366e-05, "loss": 1.6429, "step": 9084 }, { "epoch": 2.3873616448254342, "grad_norm": 0.630354642868042, "learning_rate": 2.042228841773261e-05, "loss": 1.6354, "step": 9086 }, { "epoch": 2.387887148159096, "grad_norm": 0.5797430276870728, "learning_rate": 2.0404766076747855e-05, "loss": 1.596, "step": 9088 }, { "epoch": 2.3884126514927577, "grad_norm": 0.7524275183677673, "learning_rate": 2.03872437357631e-05, "loss": 1.6484, "step": 9090 }, { "epoch": 2.3889381548264197, "grad_norm": 0.5289912819862366, "learning_rate": 2.0369721394778343e-05, "loss": 1.6089, "step": 9092 }, { "epoch": 2.389463658160081, "grad_norm": 0.610967755317688, "learning_rate": 2.0352199053793587e-05, "loss": 1.5842, "step": 9094 }, { "epoch": 2.389989161493743, "grad_norm": 0.5420004725456238, "learning_rate": 2.033467671280883e-05, "loss": 1.6497, "step": 9096 }, { "epoch": 2.390514664827405, "grad_norm": 0.5605329275131226, "learning_rate": 2.0317154371824076e-05, "loss": 1.5927, "step": 9098 }, { "epoch": 2.3910401681610667, "grad_norm": 0.7482327222824097, "learning_rate": 2.0299632030839323e-05, "loss": 1.6462, "step": 9100 }, { "epoch": 2.3915656714947287, "grad_norm": 0.536505937576294, "learning_rate": 2.0282109689854564e-05, "loss": 1.5756, "step": 9102 }, { "epoch": 2.39209117482839, "grad_norm": 0.6347717046737671, "learning_rate": 2.0264587348869808e-05, "loss": 1.5967, "step": 9104 }, { "epoch": 2.392616678162052, "grad_norm": 0.8039228320121765, "learning_rate": 2.0247065007885056e-05, "loss": 1.589, "step": 9106 }, { "epoch": 2.3931421814957137, "grad_norm": 0.5720674395561218, "learning_rate": 2.02295426669003e-05, "loss": 1.6207, "step": 9108 }, { "epoch": 2.3936676848293756, "grad_norm": 0.538987398147583, "learning_rate": 2.0212020325915544e-05, "loss": 1.604, "step": 9110 }, { "epoch": 2.3941931881630376, "grad_norm": 0.5741251111030579, "learning_rate": 2.0194497984930788e-05, "loss": 1.6345, "step": 9112 }, { "epoch": 2.394718691496699, "grad_norm": 0.6911687254905701, "learning_rate": 2.0176975643946032e-05, "loss": 1.596, "step": 9114 }, { "epoch": 2.395244194830361, "grad_norm": 0.5507877469062805, "learning_rate": 2.0159453302961277e-05, "loss": 1.5973, "step": 9116 }, { "epoch": 2.3957696981640226, "grad_norm": 0.6160647869110107, "learning_rate": 2.014193096197652e-05, "loss": 1.6113, "step": 9118 }, { "epoch": 2.3962952014976846, "grad_norm": 0.6313309669494629, "learning_rate": 2.0124408620991765e-05, "loss": 1.596, "step": 9120 }, { "epoch": 2.396820704831346, "grad_norm": 0.6924790740013123, "learning_rate": 2.010688628000701e-05, "loss": 1.6417, "step": 9122 }, { "epoch": 2.397346208165008, "grad_norm": 0.5447295904159546, "learning_rate": 2.0089363939022253e-05, "loss": 1.6247, "step": 9124 }, { "epoch": 2.39787171149867, "grad_norm": 0.5692989230155945, "learning_rate": 2.00718415980375e-05, "loss": 1.5952, "step": 9126 }, { "epoch": 2.3983972148323316, "grad_norm": 0.6437897682189941, "learning_rate": 2.005431925705274e-05, "loss": 1.6339, "step": 9128 }, { "epoch": 2.398922718165993, "grad_norm": 0.6150215268135071, "learning_rate": 2.0036796916067986e-05, "loss": 1.6362, "step": 9130 }, { "epoch": 2.399448221499655, "grad_norm": 0.6019598841667175, "learning_rate": 2.0019274575083233e-05, "loss": 1.6287, "step": 9132 }, { "epoch": 2.399973724833317, "grad_norm": 0.578983724117279, "learning_rate": 2.0001752234098474e-05, "loss": 1.6199, "step": 9134 }, { "epoch": 2.4004992281669786, "grad_norm": 0.5352420210838318, "learning_rate": 1.9984229893113722e-05, "loss": 1.6066, "step": 9136 }, { "epoch": 2.4010247315006406, "grad_norm": 0.6357502341270447, "learning_rate": 1.9966707552128966e-05, "loss": 1.6451, "step": 9138 }, { "epoch": 2.401550234834302, "grad_norm": 0.6802014708518982, "learning_rate": 1.994918521114421e-05, "loss": 1.6011, "step": 9140 }, { "epoch": 2.402075738167964, "grad_norm": 0.6203175187110901, "learning_rate": 1.9931662870159454e-05, "loss": 1.6168, "step": 9142 }, { "epoch": 2.4026012415016256, "grad_norm": 0.5582800507545471, "learning_rate": 1.99141405291747e-05, "loss": 1.6206, "step": 9144 }, { "epoch": 2.4031267448352875, "grad_norm": 0.6481848955154419, "learning_rate": 1.9896618188189946e-05, "loss": 1.6198, "step": 9146 }, { "epoch": 2.4036522481689495, "grad_norm": 0.5764234662055969, "learning_rate": 1.9879095847205187e-05, "loss": 1.6328, "step": 9148 }, { "epoch": 2.404177751502611, "grad_norm": 0.6788586378097534, "learning_rate": 1.986157350622043e-05, "loss": 1.6166, "step": 9150 }, { "epoch": 2.404703254836273, "grad_norm": 0.5868250727653503, "learning_rate": 1.984405116523568e-05, "loss": 1.5952, "step": 9152 }, { "epoch": 2.4052287581699345, "grad_norm": 0.6610315442085266, "learning_rate": 1.982652882425092e-05, "loss": 1.597, "step": 9154 }, { "epoch": 2.4057542615035965, "grad_norm": 0.7634231448173523, "learning_rate": 1.9809006483266167e-05, "loss": 1.6235, "step": 9156 }, { "epoch": 2.406279764837258, "grad_norm": 0.5905351638793945, "learning_rate": 1.979148414228141e-05, "loss": 1.6214, "step": 9158 }, { "epoch": 2.40680526817092, "grad_norm": 0.5715067982673645, "learning_rate": 1.9773961801296652e-05, "loss": 1.5922, "step": 9160 }, { "epoch": 2.407330771504582, "grad_norm": 0.593390941619873, "learning_rate": 1.97564394603119e-05, "loss": 1.6207, "step": 9162 }, { "epoch": 2.4078562748382435, "grad_norm": 0.5678249597549438, "learning_rate": 1.9738917119327144e-05, "loss": 1.6033, "step": 9164 }, { "epoch": 2.4083817781719055, "grad_norm": 0.6406270861625671, "learning_rate": 1.9721394778342388e-05, "loss": 1.6363, "step": 9166 }, { "epoch": 2.408907281505567, "grad_norm": 0.5460156202316284, "learning_rate": 1.9703872437357632e-05, "loss": 1.6043, "step": 9168 }, { "epoch": 2.409432784839229, "grad_norm": 0.5826367735862732, "learning_rate": 1.9686350096372876e-05, "loss": 1.6166, "step": 9170 }, { "epoch": 2.4099582881728905, "grad_norm": 0.5770626068115234, "learning_rate": 1.966882775538812e-05, "loss": 1.5933, "step": 9172 }, { "epoch": 2.4104837915065525, "grad_norm": 0.5596659183502197, "learning_rate": 1.9651305414403364e-05, "loss": 1.6123, "step": 9174 }, { "epoch": 2.411009294840214, "grad_norm": 0.6251547336578369, "learning_rate": 1.963378307341861e-05, "loss": 1.6105, "step": 9176 }, { "epoch": 2.411534798173876, "grad_norm": 0.566064715385437, "learning_rate": 1.9616260732433856e-05, "loss": 1.6127, "step": 9178 }, { "epoch": 2.4120603015075375, "grad_norm": 0.5645999908447266, "learning_rate": 1.9598738391449097e-05, "loss": 1.6105, "step": 9180 }, { "epoch": 2.4125858048411994, "grad_norm": 0.5924414396286011, "learning_rate": 1.9581216050464345e-05, "loss": 1.6258, "step": 9182 }, { "epoch": 2.4131113081748614, "grad_norm": 0.5936945080757141, "learning_rate": 1.956369370947959e-05, "loss": 1.6088, "step": 9184 }, { "epoch": 2.413636811508523, "grad_norm": 0.711117148399353, "learning_rate": 1.954617136849483e-05, "loss": 1.5842, "step": 9186 }, { "epoch": 2.414162314842185, "grad_norm": 0.5747193694114685, "learning_rate": 1.9528649027510077e-05, "loss": 1.5934, "step": 9188 }, { "epoch": 2.4146878181758464, "grad_norm": 0.6013287901878357, "learning_rate": 1.951112668652532e-05, "loss": 1.5854, "step": 9190 }, { "epoch": 2.4152133215095084, "grad_norm": 0.680145263671875, "learning_rate": 1.9493604345540565e-05, "loss": 1.6323, "step": 9192 }, { "epoch": 2.41573882484317, "grad_norm": 0.6533603072166443, "learning_rate": 1.947608200455581e-05, "loss": 1.5944, "step": 9194 }, { "epoch": 2.416264328176832, "grad_norm": 0.549355149269104, "learning_rate": 1.9458559663571054e-05, "loss": 1.628, "step": 9196 }, { "epoch": 2.416789831510494, "grad_norm": 0.5181589722633362, "learning_rate": 1.9441037322586298e-05, "loss": 1.6072, "step": 9198 }, { "epoch": 2.4173153348441554, "grad_norm": 0.5603959560394287, "learning_rate": 1.9423514981601542e-05, "loss": 1.6101, "step": 9200 }, { "epoch": 2.4173153348441554, "eval_loss": 1.6572836637496948, "eval_runtime": 486.1392, "eval_samples_per_second": 250.523, "eval_steps_per_second": 31.316, "step": 9200 }, { "epoch": 2.4178408381778174, "grad_norm": 0.5568393468856812, "learning_rate": 1.940599264061679e-05, "loss": 1.5855, "step": 9202 }, { "epoch": 2.418366341511479, "grad_norm": 0.6060105562210083, "learning_rate": 1.938847029963203e-05, "loss": 1.6208, "step": 9204 }, { "epoch": 2.418891844845141, "grad_norm": 0.5444419384002686, "learning_rate": 1.9370947958647275e-05, "loss": 1.6137, "step": 9206 }, { "epoch": 2.4194173481788024, "grad_norm": 0.7899800539016724, "learning_rate": 1.9353425617662522e-05, "loss": 1.6181, "step": 9208 }, { "epoch": 2.4199428515124644, "grad_norm": 0.5947116017341614, "learning_rate": 1.9335903276677763e-05, "loss": 1.5974, "step": 9210 }, { "epoch": 2.420468354846126, "grad_norm": 0.6116278171539307, "learning_rate": 1.931838093569301e-05, "loss": 1.6519, "step": 9212 }, { "epoch": 2.420993858179788, "grad_norm": 0.645462691783905, "learning_rate": 1.9300858594708255e-05, "loss": 1.586, "step": 9214 }, { "epoch": 2.4215193615134494, "grad_norm": 0.5454298853874207, "learning_rate": 1.92833362537235e-05, "loss": 1.5734, "step": 9216 }, { "epoch": 2.4220448648471113, "grad_norm": 0.595712423324585, "learning_rate": 1.9265813912738743e-05, "loss": 1.6554, "step": 9218 }, { "epoch": 2.4225703681807733, "grad_norm": 0.5496743321418762, "learning_rate": 1.9248291571753987e-05, "loss": 1.6148, "step": 9220 }, { "epoch": 2.423095871514435, "grad_norm": 0.5601218342781067, "learning_rate": 1.923076923076923e-05, "loss": 1.59, "step": 9222 }, { "epoch": 2.423621374848097, "grad_norm": 0.5808682441711426, "learning_rate": 1.9213246889784476e-05, "loss": 1.6106, "step": 9224 }, { "epoch": 2.4241468781817583, "grad_norm": 0.6002793908119202, "learning_rate": 1.919572454879972e-05, "loss": 1.6091, "step": 9226 }, { "epoch": 2.4246723815154203, "grad_norm": 0.5775554776191711, "learning_rate": 1.9178202207814967e-05, "loss": 1.5891, "step": 9228 }, { "epoch": 2.425197884849082, "grad_norm": 0.6747956871986389, "learning_rate": 1.9160679866830208e-05, "loss": 1.5877, "step": 9230 }, { "epoch": 2.425723388182744, "grad_norm": 0.549020528793335, "learning_rate": 1.9143157525845452e-05, "loss": 1.5931, "step": 9232 }, { "epoch": 2.4262488915164058, "grad_norm": 0.6531383395195007, "learning_rate": 1.91256351848607e-05, "loss": 1.6393, "step": 9234 }, { "epoch": 2.4267743948500673, "grad_norm": 0.7158095240592957, "learning_rate": 1.910811284387594e-05, "loss": 1.5868, "step": 9236 }, { "epoch": 2.4272998981837293, "grad_norm": 0.5585731267929077, "learning_rate": 1.9090590502891188e-05, "loss": 1.6519, "step": 9238 }, { "epoch": 2.427825401517391, "grad_norm": 0.6200599670410156, "learning_rate": 1.9073068161906432e-05, "loss": 1.6401, "step": 9240 }, { "epoch": 2.4283509048510528, "grad_norm": 0.5844702124595642, "learning_rate": 1.9055545820921673e-05, "loss": 1.5893, "step": 9242 }, { "epoch": 2.4288764081847143, "grad_norm": 0.5763403177261353, "learning_rate": 1.903802347993692e-05, "loss": 1.6056, "step": 9244 }, { "epoch": 2.4294019115183763, "grad_norm": 0.6376482844352722, "learning_rate": 1.9020501138952165e-05, "loss": 1.6083, "step": 9246 }, { "epoch": 2.429927414852038, "grad_norm": 0.6373746991157532, "learning_rate": 1.900297879796741e-05, "loss": 1.5721, "step": 9248 }, { "epoch": 2.4304529181856998, "grad_norm": 0.572599470615387, "learning_rate": 1.8985456456982653e-05, "loss": 1.6161, "step": 9250 }, { "epoch": 2.4309784215193613, "grad_norm": 0.5486949682235718, "learning_rate": 1.8967934115997897e-05, "loss": 1.591, "step": 9252 }, { "epoch": 2.4315039248530232, "grad_norm": 0.6981235146522522, "learning_rate": 1.8950411775013145e-05, "loss": 1.6399, "step": 9254 }, { "epoch": 2.432029428186685, "grad_norm": 0.7503864169120789, "learning_rate": 1.8932889434028386e-05, "loss": 1.6222, "step": 9256 }, { "epoch": 2.4325549315203467, "grad_norm": 0.5889126062393188, "learning_rate": 1.891536709304363e-05, "loss": 1.6176, "step": 9258 }, { "epoch": 2.4330804348540087, "grad_norm": 0.5790210962295532, "learning_rate": 1.8897844752058878e-05, "loss": 1.6321, "step": 9260 }, { "epoch": 2.4336059381876702, "grad_norm": 0.5340055823326111, "learning_rate": 1.888032241107412e-05, "loss": 1.6009, "step": 9262 }, { "epoch": 2.434131441521332, "grad_norm": 0.5527550578117371, "learning_rate": 1.8862800070089366e-05, "loss": 1.6173, "step": 9264 }, { "epoch": 2.4346569448549937, "grad_norm": 0.6926127672195435, "learning_rate": 1.884527772910461e-05, "loss": 1.5944, "step": 9266 }, { "epoch": 2.4351824481886557, "grad_norm": 0.5821343064308167, "learning_rate": 1.8827755388119854e-05, "loss": 1.6235, "step": 9268 }, { "epoch": 2.4357079515223177, "grad_norm": 0.5754803419113159, "learning_rate": 1.88102330471351e-05, "loss": 1.6216, "step": 9270 }, { "epoch": 2.436233454855979, "grad_norm": 0.5613455176353455, "learning_rate": 1.8792710706150343e-05, "loss": 1.607, "step": 9272 }, { "epoch": 2.436758958189641, "grad_norm": 0.5742526650428772, "learning_rate": 1.8775188365165587e-05, "loss": 1.5704, "step": 9274 }, { "epoch": 2.4372844615233027, "grad_norm": 0.5530628561973572, "learning_rate": 1.875766602418083e-05, "loss": 1.5849, "step": 9276 }, { "epoch": 2.4378099648569647, "grad_norm": 0.5322666168212891, "learning_rate": 1.8740143683196075e-05, "loss": 1.5777, "step": 9278 }, { "epoch": 2.438335468190626, "grad_norm": 0.5801582932472229, "learning_rate": 1.8722621342211323e-05, "loss": 1.6277, "step": 9280 }, { "epoch": 2.438860971524288, "grad_norm": 0.6362050175666809, "learning_rate": 1.8705099001226563e-05, "loss": 1.6053, "step": 9282 }, { "epoch": 2.43938647485795, "grad_norm": 0.6227912306785583, "learning_rate": 1.868757666024181e-05, "loss": 1.6558, "step": 9284 }, { "epoch": 2.4399119781916117, "grad_norm": 0.5319087505340576, "learning_rate": 1.8670054319257055e-05, "loss": 1.6402, "step": 9286 }, { "epoch": 2.440437481525273, "grad_norm": 0.5639674067497253, "learning_rate": 1.8652531978272296e-05, "loss": 1.6191, "step": 9288 }, { "epoch": 2.440962984858935, "grad_norm": 0.5504940748214722, "learning_rate": 1.8635009637287544e-05, "loss": 1.6054, "step": 9290 }, { "epoch": 2.441488488192597, "grad_norm": 0.535571277141571, "learning_rate": 1.8617487296302788e-05, "loss": 1.5875, "step": 9292 }, { "epoch": 2.4420139915262586, "grad_norm": 0.5902593731880188, "learning_rate": 1.8599964955318032e-05, "loss": 1.5771, "step": 9294 }, { "epoch": 2.4425394948599206, "grad_norm": 0.5818725824356079, "learning_rate": 1.8582442614333276e-05, "loss": 1.5895, "step": 9296 }, { "epoch": 2.443064998193582, "grad_norm": 0.5598388314247131, "learning_rate": 1.856492027334852e-05, "loss": 1.619, "step": 9298 }, { "epoch": 2.443590501527244, "grad_norm": 0.6018016934394836, "learning_rate": 1.8547397932363764e-05, "loss": 1.5975, "step": 9300 }, { "epoch": 2.4441160048609056, "grad_norm": 0.5516964197158813, "learning_rate": 1.852987559137901e-05, "loss": 1.6073, "step": 9302 }, { "epoch": 2.4446415081945676, "grad_norm": 0.5745545625686646, "learning_rate": 1.8512353250394253e-05, "loss": 1.6345, "step": 9304 }, { "epoch": 2.4451670115282296, "grad_norm": 0.5729696750640869, "learning_rate": 1.8494830909409497e-05, "loss": 1.6356, "step": 9306 }, { "epoch": 2.445692514861891, "grad_norm": 0.6375647783279419, "learning_rate": 1.847730856842474e-05, "loss": 1.6262, "step": 9308 }, { "epoch": 2.446218018195553, "grad_norm": 0.599912166595459, "learning_rate": 1.845978622743999e-05, "loss": 1.5833, "step": 9310 }, { "epoch": 2.4467435215292146, "grad_norm": 0.6265223026275635, "learning_rate": 1.844226388645523e-05, "loss": 1.5683, "step": 9312 }, { "epoch": 2.4472690248628766, "grad_norm": 0.5674051642417908, "learning_rate": 1.8424741545470474e-05, "loss": 1.6312, "step": 9314 }, { "epoch": 2.447794528196538, "grad_norm": 0.5264484286308289, "learning_rate": 1.840721920448572e-05, "loss": 1.6509, "step": 9316 }, { "epoch": 2.4483200315302, "grad_norm": 0.5643804669380188, "learning_rate": 1.8389696863500965e-05, "loss": 1.6002, "step": 9318 }, { "epoch": 2.448845534863862, "grad_norm": 0.5581643581390381, "learning_rate": 1.837217452251621e-05, "loss": 1.602, "step": 9320 }, { "epoch": 2.4493710381975236, "grad_norm": 0.5524643063545227, "learning_rate": 1.8354652181531454e-05, "loss": 1.6092, "step": 9322 }, { "epoch": 2.4498965415311855, "grad_norm": 0.7081454992294312, "learning_rate": 1.8337129840546698e-05, "loss": 1.5703, "step": 9324 }, { "epoch": 2.450422044864847, "grad_norm": 0.6080058217048645, "learning_rate": 1.8319607499561942e-05, "loss": 1.6261, "step": 9326 }, { "epoch": 2.450947548198509, "grad_norm": 0.5197411775588989, "learning_rate": 1.8302085158577186e-05, "loss": 1.6313, "step": 9328 }, { "epoch": 2.4514730515321705, "grad_norm": 0.5948798060417175, "learning_rate": 1.8284562817592434e-05, "loss": 1.6415, "step": 9330 }, { "epoch": 2.4519985548658325, "grad_norm": 0.4755839705467224, "learning_rate": 1.8267040476607675e-05, "loss": 1.5945, "step": 9332 }, { "epoch": 2.452524058199494, "grad_norm": 0.577564537525177, "learning_rate": 1.824951813562292e-05, "loss": 1.6056, "step": 9334 }, { "epoch": 2.453049561533156, "grad_norm": 0.5504733324050903, "learning_rate": 1.8231995794638166e-05, "loss": 1.5843, "step": 9336 }, { "epoch": 2.4535750648668175, "grad_norm": 0.5584169626235962, "learning_rate": 1.8214473453653407e-05, "loss": 1.6318, "step": 9338 }, { "epoch": 2.4541005682004795, "grad_norm": 0.5598637461662292, "learning_rate": 1.8196951112668655e-05, "loss": 1.5763, "step": 9340 }, { "epoch": 2.4546260715341415, "grad_norm": 0.6208490133285522, "learning_rate": 1.81794287716839e-05, "loss": 1.6339, "step": 9342 }, { "epoch": 2.455151574867803, "grad_norm": 0.5243476033210754, "learning_rate": 1.816190643069914e-05, "loss": 1.5918, "step": 9344 }, { "epoch": 2.455677078201465, "grad_norm": 0.6066216826438904, "learning_rate": 1.8144384089714387e-05, "loss": 1.6419, "step": 9346 }, { "epoch": 2.4562025815351265, "grad_norm": 0.5985385179519653, "learning_rate": 1.812686174872963e-05, "loss": 1.6074, "step": 9348 }, { "epoch": 2.4567280848687885, "grad_norm": 0.6051018834114075, "learning_rate": 1.8109339407744876e-05, "loss": 1.6076, "step": 9350 }, { "epoch": 2.45725358820245, "grad_norm": 0.5187597274780273, "learning_rate": 1.809181706676012e-05, "loss": 1.6128, "step": 9352 }, { "epoch": 2.457779091536112, "grad_norm": 0.576802670955658, "learning_rate": 1.8074294725775364e-05, "loss": 1.6185, "step": 9354 }, { "epoch": 2.458304594869774, "grad_norm": 0.6253780722618103, "learning_rate": 1.805677238479061e-05, "loss": 1.5805, "step": 9356 }, { "epoch": 2.4588300982034355, "grad_norm": 0.5701897740364075, "learning_rate": 1.8039250043805852e-05, "loss": 1.622, "step": 9358 }, { "epoch": 2.4593556015370974, "grad_norm": 0.5329028367996216, "learning_rate": 1.8021727702821096e-05, "loss": 1.616, "step": 9360 }, { "epoch": 2.459881104870759, "grad_norm": 0.6378101706504822, "learning_rate": 1.8004205361836344e-05, "loss": 1.6283, "step": 9362 }, { "epoch": 2.460406608204421, "grad_norm": 0.5629276633262634, "learning_rate": 1.7986683020851585e-05, "loss": 1.5953, "step": 9364 }, { "epoch": 2.4609321115380824, "grad_norm": 0.55552738904953, "learning_rate": 1.7969160679866832e-05, "loss": 1.588, "step": 9366 }, { "epoch": 2.4614576148717444, "grad_norm": 0.6844131350517273, "learning_rate": 1.7951638338882077e-05, "loss": 1.6477, "step": 9368 }, { "epoch": 2.461983118205406, "grad_norm": 0.5406688451766968, "learning_rate": 1.7934115997897317e-05, "loss": 1.6109, "step": 9370 }, { "epoch": 2.462508621539068, "grad_norm": 0.5940281748771667, "learning_rate": 1.7916593656912565e-05, "loss": 1.5788, "step": 9372 }, { "epoch": 2.4630341248727294, "grad_norm": 0.5564101338386536, "learning_rate": 1.789907131592781e-05, "loss": 1.5828, "step": 9374 }, { "epoch": 2.4635596282063914, "grad_norm": 0.8060266971588135, "learning_rate": 1.7881548974943053e-05, "loss": 1.5922, "step": 9376 }, { "epoch": 2.4640851315400534, "grad_norm": 0.6775248646736145, "learning_rate": 1.7864026633958297e-05, "loss": 1.6292, "step": 9378 }, { "epoch": 2.464610634873715, "grad_norm": 0.5210546851158142, "learning_rate": 1.784650429297354e-05, "loss": 1.5921, "step": 9380 }, { "epoch": 2.465136138207377, "grad_norm": 0.5210246443748474, "learning_rate": 1.7828981951988786e-05, "loss": 1.5947, "step": 9382 }, { "epoch": 2.4656616415410384, "grad_norm": 0.5449514985084534, "learning_rate": 1.781145961100403e-05, "loss": 1.5716, "step": 9384 }, { "epoch": 2.4661871448747004, "grad_norm": 0.5264145135879517, "learning_rate": 1.7793937270019274e-05, "loss": 1.5807, "step": 9386 }, { "epoch": 2.466712648208362, "grad_norm": 0.6389797329902649, "learning_rate": 1.777641492903452e-05, "loss": 1.5949, "step": 9388 }, { "epoch": 2.467238151542024, "grad_norm": 0.5606738924980164, "learning_rate": 1.7758892588049762e-05, "loss": 1.5973, "step": 9390 }, { "epoch": 2.467763654875686, "grad_norm": 0.634388267993927, "learning_rate": 1.774137024706501e-05, "loss": 1.6466, "step": 9392 }, { "epoch": 2.4682891582093474, "grad_norm": 0.6049827337265015, "learning_rate": 1.7723847906080254e-05, "loss": 1.6121, "step": 9394 }, { "epoch": 2.4688146615430093, "grad_norm": 0.6313089728355408, "learning_rate": 1.77063255650955e-05, "loss": 1.6176, "step": 9396 }, { "epoch": 2.469340164876671, "grad_norm": 0.6693941950798035, "learning_rate": 1.7688803224110743e-05, "loss": 1.6162, "step": 9398 }, { "epoch": 2.469865668210333, "grad_norm": 0.5389882326126099, "learning_rate": 1.7671280883125987e-05, "loss": 1.5935, "step": 9400 }, { "epoch": 2.4703911715439943, "grad_norm": 0.6095534563064575, "learning_rate": 1.765375854214123e-05, "loss": 1.624, "step": 9402 }, { "epoch": 2.4709166748776563, "grad_norm": 0.5612223744392395, "learning_rate": 1.7636236201156475e-05, "loss": 1.64, "step": 9404 }, { "epoch": 2.471442178211318, "grad_norm": 0.58417147397995, "learning_rate": 1.761871386017172e-05, "loss": 1.5979, "step": 9406 }, { "epoch": 2.47196768154498, "grad_norm": 0.5258997082710266, "learning_rate": 1.7601191519186963e-05, "loss": 1.5915, "step": 9408 }, { "epoch": 2.4724931848786413, "grad_norm": 0.5420629382133484, "learning_rate": 1.7583669178202208e-05, "loss": 1.6292, "step": 9410 }, { "epoch": 2.4730186882123033, "grad_norm": 0.5879486799240112, "learning_rate": 1.7566146837217455e-05, "loss": 1.5718, "step": 9412 }, { "epoch": 2.4735441915459653, "grad_norm": 0.5970991253852844, "learning_rate": 1.7548624496232696e-05, "loss": 1.6217, "step": 9414 }, { "epoch": 2.474069694879627, "grad_norm": 0.5610321760177612, "learning_rate": 1.753110215524794e-05, "loss": 1.6234, "step": 9416 }, { "epoch": 2.4745951982132888, "grad_norm": 0.6712002158164978, "learning_rate": 1.7513579814263188e-05, "loss": 1.5815, "step": 9418 }, { "epoch": 2.4751207015469503, "grad_norm": 0.5543602108955383, "learning_rate": 1.7496057473278432e-05, "loss": 1.616, "step": 9420 }, { "epoch": 2.4756462048806123, "grad_norm": 0.6275373101234436, "learning_rate": 1.7478535132293676e-05, "loss": 1.61, "step": 9422 }, { "epoch": 2.476171708214274, "grad_norm": 0.6385626792907715, "learning_rate": 1.746101279130892e-05, "loss": 1.5826, "step": 9424 }, { "epoch": 2.4766972115479358, "grad_norm": 0.5762410759925842, "learning_rate": 1.7443490450324164e-05, "loss": 1.5961, "step": 9426 }, { "epoch": 2.4772227148815977, "grad_norm": 0.5379599332809448, "learning_rate": 1.742596810933941e-05, "loss": 1.6381, "step": 9428 }, { "epoch": 2.4777482182152593, "grad_norm": 0.5608949661254883, "learning_rate": 1.7408445768354653e-05, "loss": 1.6073, "step": 9430 }, { "epoch": 2.4782737215489212, "grad_norm": 0.5401448607444763, "learning_rate": 1.7390923427369897e-05, "loss": 1.6178, "step": 9432 }, { "epoch": 2.4787992248825828, "grad_norm": 0.648591935634613, "learning_rate": 1.737340108638514e-05, "loss": 1.5938, "step": 9434 }, { "epoch": 2.4793247282162447, "grad_norm": 0.710050106048584, "learning_rate": 1.7355878745400385e-05, "loss": 1.6104, "step": 9436 }, { "epoch": 2.4798502315499062, "grad_norm": 0.5923548340797424, "learning_rate": 1.7338356404415633e-05, "loss": 1.6235, "step": 9438 }, { "epoch": 2.480375734883568, "grad_norm": 0.6905649900436401, "learning_rate": 1.7320834063430874e-05, "loss": 1.5891, "step": 9440 }, { "epoch": 2.48090123821723, "grad_norm": 0.6398130059242249, "learning_rate": 1.7303311722446118e-05, "loss": 1.5641, "step": 9442 }, { "epoch": 2.4814267415508917, "grad_norm": 0.6153111457824707, "learning_rate": 1.7285789381461365e-05, "loss": 1.6005, "step": 9444 }, { "epoch": 2.4819522448845532, "grad_norm": 0.5550106763839722, "learning_rate": 1.7268267040476606e-05, "loss": 1.5992, "step": 9446 }, { "epoch": 2.482477748218215, "grad_norm": 0.5491114258766174, "learning_rate": 1.7250744699491854e-05, "loss": 1.5753, "step": 9448 }, { "epoch": 2.483003251551877, "grad_norm": 0.5267966985702515, "learning_rate": 1.7233222358507098e-05, "loss": 1.5962, "step": 9450 }, { "epoch": 2.4835287548855387, "grad_norm": 0.6979205012321472, "learning_rate": 1.7215700017522342e-05, "loss": 1.5966, "step": 9452 }, { "epoch": 2.4840542582192007, "grad_norm": 0.5285736918449402, "learning_rate": 1.7198177676537586e-05, "loss": 1.5854, "step": 9454 }, { "epoch": 2.484579761552862, "grad_norm": 0.5110975503921509, "learning_rate": 1.718065533555283e-05, "loss": 1.6251, "step": 9456 }, { "epoch": 2.485105264886524, "grad_norm": 0.5918470025062561, "learning_rate": 1.7163132994568078e-05, "loss": 1.5997, "step": 9458 }, { "epoch": 2.4856307682201857, "grad_norm": 0.5723440051078796, "learning_rate": 1.714561065358332e-05, "loss": 1.5997, "step": 9460 }, { "epoch": 2.4861562715538477, "grad_norm": 0.7100077271461487, "learning_rate": 1.7128088312598563e-05, "loss": 1.5781, "step": 9462 }, { "epoch": 2.4866817748875096, "grad_norm": 0.6034718155860901, "learning_rate": 1.711056597161381e-05, "loss": 1.6122, "step": 9464 }, { "epoch": 2.487207278221171, "grad_norm": 0.6209121346473694, "learning_rate": 1.709304363062905e-05, "loss": 1.6385, "step": 9466 }, { "epoch": 2.487732781554833, "grad_norm": 0.6605438590049744, "learning_rate": 1.70755212896443e-05, "loss": 1.5764, "step": 9468 }, { "epoch": 2.4882582848884947, "grad_norm": 0.5605977773666382, "learning_rate": 1.7057998948659543e-05, "loss": 1.5951, "step": 9470 }, { "epoch": 2.4887837882221566, "grad_norm": 0.5933828949928284, "learning_rate": 1.7040476607674784e-05, "loss": 1.6321, "step": 9472 }, { "epoch": 2.489309291555818, "grad_norm": 0.6970359683036804, "learning_rate": 1.702295426669003e-05, "loss": 1.6462, "step": 9474 }, { "epoch": 2.48983479488948, "grad_norm": 0.5877122282981873, "learning_rate": 1.7005431925705276e-05, "loss": 1.6058, "step": 9476 }, { "epoch": 2.490360298223142, "grad_norm": 0.8348531126976013, "learning_rate": 1.698790958472052e-05, "loss": 1.6066, "step": 9478 }, { "epoch": 2.4908858015568036, "grad_norm": 0.5285534262657166, "learning_rate": 1.6970387243735764e-05, "loss": 1.6145, "step": 9480 }, { "epoch": 2.4914113048904656, "grad_norm": 0.5586251020431519, "learning_rate": 1.6952864902751008e-05, "loss": 1.618, "step": 9482 }, { "epoch": 2.491936808224127, "grad_norm": 0.6798741817474365, "learning_rate": 1.6935342561766252e-05, "loss": 1.5923, "step": 9484 }, { "epoch": 2.492462311557789, "grad_norm": 0.6360746622085571, "learning_rate": 1.6917820220781496e-05, "loss": 1.5958, "step": 9486 }, { "epoch": 2.4929878148914506, "grad_norm": 0.6008504629135132, "learning_rate": 1.690029787979674e-05, "loss": 1.6299, "step": 9488 }, { "epoch": 2.4935133182251126, "grad_norm": 0.6286047697067261, "learning_rate": 1.6882775538811988e-05, "loss": 1.6249, "step": 9490 }, { "epoch": 2.494038821558774, "grad_norm": 0.5566932559013367, "learning_rate": 1.686525319782723e-05, "loss": 1.6175, "step": 9492 }, { "epoch": 2.494564324892436, "grad_norm": 0.5241519212722778, "learning_rate": 1.6847730856842477e-05, "loss": 1.5978, "step": 9494 }, { "epoch": 2.4950898282260976, "grad_norm": 0.5487810969352722, "learning_rate": 1.683020851585772e-05, "loss": 1.6114, "step": 9496 }, { "epoch": 2.4956153315597596, "grad_norm": 0.518281877040863, "learning_rate": 1.681268617487296e-05, "loss": 1.5993, "step": 9498 }, { "epoch": 2.4961408348934215, "grad_norm": 0.5262603163719177, "learning_rate": 1.679516383388821e-05, "loss": 1.5982, "step": 9500 }, { "epoch": 2.496666338227083, "grad_norm": 0.560440182685852, "learning_rate": 1.6777641492903453e-05, "loss": 1.6281, "step": 9502 }, { "epoch": 2.497191841560745, "grad_norm": 0.6901085376739502, "learning_rate": 1.6760119151918697e-05, "loss": 1.6069, "step": 9504 }, { "epoch": 2.4977173448944066, "grad_norm": 0.6319709420204163, "learning_rate": 1.674259681093394e-05, "loss": 1.6053, "step": 9506 }, { "epoch": 2.4982428482280685, "grad_norm": 0.6285496354103088, "learning_rate": 1.6725074469949186e-05, "loss": 1.6269, "step": 9508 }, { "epoch": 2.49876835156173, "grad_norm": 0.5803909301757812, "learning_rate": 1.670755212896443e-05, "loss": 1.5917, "step": 9510 }, { "epoch": 2.499293854895392, "grad_norm": 0.519927978515625, "learning_rate": 1.6690029787979674e-05, "loss": 1.6247, "step": 9512 }, { "epoch": 2.499819358229054, "grad_norm": 0.5453500747680664, "learning_rate": 1.6672507446994918e-05, "loss": 1.582, "step": 9514 }, { "epoch": 2.5003448615627155, "grad_norm": 0.7243382930755615, "learning_rate": 1.6654985106010162e-05, "loss": 1.5941, "step": 9516 }, { "epoch": 2.500870364896377, "grad_norm": 0.6101195812225342, "learning_rate": 1.6637462765025407e-05, "loss": 1.5646, "step": 9518 }, { "epoch": 2.501395868230039, "grad_norm": 0.5308881402015686, "learning_rate": 1.6619940424040654e-05, "loss": 1.5863, "step": 9520 }, { "epoch": 2.501921371563701, "grad_norm": 0.624251663684845, "learning_rate": 1.66024180830559e-05, "loss": 1.5914, "step": 9522 }, { "epoch": 2.5024468748973625, "grad_norm": 0.6056520938873291, "learning_rate": 1.6584895742071143e-05, "loss": 1.5891, "step": 9524 }, { "epoch": 2.5029723782310245, "grad_norm": 0.6396739482879639, "learning_rate": 1.6567373401086387e-05, "loss": 1.6248, "step": 9526 }, { "epoch": 2.5034978815646864, "grad_norm": 0.5661067962646484, "learning_rate": 1.654985106010163e-05, "loss": 1.5923, "step": 9528 }, { "epoch": 2.504023384898348, "grad_norm": 0.601337194442749, "learning_rate": 1.6532328719116875e-05, "loss": 1.6074, "step": 9530 }, { "epoch": 2.5045488882320095, "grad_norm": 0.603577733039856, "learning_rate": 1.651480637813212e-05, "loss": 1.6247, "step": 9532 }, { "epoch": 2.5050743915656715, "grad_norm": 0.5598317980766296, "learning_rate": 1.6497284037147363e-05, "loss": 1.6353, "step": 9534 }, { "epoch": 2.5055998948993334, "grad_norm": 0.6507828831672668, "learning_rate": 1.6479761696162608e-05, "loss": 1.6215, "step": 9536 }, { "epoch": 2.506125398232995, "grad_norm": 0.5373896360397339, "learning_rate": 1.6462239355177852e-05, "loss": 1.5898, "step": 9538 }, { "epoch": 2.506650901566657, "grad_norm": 0.6569491028785706, "learning_rate": 1.64447170141931e-05, "loss": 1.6061, "step": 9540 }, { "epoch": 2.5071764049003185, "grad_norm": 0.6949899196624756, "learning_rate": 1.642719467320834e-05, "loss": 1.5774, "step": 9542 }, { "epoch": 2.5077019082339804, "grad_norm": 0.6420979499816895, "learning_rate": 1.6409672332223584e-05, "loss": 1.5848, "step": 9544 }, { "epoch": 2.508227411567642, "grad_norm": 0.5891046524047852, "learning_rate": 1.6392149991238832e-05, "loss": 1.6175, "step": 9546 }, { "epoch": 2.508752914901304, "grad_norm": 0.5385305285453796, "learning_rate": 1.6374627650254073e-05, "loss": 1.6017, "step": 9548 }, { "epoch": 2.509278418234966, "grad_norm": 0.5676510334014893, "learning_rate": 1.635710530926932e-05, "loss": 1.5919, "step": 9550 }, { "epoch": 2.5098039215686274, "grad_norm": 0.6153396964073181, "learning_rate": 1.6339582968284564e-05, "loss": 1.6062, "step": 9552 }, { "epoch": 2.5103294249022894, "grad_norm": 0.625535786151886, "learning_rate": 1.6322060627299805e-05, "loss": 1.6017, "step": 9554 }, { "epoch": 2.510854928235951, "grad_norm": 0.5933648347854614, "learning_rate": 1.6304538286315053e-05, "loss": 1.6464, "step": 9556 }, { "epoch": 2.511380431569613, "grad_norm": 0.5222533941268921, "learning_rate": 1.6287015945330297e-05, "loss": 1.6486, "step": 9558 }, { "epoch": 2.5119059349032744, "grad_norm": 0.511766791343689, "learning_rate": 1.626949360434554e-05, "loss": 1.6111, "step": 9560 }, { "epoch": 2.5124314382369364, "grad_norm": 0.6270935535430908, "learning_rate": 1.6251971263360785e-05, "loss": 1.6497, "step": 9562 }, { "epoch": 2.5129569415705983, "grad_norm": 0.6762127876281738, "learning_rate": 1.623444892237603e-05, "loss": 1.5915, "step": 9564 }, { "epoch": 2.51348244490426, "grad_norm": 0.7019566297531128, "learning_rate": 1.6216926581391277e-05, "loss": 1.5817, "step": 9566 }, { "epoch": 2.5140079482379214, "grad_norm": 0.6118047833442688, "learning_rate": 1.6199404240406518e-05, "loss": 1.6241, "step": 9568 }, { "epoch": 2.5145334515715834, "grad_norm": 0.5920790433883667, "learning_rate": 1.6181881899421762e-05, "loss": 1.607, "step": 9570 }, { "epoch": 2.5150589549052453, "grad_norm": 0.5979636311531067, "learning_rate": 1.616435955843701e-05, "loss": 1.5874, "step": 9572 }, { "epoch": 2.515584458238907, "grad_norm": 0.6232622265815735, "learning_rate": 1.614683721745225e-05, "loss": 1.5878, "step": 9574 }, { "epoch": 2.516109961572569, "grad_norm": 0.5250893235206604, "learning_rate": 1.6129314876467498e-05, "loss": 1.5994, "step": 9576 }, { "epoch": 2.5166354649062304, "grad_norm": 0.5512763857841492, "learning_rate": 1.6111792535482742e-05, "loss": 1.6041, "step": 9578 }, { "epoch": 2.5171609682398923, "grad_norm": 0.5368601083755493, "learning_rate": 1.6094270194497986e-05, "loss": 1.6131, "step": 9580 }, { "epoch": 2.517686471573554, "grad_norm": 0.6187223792076111, "learning_rate": 1.607674785351323e-05, "loss": 1.6238, "step": 9582 }, { "epoch": 2.518211974907216, "grad_norm": 0.5821846127510071, "learning_rate": 1.6059225512528475e-05, "loss": 1.6262, "step": 9584 }, { "epoch": 2.518737478240878, "grad_norm": 0.5627323985099792, "learning_rate": 1.604170317154372e-05, "loss": 1.6379, "step": 9586 }, { "epoch": 2.5192629815745393, "grad_norm": 0.5581722259521484, "learning_rate": 1.6024180830558963e-05, "loss": 1.5849, "step": 9588 }, { "epoch": 2.5197884849082013, "grad_norm": 0.677148163318634, "learning_rate": 1.6006658489574207e-05, "loss": 1.6116, "step": 9590 }, { "epoch": 2.520313988241863, "grad_norm": 0.5418302416801453, "learning_rate": 1.5989136148589455e-05, "loss": 1.5981, "step": 9592 }, { "epoch": 2.520839491575525, "grad_norm": 0.5710042715072632, "learning_rate": 1.5971613807604695e-05, "loss": 1.5515, "step": 9594 }, { "epoch": 2.5213649949091863, "grad_norm": 0.5451944470405579, "learning_rate": 1.5954091466619943e-05, "loss": 1.603, "step": 9596 }, { "epoch": 2.5218904982428483, "grad_norm": 0.7278261780738831, "learning_rate": 1.5936569125635187e-05, "loss": 1.6079, "step": 9598 }, { "epoch": 2.5224160015765102, "grad_norm": 0.5918084979057312, "learning_rate": 1.5919046784650428e-05, "loss": 1.6009, "step": 9600 }, { "epoch": 2.5224160015765102, "eval_loss": 1.6515436172485352, "eval_runtime": 487.0879, "eval_samples_per_second": 250.035, "eval_steps_per_second": 31.255, "step": 9600 }, { "epoch": 2.5229415049101718, "grad_norm": 0.6685560345649719, "learning_rate": 1.5901524443665676e-05, "loss": 1.6337, "step": 9602 }, { "epoch": 2.5234670082438333, "grad_norm": 0.579550564289093, "learning_rate": 1.588400210268092e-05, "loss": 1.5911, "step": 9604 }, { "epoch": 2.5239925115774953, "grad_norm": 0.522769570350647, "learning_rate": 1.5866479761696164e-05, "loss": 1.612, "step": 9606 }, { "epoch": 2.5245180149111572, "grad_norm": 0.6259097456932068, "learning_rate": 1.5848957420711408e-05, "loss": 1.6076, "step": 9608 }, { "epoch": 2.5250435182448188, "grad_norm": 0.5262504816055298, "learning_rate": 1.5831435079726652e-05, "loss": 1.5794, "step": 9610 }, { "epoch": 2.5255690215784807, "grad_norm": 0.6325481534004211, "learning_rate": 1.5813912738741896e-05, "loss": 1.5997, "step": 9612 }, { "epoch": 2.5260945249121423, "grad_norm": 0.7307614088058472, "learning_rate": 1.579639039775714e-05, "loss": 1.624, "step": 9614 }, { "epoch": 2.5266200282458042, "grad_norm": 0.5761781334877014, "learning_rate": 1.5778868056772385e-05, "loss": 1.6469, "step": 9616 }, { "epoch": 2.5271455315794658, "grad_norm": 0.543783962726593, "learning_rate": 1.576134571578763e-05, "loss": 1.613, "step": 9618 }, { "epoch": 2.5276710349131277, "grad_norm": 0.5635935068130493, "learning_rate": 1.5743823374802873e-05, "loss": 1.6249, "step": 9620 }, { "epoch": 2.5281965382467897, "grad_norm": 0.5564760565757751, "learning_rate": 1.572630103381812e-05, "loss": 1.5941, "step": 9622 }, { "epoch": 2.528722041580451, "grad_norm": 0.7148152589797974, "learning_rate": 1.5708778692833365e-05, "loss": 1.6357, "step": 9624 }, { "epoch": 2.529247544914113, "grad_norm": 0.7149800062179565, "learning_rate": 1.5691256351848606e-05, "loss": 1.6298, "step": 9626 }, { "epoch": 2.5297730482477747, "grad_norm": 0.6550901532173157, "learning_rate": 1.5673734010863853e-05, "loss": 1.6008, "step": 9628 }, { "epoch": 2.5302985515814367, "grad_norm": 0.5404581427574158, "learning_rate": 1.5656211669879097e-05, "loss": 1.5608, "step": 9630 }, { "epoch": 2.530824054915098, "grad_norm": 0.674629807472229, "learning_rate": 1.563868932889434e-05, "loss": 1.6215, "step": 9632 }, { "epoch": 2.53134955824876, "grad_norm": 0.6244335770606995, "learning_rate": 1.5621166987909586e-05, "loss": 1.6011, "step": 9634 }, { "epoch": 2.531875061582422, "grad_norm": 0.6089237332344055, "learning_rate": 1.560364464692483e-05, "loss": 1.5962, "step": 9636 }, { "epoch": 2.5324005649160837, "grad_norm": 0.5669505000114441, "learning_rate": 1.5586122305940074e-05, "loss": 1.5876, "step": 9638 }, { "epoch": 2.532926068249745, "grad_norm": 0.5484370589256287, "learning_rate": 1.5568599964955318e-05, "loss": 1.5994, "step": 9640 }, { "epoch": 2.533451571583407, "grad_norm": 0.6164976954460144, "learning_rate": 1.5551077623970562e-05, "loss": 1.5965, "step": 9642 }, { "epoch": 2.533977074917069, "grad_norm": 0.6072930693626404, "learning_rate": 1.5533555282985807e-05, "loss": 1.6167, "step": 9644 }, { "epoch": 2.5345025782507307, "grad_norm": 0.6487789750099182, "learning_rate": 1.551603294200105e-05, "loss": 1.6376, "step": 9646 }, { "epoch": 2.5350280815843926, "grad_norm": 0.641263484954834, "learning_rate": 1.54985106010163e-05, "loss": 1.6516, "step": 9648 }, { "epoch": 2.535553584918054, "grad_norm": 0.6348699927330017, "learning_rate": 1.548098826003154e-05, "loss": 1.5979, "step": 9650 }, { "epoch": 2.536079088251716, "grad_norm": 0.5915436744689941, "learning_rate": 1.5463465919046787e-05, "loss": 1.6025, "step": 9652 }, { "epoch": 2.5366045915853777, "grad_norm": 0.6235045194625854, "learning_rate": 1.544594357806203e-05, "loss": 1.6093, "step": 9654 }, { "epoch": 2.5371300949190396, "grad_norm": 0.5824310183525085, "learning_rate": 1.542842123707727e-05, "loss": 1.6317, "step": 9656 }, { "epoch": 2.5376555982527016, "grad_norm": 0.7158166766166687, "learning_rate": 1.541089889609252e-05, "loss": 1.609, "step": 9658 }, { "epoch": 2.538181101586363, "grad_norm": 0.6311430335044861, "learning_rate": 1.5393376555107763e-05, "loss": 1.6262, "step": 9660 }, { "epoch": 2.538706604920025, "grad_norm": 0.6356306076049805, "learning_rate": 1.5375854214123008e-05, "loss": 1.5805, "step": 9662 }, { "epoch": 2.5392321082536866, "grad_norm": 0.57065749168396, "learning_rate": 1.5358331873138252e-05, "loss": 1.6181, "step": 9664 }, { "epoch": 2.5397576115873486, "grad_norm": 0.7448508143424988, "learning_rate": 1.5340809532153496e-05, "loss": 1.6235, "step": 9666 }, { "epoch": 2.54028311492101, "grad_norm": 0.5627465844154358, "learning_rate": 1.5323287191168743e-05, "loss": 1.6273, "step": 9668 }, { "epoch": 2.540808618254672, "grad_norm": 0.6133062839508057, "learning_rate": 1.5305764850183984e-05, "loss": 1.5949, "step": 9670 }, { "epoch": 2.541334121588334, "grad_norm": 0.5348150134086609, "learning_rate": 1.528824250919923e-05, "loss": 1.6181, "step": 9672 }, { "epoch": 2.5418596249219956, "grad_norm": 0.5559690594673157, "learning_rate": 1.5270720168214476e-05, "loss": 1.6224, "step": 9674 }, { "epoch": 2.542385128255657, "grad_norm": 0.612876832485199, "learning_rate": 1.5253197827229718e-05, "loss": 1.6112, "step": 9676 }, { "epoch": 2.542910631589319, "grad_norm": 0.697488009929657, "learning_rate": 1.5235675486244963e-05, "loss": 1.598, "step": 9678 }, { "epoch": 2.543436134922981, "grad_norm": 0.6256018280982971, "learning_rate": 1.5218153145260209e-05, "loss": 1.6273, "step": 9680 }, { "epoch": 2.5439616382566426, "grad_norm": 0.583524763584137, "learning_rate": 1.5200630804275451e-05, "loss": 1.648, "step": 9682 }, { "epoch": 2.5444871415903045, "grad_norm": 0.682237446308136, "learning_rate": 1.5183108463290697e-05, "loss": 1.5871, "step": 9684 }, { "epoch": 2.5450126449239665, "grad_norm": 0.5728941559791565, "learning_rate": 1.5165586122305941e-05, "loss": 1.5953, "step": 9686 }, { "epoch": 2.545538148257628, "grad_norm": 0.5490486025810242, "learning_rate": 1.5148063781321184e-05, "loss": 1.6289, "step": 9688 }, { "epoch": 2.5460636515912896, "grad_norm": 0.5297114849090576, "learning_rate": 1.513054144033643e-05, "loss": 1.6445, "step": 9690 }, { "epoch": 2.5465891549249515, "grad_norm": 0.5523902177810669, "learning_rate": 1.5113019099351675e-05, "loss": 1.5715, "step": 9692 }, { "epoch": 2.5471146582586135, "grad_norm": 0.5788924694061279, "learning_rate": 1.509549675836692e-05, "loss": 1.6178, "step": 9694 }, { "epoch": 2.547640161592275, "grad_norm": 0.5816612839698792, "learning_rate": 1.5077974417382162e-05, "loss": 1.5855, "step": 9696 }, { "epoch": 2.548165664925937, "grad_norm": 0.6361863017082214, "learning_rate": 1.5060452076397408e-05, "loss": 1.612, "step": 9698 }, { "epoch": 2.5486911682595985, "grad_norm": 0.6106998324394226, "learning_rate": 1.5042929735412654e-05, "loss": 1.6482, "step": 9700 }, { "epoch": 2.5492166715932605, "grad_norm": 0.623910665512085, "learning_rate": 1.5025407394427896e-05, "loss": 1.5743, "step": 9702 }, { "epoch": 2.549742174926922, "grad_norm": 0.7251524925231934, "learning_rate": 1.500788505344314e-05, "loss": 1.6176, "step": 9704 }, { "epoch": 2.550267678260584, "grad_norm": 0.5366904735565186, "learning_rate": 1.4990362712458386e-05, "loss": 1.5826, "step": 9706 }, { "epoch": 2.550793181594246, "grad_norm": 0.5754824876785278, "learning_rate": 1.4972840371473629e-05, "loss": 1.6006, "step": 9708 }, { "epoch": 2.5513186849279075, "grad_norm": 0.6944889426231384, "learning_rate": 1.4955318030488875e-05, "loss": 1.6318, "step": 9710 }, { "epoch": 2.5518441882615694, "grad_norm": 0.6008621454238892, "learning_rate": 1.4937795689504119e-05, "loss": 1.6363, "step": 9712 }, { "epoch": 2.552369691595231, "grad_norm": 0.6449068784713745, "learning_rate": 1.4920273348519361e-05, "loss": 1.6009, "step": 9714 }, { "epoch": 2.552895194928893, "grad_norm": 0.687323272228241, "learning_rate": 1.4902751007534607e-05, "loss": 1.6247, "step": 9716 }, { "epoch": 2.5534206982625545, "grad_norm": 0.6217597126960754, "learning_rate": 1.4885228666549853e-05, "loss": 1.6128, "step": 9718 }, { "epoch": 2.5539462015962164, "grad_norm": 0.6320381760597229, "learning_rate": 1.4867706325565095e-05, "loss": 1.643, "step": 9720 }, { "epoch": 2.5544717049298784, "grad_norm": 0.604827344417572, "learning_rate": 1.485018398458034e-05, "loss": 1.6073, "step": 9722 }, { "epoch": 2.55499720826354, "grad_norm": 0.5245053768157959, "learning_rate": 1.4832661643595585e-05, "loss": 1.5963, "step": 9724 }, { "epoch": 2.5555227115972015, "grad_norm": 0.6272046566009521, "learning_rate": 1.4815139302610828e-05, "loss": 1.5939, "step": 9726 }, { "epoch": 2.5560482149308634, "grad_norm": 0.5605988502502441, "learning_rate": 1.4797616961626074e-05, "loss": 1.6283, "step": 9728 }, { "epoch": 2.5565737182645254, "grad_norm": 0.5442174077033997, "learning_rate": 1.4780094620641318e-05, "loss": 1.5942, "step": 9730 }, { "epoch": 2.557099221598187, "grad_norm": 0.806302011013031, "learning_rate": 1.4762572279656564e-05, "loss": 1.6132, "step": 9732 }, { "epoch": 2.557624724931849, "grad_norm": 0.6235170960426331, "learning_rate": 1.4745049938671806e-05, "loss": 1.6062, "step": 9734 }, { "epoch": 2.5581502282655104, "grad_norm": 0.6286174654960632, "learning_rate": 1.4727527597687052e-05, "loss": 1.6112, "step": 9736 }, { "epoch": 2.5586757315991724, "grad_norm": 0.5533096194267273, "learning_rate": 1.4710005256702296e-05, "loss": 1.5906, "step": 9738 }, { "epoch": 2.559201234932834, "grad_norm": 0.5867462754249573, "learning_rate": 1.469248291571754e-05, "loss": 1.5778, "step": 9740 }, { "epoch": 2.559726738266496, "grad_norm": 0.5377023816108704, "learning_rate": 1.4674960574732785e-05, "loss": 1.5883, "step": 9742 }, { "epoch": 2.560252241600158, "grad_norm": 0.5589543581008911, "learning_rate": 1.465743823374803e-05, "loss": 1.602, "step": 9744 }, { "epoch": 2.5607777449338194, "grad_norm": 0.6195635795593262, "learning_rate": 1.4639915892763273e-05, "loss": 1.622, "step": 9746 }, { "epoch": 2.5613032482674813, "grad_norm": 0.5796042680740356, "learning_rate": 1.4622393551778519e-05, "loss": 1.6333, "step": 9748 }, { "epoch": 2.561828751601143, "grad_norm": 0.5899613499641418, "learning_rate": 1.4604871210793763e-05, "loss": 1.5945, "step": 9750 }, { "epoch": 2.562354254934805, "grad_norm": 0.5779309272766113, "learning_rate": 1.4587348869809006e-05, "loss": 1.59, "step": 9752 }, { "epoch": 2.5628797582684664, "grad_norm": 0.6350328326225281, "learning_rate": 1.4569826528824251e-05, "loss": 1.6087, "step": 9754 }, { "epoch": 2.5634052616021283, "grad_norm": 0.5645972490310669, "learning_rate": 1.4552304187839497e-05, "loss": 1.6058, "step": 9756 }, { "epoch": 2.5639307649357903, "grad_norm": 0.7305320501327515, "learning_rate": 1.453478184685474e-05, "loss": 1.5944, "step": 9758 }, { "epoch": 2.564456268269452, "grad_norm": 0.600288987159729, "learning_rate": 1.4517259505869984e-05, "loss": 1.589, "step": 9760 }, { "epoch": 2.5649817716031134, "grad_norm": 0.6395924091339111, "learning_rate": 1.449973716488523e-05, "loss": 1.5878, "step": 9762 }, { "epoch": 2.5655072749367753, "grad_norm": 0.6453663110733032, "learning_rate": 1.4482214823900476e-05, "loss": 1.5795, "step": 9764 }, { "epoch": 2.5660327782704373, "grad_norm": 0.6681911945343018, "learning_rate": 1.4464692482915718e-05, "loss": 1.5715, "step": 9766 }, { "epoch": 2.566558281604099, "grad_norm": 0.8908346891403198, "learning_rate": 1.4447170141930962e-05, "loss": 1.604, "step": 9768 }, { "epoch": 2.567083784937761, "grad_norm": 0.5479845404624939, "learning_rate": 1.4429647800946208e-05, "loss": 1.5835, "step": 9770 }, { "epoch": 2.5676092882714223, "grad_norm": 0.5948062539100647, "learning_rate": 1.441212545996145e-05, "loss": 1.5969, "step": 9772 }, { "epoch": 2.5681347916050843, "grad_norm": 0.5489128232002258, "learning_rate": 1.4394603118976697e-05, "loss": 1.6349, "step": 9774 }, { "epoch": 2.568660294938746, "grad_norm": 0.6163507103919983, "learning_rate": 1.437708077799194e-05, "loss": 1.5721, "step": 9776 }, { "epoch": 2.569185798272408, "grad_norm": 0.6882917881011963, "learning_rate": 1.4359558437007183e-05, "loss": 1.6179, "step": 9778 }, { "epoch": 2.5697113016060698, "grad_norm": 0.5954879522323608, "learning_rate": 1.434203609602243e-05, "loss": 1.6136, "step": 9780 }, { "epoch": 2.5702368049397313, "grad_norm": 0.6170343160629272, "learning_rate": 1.4324513755037675e-05, "loss": 1.585, "step": 9782 }, { "epoch": 2.5707623082733932, "grad_norm": 0.5451639890670776, "learning_rate": 1.4306991414052918e-05, "loss": 1.5914, "step": 9784 }, { "epoch": 2.5712878116070548, "grad_norm": 0.5690667629241943, "learning_rate": 1.4289469073068162e-05, "loss": 1.6272, "step": 9786 }, { "epoch": 2.5718133149407167, "grad_norm": 0.7590607404708862, "learning_rate": 1.4271946732083408e-05, "loss": 1.6112, "step": 9788 }, { "epoch": 2.5723388182743783, "grad_norm": 0.55152827501297, "learning_rate": 1.425442439109865e-05, "loss": 1.6291, "step": 9790 }, { "epoch": 2.5728643216080402, "grad_norm": 0.5713075399398804, "learning_rate": 1.4236902050113896e-05, "loss": 1.6375, "step": 9792 }, { "epoch": 2.573389824941702, "grad_norm": 0.5247568488121033, "learning_rate": 1.421937970912914e-05, "loss": 1.5643, "step": 9794 }, { "epoch": 2.5739153282753637, "grad_norm": 0.7281646132469177, "learning_rate": 1.4201857368144383e-05, "loss": 1.5993, "step": 9796 }, { "epoch": 2.5744408316090253, "grad_norm": 0.690531849861145, "learning_rate": 1.4184335027159628e-05, "loss": 1.6062, "step": 9798 }, { "epoch": 2.5749663349426872, "grad_norm": 0.5356507897377014, "learning_rate": 1.4166812686174874e-05, "loss": 1.6073, "step": 9800 }, { "epoch": 2.575491838276349, "grad_norm": 0.5589247941970825, "learning_rate": 1.4149290345190118e-05, "loss": 1.5874, "step": 9802 }, { "epoch": 2.5760173416100107, "grad_norm": 0.570418119430542, "learning_rate": 1.4131768004205363e-05, "loss": 1.6107, "step": 9804 }, { "epoch": 2.5765428449436727, "grad_norm": 0.6067774295806885, "learning_rate": 1.4114245663220607e-05, "loss": 1.6309, "step": 9806 }, { "epoch": 2.577068348277334, "grad_norm": 0.5615005493164062, "learning_rate": 1.4096723322235853e-05, "loss": 1.6374, "step": 9808 }, { "epoch": 2.577593851610996, "grad_norm": 0.5466208457946777, "learning_rate": 1.4079200981251095e-05, "loss": 1.6007, "step": 9810 }, { "epoch": 2.5781193549446577, "grad_norm": 0.6845700740814209, "learning_rate": 1.4061678640266341e-05, "loss": 1.6121, "step": 9812 }, { "epoch": 2.5786448582783197, "grad_norm": 0.6770169734954834, "learning_rate": 1.4044156299281585e-05, "loss": 1.6089, "step": 9814 }, { "epoch": 2.5791703616119817, "grad_norm": 0.5839431881904602, "learning_rate": 1.4026633958296828e-05, "loss": 1.6216, "step": 9816 }, { "epoch": 2.579695864945643, "grad_norm": 0.5246254205703735, "learning_rate": 1.4009111617312074e-05, "loss": 1.6482, "step": 9818 }, { "epoch": 2.580221368279305, "grad_norm": 0.5945092439651489, "learning_rate": 1.399158927632732e-05, "loss": 1.6152, "step": 9820 }, { "epoch": 2.5807468716129667, "grad_norm": 0.5551746487617493, "learning_rate": 1.3974066935342562e-05, "loss": 1.6123, "step": 9822 }, { "epoch": 2.5812723749466286, "grad_norm": 0.5317911505699158, "learning_rate": 1.3956544594357806e-05, "loss": 1.6127, "step": 9824 }, { "epoch": 2.58179787828029, "grad_norm": 0.5956730842590332, "learning_rate": 1.3939022253373052e-05, "loss": 1.6179, "step": 9826 }, { "epoch": 2.582323381613952, "grad_norm": 0.5398740768432617, "learning_rate": 1.3921499912388294e-05, "loss": 1.6346, "step": 9828 }, { "epoch": 2.582848884947614, "grad_norm": 0.5810662508010864, "learning_rate": 1.390397757140354e-05, "loss": 1.6218, "step": 9830 }, { "epoch": 2.5833743882812756, "grad_norm": 0.5635197162628174, "learning_rate": 1.3886455230418784e-05, "loss": 1.5876, "step": 9832 }, { "epoch": 2.583899891614937, "grad_norm": 0.626593291759491, "learning_rate": 1.386893288943403e-05, "loss": 1.596, "step": 9834 }, { "epoch": 2.584425394948599, "grad_norm": 0.528767466545105, "learning_rate": 1.3851410548449273e-05, "loss": 1.6265, "step": 9836 }, { "epoch": 2.584950898282261, "grad_norm": 0.5446393489837646, "learning_rate": 1.3833888207464519e-05, "loss": 1.6101, "step": 9838 }, { "epoch": 2.5854764016159226, "grad_norm": 0.5221058130264282, "learning_rate": 1.3816365866479763e-05, "loss": 1.6393, "step": 9840 }, { "epoch": 2.5860019049495846, "grad_norm": 0.721125602722168, "learning_rate": 1.3798843525495005e-05, "loss": 1.5934, "step": 9842 }, { "epoch": 2.5865274082832466, "grad_norm": 0.6360064744949341, "learning_rate": 1.3781321184510251e-05, "loss": 1.613, "step": 9844 }, { "epoch": 2.587052911616908, "grad_norm": 0.5796793699264526, "learning_rate": 1.3763798843525497e-05, "loss": 1.6069, "step": 9846 }, { "epoch": 2.5875784149505696, "grad_norm": 0.6104748845100403, "learning_rate": 1.374627650254074e-05, "loss": 1.6083, "step": 9848 }, { "epoch": 2.5881039182842316, "grad_norm": 0.5367170572280884, "learning_rate": 1.3728754161555984e-05, "loss": 1.6034, "step": 9850 }, { "epoch": 2.5886294216178936, "grad_norm": 0.6482873558998108, "learning_rate": 1.371123182057123e-05, "loss": 1.5852, "step": 9852 }, { "epoch": 2.589154924951555, "grad_norm": 0.5794985294342041, "learning_rate": 1.3693709479586472e-05, "loss": 1.5974, "step": 9854 }, { "epoch": 2.589680428285217, "grad_norm": 0.6778134703636169, "learning_rate": 1.3676187138601718e-05, "loss": 1.5959, "step": 9856 }, { "epoch": 2.5902059316188786, "grad_norm": 0.5274595618247986, "learning_rate": 1.3658664797616962e-05, "loss": 1.5888, "step": 9858 }, { "epoch": 2.5907314349525405, "grad_norm": 0.648197591304779, "learning_rate": 1.3641142456632205e-05, "loss": 1.6369, "step": 9860 }, { "epoch": 2.591256938286202, "grad_norm": 0.5107806324958801, "learning_rate": 1.362362011564745e-05, "loss": 1.5997, "step": 9862 }, { "epoch": 2.591782441619864, "grad_norm": 0.5690180659294128, "learning_rate": 1.3606097774662696e-05, "loss": 1.6161, "step": 9864 }, { "epoch": 2.592307944953526, "grad_norm": 0.5976451635360718, "learning_rate": 1.358857543367794e-05, "loss": 1.5956, "step": 9866 }, { "epoch": 2.5928334482871875, "grad_norm": 0.5506587624549866, "learning_rate": 1.3571053092693185e-05, "loss": 1.5734, "step": 9868 }, { "epoch": 2.5933589516208495, "grad_norm": 0.5882534384727478, "learning_rate": 1.3553530751708429e-05, "loss": 1.5834, "step": 9870 }, { "epoch": 2.593884454954511, "grad_norm": 0.5685223340988159, "learning_rate": 1.3536008410723675e-05, "loss": 1.5997, "step": 9872 }, { "epoch": 2.594409958288173, "grad_norm": 0.725277304649353, "learning_rate": 1.3518486069738917e-05, "loss": 1.573, "step": 9874 }, { "epoch": 2.5949354616218345, "grad_norm": 0.563489556312561, "learning_rate": 1.3500963728754163e-05, "loss": 1.6113, "step": 9876 }, { "epoch": 2.5954609649554965, "grad_norm": 0.5687708258628845, "learning_rate": 1.3483441387769407e-05, "loss": 1.6314, "step": 9878 }, { "epoch": 2.5959864682891585, "grad_norm": 0.6753512620925903, "learning_rate": 1.346591904678465e-05, "loss": 1.5998, "step": 9880 }, { "epoch": 2.59651197162282, "grad_norm": 0.7767539620399475, "learning_rate": 1.3448396705799896e-05, "loss": 1.5696, "step": 9882 }, { "epoch": 2.5970374749564815, "grad_norm": 0.6438019275665283, "learning_rate": 1.3430874364815142e-05, "loss": 1.6511, "step": 9884 }, { "epoch": 2.5975629782901435, "grad_norm": 0.673339307308197, "learning_rate": 1.3413352023830384e-05, "loss": 1.6375, "step": 9886 }, { "epoch": 2.5980884816238055, "grad_norm": 0.6414830684661865, "learning_rate": 1.3395829682845628e-05, "loss": 1.6215, "step": 9888 }, { "epoch": 2.598613984957467, "grad_norm": 0.5403038263320923, "learning_rate": 1.3378307341860874e-05, "loss": 1.5912, "step": 9890 }, { "epoch": 2.599139488291129, "grad_norm": 0.6485872268676758, "learning_rate": 1.3360785000876117e-05, "loss": 1.6385, "step": 9892 }, { "epoch": 2.5996649916247905, "grad_norm": 0.5818743109703064, "learning_rate": 1.3343262659891362e-05, "loss": 1.6295, "step": 9894 }, { "epoch": 2.6001904949584524, "grad_norm": 0.6335439682006836, "learning_rate": 1.3325740318906607e-05, "loss": 1.5726, "step": 9896 }, { "epoch": 2.600715998292114, "grad_norm": 0.5946658253669739, "learning_rate": 1.3308217977921849e-05, "loss": 1.6393, "step": 9898 }, { "epoch": 2.601241501625776, "grad_norm": 0.5518163442611694, "learning_rate": 1.3290695636937095e-05, "loss": 1.5873, "step": 9900 }, { "epoch": 2.601767004959438, "grad_norm": 0.5967461466789246, "learning_rate": 1.327317329595234e-05, "loss": 1.596, "step": 9902 }, { "epoch": 2.6022925082930994, "grad_norm": 0.613856852054596, "learning_rate": 1.3255650954967585e-05, "loss": 1.6073, "step": 9904 }, { "epoch": 2.6028180116267614, "grad_norm": 0.5403199791908264, "learning_rate": 1.3238128613982827e-05, "loss": 1.623, "step": 9906 }, { "epoch": 2.603343514960423, "grad_norm": 0.6116963624954224, "learning_rate": 1.3220606272998073e-05, "loss": 1.6139, "step": 9908 }, { "epoch": 2.603869018294085, "grad_norm": 0.6207565069198608, "learning_rate": 1.320308393201332e-05, "loss": 1.602, "step": 9910 }, { "epoch": 2.6043945216277464, "grad_norm": 0.6587697863578796, "learning_rate": 1.3185561591028562e-05, "loss": 1.6115, "step": 9912 }, { "epoch": 2.6049200249614084, "grad_norm": 0.5676011443138123, "learning_rate": 1.3168039250043806e-05, "loss": 1.6086, "step": 9914 }, { "epoch": 2.6054455282950704, "grad_norm": 0.5473529100418091, "learning_rate": 1.3150516909059052e-05, "loss": 1.6345, "step": 9916 }, { "epoch": 2.605971031628732, "grad_norm": 0.5622715950012207, "learning_rate": 1.3132994568074294e-05, "loss": 1.5891, "step": 9918 }, { "epoch": 2.6064965349623934, "grad_norm": 0.6009332537651062, "learning_rate": 1.311547222708954e-05, "loss": 1.6095, "step": 9920 }, { "epoch": 2.6070220382960554, "grad_norm": 0.746606707572937, "learning_rate": 1.3097949886104784e-05, "loss": 1.6328, "step": 9922 }, { "epoch": 2.6075475416297174, "grad_norm": 0.6133710741996765, "learning_rate": 1.3080427545120027e-05, "loss": 1.6085, "step": 9924 }, { "epoch": 2.608073044963379, "grad_norm": 0.6758058667182922, "learning_rate": 1.3062905204135273e-05, "loss": 1.62, "step": 9926 }, { "epoch": 2.608598548297041, "grad_norm": 0.5908955335617065, "learning_rate": 1.3045382863150518e-05, "loss": 1.5991, "step": 9928 }, { "epoch": 2.6091240516307024, "grad_norm": 0.5754519104957581, "learning_rate": 1.3027860522165761e-05, "loss": 1.6316, "step": 9930 }, { "epoch": 2.6096495549643643, "grad_norm": 0.5225171446800232, "learning_rate": 1.3010338181181007e-05, "loss": 1.5902, "step": 9932 }, { "epoch": 2.610175058298026, "grad_norm": 0.6654541492462158, "learning_rate": 1.2992815840196251e-05, "loss": 1.588, "step": 9934 }, { "epoch": 2.610700561631688, "grad_norm": 0.592370331287384, "learning_rate": 1.2975293499211497e-05, "loss": 1.6396, "step": 9936 }, { "epoch": 2.61122606496535, "grad_norm": 0.6326411366462708, "learning_rate": 1.295777115822674e-05, "loss": 1.6201, "step": 9938 }, { "epoch": 2.6117515682990113, "grad_norm": 0.6815057396888733, "learning_rate": 1.2940248817241985e-05, "loss": 1.6337, "step": 9940 }, { "epoch": 2.6122770716326733, "grad_norm": 0.6094640493392944, "learning_rate": 1.292272647625723e-05, "loss": 1.6009, "step": 9942 }, { "epoch": 2.612802574966335, "grad_norm": 0.6490272283554077, "learning_rate": 1.2905204135272472e-05, "loss": 1.6249, "step": 9944 }, { "epoch": 2.613328078299997, "grad_norm": 0.6855607032775879, "learning_rate": 1.2887681794287718e-05, "loss": 1.5982, "step": 9946 }, { "epoch": 2.6138535816336583, "grad_norm": 0.6429476141929626, "learning_rate": 1.2870159453302964e-05, "loss": 1.6119, "step": 9948 }, { "epoch": 2.6143790849673203, "grad_norm": 0.5955906510353088, "learning_rate": 1.2852637112318206e-05, "loss": 1.604, "step": 9950 }, { "epoch": 2.6149045883009823, "grad_norm": 0.7396878600120544, "learning_rate": 1.283511477133345e-05, "loss": 1.6084, "step": 9952 }, { "epoch": 2.615430091634644, "grad_norm": 0.631075382232666, "learning_rate": 1.2817592430348696e-05, "loss": 1.579, "step": 9954 }, { "epoch": 2.6159555949683053, "grad_norm": 0.5280560255050659, "learning_rate": 1.2800070089363939e-05, "loss": 1.5952, "step": 9956 }, { "epoch": 2.6164810983019673, "grad_norm": 0.5811305046081543, "learning_rate": 1.2782547748379184e-05, "loss": 1.5981, "step": 9958 }, { "epoch": 2.6170066016356293, "grad_norm": 0.616111159324646, "learning_rate": 1.2765025407394429e-05, "loss": 1.6146, "step": 9960 }, { "epoch": 2.617532104969291, "grad_norm": 0.6001870036125183, "learning_rate": 1.2747503066409671e-05, "loss": 1.612, "step": 9962 }, { "epoch": 2.6180576083029528, "grad_norm": 0.5698304176330566, "learning_rate": 1.2729980725424917e-05, "loss": 1.5854, "step": 9964 }, { "epoch": 2.6185831116366143, "grad_norm": 0.5722531676292419, "learning_rate": 1.2712458384440163e-05, "loss": 1.595, "step": 9966 }, { "epoch": 2.6191086149702762, "grad_norm": 0.556814968585968, "learning_rate": 1.2694936043455405e-05, "loss": 1.6079, "step": 9968 }, { "epoch": 2.6196341183039378, "grad_norm": 0.7477023601531982, "learning_rate": 1.267741370247065e-05, "loss": 1.6101, "step": 9970 }, { "epoch": 2.6201596216375997, "grad_norm": 0.5011134743690491, "learning_rate": 1.2659891361485895e-05, "loss": 1.5684, "step": 9972 }, { "epoch": 2.6206851249712617, "grad_norm": 0.5759318470954895, "learning_rate": 1.2642369020501141e-05, "loss": 1.5726, "step": 9974 }, { "epoch": 2.6212106283049232, "grad_norm": 0.5837996006011963, "learning_rate": 1.2624846679516384e-05, "loss": 1.5975, "step": 9976 }, { "epoch": 2.621736131638585, "grad_norm": 0.6139369606971741, "learning_rate": 1.2607324338531628e-05, "loss": 1.604, "step": 9978 }, { "epoch": 2.6222616349722467, "grad_norm": 0.59381103515625, "learning_rate": 1.2589801997546874e-05, "loss": 1.631, "step": 9980 }, { "epoch": 2.6227871383059087, "grad_norm": 0.59110027551651, "learning_rate": 1.2572279656562116e-05, "loss": 1.6342, "step": 9982 }, { "epoch": 2.6233126416395702, "grad_norm": 0.6388522386550903, "learning_rate": 1.2554757315577362e-05, "loss": 1.602, "step": 9984 }, { "epoch": 2.623838144973232, "grad_norm": 0.5485536456108093, "learning_rate": 1.2537234974592606e-05, "loss": 1.5904, "step": 9986 }, { "epoch": 2.624363648306894, "grad_norm": 0.6147143840789795, "learning_rate": 1.2519712633607849e-05, "loss": 1.6193, "step": 9988 }, { "epoch": 2.6248891516405557, "grad_norm": 0.6774529814720154, "learning_rate": 1.2502190292623095e-05, "loss": 1.5894, "step": 9990 }, { "epoch": 2.625414654974217, "grad_norm": 0.5123162269592285, "learning_rate": 1.2484667951638339e-05, "loss": 1.5871, "step": 9992 }, { "epoch": 2.625940158307879, "grad_norm": 0.5840069651603699, "learning_rate": 1.2467145610653585e-05, "loss": 1.6093, "step": 9994 }, { "epoch": 2.626465661641541, "grad_norm": 0.6273425817489624, "learning_rate": 1.2449623269668829e-05, "loss": 1.5992, "step": 9996 }, { "epoch": 2.6269911649752027, "grad_norm": 0.5548253655433655, "learning_rate": 1.2432100928684073e-05, "loss": 1.5966, "step": 9998 }, { "epoch": 2.6275166683088647, "grad_norm": 0.6084941625595093, "learning_rate": 1.2414578587699317e-05, "loss": 1.6002, "step": 10000 }, { "epoch": 2.6275166683088647, "eval_loss": 1.6519938707351685, "eval_runtime": 487.1623, "eval_samples_per_second": 249.997, "eval_steps_per_second": 31.25, "step": 10000 }, { "epoch": 2.6280421716425266, "grad_norm": 0.5661596655845642, "learning_rate": 1.2397056246714561e-05, "loss": 1.6081, "step": 10002 }, { "epoch": 2.628567674976188, "grad_norm": 0.6092149615287781, "learning_rate": 1.2379533905729807e-05, "loss": 1.5984, "step": 10004 }, { "epoch": 2.6290931783098497, "grad_norm": 0.6022893786430359, "learning_rate": 1.236201156474505e-05, "loss": 1.594, "step": 10006 }, { "epoch": 2.6296186816435116, "grad_norm": 0.6382527351379395, "learning_rate": 1.2344489223760294e-05, "loss": 1.6076, "step": 10008 }, { "epoch": 2.6301441849771736, "grad_norm": 0.6449958682060242, "learning_rate": 1.232696688277554e-05, "loss": 1.6056, "step": 10010 }, { "epoch": 2.630669688310835, "grad_norm": 0.6634608507156372, "learning_rate": 1.2309444541790784e-05, "loss": 1.5862, "step": 10012 }, { "epoch": 2.631195191644497, "grad_norm": 0.5740031003952026, "learning_rate": 1.2291922200806028e-05, "loss": 1.5988, "step": 10014 }, { "epoch": 2.6317206949781586, "grad_norm": 0.7303963899612427, "learning_rate": 1.2274399859821272e-05, "loss": 1.6342, "step": 10016 }, { "epoch": 2.6322461983118206, "grad_norm": 0.5871624946594238, "learning_rate": 1.2256877518836516e-05, "loss": 1.5836, "step": 10018 }, { "epoch": 2.632771701645482, "grad_norm": 0.5941208600997925, "learning_rate": 1.2239355177851762e-05, "loss": 1.5655, "step": 10020 }, { "epoch": 2.633297204979144, "grad_norm": 0.5943103432655334, "learning_rate": 1.2221832836867007e-05, "loss": 1.6448, "step": 10022 }, { "epoch": 2.633822708312806, "grad_norm": 0.6347323060035706, "learning_rate": 1.220431049588225e-05, "loss": 1.621, "step": 10024 }, { "epoch": 2.6343482116464676, "grad_norm": Infinity, "learning_rate": 1.2195549325389873e-05, "loss": 1.6178, "step": 10026 }, { "epoch": 2.6348737149801296, "grad_norm": 0.6692628264427185, "learning_rate": 1.2178026984405117e-05, "loss": 1.6063, "step": 10028 }, { "epoch": 2.635399218313791, "grad_norm": 0.5815702080726624, "learning_rate": 1.2160504643420361e-05, "loss": 1.5884, "step": 10030 }, { "epoch": 2.635924721647453, "grad_norm": 0.5981874465942383, "learning_rate": 1.2142982302435605e-05, "loss": 1.614, "step": 10032 }, { "epoch": 2.6364502249811146, "grad_norm": 0.5921944975852966, "learning_rate": 1.2125459961450851e-05, "loss": 1.6172, "step": 10034 }, { "epoch": 2.6369757283147766, "grad_norm": 0.5707213878631592, "learning_rate": 1.2107937620466095e-05, "loss": 1.6205, "step": 10036 }, { "epoch": 2.6375012316484385, "grad_norm": 0.5834996700286865, "learning_rate": 1.209041527948134e-05, "loss": 1.598, "step": 10038 }, { "epoch": 2.6380267349821, "grad_norm": 0.568755567073822, "learning_rate": 1.2072892938496584e-05, "loss": 1.5819, "step": 10040 }, { "epoch": 2.6385522383157616, "grad_norm": 0.6087754368782043, "learning_rate": 1.2055370597511828e-05, "loss": 1.6223, "step": 10042 }, { "epoch": 2.6390777416494235, "grad_norm": 0.5805062651634216, "learning_rate": 1.2037848256527072e-05, "loss": 1.6055, "step": 10044 }, { "epoch": 2.6396032449830855, "grad_norm": 0.5679923892021179, "learning_rate": 1.2020325915542318e-05, "loss": 1.6388, "step": 10046 }, { "epoch": 2.640128748316747, "grad_norm": 0.7020934820175171, "learning_rate": 1.200280357455756e-05, "loss": 1.6076, "step": 10048 }, { "epoch": 2.640654251650409, "grad_norm": 0.5716947913169861, "learning_rate": 1.1985281233572806e-05, "loss": 1.6035, "step": 10050 }, { "epoch": 2.6411797549840705, "grad_norm": 0.6460137963294983, "learning_rate": 1.196775889258805e-05, "loss": 1.5981, "step": 10052 }, { "epoch": 2.6417052583177325, "grad_norm": 0.5903118848800659, "learning_rate": 1.1950236551603295e-05, "loss": 1.5844, "step": 10054 }, { "epoch": 2.642230761651394, "grad_norm": 0.5728901028633118, "learning_rate": 1.193271421061854e-05, "loss": 1.5811, "step": 10056 }, { "epoch": 2.642756264985056, "grad_norm": 0.6795145273208618, "learning_rate": 1.1915191869633783e-05, "loss": 1.6334, "step": 10058 }, { "epoch": 2.643281768318718, "grad_norm": 0.552899956703186, "learning_rate": 1.1897669528649027e-05, "loss": 1.5956, "step": 10060 }, { "epoch": 2.6438072716523795, "grad_norm": 0.5750575065612793, "learning_rate": 1.1880147187664273e-05, "loss": 1.5974, "step": 10062 }, { "epoch": 2.6443327749860415, "grad_norm": 0.6233214735984802, "learning_rate": 1.1862624846679517e-05, "loss": 1.6177, "step": 10064 }, { "epoch": 2.644858278319703, "grad_norm": 0.7260066270828247, "learning_rate": 1.1845102505694761e-05, "loss": 1.6024, "step": 10066 }, { "epoch": 2.645383781653365, "grad_norm": 0.5926297903060913, "learning_rate": 1.1827580164710006e-05, "loss": 1.6003, "step": 10068 }, { "epoch": 2.6459092849870265, "grad_norm": 0.565710723400116, "learning_rate": 1.181005782372525e-05, "loss": 1.5879, "step": 10070 }, { "epoch": 2.6464347883206885, "grad_norm": 0.5911100506782532, "learning_rate": 1.1792535482740496e-05, "loss": 1.5862, "step": 10072 }, { "epoch": 2.6469602916543504, "grad_norm": 0.6018568277359009, "learning_rate": 1.177501314175574e-05, "loss": 1.6274, "step": 10074 }, { "epoch": 2.647485794988012, "grad_norm": 0.6663408875465393, "learning_rate": 1.1757490800770982e-05, "loss": 1.6331, "step": 10076 }, { "epoch": 2.6480112983216735, "grad_norm": 0.6028339266777039, "learning_rate": 1.1739968459786228e-05, "loss": 1.6208, "step": 10078 }, { "epoch": 2.6485368016553354, "grad_norm": 0.5986884236335754, "learning_rate": 1.1722446118801472e-05, "loss": 1.5974, "step": 10080 }, { "epoch": 2.6490623049889974, "grad_norm": 0.7179824113845825, "learning_rate": 1.1704923777816718e-05, "loss": 1.5959, "step": 10082 }, { "epoch": 2.649587808322659, "grad_norm": 0.6211957931518555, "learning_rate": 1.168740143683196e-05, "loss": 1.6052, "step": 10084 }, { "epoch": 2.650113311656321, "grad_norm": 0.5563115477561951, "learning_rate": 1.1669879095847205e-05, "loss": 1.6047, "step": 10086 }, { "epoch": 2.6506388149899824, "grad_norm": 0.6166182160377502, "learning_rate": 1.165235675486245e-05, "loss": 1.591, "step": 10088 }, { "epoch": 2.6511643183236444, "grad_norm": 0.5224390625953674, "learning_rate": 1.1634834413877695e-05, "loss": 1.6147, "step": 10090 }, { "epoch": 2.651689821657306, "grad_norm": 0.6834142208099365, "learning_rate": 1.1617312072892939e-05, "loss": 1.6194, "step": 10092 }, { "epoch": 2.652215324990968, "grad_norm": 0.5866041779518127, "learning_rate": 1.1599789731908183e-05, "loss": 1.6286, "step": 10094 }, { "epoch": 2.65274082832463, "grad_norm": 0.6454026699066162, "learning_rate": 1.1582267390923427e-05, "loss": 1.6181, "step": 10096 }, { "epoch": 2.6532663316582914, "grad_norm": 0.5898521542549133, "learning_rate": 1.1564745049938672e-05, "loss": 1.6157, "step": 10098 }, { "epoch": 2.6537918349919534, "grad_norm": 0.5379465222358704, "learning_rate": 1.1547222708953917e-05, "loss": 1.6015, "step": 10100 }, { "epoch": 2.654317338325615, "grad_norm": 0.624577522277832, "learning_rate": 1.1529700367969162e-05, "loss": 1.6161, "step": 10102 }, { "epoch": 2.654842841659277, "grad_norm": 0.5811554193496704, "learning_rate": 1.1512178026984406e-05, "loss": 1.598, "step": 10104 }, { "epoch": 2.6553683449929384, "grad_norm": 0.5827350616455078, "learning_rate": 1.149465568599965e-05, "loss": 1.625, "step": 10106 }, { "epoch": 2.6558938483266004, "grad_norm": 0.6221204996109009, "learning_rate": 1.1477133345014894e-05, "loss": 1.6011, "step": 10108 }, { "epoch": 2.6564193516602623, "grad_norm": 0.6008239984512329, "learning_rate": 1.145961100403014e-05, "loss": 1.6048, "step": 10110 }, { "epoch": 2.656944854993924, "grad_norm": 0.5547776222229004, "learning_rate": 1.1442088663045382e-05, "loss": 1.6149, "step": 10112 }, { "epoch": 2.6574703583275854, "grad_norm": 0.5586524605751038, "learning_rate": 1.1424566322060627e-05, "loss": 1.576, "step": 10114 }, { "epoch": 2.6579958616612473, "grad_norm": 0.6657798886299133, "learning_rate": 1.1407043981075873e-05, "loss": 1.5939, "step": 10116 }, { "epoch": 2.6585213649949093, "grad_norm": 0.6012902855873108, "learning_rate": 1.1389521640091117e-05, "loss": 1.6089, "step": 10118 }, { "epoch": 2.659046868328571, "grad_norm": 0.6174466013908386, "learning_rate": 1.1371999299106363e-05, "loss": 1.606, "step": 10120 }, { "epoch": 2.659572371662233, "grad_norm": 0.5988878011703491, "learning_rate": 1.1354476958121605e-05, "loss": 1.6162, "step": 10122 }, { "epoch": 2.6600978749958943, "grad_norm": 0.6152504086494446, "learning_rate": 1.133695461713685e-05, "loss": 1.6265, "step": 10124 }, { "epoch": 2.6606233783295563, "grad_norm": 0.5866356492042542, "learning_rate": 1.1319432276152095e-05, "loss": 1.5945, "step": 10126 }, { "epoch": 2.661148881663218, "grad_norm": 0.6397704482078552, "learning_rate": 1.130190993516734e-05, "loss": 1.6141, "step": 10128 }, { "epoch": 2.66167438499688, "grad_norm": 0.5501188039779663, "learning_rate": 1.1284387594182583e-05, "loss": 1.6161, "step": 10130 }, { "epoch": 2.6621998883305418, "grad_norm": 0.5947812795639038, "learning_rate": 1.1266865253197828e-05, "loss": 1.6168, "step": 10132 }, { "epoch": 2.6627253916642033, "grad_norm": 0.571863055229187, "learning_rate": 1.1249342912213072e-05, "loss": 1.6012, "step": 10134 }, { "epoch": 2.6632508949978653, "grad_norm": 0.5414539575576782, "learning_rate": 1.1231820571228318e-05, "loss": 1.5672, "step": 10136 }, { "epoch": 2.663776398331527, "grad_norm": 0.5631955862045288, "learning_rate": 1.1214298230243562e-05, "loss": 1.6291, "step": 10138 }, { "epoch": 2.6643019016651888, "grad_norm": 0.6036393642425537, "learning_rate": 1.1196775889258804e-05, "loss": 1.6131, "step": 10140 }, { "epoch": 2.6648274049988503, "grad_norm": 0.5736912488937378, "learning_rate": 1.117925354827405e-05, "loss": 1.595, "step": 10142 }, { "epoch": 2.6653529083325123, "grad_norm": 0.6014968156814575, "learning_rate": 1.1161731207289294e-05, "loss": 1.6042, "step": 10144 }, { "epoch": 2.6658784116661742, "grad_norm": 0.564576268196106, "learning_rate": 1.1144208866304539e-05, "loss": 1.6199, "step": 10146 }, { "epoch": 2.6664039149998358, "grad_norm": 0.5501058101654053, "learning_rate": 1.1126686525319783e-05, "loss": 1.6063, "step": 10148 }, { "epoch": 2.6669294183334973, "grad_norm": 0.6601042151451111, "learning_rate": 1.1109164184335027e-05, "loss": 1.5838, "step": 10150 }, { "epoch": 2.6674549216671593, "grad_norm": 0.6125220656394958, "learning_rate": 1.1091641843350273e-05, "loss": 1.6151, "step": 10152 }, { "epoch": 2.667980425000821, "grad_norm": 0.6514031291007996, "learning_rate": 1.1074119502365517e-05, "loss": 1.5978, "step": 10154 }, { "epoch": 2.6685059283344827, "grad_norm": 0.5755125284194946, "learning_rate": 1.1056597161380761e-05, "loss": 1.5971, "step": 10156 }, { "epoch": 2.6690314316681447, "grad_norm": 0.6414878368377686, "learning_rate": 1.1039074820396005e-05, "loss": 1.6212, "step": 10158 }, { "epoch": 2.6695569350018067, "grad_norm": 0.6092830300331116, "learning_rate": 1.102155247941125e-05, "loss": 1.6006, "step": 10160 }, { "epoch": 2.670082438335468, "grad_norm": 0.6845732927322388, "learning_rate": 1.1004030138426494e-05, "loss": 1.6473, "step": 10162 }, { "epoch": 2.6706079416691297, "grad_norm": 0.5648114681243896, "learning_rate": 1.098650779744174e-05, "loss": 1.5988, "step": 10164 }, { "epoch": 2.6711334450027917, "grad_norm": 0.5797778367996216, "learning_rate": 1.0968985456456984e-05, "loss": 1.6057, "step": 10166 }, { "epoch": 2.6716589483364537, "grad_norm": 0.6136969327926636, "learning_rate": 1.0951463115472228e-05, "loss": 1.5615, "step": 10168 }, { "epoch": 2.672184451670115, "grad_norm": 0.5537448525428772, "learning_rate": 1.0933940774487472e-05, "loss": 1.5739, "step": 10170 }, { "epoch": 2.672709955003777, "grad_norm": 0.6372264623641968, "learning_rate": 1.0916418433502716e-05, "loss": 1.5985, "step": 10172 }, { "epoch": 2.6732354583374387, "grad_norm": 0.5945001840591431, "learning_rate": 1.0898896092517962e-05, "loss": 1.6051, "step": 10174 }, { "epoch": 2.6737609616711007, "grad_norm": 0.5689924359321594, "learning_rate": 1.0881373751533205e-05, "loss": 1.6083, "step": 10176 }, { "epoch": 2.674286465004762, "grad_norm": 0.596948504447937, "learning_rate": 1.0863851410548449e-05, "loss": 1.6086, "step": 10178 }, { "epoch": 2.674811968338424, "grad_norm": 0.581723153591156, "learning_rate": 1.0846329069563695e-05, "loss": 1.6127, "step": 10180 }, { "epoch": 2.675337471672086, "grad_norm": 0.5663895010948181, "learning_rate": 1.0828806728578939e-05, "loss": 1.571, "step": 10182 }, { "epoch": 2.6758629750057477, "grad_norm": 0.5802646279335022, "learning_rate": 1.0811284387594183e-05, "loss": 1.6068, "step": 10184 }, { "epoch": 2.6763884783394096, "grad_norm": 0.615695595741272, "learning_rate": 1.0793762046609427e-05, "loss": 1.6147, "step": 10186 }, { "epoch": 2.676913981673071, "grad_norm": 0.6105237007141113, "learning_rate": 1.0776239705624671e-05, "loss": 1.607, "step": 10188 }, { "epoch": 2.677439485006733, "grad_norm": 0.5385169982910156, "learning_rate": 1.0758717364639917e-05, "loss": 1.6079, "step": 10190 }, { "epoch": 2.6779649883403946, "grad_norm": 0.7373268008232117, "learning_rate": 1.0741195023655161e-05, "loss": 1.6043, "step": 10192 }, { "epoch": 2.6784904916740566, "grad_norm": 0.5916065573692322, "learning_rate": 1.0723672682670406e-05, "loss": 1.6349, "step": 10194 }, { "epoch": 2.6790159950077186, "grad_norm": 0.7171288132667542, "learning_rate": 1.070615034168565e-05, "loss": 1.6027, "step": 10196 }, { "epoch": 2.67954149834138, "grad_norm": 0.5862658619880676, "learning_rate": 1.0688628000700894e-05, "loss": 1.5782, "step": 10198 }, { "epoch": 2.6800670016750416, "grad_norm": 0.5845574736595154, "learning_rate": 1.0671105659716138e-05, "loss": 1.6043, "step": 10200 }, { "epoch": 2.6805925050087036, "grad_norm": 0.6461187601089478, "learning_rate": 1.0653583318731384e-05, "loss": 1.6035, "step": 10202 }, { "epoch": 2.6811180083423656, "grad_norm": 0.5671385526657104, "learning_rate": 1.0636060977746626e-05, "loss": 1.5873, "step": 10204 }, { "epoch": 2.681643511676027, "grad_norm": 0.614528238773346, "learning_rate": 1.0618538636761872e-05, "loss": 1.5885, "step": 10206 }, { "epoch": 2.682169015009689, "grad_norm": 0.6290176510810852, "learning_rate": 1.0601016295777116e-05, "loss": 1.6293, "step": 10208 }, { "epoch": 2.6826945183433506, "grad_norm": 0.7489297986030579, "learning_rate": 1.058349395479236e-05, "loss": 1.6248, "step": 10210 }, { "epoch": 2.6832200216770126, "grad_norm": 0.5606198310852051, "learning_rate": 1.0565971613807605e-05, "loss": 1.5935, "step": 10212 }, { "epoch": 2.683745525010674, "grad_norm": 0.5877952575683594, "learning_rate": 1.0548449272822849e-05, "loss": 1.5945, "step": 10214 }, { "epoch": 2.684271028344336, "grad_norm": 0.6569766998291016, "learning_rate": 1.0530926931838093e-05, "loss": 1.5946, "step": 10216 }, { "epoch": 2.684796531677998, "grad_norm": 0.5706822276115417, "learning_rate": 1.0513404590853339e-05, "loss": 1.6302, "step": 10218 }, { "epoch": 2.6853220350116596, "grad_norm": 0.6443889141082764, "learning_rate": 1.0495882249868583e-05, "loss": 1.6055, "step": 10220 }, { "epoch": 2.6858475383453215, "grad_norm": 0.6561224460601807, "learning_rate": 1.0478359908883827e-05, "loss": 1.5979, "step": 10222 }, { "epoch": 2.686373041678983, "grad_norm": 0.6297589540481567, "learning_rate": 1.0460837567899072e-05, "loss": 1.621, "step": 10224 }, { "epoch": 2.686898545012645, "grad_norm": 0.5593584775924683, "learning_rate": 1.0443315226914316e-05, "loss": 1.5798, "step": 10226 }, { "epoch": 2.6874240483463065, "grad_norm": 0.5252684354782104, "learning_rate": 1.0425792885929562e-05, "loss": 1.5672, "step": 10228 }, { "epoch": 2.6879495516799685, "grad_norm": 0.7257969379425049, "learning_rate": 1.0408270544944806e-05, "loss": 1.6178, "step": 10230 }, { "epoch": 2.6884750550136305, "grad_norm": 0.5459820628166199, "learning_rate": 1.0390748203960048e-05, "loss": 1.589, "step": 10232 }, { "epoch": 2.689000558347292, "grad_norm": 0.6055821180343628, "learning_rate": 1.0373225862975294e-05, "loss": 1.5899, "step": 10234 }, { "epoch": 2.6895260616809535, "grad_norm": 0.7138866782188416, "learning_rate": 1.0355703521990538e-05, "loss": 1.6004, "step": 10236 }, { "epoch": 2.6900515650146155, "grad_norm": 0.5606485605239868, "learning_rate": 1.0338181181005784e-05, "loss": 1.5703, "step": 10238 }, { "epoch": 2.6905770683482775, "grad_norm": 0.6380048990249634, "learning_rate": 1.0320658840021027e-05, "loss": 1.6236, "step": 10240 }, { "epoch": 2.691102571681939, "grad_norm": 0.59669029712677, "learning_rate": 1.030313649903627e-05, "loss": 1.5963, "step": 10242 }, { "epoch": 2.691628075015601, "grad_norm": 0.6684574484825134, "learning_rate": 1.0285614158051517e-05, "loss": 1.6199, "step": 10244 }, { "epoch": 2.6921535783492625, "grad_norm": 0.5515660047531128, "learning_rate": 1.0268091817066761e-05, "loss": 1.6143, "step": 10246 }, { "epoch": 2.6926790816829245, "grad_norm": 0.6562058329582214, "learning_rate": 1.0250569476082005e-05, "loss": 1.6112, "step": 10248 }, { "epoch": 2.693204585016586, "grad_norm": 0.5547401309013367, "learning_rate": 1.023304713509725e-05, "loss": 1.6124, "step": 10250 }, { "epoch": 2.693730088350248, "grad_norm": 0.6307016611099243, "learning_rate": 1.0215524794112493e-05, "loss": 1.6256, "step": 10252 }, { "epoch": 2.69425559168391, "grad_norm": 0.5722636580467224, "learning_rate": 1.0198002453127738e-05, "loss": 1.5713, "step": 10254 }, { "epoch": 2.6947810950175715, "grad_norm": 0.587989330291748, "learning_rate": 1.0180480112142983e-05, "loss": 1.6191, "step": 10256 }, { "epoch": 2.6953065983512334, "grad_norm": 0.7785957455635071, "learning_rate": 1.0162957771158228e-05, "loss": 1.6129, "step": 10258 }, { "epoch": 2.695832101684895, "grad_norm": 0.5290049910545349, "learning_rate": 1.0145435430173472e-05, "loss": 1.5561, "step": 10260 }, { "epoch": 2.696357605018557, "grad_norm": 0.6306317448616028, "learning_rate": 1.0127913089188716e-05, "loss": 1.5872, "step": 10262 }, { "epoch": 2.6968831083522184, "grad_norm": 0.5847423672676086, "learning_rate": 1.011039074820396e-05, "loss": 1.5783, "step": 10264 }, { "epoch": 2.6974086116858804, "grad_norm": 0.5815863609313965, "learning_rate": 1.0092868407219206e-05, "loss": 1.6138, "step": 10266 }, { "epoch": 2.6979341150195424, "grad_norm": 0.6759339570999146, "learning_rate": 1.0075346066234448e-05, "loss": 1.6146, "step": 10268 }, { "epoch": 2.698459618353204, "grad_norm": 0.7321836948394775, "learning_rate": 1.0057823725249693e-05, "loss": 1.6333, "step": 10270 }, { "epoch": 2.6989851216868654, "grad_norm": 0.595303475856781, "learning_rate": 1.0040301384264939e-05, "loss": 1.5903, "step": 10272 }, { "epoch": 2.6995106250205274, "grad_norm": 0.6422771215438843, "learning_rate": 1.0022779043280183e-05, "loss": 1.6063, "step": 10274 }, { "epoch": 2.7000361283541894, "grad_norm": 0.6339772939682007, "learning_rate": 1.0005256702295427e-05, "loss": 1.6402, "step": 10276 }, { "epoch": 2.700561631687851, "grad_norm": 0.6175129413604736, "learning_rate": 9.987734361310671e-06, "loss": 1.5768, "step": 10278 }, { "epoch": 2.701087135021513, "grad_norm": 0.6692789793014526, "learning_rate": 9.970212020325915e-06, "loss": 1.6293, "step": 10280 }, { "epoch": 2.7016126383551744, "grad_norm": 0.6754617094993591, "learning_rate": 9.952689679341161e-06, "loss": 1.6127, "step": 10282 }, { "epoch": 2.7021381416888364, "grad_norm": 0.5878320336341858, "learning_rate": 9.935167338356405e-06, "loss": 1.6195, "step": 10284 }, { "epoch": 2.702663645022498, "grad_norm": 0.9269865155220032, "learning_rate": 9.91764499737165e-06, "loss": 1.5903, "step": 10286 }, { "epoch": 2.70318914835616, "grad_norm": 0.610824465751648, "learning_rate": 9.900122656386894e-06, "loss": 1.608, "step": 10288 }, { "epoch": 2.703714651689822, "grad_norm": 0.587494432926178, "learning_rate": 9.882600315402138e-06, "loss": 1.547, "step": 10290 }, { "epoch": 2.7042401550234834, "grad_norm": 0.6791782975196838, "learning_rate": 9.865077974417384e-06, "loss": 1.6039, "step": 10292 }, { "epoch": 2.7047656583571453, "grad_norm": 0.5835421681404114, "learning_rate": 9.847555633432628e-06, "loss": 1.6378, "step": 10294 }, { "epoch": 2.705291161690807, "grad_norm": 0.6221166253089905, "learning_rate": 9.83003329244787e-06, "loss": 1.5753, "step": 10296 }, { "epoch": 2.705816665024469, "grad_norm": 0.5716819763183594, "learning_rate": 9.812510951463116e-06, "loss": 1.5876, "step": 10298 }, { "epoch": 2.7063421683581304, "grad_norm": 0.725444495677948, "learning_rate": 9.79498861047836e-06, "loss": 1.6308, "step": 10300 }, { "epoch": 2.7068676716917923, "grad_norm": 0.6209605932235718, "learning_rate": 9.777466269493605e-06, "loss": 1.5797, "step": 10302 }, { "epoch": 2.7073931750254543, "grad_norm": 0.6639958620071411, "learning_rate": 9.759943928508849e-06, "loss": 1.6051, "step": 10304 }, { "epoch": 2.707918678359116, "grad_norm": 0.6042879819869995, "learning_rate": 9.742421587524093e-06, "loss": 1.5985, "step": 10306 }, { "epoch": 2.7084441816927773, "grad_norm": 0.5975037813186646, "learning_rate": 9.724899246539339e-06, "loss": 1.5945, "step": 10308 }, { "epoch": 2.7089696850264393, "grad_norm": 0.5482746362686157, "learning_rate": 9.707376905554583e-06, "loss": 1.5645, "step": 10310 }, { "epoch": 2.7094951883601013, "grad_norm": 0.6123608946800232, "learning_rate": 9.689854564569827e-06, "loss": 1.5862, "step": 10312 }, { "epoch": 2.710020691693763, "grad_norm": 0.6144323348999023, "learning_rate": 9.672332223585071e-06, "loss": 1.6422, "step": 10314 }, { "epoch": 2.7105461950274248, "grad_norm": 0.5697504878044128, "learning_rate": 9.654809882600315e-06, "loss": 1.6299, "step": 10316 }, { "epoch": 2.7110716983610867, "grad_norm": 0.5491029024124146, "learning_rate": 9.63728754161556e-06, "loss": 1.5843, "step": 10318 }, { "epoch": 2.7115972016947483, "grad_norm": 0.8691264390945435, "learning_rate": 9.619765200630806e-06, "loss": 1.6086, "step": 10320 }, { "epoch": 2.71212270502841, "grad_norm": 0.5798594355583191, "learning_rate": 9.60224285964605e-06, "loss": 1.6205, "step": 10322 }, { "epoch": 2.7126482083620718, "grad_norm": 0.5886126756668091, "learning_rate": 9.584720518661294e-06, "loss": 1.5923, "step": 10324 }, { "epoch": 2.7131737116957337, "grad_norm": 0.7271785140037537, "learning_rate": 9.567198177676538e-06, "loss": 1.623, "step": 10326 }, { "epoch": 2.7136992150293953, "grad_norm": 0.5842270255088806, "learning_rate": 9.549675836691782e-06, "loss": 1.6141, "step": 10328 }, { "epoch": 2.7142247183630572, "grad_norm": 0.5803746581077576, "learning_rate": 9.532153495707028e-06, "loss": 1.611, "step": 10330 }, { "epoch": 2.7147502216967188, "grad_norm": 0.6185279488563538, "learning_rate": 9.51463115472227e-06, "loss": 1.6061, "step": 10332 }, { "epoch": 2.7152757250303807, "grad_norm": 0.5305478572845459, "learning_rate": 9.497108813737515e-06, "loss": 1.6341, "step": 10334 }, { "epoch": 2.7158012283640423, "grad_norm": 0.5952558517456055, "learning_rate": 9.47958647275276e-06, "loss": 1.6062, "step": 10336 }, { "epoch": 2.7163267316977042, "grad_norm": 0.5312665700912476, "learning_rate": 9.462064131768005e-06, "loss": 1.6027, "step": 10338 }, { "epoch": 2.716852235031366, "grad_norm": 0.6712905168533325, "learning_rate": 9.444541790783249e-06, "loss": 1.5846, "step": 10340 }, { "epoch": 2.7173777383650277, "grad_norm": 0.6250156164169312, "learning_rate": 9.427019449798493e-06, "loss": 1.6207, "step": 10342 }, { "epoch": 2.7179032416986897, "grad_norm": 0.6365962624549866, "learning_rate": 9.409497108813737e-06, "loss": 1.614, "step": 10344 }, { "epoch": 2.718428745032351, "grad_norm": 0.5564280152320862, "learning_rate": 9.391974767828983e-06, "loss": 1.6054, "step": 10346 }, { "epoch": 2.718954248366013, "grad_norm": 0.599860668182373, "learning_rate": 9.374452426844227e-06, "loss": 1.5914, "step": 10348 }, { "epoch": 2.7194797516996747, "grad_norm": 0.6647321581840515, "learning_rate": 9.356930085859472e-06, "loss": 1.6001, "step": 10350 }, { "epoch": 2.7200052550333367, "grad_norm": 0.5299103260040283, "learning_rate": 9.339407744874716e-06, "loss": 1.6181, "step": 10352 }, { "epoch": 2.7205307583669986, "grad_norm": 0.6260030269622803, "learning_rate": 9.32188540388996e-06, "loss": 1.5963, "step": 10354 }, { "epoch": 2.72105626170066, "grad_norm": 0.6055660247802734, "learning_rate": 9.304363062905204e-06, "loss": 1.564, "step": 10356 }, { "epoch": 2.7215817650343217, "grad_norm": 0.6588335633277893, "learning_rate": 9.28684072192045e-06, "loss": 1.5692, "step": 10358 }, { "epoch": 2.7221072683679837, "grad_norm": 0.5844119787216187, "learning_rate": 9.269318380935692e-06, "loss": 1.5931, "step": 10360 }, { "epoch": 2.7226327717016456, "grad_norm": 0.5909096598625183, "learning_rate": 9.251796039950938e-06, "loss": 1.6011, "step": 10362 }, { "epoch": 2.723158275035307, "grad_norm": 0.5176669955253601, "learning_rate": 9.234273698966182e-06, "loss": 1.5484, "step": 10364 }, { "epoch": 2.723683778368969, "grad_norm": 0.5780521035194397, "learning_rate": 9.216751357981427e-06, "loss": 1.5989, "step": 10366 }, { "epoch": 2.7242092817026307, "grad_norm": 0.6192241311073303, "learning_rate": 9.19922901699667e-06, "loss": 1.5719, "step": 10368 }, { "epoch": 2.7247347850362926, "grad_norm": 0.5565381646156311, "learning_rate": 9.181706676011915e-06, "loss": 1.6109, "step": 10370 }, { "epoch": 2.725260288369954, "grad_norm": 0.6125465631484985, "learning_rate": 9.16418433502716e-06, "loss": 1.61, "step": 10372 }, { "epoch": 2.725785791703616, "grad_norm": 0.597527265548706, "learning_rate": 9.146661994042405e-06, "loss": 1.6093, "step": 10374 }, { "epoch": 2.726311295037278, "grad_norm": 0.633154034614563, "learning_rate": 9.12913965305765e-06, "loss": 1.6101, "step": 10376 }, { "epoch": 2.7268367983709396, "grad_norm": 0.5533692240715027, "learning_rate": 9.111617312072893e-06, "loss": 1.5807, "step": 10378 }, { "epoch": 2.7273623017046016, "grad_norm": 0.6450961232185364, "learning_rate": 9.094094971088138e-06, "loss": 1.5835, "step": 10380 }, { "epoch": 2.727887805038263, "grad_norm": 0.629943311214447, "learning_rate": 9.076572630103382e-06, "loss": 1.5977, "step": 10382 }, { "epoch": 2.728413308371925, "grad_norm": 0.6521432995796204, "learning_rate": 9.059050289118628e-06, "loss": 1.6267, "step": 10384 }, { "epoch": 2.7289388117055866, "grad_norm": 0.5852161645889282, "learning_rate": 9.041527948133872e-06, "loss": 1.6112, "step": 10386 }, { "epoch": 2.7294643150392486, "grad_norm": 0.6588388085365295, "learning_rate": 9.024005607149114e-06, "loss": 1.6181, "step": 10388 }, { "epoch": 2.7299898183729105, "grad_norm": 0.6498621106147766, "learning_rate": 9.00648326616436e-06, "loss": 1.596, "step": 10390 }, { "epoch": 2.730515321706572, "grad_norm": 0.5801547169685364, "learning_rate": 8.988960925179604e-06, "loss": 1.5948, "step": 10392 }, { "epoch": 2.7310408250402336, "grad_norm": 0.6173077821731567, "learning_rate": 8.97143858419485e-06, "loss": 1.6164, "step": 10394 }, { "epoch": 2.7315663283738956, "grad_norm": 0.5936638116836548, "learning_rate": 8.953916243210093e-06, "loss": 1.5946, "step": 10396 }, { "epoch": 2.7320918317075575, "grad_norm": 0.6268817186355591, "learning_rate": 8.936393902225337e-06, "loss": 1.6422, "step": 10398 }, { "epoch": 2.732617335041219, "grad_norm": 0.7213032245635986, "learning_rate": 8.918871561240583e-06, "loss": 1.6387, "step": 10400 }, { "epoch": 2.732617335041219, "eval_loss": 1.649667501449585, "eval_runtime": 487.15, "eval_samples_per_second": 250.003, "eval_steps_per_second": 31.251, "step": 10400 }, { "epoch": 2.733142838374881, "grad_norm": 0.5801553130149841, "learning_rate": 8.901349220255827e-06, "loss": 1.5527, "step": 10402 }, { "epoch": 2.7336683417085426, "grad_norm": 0.6011367440223694, "learning_rate": 8.883826879271071e-06, "loss": 1.5901, "step": 10404 }, { "epoch": 2.7341938450422045, "grad_norm": 0.5433336496353149, "learning_rate": 8.866304538286315e-06, "loss": 1.605, "step": 10406 }, { "epoch": 2.734719348375866, "grad_norm": 0.5924966335296631, "learning_rate": 8.84878219730156e-06, "loss": 1.594, "step": 10408 }, { "epoch": 2.735244851709528, "grad_norm": 0.6247406601905823, "learning_rate": 8.831259856316805e-06, "loss": 1.5501, "step": 10410 }, { "epoch": 2.73577035504319, "grad_norm": 0.8343258500099182, "learning_rate": 8.81373751533205e-06, "loss": 1.6041, "step": 10412 }, { "epoch": 2.7362958583768515, "grad_norm": 0.5661430358886719, "learning_rate": 8.796215174347294e-06, "loss": 1.6153, "step": 10414 }, { "epoch": 2.7368213617105135, "grad_norm": 0.6813891530036926, "learning_rate": 8.778692833362538e-06, "loss": 1.6211, "step": 10416 }, { "epoch": 2.737346865044175, "grad_norm": 0.6394498348236084, "learning_rate": 8.761170492377782e-06, "loss": 1.5989, "step": 10418 }, { "epoch": 2.737872368377837, "grad_norm": 0.6066767573356628, "learning_rate": 8.743648151393026e-06, "loss": 1.6183, "step": 10420 }, { "epoch": 2.7383978717114985, "grad_norm": 0.5361884236335754, "learning_rate": 8.726125810408272e-06, "loss": 1.619, "step": 10422 }, { "epoch": 2.7389233750451605, "grad_norm": 0.5502820014953613, "learning_rate": 8.708603469423514e-06, "loss": 1.5936, "step": 10424 }, { "epoch": 2.7394488783788224, "grad_norm": 0.6126530170440674, "learning_rate": 8.691081128438759e-06, "loss": 1.5771, "step": 10426 }, { "epoch": 2.739974381712484, "grad_norm": 0.5665082335472107, "learning_rate": 8.673558787454005e-06, "loss": 1.608, "step": 10428 }, { "epoch": 2.7404998850461455, "grad_norm": 0.6264111399650574, "learning_rate": 8.656036446469249e-06, "loss": 1.6095, "step": 10430 }, { "epoch": 2.7410253883798075, "grad_norm": 0.6684525609016418, "learning_rate": 8.638514105484493e-06, "loss": 1.5786, "step": 10432 }, { "epoch": 2.7415508917134694, "grad_norm": 0.5436965823173523, "learning_rate": 8.620991764499737e-06, "loss": 1.5764, "step": 10434 }, { "epoch": 2.742076395047131, "grad_norm": 0.5792893171310425, "learning_rate": 8.603469423514981e-06, "loss": 1.6078, "step": 10436 }, { "epoch": 2.742601898380793, "grad_norm": 0.5539526343345642, "learning_rate": 8.585947082530227e-06, "loss": 1.5919, "step": 10438 }, { "epoch": 2.7431274017144545, "grad_norm": 0.5818884968757629, "learning_rate": 8.568424741545471e-06, "loss": 1.5926, "step": 10440 }, { "epoch": 2.7436529050481164, "grad_norm": 0.5815169811248779, "learning_rate": 8.550902400560715e-06, "loss": 1.5986, "step": 10442 }, { "epoch": 2.744178408381778, "grad_norm": 0.6487326622009277, "learning_rate": 8.53338005957596e-06, "loss": 1.6054, "step": 10444 }, { "epoch": 2.74470391171544, "grad_norm": 0.6841205358505249, "learning_rate": 8.515857718591204e-06, "loss": 1.6031, "step": 10446 }, { "epoch": 2.745229415049102, "grad_norm": 0.6351028680801392, "learning_rate": 8.49833537760645e-06, "loss": 1.6272, "step": 10448 }, { "epoch": 2.7457549183827634, "grad_norm": 0.5550417900085449, "learning_rate": 8.480813036621694e-06, "loss": 1.6228, "step": 10450 }, { "epoch": 2.7462804217164254, "grad_norm": 0.6113694906234741, "learning_rate": 8.463290695636936e-06, "loss": 1.5799, "step": 10452 }, { "epoch": 2.746805925050087, "grad_norm": 0.5771463513374329, "learning_rate": 8.445768354652182e-06, "loss": 1.577, "step": 10454 }, { "epoch": 2.747331428383749, "grad_norm": 0.5693290829658508, "learning_rate": 8.428246013667426e-06, "loss": 1.5841, "step": 10456 }, { "epoch": 2.7478569317174104, "grad_norm": 0.5757125020027161, "learning_rate": 8.41072367268267e-06, "loss": 1.5785, "step": 10458 }, { "epoch": 2.7483824350510724, "grad_norm": 0.643251359462738, "learning_rate": 8.393201331697915e-06, "loss": 1.611, "step": 10460 }, { "epoch": 2.7489079383847344, "grad_norm": 0.6383783221244812, "learning_rate": 8.375678990713159e-06, "loss": 1.6065, "step": 10462 }, { "epoch": 2.749433441718396, "grad_norm": 0.5447094440460205, "learning_rate": 8.358156649728405e-06, "loss": 1.5887, "step": 10464 }, { "epoch": 2.7499589450520574, "grad_norm": 0.5493614077568054, "learning_rate": 8.340634308743649e-06, "loss": 1.5946, "step": 10466 }, { "epoch": 2.7504844483857194, "grad_norm": 0.7292418479919434, "learning_rate": 8.323111967758893e-06, "loss": 1.5853, "step": 10468 }, { "epoch": 2.7510099517193813, "grad_norm": 0.5726646780967712, "learning_rate": 8.305589626774137e-06, "loss": 1.6051, "step": 10470 }, { "epoch": 2.751535455053043, "grad_norm": 0.6066914200782776, "learning_rate": 8.288067285789381e-06, "loss": 1.5952, "step": 10472 }, { "epoch": 2.752060958386705, "grad_norm": 0.6483511328697205, "learning_rate": 8.270544944804626e-06, "loss": 1.6212, "step": 10474 }, { "epoch": 2.752586461720367, "grad_norm": 0.5741682648658752, "learning_rate": 8.253022603819872e-06, "loss": 1.6178, "step": 10476 }, { "epoch": 2.7531119650540283, "grad_norm": 0.5253785848617554, "learning_rate": 8.235500262835116e-06, "loss": 1.5889, "step": 10478 }, { "epoch": 2.75363746838769, "grad_norm": 0.557144820690155, "learning_rate": 8.21797792185036e-06, "loss": 1.6081, "step": 10480 }, { "epoch": 2.754162971721352, "grad_norm": 0.5865209698677063, "learning_rate": 8.200455580865604e-06, "loss": 1.603, "step": 10482 }, { "epoch": 2.754688475055014, "grad_norm": 0.5472773909568787, "learning_rate": 8.182933239880848e-06, "loss": 1.6022, "step": 10484 }, { "epoch": 2.7552139783886753, "grad_norm": 0.6681110262870789, "learning_rate": 8.165410898896094e-06, "loss": 1.5927, "step": 10486 }, { "epoch": 2.7557394817223373, "grad_norm": 0.6070604920387268, "learning_rate": 8.147888557911337e-06, "loss": 1.6051, "step": 10488 }, { "epoch": 2.756264985055999, "grad_norm": 0.5712462663650513, "learning_rate": 8.13036621692658e-06, "loss": 1.6124, "step": 10490 }, { "epoch": 2.756790488389661, "grad_norm": 0.5560539364814758, "learning_rate": 8.112843875941827e-06, "loss": 1.6116, "step": 10492 }, { "epoch": 2.7573159917233223, "grad_norm": 0.6540628671646118, "learning_rate": 8.09532153495707e-06, "loss": 1.5916, "step": 10494 }, { "epoch": 2.7578414950569843, "grad_norm": 0.5967172384262085, "learning_rate": 8.077799193972315e-06, "loss": 1.5789, "step": 10496 }, { "epoch": 2.7583669983906463, "grad_norm": 0.6844902634620667, "learning_rate": 8.060276852987559e-06, "loss": 1.6098, "step": 10498 }, { "epoch": 2.7588925017243078, "grad_norm": 0.6153303980827332, "learning_rate": 8.042754512002803e-06, "loss": 1.6149, "step": 10500 }, { "epoch": 2.7594180050579697, "grad_norm": 0.5895024538040161, "learning_rate": 8.02523217101805e-06, "loss": 1.6097, "step": 10502 }, { "epoch": 2.7599435083916313, "grad_norm": 0.6543148159980774, "learning_rate": 8.007709830033293e-06, "loss": 1.5865, "step": 10504 }, { "epoch": 2.7604690117252932, "grad_norm": 0.6354900002479553, "learning_rate": 7.990187489048538e-06, "loss": 1.587, "step": 10506 }, { "epoch": 2.7609945150589548, "grad_norm": 0.6232167482376099, "learning_rate": 7.972665148063782e-06, "loss": 1.622, "step": 10508 }, { "epoch": 2.7615200183926167, "grad_norm": 0.5841950178146362, "learning_rate": 7.955142807079026e-06, "loss": 1.6159, "step": 10510 }, { "epoch": 2.7620455217262787, "grad_norm": 0.5530897974967957, "learning_rate": 7.93762046609427e-06, "loss": 1.5696, "step": 10512 }, { "epoch": 2.7625710250599402, "grad_norm": 0.5793708562850952, "learning_rate": 7.920098125109516e-06, "loss": 1.5672, "step": 10514 }, { "epoch": 2.7630965283936018, "grad_norm": 0.595795750617981, "learning_rate": 7.902575784124758e-06, "loss": 1.6003, "step": 10516 }, { "epoch": 2.7636220317272637, "grad_norm": 0.5570265054702759, "learning_rate": 7.885053443140004e-06, "loss": 1.5881, "step": 10518 }, { "epoch": 2.7641475350609257, "grad_norm": 0.6062209606170654, "learning_rate": 7.867531102155248e-06, "loss": 1.5865, "step": 10520 }, { "epoch": 2.7646730383945872, "grad_norm": 0.6273199319839478, "learning_rate": 7.850008761170493e-06, "loss": 1.6095, "step": 10522 }, { "epoch": 2.765198541728249, "grad_norm": 0.6534038782119751, "learning_rate": 7.832486420185737e-06, "loss": 1.5834, "step": 10524 }, { "epoch": 2.7657240450619107, "grad_norm": 0.7082603573799133, "learning_rate": 7.814964079200981e-06, "loss": 1.5977, "step": 10526 }, { "epoch": 2.7662495483955727, "grad_norm": 0.8246460556983948, "learning_rate": 7.797441738216225e-06, "loss": 1.5952, "step": 10528 }, { "epoch": 2.766775051729234, "grad_norm": 0.6671035289764404, "learning_rate": 7.779919397231471e-06, "loss": 1.6045, "step": 10530 }, { "epoch": 2.767300555062896, "grad_norm": 0.6524311900138855, "learning_rate": 7.762397056246715e-06, "loss": 1.6305, "step": 10532 }, { "epoch": 2.767826058396558, "grad_norm": 0.5577046275138855, "learning_rate": 7.74487471526196e-06, "loss": 1.5955, "step": 10534 }, { "epoch": 2.7683515617302197, "grad_norm": 0.5540436506271362, "learning_rate": 7.727352374277204e-06, "loss": 1.6009, "step": 10536 }, { "epoch": 2.7688770650638816, "grad_norm": 0.5906193256378174, "learning_rate": 7.709830033292448e-06, "loss": 1.5597, "step": 10538 }, { "epoch": 2.769402568397543, "grad_norm": 0.7687748074531555, "learning_rate": 7.692307692307694e-06, "loss": 1.6085, "step": 10540 }, { "epoch": 2.769928071731205, "grad_norm": 0.5762370824813843, "learning_rate": 7.674785351322938e-06, "loss": 1.6209, "step": 10542 }, { "epoch": 2.7704535750648667, "grad_norm": 0.6532847285270691, "learning_rate": 7.65726301033818e-06, "loss": 1.5809, "step": 10544 }, { "epoch": 2.7709790783985286, "grad_norm": 0.6120353937149048, "learning_rate": 7.639740669353426e-06, "loss": 1.5994, "step": 10546 }, { "epoch": 2.7715045817321906, "grad_norm": 0.5683396458625793, "learning_rate": 7.62221832836867e-06, "loss": 1.6049, "step": 10548 }, { "epoch": 2.772030085065852, "grad_norm": 0.6380746364593506, "learning_rate": 7.604695987383915e-06, "loss": 1.5922, "step": 10550 }, { "epoch": 2.7725555883995137, "grad_norm": 0.619238018989563, "learning_rate": 7.5871736463991595e-06, "loss": 1.6036, "step": 10552 }, { "epoch": 2.7730810917331756, "grad_norm": 0.6409256458282471, "learning_rate": 7.569651305414404e-06, "loss": 1.6094, "step": 10554 }, { "epoch": 2.7736065950668376, "grad_norm": 0.6005852818489075, "learning_rate": 7.552128964429649e-06, "loss": 1.5932, "step": 10556 }, { "epoch": 2.774132098400499, "grad_norm": 0.5933837890625, "learning_rate": 7.534606623444893e-06, "loss": 1.5956, "step": 10558 }, { "epoch": 2.774657601734161, "grad_norm": 0.6530826687812805, "learning_rate": 7.517084282460136e-06, "loss": 1.6305, "step": 10560 }, { "epoch": 2.7751831050678226, "grad_norm": 0.7053954601287842, "learning_rate": 7.499561941475382e-06, "loss": 1.6037, "step": 10562 }, { "epoch": 2.7757086084014846, "grad_norm": 0.5595335364341736, "learning_rate": 7.482039600490625e-06, "loss": 1.6028, "step": 10564 }, { "epoch": 2.776234111735146, "grad_norm": 0.6155191659927368, "learning_rate": 7.464517259505871e-06, "loss": 1.591, "step": 10566 }, { "epoch": 2.776759615068808, "grad_norm": 0.5995579361915588, "learning_rate": 7.446994918521115e-06, "loss": 1.6206, "step": 10568 }, { "epoch": 2.77728511840247, "grad_norm": 0.5798934698104858, "learning_rate": 7.429472577536359e-06, "loss": 1.5958, "step": 10570 }, { "epoch": 2.7778106217361316, "grad_norm": 0.6855894923210144, "learning_rate": 7.411950236551604e-06, "loss": 1.6102, "step": 10572 }, { "epoch": 2.7783361250697935, "grad_norm": 0.6109877228736877, "learning_rate": 7.394427895566848e-06, "loss": 1.5838, "step": 10574 }, { "epoch": 2.778861628403455, "grad_norm": 0.5831050276756287, "learning_rate": 7.376905554582092e-06, "loss": 1.5936, "step": 10576 }, { "epoch": 2.779387131737117, "grad_norm": 0.6219046711921692, "learning_rate": 7.359383213597337e-06, "loss": 1.6013, "step": 10578 }, { "epoch": 2.7799126350707786, "grad_norm": 0.6402764916419983, "learning_rate": 7.341860872612581e-06, "loss": 1.6106, "step": 10580 }, { "epoch": 2.7804381384044405, "grad_norm": 0.6024214625358582, "learning_rate": 7.324338531627826e-06, "loss": 1.6195, "step": 10582 }, { "epoch": 2.7809636417381025, "grad_norm": 0.5932384729385376, "learning_rate": 7.3068161906430705e-06, "loss": 1.5976, "step": 10584 }, { "epoch": 2.781489145071764, "grad_norm": 0.631115198135376, "learning_rate": 7.289293849658315e-06, "loss": 1.5967, "step": 10586 }, { "epoch": 2.7820146484054256, "grad_norm": 0.6470726132392883, "learning_rate": 7.27177150867356e-06, "loss": 1.5992, "step": 10588 }, { "epoch": 2.7825401517390875, "grad_norm": 0.6247832775115967, "learning_rate": 7.254249167688804e-06, "loss": 1.6331, "step": 10590 }, { "epoch": 2.7830656550727495, "grad_norm": 0.5455314517021179, "learning_rate": 7.236726826704047e-06, "loss": 1.6089, "step": 10592 }, { "epoch": 2.783591158406411, "grad_norm": 0.7327371835708618, "learning_rate": 7.219204485719293e-06, "loss": 1.5804, "step": 10594 }, { "epoch": 2.784116661740073, "grad_norm": 0.6657344102859497, "learning_rate": 7.201682144734536e-06, "loss": 1.5925, "step": 10596 }, { "epoch": 2.7846421650737345, "grad_norm": 0.6697341203689575, "learning_rate": 7.184159803749781e-06, "loss": 1.5906, "step": 10598 }, { "epoch": 2.7851676684073965, "grad_norm": 0.6150529384613037, "learning_rate": 7.166637462765026e-06, "loss": 1.6017, "step": 10600 }, { "epoch": 2.785693171741058, "grad_norm": 0.5665842890739441, "learning_rate": 7.14911512178027e-06, "loss": 1.581, "step": 10602 }, { "epoch": 2.78621867507472, "grad_norm": 0.5935694575309753, "learning_rate": 7.131592780795515e-06, "loss": 1.5845, "step": 10604 }, { "epoch": 2.786744178408382, "grad_norm": 0.6057782769203186, "learning_rate": 7.114070439810759e-06, "loss": 1.5985, "step": 10606 }, { "epoch": 2.7872696817420435, "grad_norm": 0.5478567481040955, "learning_rate": 7.096548098826003e-06, "loss": 1.6067, "step": 10608 }, { "epoch": 2.7877951850757055, "grad_norm": 0.5651966333389282, "learning_rate": 7.079025757841248e-06, "loss": 1.6157, "step": 10610 }, { "epoch": 2.788320688409367, "grad_norm": 0.6189813017845154, "learning_rate": 7.061503416856492e-06, "loss": 1.5912, "step": 10612 }, { "epoch": 2.788846191743029, "grad_norm": 0.6159524917602539, "learning_rate": 7.0439810758717365e-06, "loss": 1.622, "step": 10614 }, { "epoch": 2.7893716950766905, "grad_norm": 0.5482856631278992, "learning_rate": 7.0264587348869816e-06, "loss": 1.6197, "step": 10616 }, { "epoch": 2.7898971984103524, "grad_norm": 0.6579813957214355, "learning_rate": 7.008936393902226e-06, "loss": 1.6182, "step": 10618 }, { "epoch": 2.7904227017440144, "grad_norm": 0.607364296913147, "learning_rate": 6.991414052917471e-06, "loss": 1.6183, "step": 10620 }, { "epoch": 2.790948205077676, "grad_norm": 0.649757444858551, "learning_rate": 6.973891711932715e-06, "loss": 1.6179, "step": 10622 }, { "epoch": 2.7914737084113375, "grad_norm": 0.5615342259407043, "learning_rate": 6.956369370947958e-06, "loss": 1.632, "step": 10624 }, { "epoch": 2.7919992117449994, "grad_norm": 0.6041872501373291, "learning_rate": 6.938847029963204e-06, "loss": 1.5943, "step": 10626 }, { "epoch": 2.7925247150786614, "grad_norm": 0.6032666563987732, "learning_rate": 6.9213246889784475e-06, "loss": 1.5552, "step": 10628 }, { "epoch": 2.793050218412323, "grad_norm": 0.5664829611778259, "learning_rate": 6.903802347993692e-06, "loss": 1.5918, "step": 10630 }, { "epoch": 2.793575721745985, "grad_norm": 0.633831262588501, "learning_rate": 6.886280007008937e-06, "loss": 1.5763, "step": 10632 }, { "epoch": 2.794101225079647, "grad_norm": 0.5904597640037537, "learning_rate": 6.868757666024181e-06, "loss": 1.5659, "step": 10634 }, { "epoch": 2.7946267284133084, "grad_norm": 0.556845486164093, "learning_rate": 6.851235325039426e-06, "loss": 1.5793, "step": 10636 }, { "epoch": 2.79515223174697, "grad_norm": 0.6646136045455933, "learning_rate": 6.83371298405467e-06, "loss": 1.6201, "step": 10638 }, { "epoch": 2.795677735080632, "grad_norm": 0.5443913340568542, "learning_rate": 6.816190643069914e-06, "loss": 1.6126, "step": 10640 }, { "epoch": 2.796203238414294, "grad_norm": 0.5882983803749084, "learning_rate": 6.798668302085159e-06, "loss": 1.572, "step": 10642 }, { "epoch": 2.7967287417479554, "grad_norm": 0.5482323169708252, "learning_rate": 6.781145961100403e-06, "loss": 1.6127, "step": 10644 }, { "epoch": 2.7972542450816174, "grad_norm": 0.5871306657791138, "learning_rate": 6.763623620115648e-06, "loss": 1.6031, "step": 10646 }, { "epoch": 2.797779748415279, "grad_norm": 0.6310222744941711, "learning_rate": 6.746101279130893e-06, "loss": 1.6017, "step": 10648 }, { "epoch": 2.798305251748941, "grad_norm": 0.8194197416305542, "learning_rate": 6.728578938146137e-06, "loss": 1.5373, "step": 10650 }, { "epoch": 2.7988307550826024, "grad_norm": 0.5392462015151978, "learning_rate": 6.711056597161382e-06, "loss": 1.6042, "step": 10652 }, { "epoch": 2.7993562584162643, "grad_norm": 0.5910424590110779, "learning_rate": 6.693534256176626e-06, "loss": 1.6332, "step": 10654 }, { "epoch": 2.7998817617499263, "grad_norm": 0.6581763029098511, "learning_rate": 6.676011915191869e-06, "loss": 1.617, "step": 10656 }, { "epoch": 2.800407265083588, "grad_norm": 0.6362857222557068, "learning_rate": 6.658489574207115e-06, "loss": 1.5952, "step": 10658 }, { "epoch": 2.80093276841725, "grad_norm": 0.6781132817268372, "learning_rate": 6.6409672332223585e-06, "loss": 1.571, "step": 10660 }, { "epoch": 2.8014582717509113, "grad_norm": 0.609855055809021, "learning_rate": 6.623444892237603e-06, "loss": 1.6155, "step": 10662 }, { "epoch": 2.8019837750845733, "grad_norm": 0.5758035182952881, "learning_rate": 6.605922551252848e-06, "loss": 1.5991, "step": 10664 }, { "epoch": 2.802509278418235, "grad_norm": 0.6021389961242676, "learning_rate": 6.588400210268092e-06, "loss": 1.6027, "step": 10666 }, { "epoch": 2.803034781751897, "grad_norm": 0.6531217098236084, "learning_rate": 6.570877869283337e-06, "loss": 1.6246, "step": 10668 }, { "epoch": 2.8035602850855588, "grad_norm": 0.7034488916397095, "learning_rate": 6.553355528298581e-06, "loss": 1.599, "step": 10670 }, { "epoch": 2.8040857884192203, "grad_norm": 0.5464747548103333, "learning_rate": 6.535833187313825e-06, "loss": 1.6301, "step": 10672 }, { "epoch": 2.804611291752882, "grad_norm": 0.6323386430740356, "learning_rate": 6.51831084632907e-06, "loss": 1.634, "step": 10674 }, { "epoch": 2.805136795086544, "grad_norm": 0.5574644207954407, "learning_rate": 6.5007885053443144e-06, "loss": 1.5634, "step": 10676 }, { "epoch": 2.8056622984202058, "grad_norm": 0.641542911529541, "learning_rate": 6.483266164359559e-06, "loss": 1.5822, "step": 10678 }, { "epoch": 2.8061878017538673, "grad_norm": 0.5666943788528442, "learning_rate": 6.465743823374804e-06, "loss": 1.5812, "step": 10680 }, { "epoch": 2.8067133050875293, "grad_norm": 0.5939244627952576, "learning_rate": 6.448221482390048e-06, "loss": 1.5907, "step": 10682 }, { "epoch": 2.807238808421191, "grad_norm": 0.5618119835853577, "learning_rate": 6.430699141405291e-06, "loss": 1.5859, "step": 10684 }, { "epoch": 2.8077643117548527, "grad_norm": 0.5734422206878662, "learning_rate": 6.413176800420537e-06, "loss": 1.5904, "step": 10686 }, { "epoch": 2.8082898150885143, "grad_norm": 0.5530012845993042, "learning_rate": 6.39565445943578e-06, "loss": 1.6041, "step": 10688 }, { "epoch": 2.8088153184221762, "grad_norm": 0.6470913290977478, "learning_rate": 6.378132118451026e-06, "loss": 1.594, "step": 10690 }, { "epoch": 2.809340821755838, "grad_norm": 0.6118698120117188, "learning_rate": 6.3606097774662695e-06, "loss": 1.625, "step": 10692 }, { "epoch": 2.8098663250894997, "grad_norm": 0.6750172972679138, "learning_rate": 6.343087436481514e-06, "loss": 1.5389, "step": 10694 }, { "epoch": 2.8103918284231617, "grad_norm": 0.6908929347991943, "learning_rate": 6.325565095496759e-06, "loss": 1.6215, "step": 10696 }, { "epoch": 2.8109173317568232, "grad_norm": 0.6111420392990112, "learning_rate": 6.308042754512003e-06, "loss": 1.5701, "step": 10698 }, { "epoch": 2.811442835090485, "grad_norm": 0.6819601655006409, "learning_rate": 6.290520413527247e-06, "loss": 1.5895, "step": 10700 }, { "epoch": 2.8119683384241467, "grad_norm": 0.6061453819274902, "learning_rate": 6.272998072542492e-06, "loss": 1.6076, "step": 10702 }, { "epoch": 2.8124938417578087, "grad_norm": 0.6342339515686035, "learning_rate": 6.255475731557736e-06, "loss": 1.6128, "step": 10704 }, { "epoch": 2.8130193450914707, "grad_norm": 0.6219684481620789, "learning_rate": 6.2379533905729805e-06, "loss": 1.6264, "step": 10706 }, { "epoch": 2.813544848425132, "grad_norm": 0.779617190361023, "learning_rate": 6.2204310495882255e-06, "loss": 1.6218, "step": 10708 }, { "epoch": 2.8140703517587937, "grad_norm": 0.5725176334381104, "learning_rate": 6.20290870860347e-06, "loss": 1.5772, "step": 10710 }, { "epoch": 2.8145958550924557, "grad_norm": 0.6861016750335693, "learning_rate": 6.185386367618714e-06, "loss": 1.604, "step": 10712 }, { "epoch": 2.8151213584261177, "grad_norm": 0.6791368126869202, "learning_rate": 6.167864026633959e-06, "loss": 1.6244, "step": 10714 }, { "epoch": 2.815646861759779, "grad_norm": 0.5471606850624084, "learning_rate": 6.150341685649203e-06, "loss": 1.589, "step": 10716 }, { "epoch": 2.816172365093441, "grad_norm": 0.5449687242507935, "learning_rate": 6.132819344664448e-06, "loss": 1.6234, "step": 10718 }, { "epoch": 2.8166978684271027, "grad_norm": 0.6119715571403503, "learning_rate": 6.115297003679691e-06, "loss": 1.5809, "step": 10720 }, { "epoch": 2.8172233717607646, "grad_norm": 0.5771490931510925, "learning_rate": 6.097774662694936e-06, "loss": 1.5833, "step": 10722 }, { "epoch": 2.817748875094426, "grad_norm": 0.5519348978996277, "learning_rate": 6.0802523217101806e-06, "loss": 1.6195, "step": 10724 }, { "epoch": 2.818274378428088, "grad_norm": 0.7355902791023254, "learning_rate": 6.062729980725426e-06, "loss": 1.54, "step": 10726 }, { "epoch": 2.81879988176175, "grad_norm": 0.5478994250297546, "learning_rate": 6.04520763974067e-06, "loss": 1.6225, "step": 10728 }, { "epoch": 2.8193253850954116, "grad_norm": 0.5732230544090271, "learning_rate": 6.027685298755914e-06, "loss": 1.5899, "step": 10730 }, { "epoch": 2.8198508884290736, "grad_norm": 0.6284743547439575, "learning_rate": 6.010162957771159e-06, "loss": 1.6116, "step": 10732 }, { "epoch": 2.820376391762735, "grad_norm": 0.6271458864212036, "learning_rate": 5.992640616786403e-06, "loss": 1.5938, "step": 10734 }, { "epoch": 2.820901895096397, "grad_norm": 0.5997607111930847, "learning_rate": 5.975118275801647e-06, "loss": 1.6035, "step": 10736 }, { "epoch": 2.8214273984300586, "grad_norm": 0.6365160942077637, "learning_rate": 5.9575959348168915e-06, "loss": 1.6172, "step": 10738 }, { "epoch": 2.8219529017637206, "grad_norm": 0.7437217235565186, "learning_rate": 5.9400735938321365e-06, "loss": 1.605, "step": 10740 }, { "epoch": 2.8224784050973826, "grad_norm": 0.6684809923171997, "learning_rate": 5.922551252847381e-06, "loss": 1.6066, "step": 10742 }, { "epoch": 2.823003908431044, "grad_norm": 0.5696150064468384, "learning_rate": 5.905028911862625e-06, "loss": 1.6137, "step": 10744 }, { "epoch": 2.8235294117647056, "grad_norm": 0.6045554876327515, "learning_rate": 5.88750657087787e-06, "loss": 1.6169, "step": 10746 }, { "epoch": 2.8240549150983676, "grad_norm": 0.5769687294960022, "learning_rate": 5.869984229893114e-06, "loss": 1.612, "step": 10748 }, { "epoch": 2.8245804184320296, "grad_norm": 0.6565601825714111, "learning_rate": 5.852461888908359e-06, "loss": 1.6078, "step": 10750 }, { "epoch": 2.825105921765691, "grad_norm": 0.614622175693512, "learning_rate": 5.834939547923602e-06, "loss": 1.6229, "step": 10752 }, { "epoch": 2.825631425099353, "grad_norm": 0.5989246964454651, "learning_rate": 5.8174172069388474e-06, "loss": 1.6167, "step": 10754 }, { "epoch": 2.8261569284330146, "grad_norm": 0.6011819243431091, "learning_rate": 5.799894865954092e-06, "loss": 1.5975, "step": 10756 }, { "epoch": 2.8266824317666766, "grad_norm": 0.5927401185035706, "learning_rate": 5.782372524969336e-06, "loss": 1.6034, "step": 10758 }, { "epoch": 2.827207935100338, "grad_norm": 0.6807177066802979, "learning_rate": 5.764850183984581e-06, "loss": 1.6005, "step": 10760 }, { "epoch": 2.827733438434, "grad_norm": 0.637870728969574, "learning_rate": 5.747327842999825e-06, "loss": 1.6184, "step": 10762 }, { "epoch": 2.828258941767662, "grad_norm": 0.5749346017837524, "learning_rate": 5.72980550201507e-06, "loss": 1.6022, "step": 10764 }, { "epoch": 2.8287844451013235, "grad_norm": 0.5985031127929688, "learning_rate": 5.712283161030313e-06, "loss": 1.5792, "step": 10766 }, { "epoch": 2.8293099484349855, "grad_norm": 0.5938543677330017, "learning_rate": 5.694760820045558e-06, "loss": 1.621, "step": 10768 }, { "epoch": 2.829835451768647, "grad_norm": 0.6295526027679443, "learning_rate": 5.6772384790608025e-06, "loss": 1.6134, "step": 10770 }, { "epoch": 2.830360955102309, "grad_norm": 0.5686202049255371, "learning_rate": 5.6597161380760476e-06, "loss": 1.5748, "step": 10772 }, { "epoch": 2.8308864584359705, "grad_norm": 0.5600228309631348, "learning_rate": 5.642193797091292e-06, "loss": 1.5901, "step": 10774 }, { "epoch": 2.8314119617696325, "grad_norm": 0.5651440024375916, "learning_rate": 5.624671456106536e-06, "loss": 1.6054, "step": 10776 }, { "epoch": 2.8319374651032945, "grad_norm": 0.5856150984764099, "learning_rate": 5.607149115121781e-06, "loss": 1.5641, "step": 10778 }, { "epoch": 2.832462968436956, "grad_norm": 0.6541823148727417, "learning_rate": 5.589626774137025e-06, "loss": 1.5969, "step": 10780 }, { "epoch": 2.8329884717706175, "grad_norm": 0.577987551689148, "learning_rate": 5.572104433152269e-06, "loss": 1.5768, "step": 10782 }, { "epoch": 2.8335139751042795, "grad_norm": 0.6325380206108093, "learning_rate": 5.5545820921675135e-06, "loss": 1.5626, "step": 10784 }, { "epoch": 2.8340394784379415, "grad_norm": 0.5774263143539429, "learning_rate": 5.5370597511827585e-06, "loss": 1.6056, "step": 10786 }, { "epoch": 2.834564981771603, "grad_norm": 0.5753619074821472, "learning_rate": 5.519537410198003e-06, "loss": 1.5734, "step": 10788 }, { "epoch": 2.835090485105265, "grad_norm": 0.5787561535835266, "learning_rate": 5.502015069213247e-06, "loss": 1.6027, "step": 10790 }, { "epoch": 2.835615988438927, "grad_norm": 0.616862952709198, "learning_rate": 5.484492728228492e-06, "loss": 1.6296, "step": 10792 }, { "epoch": 2.8361414917725885, "grad_norm": 0.5734919905662537, "learning_rate": 5.466970387243736e-06, "loss": 1.5966, "step": 10794 }, { "epoch": 2.83666699510625, "grad_norm": 0.5936645865440369, "learning_rate": 5.449448046258981e-06, "loss": 1.6018, "step": 10796 }, { "epoch": 2.837192498439912, "grad_norm": 0.6087591052055359, "learning_rate": 5.431925705274224e-06, "loss": 1.6201, "step": 10798 }, { "epoch": 2.837718001773574, "grad_norm": 0.609794557094574, "learning_rate": 5.414403364289469e-06, "loss": 1.6401, "step": 10800 }, { "epoch": 2.837718001773574, "eval_loss": 1.6476643085479736, "eval_runtime": 487.2582, "eval_samples_per_second": 249.948, "eval_steps_per_second": 31.244, "step": 10800 }, { "epoch": 2.8382435051072354, "grad_norm": 0.8440867066383362, "learning_rate": 5.3968810233047136e-06, "loss": 1.613, "step": 10802 }, { "epoch": 2.8387690084408974, "grad_norm": 0.6034703254699707, "learning_rate": 5.379358682319959e-06, "loss": 1.5959, "step": 10804 }, { "epoch": 2.839294511774559, "grad_norm": 0.5443819165229797, "learning_rate": 5.361836341335203e-06, "loss": 1.5844, "step": 10806 }, { "epoch": 2.839820015108221, "grad_norm": 0.7041229605674744, "learning_rate": 5.344314000350447e-06, "loss": 1.611, "step": 10808 }, { "epoch": 2.8403455184418824, "grad_norm": 0.5578668713569641, "learning_rate": 5.326791659365692e-06, "loss": 1.6177, "step": 10810 }, { "epoch": 2.8408710217755444, "grad_norm": 0.5947692394256592, "learning_rate": 5.309269318380936e-06, "loss": 1.6223, "step": 10812 }, { "epoch": 2.8413965251092064, "grad_norm": 0.6893056631088257, "learning_rate": 5.29174697739618e-06, "loss": 1.6284, "step": 10814 }, { "epoch": 2.841922028442868, "grad_norm": 0.6605462431907654, "learning_rate": 5.2742246364114245e-06, "loss": 1.6525, "step": 10816 }, { "epoch": 2.84244753177653, "grad_norm": 0.5912427306175232, "learning_rate": 5.2567022954266695e-06, "loss": 1.5903, "step": 10818 }, { "epoch": 2.8429730351101914, "grad_norm": 0.6220155954360962, "learning_rate": 5.239179954441914e-06, "loss": 1.6177, "step": 10820 }, { "epoch": 2.8434985384438534, "grad_norm": 0.5568714141845703, "learning_rate": 5.221657613457158e-06, "loss": 1.5975, "step": 10822 }, { "epoch": 2.844024041777515, "grad_norm": 0.6185790300369263, "learning_rate": 5.204135272472403e-06, "loss": 1.6108, "step": 10824 }, { "epoch": 2.844549545111177, "grad_norm": 0.5641743540763855, "learning_rate": 5.186612931487647e-06, "loss": 1.5923, "step": 10826 }, { "epoch": 2.845075048444839, "grad_norm": 0.5651496052742004, "learning_rate": 5.169090590502892e-06, "loss": 1.6242, "step": 10828 }, { "epoch": 2.8456005517785004, "grad_norm": 0.5548931360244751, "learning_rate": 5.151568249518135e-06, "loss": 1.5939, "step": 10830 }, { "epoch": 2.846126055112162, "grad_norm": 0.6729891896247864, "learning_rate": 5.1340459085333804e-06, "loss": 1.6279, "step": 10832 }, { "epoch": 2.846651558445824, "grad_norm": 0.603522002696991, "learning_rate": 5.116523567548625e-06, "loss": 1.5939, "step": 10834 }, { "epoch": 2.847177061779486, "grad_norm": 0.5417622923851013, "learning_rate": 5.099001226563869e-06, "loss": 1.5809, "step": 10836 }, { "epoch": 2.8477025651131473, "grad_norm": 0.6103736162185669, "learning_rate": 5.081478885579114e-06, "loss": 1.5864, "step": 10838 }, { "epoch": 2.8482280684468093, "grad_norm": 0.6213535666465759, "learning_rate": 5.063956544594358e-06, "loss": 1.6082, "step": 10840 }, { "epoch": 2.848753571780471, "grad_norm": 0.5709004998207092, "learning_rate": 5.046434203609603e-06, "loss": 1.5648, "step": 10842 }, { "epoch": 2.849279075114133, "grad_norm": 0.6318028569221497, "learning_rate": 5.028911862624846e-06, "loss": 1.6319, "step": 10844 }, { "epoch": 2.8498045784477943, "grad_norm": 0.5908393263816833, "learning_rate": 5.011389521640091e-06, "loss": 1.63, "step": 10846 }, { "epoch": 2.8503300817814563, "grad_norm": 0.5498826503753662, "learning_rate": 4.9938671806553355e-06, "loss": 1.6219, "step": 10848 }, { "epoch": 2.8508555851151183, "grad_norm": 0.5806742906570435, "learning_rate": 4.9763448396705806e-06, "loss": 1.574, "step": 10850 }, { "epoch": 2.85138108844878, "grad_norm": 0.5563404560089111, "learning_rate": 4.958822498685825e-06, "loss": 1.6111, "step": 10852 }, { "epoch": 2.8519065917824418, "grad_norm": 0.5694655179977417, "learning_rate": 4.941300157701069e-06, "loss": 1.604, "step": 10854 }, { "epoch": 2.8524320951161033, "grad_norm": 0.5293794274330139, "learning_rate": 4.923777816716314e-06, "loss": 1.624, "step": 10856 }, { "epoch": 2.8529575984497653, "grad_norm": 0.5831487774848938, "learning_rate": 4.906255475731558e-06, "loss": 1.6151, "step": 10858 }, { "epoch": 2.853483101783427, "grad_norm": 0.6377184987068176, "learning_rate": 4.888733134746802e-06, "loss": 1.5698, "step": 10860 }, { "epoch": 2.8540086051170888, "grad_norm": 0.6772559881210327, "learning_rate": 4.8712107937620465e-06, "loss": 1.5918, "step": 10862 }, { "epoch": 2.8545341084507507, "grad_norm": 0.6782848238945007, "learning_rate": 4.8536884527772915e-06, "loss": 1.6303, "step": 10864 }, { "epoch": 2.8550596117844123, "grad_norm": 0.6130311489105225, "learning_rate": 4.836166111792536e-06, "loss": 1.6051, "step": 10866 }, { "epoch": 2.855585115118074, "grad_norm": 0.6167743802070618, "learning_rate": 4.81864377080778e-06, "loss": 1.6214, "step": 10868 }, { "epoch": 2.8561106184517357, "grad_norm": 0.5957754850387573, "learning_rate": 4.801121429823025e-06, "loss": 1.6118, "step": 10870 }, { "epoch": 2.8566361217853977, "grad_norm": 0.5984094738960266, "learning_rate": 4.783599088838269e-06, "loss": 1.5894, "step": 10872 }, { "epoch": 2.8571616251190592, "grad_norm": 0.6340751051902771, "learning_rate": 4.766076747853514e-06, "loss": 1.6018, "step": 10874 }, { "epoch": 2.857687128452721, "grad_norm": 0.6450504660606384, "learning_rate": 4.748554406868757e-06, "loss": 1.6009, "step": 10876 }, { "epoch": 2.8582126317863827, "grad_norm": 0.5570113062858582, "learning_rate": 4.731032065884002e-06, "loss": 1.5877, "step": 10878 }, { "epoch": 2.8587381351200447, "grad_norm": 0.6482218503952026, "learning_rate": 4.7135097248992466e-06, "loss": 1.6192, "step": 10880 }, { "epoch": 2.8592636384537062, "grad_norm": 0.5896407961845398, "learning_rate": 4.695987383914492e-06, "loss": 1.5885, "step": 10882 }, { "epoch": 2.859789141787368, "grad_norm": 0.6039836406707764, "learning_rate": 4.678465042929736e-06, "loss": 1.6141, "step": 10884 }, { "epoch": 2.86031464512103, "grad_norm": 0.6409982442855835, "learning_rate": 4.66094270194498e-06, "loss": 1.6277, "step": 10886 }, { "epoch": 2.8608401484546917, "grad_norm": 0.5397644639015198, "learning_rate": 4.643420360960225e-06, "loss": 1.6035, "step": 10888 }, { "epoch": 2.8613656517883537, "grad_norm": 0.587805986404419, "learning_rate": 4.625898019975469e-06, "loss": 1.578, "step": 10890 }, { "epoch": 2.861891155122015, "grad_norm": 0.6628287434577942, "learning_rate": 4.608375678990713e-06, "loss": 1.6003, "step": 10892 }, { "epoch": 2.862416658455677, "grad_norm": 0.5740228891372681, "learning_rate": 4.5908533380059575e-06, "loss": 1.6358, "step": 10894 }, { "epoch": 2.8629421617893387, "grad_norm": 0.6140313148498535, "learning_rate": 4.5733309970212025e-06, "loss": 1.5789, "step": 10896 }, { "epoch": 2.8634676651230007, "grad_norm": 0.6435587406158447, "learning_rate": 4.555808656036447e-06, "loss": 1.6386, "step": 10898 }, { "epoch": 2.8639931684566626, "grad_norm": 0.6358049511909485, "learning_rate": 4.538286315051691e-06, "loss": 1.6128, "step": 10900 }, { "epoch": 2.864518671790324, "grad_norm": 0.5897725224494934, "learning_rate": 4.520763974066936e-06, "loss": 1.5958, "step": 10902 }, { "epoch": 2.8650441751239857, "grad_norm": 0.6232768893241882, "learning_rate": 4.50324163308218e-06, "loss": 1.6026, "step": 10904 }, { "epoch": 2.8655696784576477, "grad_norm": 0.5649431347846985, "learning_rate": 4.485719292097425e-06, "loss": 1.6001, "step": 10906 }, { "epoch": 2.8660951817913096, "grad_norm": 0.5962793827056885, "learning_rate": 4.468196951112668e-06, "loss": 1.5408, "step": 10908 }, { "epoch": 2.866620685124971, "grad_norm": 0.5510377883911133, "learning_rate": 4.4506746101279134e-06, "loss": 1.6295, "step": 10910 }, { "epoch": 2.867146188458633, "grad_norm": 0.5945908427238464, "learning_rate": 4.433152269143158e-06, "loss": 1.6192, "step": 10912 }, { "epoch": 2.8676716917922946, "grad_norm": 0.5374131798744202, "learning_rate": 4.415629928158403e-06, "loss": 1.6022, "step": 10914 }, { "epoch": 2.8681971951259566, "grad_norm": 0.5591030716896057, "learning_rate": 4.398107587173647e-06, "loss": 1.6104, "step": 10916 }, { "epoch": 2.868722698459618, "grad_norm": 0.6745641231536865, "learning_rate": 4.380585246188891e-06, "loss": 1.5931, "step": 10918 }, { "epoch": 2.86924820179328, "grad_norm": 0.7577717900276184, "learning_rate": 4.363062905204136e-06, "loss": 1.6324, "step": 10920 }, { "epoch": 2.869773705126942, "grad_norm": 0.7548211812973022, "learning_rate": 4.345540564219379e-06, "loss": 1.62, "step": 10922 }, { "epoch": 2.8702992084606036, "grad_norm": 0.6899915337562561, "learning_rate": 4.328018223234624e-06, "loss": 1.5757, "step": 10924 }, { "epoch": 2.8708247117942656, "grad_norm": 0.580837070941925, "learning_rate": 4.3104958822498685e-06, "loss": 1.5889, "step": 10926 }, { "epoch": 2.871350215127927, "grad_norm": 0.6023684740066528, "learning_rate": 4.2929735412651136e-06, "loss": 1.5814, "step": 10928 }, { "epoch": 2.871875718461589, "grad_norm": 0.63701331615448, "learning_rate": 4.275451200280358e-06, "loss": 1.6205, "step": 10930 }, { "epoch": 2.8724012217952506, "grad_norm": 0.5388381481170654, "learning_rate": 4.257928859295602e-06, "loss": 1.6093, "step": 10932 }, { "epoch": 2.8729267251289126, "grad_norm": 0.5374415516853333, "learning_rate": 4.240406518310847e-06, "loss": 1.6366, "step": 10934 }, { "epoch": 2.8734522284625745, "grad_norm": 0.7570417523384094, "learning_rate": 4.222884177326091e-06, "loss": 1.6263, "step": 10936 }, { "epoch": 2.873977731796236, "grad_norm": 0.6314011812210083, "learning_rate": 4.205361836341335e-06, "loss": 1.5977, "step": 10938 }, { "epoch": 2.8745032351298976, "grad_norm": 0.649953305721283, "learning_rate": 4.1878394953565794e-06, "loss": 1.6034, "step": 10940 }, { "epoch": 2.8750287384635596, "grad_norm": 0.6486716866493225, "learning_rate": 4.1703171543718245e-06, "loss": 1.5938, "step": 10942 }, { "epoch": 2.8755542417972215, "grad_norm": 0.6346645951271057, "learning_rate": 4.152794813387069e-06, "loss": 1.5878, "step": 10944 }, { "epoch": 2.876079745130883, "grad_norm": 0.624394953250885, "learning_rate": 4.135272472402313e-06, "loss": 1.6165, "step": 10946 }, { "epoch": 2.876605248464545, "grad_norm": 0.5654635429382324, "learning_rate": 4.117750131417558e-06, "loss": 1.5565, "step": 10948 }, { "epoch": 2.877130751798207, "grad_norm": 0.6420629024505615, "learning_rate": 4.100227790432802e-06, "loss": 1.6144, "step": 10950 }, { "epoch": 2.8776562551318685, "grad_norm": 0.6588164567947388, "learning_rate": 4.082705449448047e-06, "loss": 1.6324, "step": 10952 }, { "epoch": 2.87818175846553, "grad_norm": 0.5764179229736328, "learning_rate": 4.06518310846329e-06, "loss": 1.608, "step": 10954 }, { "epoch": 2.878707261799192, "grad_norm": 0.5966598391532898, "learning_rate": 4.047660767478535e-06, "loss": 1.5683, "step": 10956 }, { "epoch": 2.879232765132854, "grad_norm": 0.6094986796379089, "learning_rate": 4.0301384264937796e-06, "loss": 1.6101, "step": 10958 }, { "epoch": 2.8797582684665155, "grad_norm": 0.6777652502059937, "learning_rate": 4.012616085509025e-06, "loss": 1.6454, "step": 10960 }, { "epoch": 2.8802837718001775, "grad_norm": 0.54978346824646, "learning_rate": 3.995093744524269e-06, "loss": 1.5811, "step": 10962 }, { "epoch": 2.880809275133839, "grad_norm": 0.7338440418243408, "learning_rate": 3.977571403539513e-06, "loss": 1.6004, "step": 10964 }, { "epoch": 2.881334778467501, "grad_norm": 0.5676441192626953, "learning_rate": 3.960049062554758e-06, "loss": 1.6194, "step": 10966 }, { "epoch": 2.8818602818011625, "grad_norm": 0.6784756183624268, "learning_rate": 3.942526721570002e-06, "loss": 1.5941, "step": 10968 }, { "epoch": 2.8823857851348245, "grad_norm": 0.6024194955825806, "learning_rate": 3.925004380585246e-06, "loss": 1.6035, "step": 10970 }, { "epoch": 2.8829112884684864, "grad_norm": 0.5432921051979065, "learning_rate": 3.9074820396004905e-06, "loss": 1.6307, "step": 10972 }, { "epoch": 2.883436791802148, "grad_norm": 0.5449672937393188, "learning_rate": 3.8899596986157355e-06, "loss": 1.6078, "step": 10974 }, { "epoch": 2.88396229513581, "grad_norm": 0.582300066947937, "learning_rate": 3.87243735763098e-06, "loss": 1.5869, "step": 10976 }, { "epoch": 2.8844877984694715, "grad_norm": 0.5711895823478699, "learning_rate": 3.854915016646224e-06, "loss": 1.578, "step": 10978 }, { "epoch": 2.8850133018031334, "grad_norm": 0.6558634638786316, "learning_rate": 3.837392675661469e-06, "loss": 1.6074, "step": 10980 }, { "epoch": 2.885538805136795, "grad_norm": 0.6054571866989136, "learning_rate": 3.819870334676713e-06, "loss": 1.6015, "step": 10982 }, { "epoch": 2.886064308470457, "grad_norm": 0.5859559178352356, "learning_rate": 3.8023479936919577e-06, "loss": 1.575, "step": 10984 }, { "epoch": 2.886589811804119, "grad_norm": 0.7558565139770508, "learning_rate": 3.784825652707202e-06, "loss": 1.6048, "step": 10986 }, { "epoch": 2.8871153151377804, "grad_norm": 0.5738922357559204, "learning_rate": 3.7673033117224464e-06, "loss": 1.5895, "step": 10988 }, { "epoch": 2.887640818471442, "grad_norm": 0.6217196583747864, "learning_rate": 3.749780970737691e-06, "loss": 1.6017, "step": 10990 }, { "epoch": 2.888166321805104, "grad_norm": 0.590238094329834, "learning_rate": 3.7322586297529356e-06, "loss": 1.5937, "step": 10992 }, { "epoch": 2.888691825138766, "grad_norm": 0.5736632943153381, "learning_rate": 3.7147362887681794e-06, "loss": 1.6011, "step": 10994 }, { "epoch": 2.8892173284724274, "grad_norm": 0.6404598951339722, "learning_rate": 3.697213947783424e-06, "loss": 1.6158, "step": 10996 }, { "epoch": 2.8897428318060894, "grad_norm": 0.5618501305580139, "learning_rate": 3.6796916067986686e-06, "loss": 1.5836, "step": 10998 }, { "epoch": 2.890268335139751, "grad_norm": 0.5837007761001587, "learning_rate": 3.662169265813913e-06, "loss": 1.6126, "step": 11000 }, { "epoch": 2.890793838473413, "grad_norm": 0.6110183596611023, "learning_rate": 3.6446469248291574e-06, "loss": 1.5715, "step": 11002 }, { "epoch": 2.8913193418070744, "grad_norm": 0.5909767746925354, "learning_rate": 3.627124583844402e-06, "loss": 1.5992, "step": 11004 }, { "epoch": 2.8918448451407364, "grad_norm": 0.5684041976928711, "learning_rate": 3.6096022428596465e-06, "loss": 1.6002, "step": 11006 }, { "epoch": 2.8923703484743983, "grad_norm": 0.5662478804588318, "learning_rate": 3.5920799018748903e-06, "loss": 1.6093, "step": 11008 }, { "epoch": 2.89289585180806, "grad_norm": 0.5797233581542969, "learning_rate": 3.574557560890135e-06, "loss": 1.5847, "step": 11010 }, { "epoch": 2.893421355141722, "grad_norm": 0.60663902759552, "learning_rate": 3.5570352199053795e-06, "loss": 1.5927, "step": 11012 }, { "epoch": 2.8939468584753834, "grad_norm": 0.5697915554046631, "learning_rate": 3.539512878920624e-06, "loss": 1.5718, "step": 11014 }, { "epoch": 2.8944723618090453, "grad_norm": 0.6556651592254639, "learning_rate": 3.5219905379358683e-06, "loss": 1.6116, "step": 11016 }, { "epoch": 2.894997865142707, "grad_norm": 0.6983522176742554, "learning_rate": 3.504468196951113e-06, "loss": 1.6157, "step": 11018 }, { "epoch": 2.895523368476369, "grad_norm": 0.5559704303741455, "learning_rate": 3.4869458559663575e-06, "loss": 1.5646, "step": 11020 }, { "epoch": 2.896048871810031, "grad_norm": 0.5538905262947083, "learning_rate": 3.469423514981602e-06, "loss": 1.6231, "step": 11022 }, { "epoch": 2.8965743751436923, "grad_norm": 0.5700134038925171, "learning_rate": 3.451901173996846e-06, "loss": 1.5936, "step": 11024 }, { "epoch": 2.897099878477354, "grad_norm": 0.5542116761207581, "learning_rate": 3.4343788330120904e-06, "loss": 1.6293, "step": 11026 }, { "epoch": 2.897625381811016, "grad_norm": 0.5474702715873718, "learning_rate": 3.416856492027335e-06, "loss": 1.6112, "step": 11028 }, { "epoch": 2.898150885144678, "grad_norm": 0.6167372465133667, "learning_rate": 3.3993341510425796e-06, "loss": 1.5763, "step": 11030 }, { "epoch": 2.8986763884783393, "grad_norm": 0.6581651568412781, "learning_rate": 3.381811810057824e-06, "loss": 1.6004, "step": 11032 }, { "epoch": 2.8992018918120013, "grad_norm": 0.62739497423172, "learning_rate": 3.3642894690730684e-06, "loss": 1.6136, "step": 11034 }, { "epoch": 2.899727395145663, "grad_norm": 0.5928983688354492, "learning_rate": 3.346767128088313e-06, "loss": 1.622, "step": 11036 }, { "epoch": 2.9002528984793248, "grad_norm": 0.6407076716423035, "learning_rate": 3.3292447871035576e-06, "loss": 1.5815, "step": 11038 }, { "epoch": 2.9007784018129863, "grad_norm": 0.6269816756248474, "learning_rate": 3.3117224461188013e-06, "loss": 1.6119, "step": 11040 }, { "epoch": 2.9013039051466483, "grad_norm": 0.5729198455810547, "learning_rate": 3.294200105134046e-06, "loss": 1.6173, "step": 11042 }, { "epoch": 2.9018294084803102, "grad_norm": 0.5712758302688599, "learning_rate": 3.2766777641492905e-06, "loss": 1.5912, "step": 11044 }, { "epoch": 2.9023549118139718, "grad_norm": 0.6735134720802307, "learning_rate": 3.259155423164535e-06, "loss": 1.603, "step": 11046 }, { "epoch": 2.9028804151476337, "grad_norm": 0.6073388457298279, "learning_rate": 3.2416330821797793e-06, "loss": 1.6013, "step": 11048 }, { "epoch": 2.9034059184812953, "grad_norm": 0.611774742603302, "learning_rate": 3.224110741195024e-06, "loss": 1.6021, "step": 11050 }, { "epoch": 2.9039314218149572, "grad_norm": 0.6439489722251892, "learning_rate": 3.2065884002102685e-06, "loss": 1.6037, "step": 11052 }, { "epoch": 2.9044569251486188, "grad_norm": 0.572325587272644, "learning_rate": 3.189066059225513e-06, "loss": 1.5629, "step": 11054 }, { "epoch": 2.9049824284822807, "grad_norm": 0.5802271962165833, "learning_rate": 3.171543718240757e-06, "loss": 1.5982, "step": 11056 }, { "epoch": 2.9055079318159427, "grad_norm": 0.6686844825744629, "learning_rate": 3.1540213772560015e-06, "loss": 1.626, "step": 11058 }, { "epoch": 2.906033435149604, "grad_norm": 0.5929732918739319, "learning_rate": 3.136499036271246e-06, "loss": 1.5946, "step": 11060 }, { "epoch": 2.9065589384832657, "grad_norm": 0.5740990042686462, "learning_rate": 3.1189766952864902e-06, "loss": 1.6365, "step": 11062 }, { "epoch": 2.9070844418169277, "grad_norm": 0.6125746369361877, "learning_rate": 3.101454354301735e-06, "loss": 1.5778, "step": 11064 }, { "epoch": 2.9076099451505897, "grad_norm": 0.5508874654769897, "learning_rate": 3.0839320133169794e-06, "loss": 1.6019, "step": 11066 }, { "epoch": 2.908135448484251, "grad_norm": 0.5743160247802734, "learning_rate": 3.066409672332224e-06, "loss": 1.6103, "step": 11068 }, { "epoch": 2.908660951817913, "grad_norm": 0.6552396416664124, "learning_rate": 3.048887331347468e-06, "loss": 1.6245, "step": 11070 }, { "epoch": 2.9091864551515747, "grad_norm": 0.5878915190696716, "learning_rate": 3.031364990362713e-06, "loss": 1.6111, "step": 11072 }, { "epoch": 2.9097119584852367, "grad_norm": 0.5548945665359497, "learning_rate": 3.013842649377957e-06, "loss": 1.6289, "step": 11074 }, { "epoch": 2.910237461818898, "grad_norm": 0.5816505551338196, "learning_rate": 2.9963203083932016e-06, "loss": 1.6, "step": 11076 }, { "epoch": 2.91076296515256, "grad_norm": 0.5567723512649536, "learning_rate": 2.9787979674084457e-06, "loss": 1.6144, "step": 11078 }, { "epoch": 2.911288468486222, "grad_norm": 0.542295515537262, "learning_rate": 2.9612756264236903e-06, "loss": 1.6051, "step": 11080 }, { "epoch": 2.9118139718198837, "grad_norm": 0.6353909969329834, "learning_rate": 2.943753285438935e-06, "loss": 1.6135, "step": 11082 }, { "epoch": 2.9123394751535456, "grad_norm": 0.5646031498908997, "learning_rate": 2.9262309444541795e-06, "loss": 1.5934, "step": 11084 }, { "epoch": 2.912864978487207, "grad_norm": 0.5628175139427185, "learning_rate": 2.9087086034694237e-06, "loss": 1.5589, "step": 11086 }, { "epoch": 2.913390481820869, "grad_norm": 0.6169978380203247, "learning_rate": 2.891186262484668e-06, "loss": 1.5969, "step": 11088 }, { "epoch": 2.9139159851545307, "grad_norm": 0.5552890300750732, "learning_rate": 2.8736639214999125e-06, "loss": 1.6083, "step": 11090 }, { "epoch": 2.9144414884881926, "grad_norm": 0.5658928751945496, "learning_rate": 2.8561415805151567e-06, "loss": 1.5807, "step": 11092 }, { "epoch": 2.9149669918218546, "grad_norm": 0.5916785001754761, "learning_rate": 2.8386192395304013e-06, "loss": 1.6128, "step": 11094 }, { "epoch": 2.915492495155516, "grad_norm": 0.5797626376152039, "learning_rate": 2.821096898545646e-06, "loss": 1.6042, "step": 11096 }, { "epoch": 2.9160179984891776, "grad_norm": 0.661906361579895, "learning_rate": 2.8035745575608905e-06, "loss": 1.6357, "step": 11098 }, { "epoch": 2.9165435018228396, "grad_norm": 0.5823834538459778, "learning_rate": 2.7860522165761346e-06, "loss": 1.5987, "step": 11100 }, { "epoch": 2.9170690051565016, "grad_norm": 0.5549127459526062, "learning_rate": 2.7685298755913792e-06, "loss": 1.6271, "step": 11102 }, { "epoch": 2.917594508490163, "grad_norm": 0.633703351020813, "learning_rate": 2.7510075346066234e-06, "loss": 1.6639, "step": 11104 }, { "epoch": 2.918120011823825, "grad_norm": 0.6295391321182251, "learning_rate": 2.733485193621868e-06, "loss": 1.5803, "step": 11106 }, { "epoch": 2.918645515157487, "grad_norm": 0.7048977017402649, "learning_rate": 2.715962852637112e-06, "loss": 1.6112, "step": 11108 }, { "epoch": 2.9191710184911486, "grad_norm": 0.7996454238891602, "learning_rate": 2.6984405116523568e-06, "loss": 1.5931, "step": 11110 }, { "epoch": 2.91969652182481, "grad_norm": 0.6191883683204651, "learning_rate": 2.6809181706676014e-06, "loss": 1.6265, "step": 11112 }, { "epoch": 2.920222025158472, "grad_norm": 0.6032472252845764, "learning_rate": 2.663395829682846e-06, "loss": 1.6078, "step": 11114 }, { "epoch": 2.920747528492134, "grad_norm": 0.5909383296966553, "learning_rate": 2.64587348869809e-06, "loss": 1.6047, "step": 11116 }, { "epoch": 2.9212730318257956, "grad_norm": 0.741261899471283, "learning_rate": 2.6283511477133348e-06, "loss": 1.5909, "step": 11118 }, { "epoch": 2.9217985351594575, "grad_norm": 0.5791400074958801, "learning_rate": 2.610828806728579e-06, "loss": 1.6037, "step": 11120 }, { "epoch": 2.922324038493119, "grad_norm": 0.5476728677749634, "learning_rate": 2.5933064657438235e-06, "loss": 1.5841, "step": 11122 }, { "epoch": 2.922849541826781, "grad_norm": 0.6799639463424683, "learning_rate": 2.5757841247590677e-06, "loss": 1.6055, "step": 11124 }, { "epoch": 2.9233750451604426, "grad_norm": 0.6471995115280151, "learning_rate": 2.5582617837743123e-06, "loss": 1.614, "step": 11126 }, { "epoch": 2.9239005484941045, "grad_norm": 0.5741569399833679, "learning_rate": 2.540739442789557e-06, "loss": 1.5667, "step": 11128 }, { "epoch": 2.9244260518277665, "grad_norm": 0.5387163162231445, "learning_rate": 2.5232171018048015e-06, "loss": 1.617, "step": 11130 }, { "epoch": 2.924951555161428, "grad_norm": 0.5384705066680908, "learning_rate": 2.5056947608200457e-06, "loss": 1.5939, "step": 11132 }, { "epoch": 2.92547705849509, "grad_norm": 0.5429417490959167, "learning_rate": 2.4881724198352903e-06, "loss": 1.5842, "step": 11134 }, { "epoch": 2.9260025618287515, "grad_norm": 0.6191485524177551, "learning_rate": 2.4706500788505345e-06, "loss": 1.6338, "step": 11136 }, { "epoch": 2.9265280651624135, "grad_norm": 0.7055456042289734, "learning_rate": 2.453127737865779e-06, "loss": 1.6349, "step": 11138 }, { "epoch": 2.927053568496075, "grad_norm": 0.5733761787414551, "learning_rate": 2.4356053968810232e-06, "loss": 1.5677, "step": 11140 }, { "epoch": 2.927579071829737, "grad_norm": 0.6309702396392822, "learning_rate": 2.418083055896268e-06, "loss": 1.577, "step": 11142 }, { "epoch": 2.928104575163399, "grad_norm": 0.56844162940979, "learning_rate": 2.4005607149115124e-06, "loss": 1.6017, "step": 11144 }, { "epoch": 2.9286300784970605, "grad_norm": 0.5193071365356445, "learning_rate": 2.383038373926757e-06, "loss": 1.5855, "step": 11146 }, { "epoch": 2.929155581830722, "grad_norm": 0.6680015325546265, "learning_rate": 2.365516032942001e-06, "loss": 1.6215, "step": 11148 }, { "epoch": 2.929681085164384, "grad_norm": 0.6055448055267334, "learning_rate": 2.347993691957246e-06, "loss": 1.5748, "step": 11150 }, { "epoch": 2.930206588498046, "grad_norm": 0.763596773147583, "learning_rate": 2.33047135097249e-06, "loss": 1.567, "step": 11152 }, { "epoch": 2.9307320918317075, "grad_norm": 0.5230932235717773, "learning_rate": 2.3129490099877346e-06, "loss": 1.6069, "step": 11154 }, { "epoch": 2.9312575951653694, "grad_norm": 0.5715333819389343, "learning_rate": 2.2954266690029787e-06, "loss": 1.5798, "step": 11156 }, { "epoch": 2.931783098499031, "grad_norm": 0.5592380166053772, "learning_rate": 2.2779043280182233e-06, "loss": 1.616, "step": 11158 }, { "epoch": 2.932308601832693, "grad_norm": 0.5946754813194275, "learning_rate": 2.260381987033468e-06, "loss": 1.5689, "step": 11160 }, { "epoch": 2.9328341051663545, "grad_norm": 0.6536034345626831, "learning_rate": 2.2428596460487125e-06, "loss": 1.5949, "step": 11162 }, { "epoch": 2.9333596085000164, "grad_norm": 0.5874665975570679, "learning_rate": 2.2253373050639567e-06, "loss": 1.6125, "step": 11164 }, { "epoch": 2.9338851118336784, "grad_norm": 0.5944022536277771, "learning_rate": 2.2078149640792013e-06, "loss": 1.5724, "step": 11166 }, { "epoch": 2.93441061516734, "grad_norm": 0.5689641237258911, "learning_rate": 2.1902926230944455e-06, "loss": 1.587, "step": 11168 }, { "epoch": 2.934936118501002, "grad_norm": 0.6851060390472412, "learning_rate": 2.1727702821096897e-06, "loss": 1.5942, "step": 11170 }, { "epoch": 2.9354616218346634, "grad_norm": 0.6660142540931702, "learning_rate": 2.1552479411249343e-06, "loss": 1.5892, "step": 11172 }, { "epoch": 2.9359871251683254, "grad_norm": 0.6187747716903687, "learning_rate": 2.137725600140179e-06, "loss": 1.5622, "step": 11174 }, { "epoch": 2.936512628501987, "grad_norm": 0.5741645097732544, "learning_rate": 2.1202032591554235e-06, "loss": 1.5856, "step": 11176 }, { "epoch": 2.937038131835649, "grad_norm": 0.5658064484596252, "learning_rate": 2.1026809181706676e-06, "loss": 1.5984, "step": 11178 }, { "epoch": 2.937563635169311, "grad_norm": 0.6636450886726379, "learning_rate": 2.0851585771859122e-06, "loss": 1.6533, "step": 11180 }, { "epoch": 2.9380891385029724, "grad_norm": 0.5776405334472656, "learning_rate": 2.0676362362011564e-06, "loss": 1.567, "step": 11182 }, { "epoch": 2.938614641836634, "grad_norm": 0.6802220940589905, "learning_rate": 2.050113895216401e-06, "loss": 1.6211, "step": 11184 }, { "epoch": 2.939140145170296, "grad_norm": 0.6535943746566772, "learning_rate": 2.032591554231645e-06, "loss": 1.596, "step": 11186 }, { "epoch": 2.939665648503958, "grad_norm": 0.5814671516418457, "learning_rate": 2.0150692132468898e-06, "loss": 1.5689, "step": 11188 }, { "epoch": 2.9401911518376194, "grad_norm": 0.6042841076850891, "learning_rate": 1.9975468722621344e-06, "loss": 1.6142, "step": 11190 }, { "epoch": 2.9407166551712813, "grad_norm": 0.6148901581764221, "learning_rate": 1.980024531277379e-06, "loss": 1.5813, "step": 11192 }, { "epoch": 2.941242158504943, "grad_norm": 0.6761669516563416, "learning_rate": 1.962502190292623e-06, "loss": 1.5915, "step": 11194 }, { "epoch": 2.941767661838605, "grad_norm": 0.6184157133102417, "learning_rate": 1.9449798493078678e-06, "loss": 1.6291, "step": 11196 }, { "epoch": 2.9422931651722664, "grad_norm": 0.567001223564148, "learning_rate": 1.927457508323112e-06, "loss": 1.5796, "step": 11198 }, { "epoch": 2.9428186685059283, "grad_norm": 0.5846920609474182, "learning_rate": 1.9099351673383565e-06, "loss": 1.6186, "step": 11200 }, { "epoch": 2.9428186685059283, "eval_loss": 1.6465901136398315, "eval_runtime": 487.2703, "eval_samples_per_second": 249.941, "eval_steps_per_second": 31.243, "step": 11200 }, { "epoch": 2.9433441718395903, "grad_norm": 0.5758314728736877, "learning_rate": 1.892412826353601e-06, "loss": 1.609, "step": 11202 }, { "epoch": 2.943869675173252, "grad_norm": 0.6517518758773804, "learning_rate": 1.8748904853688455e-06, "loss": 1.614, "step": 11204 }, { "epoch": 2.944395178506914, "grad_norm": 0.5758547186851501, "learning_rate": 1.8573681443840897e-06, "loss": 1.5874, "step": 11206 }, { "epoch": 2.9449206818405753, "grad_norm": 0.5640631914138794, "learning_rate": 1.8398458033993343e-06, "loss": 1.5723, "step": 11208 }, { "epoch": 2.9454461851742373, "grad_norm": 0.6348847150802612, "learning_rate": 1.8223234624145787e-06, "loss": 1.5789, "step": 11210 }, { "epoch": 2.945971688507899, "grad_norm": 0.6184114217758179, "learning_rate": 1.8048011214298233e-06, "loss": 1.5986, "step": 11212 }, { "epoch": 2.946497191841561, "grad_norm": 0.8769078850746155, "learning_rate": 1.7872787804450674e-06, "loss": 1.6315, "step": 11214 }, { "epoch": 2.9470226951752228, "grad_norm": 0.6675733923912048, "learning_rate": 1.769756439460312e-06, "loss": 1.6477, "step": 11216 }, { "epoch": 2.9475481985088843, "grad_norm": 0.5792946219444275, "learning_rate": 1.7522340984755564e-06, "loss": 1.6266, "step": 11218 }, { "epoch": 2.948073701842546, "grad_norm": 0.6874922513961792, "learning_rate": 1.734711757490801e-06, "loss": 1.6014, "step": 11220 }, { "epoch": 2.9485992051762078, "grad_norm": 0.6224451065063477, "learning_rate": 1.7171894165060452e-06, "loss": 1.5985, "step": 11222 }, { "epoch": 2.9491247085098697, "grad_norm": 0.5994266271591187, "learning_rate": 1.6996670755212898e-06, "loss": 1.6229, "step": 11224 }, { "epoch": 2.9496502118435313, "grad_norm": 0.6327415704727173, "learning_rate": 1.6821447345365342e-06, "loss": 1.5961, "step": 11226 }, { "epoch": 2.9501757151771932, "grad_norm": 0.5327082276344299, "learning_rate": 1.6646223935517788e-06, "loss": 1.5832, "step": 11228 }, { "epoch": 2.9507012185108548, "grad_norm": 0.6451945900917053, "learning_rate": 1.647100052567023e-06, "loss": 1.6072, "step": 11230 }, { "epoch": 2.9512267218445167, "grad_norm": 0.6457750201225281, "learning_rate": 1.6295777115822676e-06, "loss": 1.59, "step": 11232 }, { "epoch": 2.9517522251781783, "grad_norm": 0.7860383987426758, "learning_rate": 1.612055370597512e-06, "loss": 1.6288, "step": 11234 }, { "epoch": 2.9522777285118402, "grad_norm": 0.5900551080703735, "learning_rate": 1.5945330296127566e-06, "loss": 1.6113, "step": 11236 }, { "epoch": 2.952803231845502, "grad_norm": 0.606599748134613, "learning_rate": 1.5770106886280007e-06, "loss": 1.6201, "step": 11238 }, { "epoch": 2.9533287351791637, "grad_norm": 0.6085255742073059, "learning_rate": 1.5594883476432451e-06, "loss": 1.5788, "step": 11240 }, { "epoch": 2.9538542385128257, "grad_norm": 0.689579427242279, "learning_rate": 1.5419660066584897e-06, "loss": 1.5928, "step": 11242 }, { "epoch": 2.954379741846487, "grad_norm": 0.5633424520492554, "learning_rate": 1.524443665673734e-06, "loss": 1.5989, "step": 11244 }, { "epoch": 2.954905245180149, "grad_norm": 0.6903582811355591, "learning_rate": 1.5069213246889785e-06, "loss": 1.6527, "step": 11246 }, { "epoch": 2.9554307485138107, "grad_norm": 0.6674286723136902, "learning_rate": 1.4893989837042229e-06, "loss": 1.6041, "step": 11248 }, { "epoch": 2.9559562518474727, "grad_norm": 0.5618950724601746, "learning_rate": 1.4718766427194675e-06, "loss": 1.6209, "step": 11250 }, { "epoch": 2.9564817551811347, "grad_norm": 0.60519939661026, "learning_rate": 1.4543543017347119e-06, "loss": 1.57, "step": 11252 }, { "epoch": 2.957007258514796, "grad_norm": 0.6382578611373901, "learning_rate": 1.4368319607499562e-06, "loss": 1.6074, "step": 11254 }, { "epoch": 2.9575327618484577, "grad_norm": 0.5759736895561218, "learning_rate": 1.4193096197652006e-06, "loss": 1.5892, "step": 11256 }, { "epoch": 2.9580582651821197, "grad_norm": 0.5800794363021851, "learning_rate": 1.4017872787804452e-06, "loss": 1.643, "step": 11258 }, { "epoch": 2.9585837685157816, "grad_norm": 0.5824633240699768, "learning_rate": 1.3842649377956896e-06, "loss": 1.5935, "step": 11260 }, { "epoch": 2.959109271849443, "grad_norm": 0.5774669647216797, "learning_rate": 1.366742596810934e-06, "loss": 1.6421, "step": 11262 }, { "epoch": 2.959634775183105, "grad_norm": 0.5631816387176514, "learning_rate": 1.3492202558261784e-06, "loss": 1.6194, "step": 11264 }, { "epoch": 2.960160278516767, "grad_norm": 0.5980478525161743, "learning_rate": 1.331697914841423e-06, "loss": 1.6124, "step": 11266 }, { "epoch": 2.9606857818504286, "grad_norm": 0.5869792699813843, "learning_rate": 1.3141755738566674e-06, "loss": 1.6033, "step": 11268 }, { "epoch": 2.96121128518409, "grad_norm": 0.7216488122940063, "learning_rate": 1.2966532328719118e-06, "loss": 1.6396, "step": 11270 }, { "epoch": 2.961736788517752, "grad_norm": 0.6487390398979187, "learning_rate": 1.2791308918871562e-06, "loss": 1.611, "step": 11272 }, { "epoch": 2.962262291851414, "grad_norm": 0.6345615983009338, "learning_rate": 1.2616085509024008e-06, "loss": 1.575, "step": 11274 }, { "epoch": 2.9627877951850756, "grad_norm": 0.692871630191803, "learning_rate": 1.2440862099176451e-06, "loss": 1.6005, "step": 11276 }, { "epoch": 2.9633132985187376, "grad_norm": 0.6169196367263794, "learning_rate": 1.2265638689328895e-06, "loss": 1.5758, "step": 11278 }, { "epoch": 2.963838801852399, "grad_norm": 0.5923426151275635, "learning_rate": 1.209041527948134e-06, "loss": 1.5737, "step": 11280 }, { "epoch": 2.964364305186061, "grad_norm": 0.629149854183197, "learning_rate": 1.1915191869633785e-06, "loss": 1.6043, "step": 11282 }, { "epoch": 2.9648898085197226, "grad_norm": 0.5396657586097717, "learning_rate": 1.173996845978623e-06, "loss": 1.5922, "step": 11284 }, { "epoch": 2.9654153118533846, "grad_norm": 0.6270614862442017, "learning_rate": 1.1564745049938673e-06, "loss": 1.6032, "step": 11286 }, { "epoch": 2.9659408151870466, "grad_norm": 0.6442562937736511, "learning_rate": 1.1389521640091117e-06, "loss": 1.5929, "step": 11288 }, { "epoch": 2.966466318520708, "grad_norm": 0.5687601566314697, "learning_rate": 1.1214298230243563e-06, "loss": 1.5735, "step": 11290 }, { "epoch": 2.96699182185437, "grad_norm": 0.6251326203346252, "learning_rate": 1.1039074820396007e-06, "loss": 1.5841, "step": 11292 }, { "epoch": 2.9675173251880316, "grad_norm": 0.6409140229225159, "learning_rate": 1.0863851410548448e-06, "loss": 1.6202, "step": 11294 }, { "epoch": 2.9680428285216935, "grad_norm": 0.5543487668037415, "learning_rate": 1.0688628000700894e-06, "loss": 1.6237, "step": 11296 }, { "epoch": 2.968568331855355, "grad_norm": 0.6813648343086243, "learning_rate": 1.0513404590853338e-06, "loss": 1.6548, "step": 11298 }, { "epoch": 2.969093835189017, "grad_norm": 0.5688581466674805, "learning_rate": 1.0338181181005782e-06, "loss": 1.597, "step": 11300 }, { "epoch": 2.969619338522679, "grad_norm": 0.6080614328384399, "learning_rate": 1.0162957771158226e-06, "loss": 1.6439, "step": 11302 }, { "epoch": 2.9701448418563405, "grad_norm": 0.5709330439567566, "learning_rate": 9.987734361310672e-07, "loss": 1.6394, "step": 11304 }, { "epoch": 2.970670345190002, "grad_norm": 0.6443371772766113, "learning_rate": 9.812510951463116e-07, "loss": 1.6215, "step": 11306 }, { "epoch": 2.971195848523664, "grad_norm": 0.6284047365188599, "learning_rate": 9.63728754161556e-07, "loss": 1.6182, "step": 11308 }, { "epoch": 2.971721351857326, "grad_norm": 0.5524962544441223, "learning_rate": 9.462064131768005e-07, "loss": 1.5953, "step": 11310 }, { "epoch": 2.9722468551909875, "grad_norm": 0.56756991147995, "learning_rate": 9.286840721920448e-07, "loss": 1.6048, "step": 11312 }, { "epoch": 2.9727723585246495, "grad_norm": 0.729558527469635, "learning_rate": 9.111617312072893e-07, "loss": 1.6145, "step": 11314 }, { "epoch": 2.973297861858311, "grad_norm": 0.6247385144233704, "learning_rate": 8.936393902225337e-07, "loss": 1.6176, "step": 11316 }, { "epoch": 2.973823365191973, "grad_norm": 0.6063811779022217, "learning_rate": 8.761170492377782e-07, "loss": 1.5839, "step": 11318 }, { "epoch": 2.9743488685256345, "grad_norm": 0.6044145226478577, "learning_rate": 8.585947082530226e-07, "loss": 1.6051, "step": 11320 }, { "epoch": 2.9748743718592965, "grad_norm": 0.5743279457092285, "learning_rate": 8.410723672682671e-07, "loss": 1.6128, "step": 11322 }, { "epoch": 2.9753998751929585, "grad_norm": 0.5868866443634033, "learning_rate": 8.235500262835115e-07, "loss": 1.6428, "step": 11324 }, { "epoch": 2.97592537852662, "grad_norm": 0.5260694026947021, "learning_rate": 8.06027685298756e-07, "loss": 1.584, "step": 11326 }, { "epoch": 2.976450881860282, "grad_norm": 0.6764076948165894, "learning_rate": 7.885053443140004e-07, "loss": 1.6163, "step": 11328 }, { "epoch": 2.9769763851939435, "grad_norm": 0.5896710157394409, "learning_rate": 7.709830033292449e-07, "loss": 1.5717, "step": 11330 }, { "epoch": 2.9775018885276054, "grad_norm": 0.6759716272354126, "learning_rate": 7.534606623444892e-07, "loss": 1.5856, "step": 11332 }, { "epoch": 2.978027391861267, "grad_norm": 0.6693316102027893, "learning_rate": 7.359383213597337e-07, "loss": 1.6189, "step": 11334 }, { "epoch": 2.978552895194929, "grad_norm": 0.5483260154724121, "learning_rate": 7.184159803749781e-07, "loss": 1.6106, "step": 11336 }, { "epoch": 2.979078398528591, "grad_norm": 0.6094440817832947, "learning_rate": 7.008936393902226e-07, "loss": 1.6105, "step": 11338 }, { "epoch": 2.9796039018622524, "grad_norm": 0.6723849773406982, "learning_rate": 6.83371298405467e-07, "loss": 1.5995, "step": 11340 }, { "epoch": 2.980129405195914, "grad_norm": 0.6282168030738831, "learning_rate": 6.658489574207115e-07, "loss": 1.5797, "step": 11342 }, { "epoch": 2.980654908529576, "grad_norm": 0.5661284923553467, "learning_rate": 6.483266164359559e-07, "loss": 1.5908, "step": 11344 }, { "epoch": 2.981180411863238, "grad_norm": 0.598994255065918, "learning_rate": 6.308042754512004e-07, "loss": 1.592, "step": 11346 }, { "epoch": 2.9817059151968994, "grad_norm": 0.6033176183700562, "learning_rate": 6.132819344664448e-07, "loss": 1.5935, "step": 11348 }, { "epoch": 2.9822314185305614, "grad_norm": 0.6345306038856506, "learning_rate": 5.957595934816893e-07, "loss": 1.5789, "step": 11350 }, { "epoch": 2.982756921864223, "grad_norm": 0.606160044670105, "learning_rate": 5.782372524969336e-07, "loss": 1.5949, "step": 11352 }, { "epoch": 2.983282425197885, "grad_norm": 0.7126883268356323, "learning_rate": 5.607149115121781e-07, "loss": 1.5786, "step": 11354 }, { "epoch": 2.9838079285315464, "grad_norm": 0.6453995704650879, "learning_rate": 5.431925705274224e-07, "loss": 1.6359, "step": 11356 }, { "epoch": 2.9843334318652084, "grad_norm": 0.5591650605201721, "learning_rate": 5.256702295426669e-07, "loss": 1.596, "step": 11358 }, { "epoch": 2.9848589351988704, "grad_norm": 0.5831902027130127, "learning_rate": 5.081478885579113e-07, "loss": 1.5785, "step": 11360 }, { "epoch": 2.985384438532532, "grad_norm": 0.5874847173690796, "learning_rate": 4.906255475731558e-07, "loss": 1.6092, "step": 11362 }, { "epoch": 2.985909941866194, "grad_norm": 0.6131449937820435, "learning_rate": 4.7310320658840023e-07, "loss": 1.6002, "step": 11364 }, { "epoch": 2.9864354451998554, "grad_norm": 0.5965538620948792, "learning_rate": 4.5558086560364467e-07, "loss": 1.5937, "step": 11366 }, { "epoch": 2.9869609485335173, "grad_norm": 0.6877683401107788, "learning_rate": 4.380585246188891e-07, "loss": 1.5981, "step": 11368 }, { "epoch": 2.987486451867179, "grad_norm": 0.6720339059829712, "learning_rate": 4.2053618363413355e-07, "loss": 1.6071, "step": 11370 }, { "epoch": 2.988011955200841, "grad_norm": 0.7178999185562134, "learning_rate": 4.03013842649378e-07, "loss": 1.6089, "step": 11372 }, { "epoch": 2.988537458534503, "grad_norm": 0.5988190770149231, "learning_rate": 3.8549150166462243e-07, "loss": 1.6001, "step": 11374 }, { "epoch": 2.9890629618681643, "grad_norm": 0.6235753297805786, "learning_rate": 3.6796916067986687e-07, "loss": 1.6124, "step": 11376 }, { "epoch": 2.989588465201826, "grad_norm": 0.6031942367553711, "learning_rate": 3.504468196951113e-07, "loss": 1.6445, "step": 11378 }, { "epoch": 2.990113968535488, "grad_norm": 0.6021082401275635, "learning_rate": 3.3292447871035575e-07, "loss": 1.6139, "step": 11380 }, { "epoch": 2.99063947186915, "grad_norm": 0.5596811175346375, "learning_rate": 3.154021377256002e-07, "loss": 1.5923, "step": 11382 }, { "epoch": 2.9911649752028113, "grad_norm": 0.5226958990097046, "learning_rate": 2.9787979674084463e-07, "loss": 1.6145, "step": 11384 }, { "epoch": 2.9916904785364733, "grad_norm": 0.6241964101791382, "learning_rate": 2.8035745575608907e-07, "loss": 1.5997, "step": 11386 }, { "epoch": 2.992215981870135, "grad_norm": 0.5991497039794922, "learning_rate": 2.6283511477133345e-07, "loss": 1.6279, "step": 11388 }, { "epoch": 2.992741485203797, "grad_norm": 0.5512558817863464, "learning_rate": 2.453127737865779e-07, "loss": 1.5999, "step": 11390 }, { "epoch": 2.9932669885374583, "grad_norm": 0.6609853506088257, "learning_rate": 2.2779043280182233e-07, "loss": 1.6145, "step": 11392 }, { "epoch": 2.9937924918711203, "grad_norm": 0.6005377769470215, "learning_rate": 2.1026809181706677e-07, "loss": 1.5893, "step": 11394 }, { "epoch": 2.9943179952047823, "grad_norm": 0.5884697437286377, "learning_rate": 1.9274575083231121e-07, "loss": 1.5813, "step": 11396 }, { "epoch": 2.994843498538444, "grad_norm": 0.7103734612464905, "learning_rate": 1.7522340984755565e-07, "loss": 1.592, "step": 11398 }, { "epoch": 2.9953690018721058, "grad_norm": 0.5749437808990479, "learning_rate": 1.577010688628001e-07, "loss": 1.6045, "step": 11400 }, { "epoch": 2.9958945052057673, "grad_norm": 0.6688903570175171, "learning_rate": 1.4017872787804453e-07, "loss": 1.6188, "step": 11402 }, { "epoch": 2.9964200085394292, "grad_norm": 0.6899111866950989, "learning_rate": 1.2265638689328895e-07, "loss": 1.6102, "step": 11404 }, { "epoch": 2.9969455118730908, "grad_norm": 0.6025496125221252, "learning_rate": 1.0513404590853339e-07, "loss": 1.5927, "step": 11406 }, { "epoch": 2.9974710152067527, "grad_norm": 0.8233180642127991, "learning_rate": 8.761170492377783e-08, "loss": 1.6267, "step": 11408 }, { "epoch": 2.9979965185404147, "grad_norm": 0.633036196231842, "learning_rate": 7.008936393902227e-08, "loss": 1.6246, "step": 11410 }, { "epoch": 2.9985220218740762, "grad_norm": 0.6616869568824768, "learning_rate": 5.2567022954266694e-08, "loss": 1.5989, "step": 11412 }, { "epoch": 2.9990475252077378, "grad_norm": 0.7166438102722168, "learning_rate": 3.5044681969511133e-08, "loss": 1.6247, "step": 11414 }, { "epoch": 2.999310276874569, "step": 11415, "total_flos": 8.316227983158804e+17, "train_loss": 1.7114452323838383, "train_runtime": 54638.3985, "train_samples_per_second": 53.496, "train_steps_per_second": 0.209 }, { "epoch": 2.999310276874569, "eval_loss": 1.6470075845718384, "eval_runtime": 487.9081, "eval_samples_per_second": 249.615, "eval_steps_per_second": 31.203, "step": 11415 }, { "epoch": 2.999310276874569, "eval_loss": 1.6461076736450195, "eval_runtime": 486.7382, "eval_samples_per_second": 250.215, "eval_steps_per_second": 31.278, "step": 11415 } ], "logging_steps": 2, "max_steps": 11415, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.316227983158804e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }