diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40231 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999310276874569, + "eval_steps": 400, + "global_step": 11415, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005255033336617729, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 9.1222, + "step": 2 + }, + { + "epoch": 0.0010510066673235458, + "grad_norm": 38.698402404785156, + "learning_rate": 9.999123882950763e-05, + "loss": 9.1362, + "step": 4 + }, + { + "epoch": 0.0015765100009853187, + "grad_norm": 23.888906478881836, + "learning_rate": 9.997371648852288e-05, + "loss": 6.4836, + "step": 6 + }, + { + "epoch": 0.0021020133346470915, + "grad_norm": 12.829351425170898, + "learning_rate": 9.995619414753812e-05, + "loss": 5.5976, + "step": 8 + }, + { + "epoch": 0.0026275166683088647, + "grad_norm": 13.441329956054688, + "learning_rate": 9.993867180655336e-05, + "loss": 4.922, + "step": 10 + }, + { + "epoch": 0.0031530200019706375, + "grad_norm": 8.487077713012695, + "learning_rate": 9.99211494655686e-05, + "loss": 4.3391, + "step": 12 + }, + { + "epoch": 0.0036785233356324103, + "grad_norm": 7.427611351013184, + "learning_rate": 9.990362712458385e-05, + "loss": 3.8921, + "step": 14 + }, + { + "epoch": 0.004204026669294183, + "grad_norm": 6.296542167663574, + "learning_rate": 9.988610478359909e-05, + "loss": 3.5532, + "step": 16 + }, + { + "epoch": 0.004729530002955957, + "grad_norm": 5.726244926452637, + "learning_rate": 9.986858244261433e-05, + "loss": 3.2786, + "step": 18 + }, + { + "epoch": 0.0052550333366177295, + "grad_norm": 5.131892204284668, + "learning_rate": 9.985106010162958e-05, + "loss": 3.016, + "step": 20 + }, + { + "epoch": 0.005780536670279502, + "grad_norm": 4.895572185516357, + "learning_rate": 9.983353776064483e-05, + "loss": 2.8689, + "step": 22 + }, + { + "epoch": 0.006306040003941275, + "grad_norm": 4.9487714767456055, + "learning_rate": 9.981601541966006e-05, + "loss": 2.7813, + "step": 24 + }, + { + "epoch": 0.006831543337603048, + "grad_norm": 4.282960891723633, + "learning_rate": 9.979849307867531e-05, + "loss": 2.6648, + "step": 26 + }, + { + "epoch": 0.0073570466712648205, + "grad_norm": 3.610912561416626, + "learning_rate": 9.978097073769056e-05, + "loss": 2.6083, + "step": 28 + }, + { + "epoch": 0.007882550004926594, + "grad_norm": 3.4365551471710205, + "learning_rate": 9.97634483967058e-05, + "loss": 2.5791, + "step": 30 + }, + { + "epoch": 0.008408053338588366, + "grad_norm": 3.2651638984680176, + "learning_rate": 9.974592605572105e-05, + "loss": 2.5128, + "step": 32 + }, + { + "epoch": 0.00893355667225014, + "grad_norm": 3.259965658187866, + "learning_rate": 9.97284037147363e-05, + "loss": 2.4589, + "step": 34 + }, + { + "epoch": 0.009459060005911913, + "grad_norm": 3.229142904281616, + "learning_rate": 9.971088137375154e-05, + "loss": 2.4857, + "step": 36 + }, + { + "epoch": 0.009984563339573685, + "grad_norm": 3.1428916454315186, + "learning_rate": 9.969335903276678e-05, + "loss": 2.4305, + "step": 38 + }, + { + "epoch": 0.010510066673235459, + "grad_norm": 2.565325975418091, + "learning_rate": 9.967583669178203e-05, + "loss": 2.4047, + "step": 40 + }, + { + "epoch": 0.01103557000689723, + "grad_norm": 2.9086647033691406, + "learning_rate": 9.965831435079726e-05, + "loss": 2.3909, + "step": 42 + }, + { + "epoch": 0.011561073340559004, + "grad_norm": 2.6053147315979004, + "learning_rate": 9.964079200981251e-05, + "loss": 2.3981, + "step": 44 + }, + { + "epoch": 0.012086576674220776, + "grad_norm": 2.3066649436950684, + "learning_rate": 9.962326966882776e-05, + "loss": 2.3786, + "step": 46 + }, + { + "epoch": 0.01261208000788255, + "grad_norm": 2.5991337299346924, + "learning_rate": 9.960574732784301e-05, + "loss": 2.3309, + "step": 48 + }, + { + "epoch": 0.013137583341544324, + "grad_norm": 2.6027159690856934, + "learning_rate": 9.958822498685824e-05, + "loss": 2.3522, + "step": 50 + }, + { + "epoch": 0.013663086675206096, + "grad_norm": 2.2059361934661865, + "learning_rate": 9.957070264587349e-05, + "loss": 2.3221, + "step": 52 + }, + { + "epoch": 0.01418859000886787, + "grad_norm": 2.3481605052948, + "learning_rate": 9.955318030488874e-05, + "loss": 2.3223, + "step": 54 + }, + { + "epoch": 0.014714093342529641, + "grad_norm": 2.489518880844116, + "learning_rate": 9.953565796390398e-05, + "loss": 2.299, + "step": 56 + }, + { + "epoch": 0.015239596676191415, + "grad_norm": 1.992858648300171, + "learning_rate": 9.951813562291923e-05, + "loss": 2.2695, + "step": 58 + }, + { + "epoch": 0.01576510000985319, + "grad_norm": 2.3498594760894775, + "learning_rate": 9.950061328193448e-05, + "loss": 2.305, + "step": 60 + }, + { + "epoch": 0.01629060334351496, + "grad_norm": 2.3116061687469482, + "learning_rate": 9.948309094094971e-05, + "loss": 2.28, + "step": 62 + }, + { + "epoch": 0.016816106677176732, + "grad_norm": 1.8108229637145996, + "learning_rate": 9.946556859996496e-05, + "loss": 2.2844, + "step": 64 + }, + { + "epoch": 0.017341610010838508, + "grad_norm": 1.7650134563446045, + "learning_rate": 9.944804625898021e-05, + "loss": 2.2585, + "step": 66 + }, + { + "epoch": 0.01786711334450028, + "grad_norm": 2.000683546066284, + "learning_rate": 9.943052391799544e-05, + "loss": 2.2866, + "step": 68 + }, + { + "epoch": 0.01839261667816205, + "grad_norm": 1.7432600259780884, + "learning_rate": 9.941300157701069e-05, + "loss": 2.2551, + "step": 70 + }, + { + "epoch": 0.018918120011823827, + "grad_norm": 1.8153115510940552, + "learning_rate": 9.939547923602594e-05, + "loss": 2.2829, + "step": 72 + }, + { + "epoch": 0.0194436233454856, + "grad_norm": 1.7774672508239746, + "learning_rate": 9.937795689504118e-05, + "loss": 2.2656, + "step": 74 + }, + { + "epoch": 0.01996912667914737, + "grad_norm": 1.8277621269226074, + "learning_rate": 9.936043455405642e-05, + "loss": 2.2602, + "step": 76 + }, + { + "epoch": 0.020494630012809142, + "grad_norm": 1.8183931112289429, + "learning_rate": 9.934291221307167e-05, + "loss": 2.2581, + "step": 78 + }, + { + "epoch": 0.021020133346470918, + "grad_norm": 1.5881011486053467, + "learning_rate": 9.932538987208691e-05, + "loss": 2.2305, + "step": 80 + }, + { + "epoch": 0.02154563668013269, + "grad_norm": 1.653342843055725, + "learning_rate": 9.930786753110216e-05, + "loss": 2.2092, + "step": 82 + }, + { + "epoch": 0.02207114001379446, + "grad_norm": 1.7351162433624268, + "learning_rate": 9.929034519011741e-05, + "loss": 2.2031, + "step": 84 + }, + { + "epoch": 0.022596643347456237, + "grad_norm": 1.679905891418457, + "learning_rate": 9.927282284913266e-05, + "loss": 2.2373, + "step": 86 + }, + { + "epoch": 0.02312214668111801, + "grad_norm": 1.742342472076416, + "learning_rate": 9.925530050814789e-05, + "loss": 2.1937, + "step": 88 + }, + { + "epoch": 0.02364765001477978, + "grad_norm": 1.5579359531402588, + "learning_rate": 9.923777816716314e-05, + "loss": 2.2137, + "step": 90 + }, + { + "epoch": 0.024173153348441553, + "grad_norm": 1.548086404800415, + "learning_rate": 9.922025582617839e-05, + "loss": 2.2123, + "step": 92 + }, + { + "epoch": 0.024698656682103328, + "grad_norm": 1.6375842094421387, + "learning_rate": 9.920273348519362e-05, + "loss": 2.182, + "step": 94 + }, + { + "epoch": 0.0252241600157651, + "grad_norm": 1.6106681823730469, + "learning_rate": 9.918521114420887e-05, + "loss": 2.1649, + "step": 96 + }, + { + "epoch": 0.025749663349426872, + "grad_norm": 1.6409622430801392, + "learning_rate": 9.916768880322411e-05, + "loss": 2.1702, + "step": 98 + }, + { + "epoch": 0.026275166683088647, + "grad_norm": 2.1687161922454834, + "learning_rate": 9.915016646223936e-05, + "loss": 2.1519, + "step": 100 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 1.595245122909546, + "learning_rate": 9.91326441212546e-05, + "loss": 2.1314, + "step": 102 + }, + { + "epoch": 0.02732617335041219, + "grad_norm": 1.6276413202285767, + "learning_rate": 9.911512178026984e-05, + "loss": 2.1189, + "step": 104 + }, + { + "epoch": 0.027851676684073963, + "grad_norm": 1.7238609790802002, + "learning_rate": 9.909759943928509e-05, + "loss": 2.1617, + "step": 106 + }, + { + "epoch": 0.02837718001773574, + "grad_norm": 1.5262254476547241, + "learning_rate": 9.908007709830034e-05, + "loss": 2.1105, + "step": 108 + }, + { + "epoch": 0.02890268335139751, + "grad_norm": 1.6144453287124634, + "learning_rate": 9.906255475731559e-05, + "loss": 2.1287, + "step": 110 + }, + { + "epoch": 0.029428186685059282, + "grad_norm": 1.528970718383789, + "learning_rate": 9.904503241633083e-05, + "loss": 2.0744, + "step": 112 + }, + { + "epoch": 0.029953690018721058, + "grad_norm": 1.703235149383545, + "learning_rate": 9.902751007534607e-05, + "loss": 2.1285, + "step": 114 + }, + { + "epoch": 0.03047919335238283, + "grad_norm": 1.567522406578064, + "learning_rate": 9.900998773436132e-05, + "loss": 2.0736, + "step": 116 + }, + { + "epoch": 0.0310046966860446, + "grad_norm": 1.4350054264068604, + "learning_rate": 9.899246539337655e-05, + "loss": 2.0745, + "step": 118 + }, + { + "epoch": 0.03153020001970638, + "grad_norm": 1.438694953918457, + "learning_rate": 9.89749430523918e-05, + "loss": 2.0607, + "step": 120 + }, + { + "epoch": 0.032055703353368145, + "grad_norm": 1.3507471084594727, + "learning_rate": 9.895742071140705e-05, + "loss": 2.0601, + "step": 122 + }, + { + "epoch": 0.03258120668702992, + "grad_norm": 1.2798346281051636, + "learning_rate": 9.893989837042229e-05, + "loss": 2.0925, + "step": 124 + }, + { + "epoch": 0.033106710020691696, + "grad_norm": 1.1719093322753906, + "learning_rate": 9.892237602943754e-05, + "loss": 2.069, + "step": 126 + }, + { + "epoch": 0.033632213354353464, + "grad_norm": 1.3536409139633179, + "learning_rate": 9.890485368845279e-05, + "loss": 2.0716, + "step": 128 + }, + { + "epoch": 0.03415771668801524, + "grad_norm": 1.3446725606918335, + "learning_rate": 9.888733134746802e-05, + "loss": 2.0164, + "step": 130 + }, + { + "epoch": 0.034683220021677015, + "grad_norm": 1.322689414024353, + "learning_rate": 9.886980900648327e-05, + "loss": 2.0752, + "step": 132 + }, + { + "epoch": 0.035208723355338784, + "grad_norm": 1.3145771026611328, + "learning_rate": 9.885228666549852e-05, + "loss": 2.0852, + "step": 134 + }, + { + "epoch": 0.03573422668900056, + "grad_norm": 1.326093316078186, + "learning_rate": 9.883476432451376e-05, + "loss": 2.0414, + "step": 136 + }, + { + "epoch": 0.036259730022662334, + "grad_norm": 1.234911561012268, + "learning_rate": 9.881724198352901e-05, + "loss": 2.0627, + "step": 138 + }, + { + "epoch": 0.0367852333563241, + "grad_norm": 1.2666422128677368, + "learning_rate": 9.879971964254426e-05, + "loss": 2.039, + "step": 140 + }, + { + "epoch": 0.03731073668998588, + "grad_norm": 1.285933494567871, + "learning_rate": 9.87821973015595e-05, + "loss": 2.0428, + "step": 142 + }, + { + "epoch": 0.03783624002364765, + "grad_norm": 1.330095648765564, + "learning_rate": 9.876467496057473e-05, + "loss": 2.0458, + "step": 144 + }, + { + "epoch": 0.03836174335730942, + "grad_norm": 1.338354229927063, + "learning_rate": 9.874715261958998e-05, + "loss": 2.0588, + "step": 146 + }, + { + "epoch": 0.0388872466909712, + "grad_norm": 1.2980471849441528, + "learning_rate": 9.872963027860522e-05, + "loss": 2.092, + "step": 148 + }, + { + "epoch": 0.039412750024632966, + "grad_norm": 1.1613810062408447, + "learning_rate": 9.871210793762047e-05, + "loss": 2.0558, + "step": 150 + }, + { + "epoch": 0.03993825335829474, + "grad_norm": 1.2073849439620972, + "learning_rate": 9.869458559663572e-05, + "loss": 2.0317, + "step": 152 + }, + { + "epoch": 0.040463756691956516, + "grad_norm": 1.2727243900299072, + "learning_rate": 9.867706325565097e-05, + "loss": 2.0637, + "step": 154 + }, + { + "epoch": 0.040989260025618285, + "grad_norm": 1.1971285343170166, + "learning_rate": 9.86595409146662e-05, + "loss": 2.0245, + "step": 156 + }, + { + "epoch": 0.04151476335928006, + "grad_norm": 1.2773100137710571, + "learning_rate": 9.864201857368145e-05, + "loss": 1.9971, + "step": 158 + }, + { + "epoch": 0.042040266692941836, + "grad_norm": 1.1450934410095215, + "learning_rate": 9.86244962326967e-05, + "loss": 2.0367, + "step": 160 + }, + { + "epoch": 0.042565770026603604, + "grad_norm": 1.0777571201324463, + "learning_rate": 9.860697389171194e-05, + "loss": 2.0667, + "step": 162 + }, + { + "epoch": 0.04309127336026538, + "grad_norm": 1.1527281999588013, + "learning_rate": 9.858945155072719e-05, + "loss": 2.0556, + "step": 164 + }, + { + "epoch": 0.043616776693927155, + "grad_norm": 1.2145709991455078, + "learning_rate": 9.857192920974244e-05, + "loss": 2.0638, + "step": 166 + }, + { + "epoch": 0.04414228002758892, + "grad_norm": 1.2327994108200073, + "learning_rate": 9.855440686875767e-05, + "loss": 2.0521, + "step": 168 + }, + { + "epoch": 0.0446677833612507, + "grad_norm": 1.1262874603271484, + "learning_rate": 9.85368845277729e-05, + "loss": 2.0456, + "step": 170 + }, + { + "epoch": 0.045193286694912474, + "grad_norm": 1.261991262435913, + "learning_rate": 9.851936218678815e-05, + "loss": 2.0438, + "step": 172 + }, + { + "epoch": 0.04571879002857424, + "grad_norm": 1.1301138401031494, + "learning_rate": 9.85018398458034e-05, + "loss": 2.0385, + "step": 174 + }, + { + "epoch": 0.04624429336223602, + "grad_norm": 1.1710069179534912, + "learning_rate": 9.848431750481865e-05, + "loss": 1.9769, + "step": 176 + }, + { + "epoch": 0.046769796695897786, + "grad_norm": 1.417973279953003, + "learning_rate": 9.84667951638339e-05, + "loss": 2.0309, + "step": 178 + }, + { + "epoch": 0.04729530002955956, + "grad_norm": 1.1423699855804443, + "learning_rate": 9.844927282284914e-05, + "loss": 2.0254, + "step": 180 + }, + { + "epoch": 0.04782080336322134, + "grad_norm": 1.036130428314209, + "learning_rate": 9.843175048186438e-05, + "loss": 2.0361, + "step": 182 + }, + { + "epoch": 0.048346306696883105, + "grad_norm": 1.1310175657272339, + "learning_rate": 9.841422814087962e-05, + "loss": 2.0172, + "step": 184 + }, + { + "epoch": 0.04887181003054488, + "grad_norm": 1.1112390756607056, + "learning_rate": 9.839670579989487e-05, + "loss": 2.021, + "step": 186 + }, + { + "epoch": 0.049397313364206656, + "grad_norm": 1.1196744441986084, + "learning_rate": 9.837918345891012e-05, + "loss": 2.0398, + "step": 188 + }, + { + "epoch": 0.049922816697868425, + "grad_norm": 1.1641274690628052, + "learning_rate": 9.836166111792537e-05, + "loss": 2.0211, + "step": 190 + }, + { + "epoch": 0.0504483200315302, + "grad_norm": 1.231698989868164, + "learning_rate": 9.834413877694061e-05, + "loss": 1.9938, + "step": 192 + }, + { + "epoch": 0.050973823365191975, + "grad_norm": 1.116532325744629, + "learning_rate": 9.832661643595585e-05, + "loss": 1.9916, + "step": 194 + }, + { + "epoch": 0.051499326698853744, + "grad_norm": 1.1912415027618408, + "learning_rate": 9.830909409497108e-05, + "loss": 2.0206, + "step": 196 + }, + { + "epoch": 0.05202483003251552, + "grad_norm": 1.0664820671081543, + "learning_rate": 9.829157175398633e-05, + "loss": 2.0244, + "step": 198 + }, + { + "epoch": 0.052550333366177295, + "grad_norm": 1.1066045761108398, + "learning_rate": 9.827404941300158e-05, + "loss": 2.0005, + "step": 200 + }, + { + "epoch": 0.05307583669983906, + "grad_norm": 1.1267125606536865, + "learning_rate": 9.825652707201683e-05, + "loss": 2.0185, + "step": 202 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 1.14983069896698, + "learning_rate": 9.823900473103207e-05, + "loss": 1.9726, + "step": 204 + }, + { + "epoch": 0.054126843367162614, + "grad_norm": 1.2234214544296265, + "learning_rate": 9.822148239004732e-05, + "loss": 2.0497, + "step": 206 + }, + { + "epoch": 0.05465234670082438, + "grad_norm": 1.1713297367095947, + "learning_rate": 9.820396004906255e-05, + "loss": 2.0167, + "step": 208 + }, + { + "epoch": 0.05517785003448616, + "grad_norm": 1.2977114915847778, + "learning_rate": 9.81864377080778e-05, + "loss": 1.9795, + "step": 210 + }, + { + "epoch": 0.055703353368147926, + "grad_norm": 1.1145280599594116, + "learning_rate": 9.816891536709305e-05, + "loss": 2.0099, + "step": 212 + }, + { + "epoch": 0.0562288567018097, + "grad_norm": 1.126206874847412, + "learning_rate": 9.81513930261083e-05, + "loss": 2.0053, + "step": 214 + }, + { + "epoch": 0.05675436003547148, + "grad_norm": 1.1073702573776245, + "learning_rate": 9.813387068512355e-05, + "loss": 1.985, + "step": 216 + }, + { + "epoch": 0.057279863369133245, + "grad_norm": 1.2039167881011963, + "learning_rate": 9.811634834413879e-05, + "loss": 1.9898, + "step": 218 + }, + { + "epoch": 0.05780536670279502, + "grad_norm": 1.2644699811935425, + "learning_rate": 9.809882600315403e-05, + "loss": 1.9904, + "step": 220 + }, + { + "epoch": 0.058330870036456796, + "grad_norm": 0.9593138694763184, + "learning_rate": 9.808130366216926e-05, + "loss": 2.0065, + "step": 222 + }, + { + "epoch": 0.058856373370118564, + "grad_norm": 0.9157779216766357, + "learning_rate": 9.806378132118451e-05, + "loss": 1.9853, + "step": 224 + }, + { + "epoch": 0.05938187670378034, + "grad_norm": 1.0334917306900024, + "learning_rate": 9.804625898019976e-05, + "loss": 2.0129, + "step": 226 + }, + { + "epoch": 0.059907380037442115, + "grad_norm": 1.0476865768432617, + "learning_rate": 9.8028736639215e-05, + "loss": 1.9816, + "step": 228 + }, + { + "epoch": 0.060432883371103883, + "grad_norm": 1.1626946926116943, + "learning_rate": 9.801121429823025e-05, + "loss": 2.0278, + "step": 230 + }, + { + "epoch": 0.06095838670476566, + "grad_norm": 1.3142833709716797, + "learning_rate": 9.79936919572455e-05, + "loss": 1.998, + "step": 232 + }, + { + "epoch": 0.061483890038427434, + "grad_norm": 1.195894718170166, + "learning_rate": 9.797616961626073e-05, + "loss": 1.9813, + "step": 234 + }, + { + "epoch": 0.0620093933720892, + "grad_norm": 1.0905667543411255, + "learning_rate": 9.795864727527598e-05, + "loss": 1.9725, + "step": 236 + }, + { + "epoch": 0.06253489670575098, + "grad_norm": 1.0762920379638672, + "learning_rate": 9.794112493429123e-05, + "loss": 2.0036, + "step": 238 + }, + { + "epoch": 0.06306040003941275, + "grad_norm": 0.9937522411346436, + "learning_rate": 9.792360259330648e-05, + "loss": 2.0208, + "step": 240 + }, + { + "epoch": 0.06358590337307453, + "grad_norm": 1.3950233459472656, + "learning_rate": 9.790608025232172e-05, + "loss": 1.9667, + "step": 242 + }, + { + "epoch": 0.06411140670673629, + "grad_norm": 1.0673043727874756, + "learning_rate": 9.788855791133697e-05, + "loss": 1.984, + "step": 244 + }, + { + "epoch": 0.06463691004039807, + "grad_norm": 1.0824543237686157, + "learning_rate": 9.78710355703522e-05, + "loss": 2.0271, + "step": 246 + }, + { + "epoch": 0.06516241337405984, + "grad_norm": 1.2005363702774048, + "learning_rate": 9.785351322936744e-05, + "loss": 1.9912, + "step": 248 + }, + { + "epoch": 0.06568791670772162, + "grad_norm": 1.0987967252731323, + "learning_rate": 9.783599088838269e-05, + "loss": 1.9886, + "step": 250 + }, + { + "epoch": 0.06621342004138339, + "grad_norm": 1.0441079139709473, + "learning_rate": 9.781846854739793e-05, + "loss": 1.9888, + "step": 252 + }, + { + "epoch": 0.06673892337504517, + "grad_norm": 1.3533669710159302, + "learning_rate": 9.780094620641318e-05, + "loss": 2.0232, + "step": 254 + }, + { + "epoch": 0.06726442670870693, + "grad_norm": 0.9276557564735413, + "learning_rate": 9.778342386542843e-05, + "loss": 1.969, + "step": 256 + }, + { + "epoch": 0.0677899300423687, + "grad_norm": 1.143988847732544, + "learning_rate": 9.776590152444368e-05, + "loss": 1.9643, + "step": 258 + }, + { + "epoch": 0.06831543337603048, + "grad_norm": 0.9717089533805847, + "learning_rate": 9.774837918345891e-05, + "loss": 2.0169, + "step": 260 + }, + { + "epoch": 0.06884093670969225, + "grad_norm": 1.328565001487732, + "learning_rate": 9.773085684247416e-05, + "loss": 1.9733, + "step": 262 + }, + { + "epoch": 0.06936644004335403, + "grad_norm": 1.0031944513320923, + "learning_rate": 9.77133345014894e-05, + "loss": 2.0026, + "step": 264 + }, + { + "epoch": 0.06989194337701579, + "grad_norm": 0.9560984969139099, + "learning_rate": 9.769581216050465e-05, + "loss": 1.9893, + "step": 266 + }, + { + "epoch": 0.07041744671067757, + "grad_norm": 1.1324944496154785, + "learning_rate": 9.76782898195199e-05, + "loss": 1.9499, + "step": 268 + }, + { + "epoch": 0.07094295004433934, + "grad_norm": 1.1018491983413696, + "learning_rate": 9.766076747853515e-05, + "loss": 1.9915, + "step": 270 + }, + { + "epoch": 0.07146845337800112, + "grad_norm": 1.1043239831924438, + "learning_rate": 9.764324513755038e-05, + "loss": 2.0021, + "step": 272 + }, + { + "epoch": 0.0719939567116629, + "grad_norm": 1.1171072721481323, + "learning_rate": 9.762572279656562e-05, + "loss": 1.9665, + "step": 274 + }, + { + "epoch": 0.07251946004532467, + "grad_norm": 1.040802001953125, + "learning_rate": 9.760820045558086e-05, + "loss": 2.0249, + "step": 276 + }, + { + "epoch": 0.07304496337898643, + "grad_norm": 1.0344669818878174, + "learning_rate": 9.759067811459611e-05, + "loss": 2.0102, + "step": 278 + }, + { + "epoch": 0.0735704667126482, + "grad_norm": 0.9199603199958801, + "learning_rate": 9.757315577361136e-05, + "loss": 1.9836, + "step": 280 + }, + { + "epoch": 0.07409597004630998, + "grad_norm": 1.129347562789917, + "learning_rate": 9.75556334326266e-05, + "loss": 1.9141, + "step": 282 + }, + { + "epoch": 0.07462147337997176, + "grad_norm": 0.9745813012123108, + "learning_rate": 9.753811109164185e-05, + "loss": 1.9807, + "step": 284 + }, + { + "epoch": 0.07514697671363353, + "grad_norm": 1.2225230932235718, + "learning_rate": 9.752058875065709e-05, + "loss": 1.9786, + "step": 286 + }, + { + "epoch": 0.0756724800472953, + "grad_norm": 1.7652974128723145, + "learning_rate": 9.750306640967234e-05, + "loss": 1.9575, + "step": 288 + }, + { + "epoch": 0.07619798338095707, + "grad_norm": 1.075140118598938, + "learning_rate": 9.748554406868758e-05, + "loss": 2.0034, + "step": 290 + }, + { + "epoch": 0.07672348671461884, + "grad_norm": 1.0169627666473389, + "learning_rate": 9.746802172770283e-05, + "loss": 2.0072, + "step": 292 + }, + { + "epoch": 0.07724899004828062, + "grad_norm": 1.0737385749816895, + "learning_rate": 9.745049938671808e-05, + "loss": 1.9446, + "step": 294 + }, + { + "epoch": 0.0777744933819424, + "grad_norm": 1.2098944187164307, + "learning_rate": 9.743297704573331e-05, + "loss": 1.9875, + "step": 296 + }, + { + "epoch": 0.07829999671560417, + "grad_norm": 1.030888557434082, + "learning_rate": 9.741545470474856e-05, + "loss": 2.0323, + "step": 298 + }, + { + "epoch": 0.07882550004926593, + "grad_norm": 1.1690922975540161, + "learning_rate": 9.73979323637638e-05, + "loss": 1.9544, + "step": 300 + }, + { + "epoch": 0.0793510033829277, + "grad_norm": 0.9702273607254028, + "learning_rate": 9.738041002277904e-05, + "loss": 1.954, + "step": 302 + }, + { + "epoch": 0.07987650671658948, + "grad_norm": 1.1210618019104004, + "learning_rate": 9.736288768179429e-05, + "loss": 1.9786, + "step": 304 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 1.0246062278747559, + "learning_rate": 9.734536534080954e-05, + "loss": 1.9777, + "step": 306 + }, + { + "epoch": 0.08092751338391303, + "grad_norm": 0.9073227047920227, + "learning_rate": 9.732784299982478e-05, + "loss": 1.9737, + "step": 308 + }, + { + "epoch": 0.08145301671757481, + "grad_norm": 0.9827994704246521, + "learning_rate": 9.731032065884003e-05, + "loss": 1.9794, + "step": 310 + }, + { + "epoch": 0.08197852005123657, + "grad_norm": 0.9744251370429993, + "learning_rate": 9.729279831785527e-05, + "loss": 1.9771, + "step": 312 + }, + { + "epoch": 0.08250402338489835, + "grad_norm": 1.0426684617996216, + "learning_rate": 9.727527597687051e-05, + "loss": 1.9522, + "step": 314 + }, + { + "epoch": 0.08302952671856012, + "grad_norm": 1.0296825170516968, + "learning_rate": 9.725775363588576e-05, + "loss": 1.9727, + "step": 316 + }, + { + "epoch": 0.0835550300522219, + "grad_norm": 1.2643887996673584, + "learning_rate": 9.724023129490101e-05, + "loss": 1.9414, + "step": 318 + }, + { + "epoch": 0.08408053338588367, + "grad_norm": 1.0346665382385254, + "learning_rate": 9.722270895391626e-05, + "loss": 1.9964, + "step": 320 + }, + { + "epoch": 0.08460603671954543, + "grad_norm": 0.9016798734664917, + "learning_rate": 9.720518661293149e-05, + "loss": 1.9762, + "step": 322 + }, + { + "epoch": 0.08513154005320721, + "grad_norm": 1.0269689559936523, + "learning_rate": 9.718766427194674e-05, + "loss": 1.9969, + "step": 324 + }, + { + "epoch": 0.08565704338686898, + "grad_norm": 1.0594022274017334, + "learning_rate": 9.717014193096197e-05, + "loss": 1.9624, + "step": 326 + }, + { + "epoch": 0.08618254672053076, + "grad_norm": 1.1567680835723877, + "learning_rate": 9.715261958997722e-05, + "loss": 1.959, + "step": 328 + }, + { + "epoch": 0.08670805005419253, + "grad_norm": 0.870560884475708, + "learning_rate": 9.713509724899247e-05, + "loss": 1.9569, + "step": 330 + }, + { + "epoch": 0.08723355338785431, + "grad_norm": 1.0716711282730103, + "learning_rate": 9.711757490800771e-05, + "loss": 1.951, + "step": 332 + }, + { + "epoch": 0.08775905672151607, + "grad_norm": 1.1096898317337036, + "learning_rate": 9.710005256702296e-05, + "loss": 1.9796, + "step": 334 + }, + { + "epoch": 0.08828456005517785, + "grad_norm": 0.8860819339752197, + "learning_rate": 9.708253022603821e-05, + "loss": 1.9591, + "step": 336 + }, + { + "epoch": 0.08881006338883962, + "grad_norm": 0.9616803526878357, + "learning_rate": 9.706500788505344e-05, + "loss": 1.9743, + "step": 338 + }, + { + "epoch": 0.0893355667225014, + "grad_norm": 0.8491392135620117, + "learning_rate": 9.704748554406869e-05, + "loss": 1.959, + "step": 340 + }, + { + "epoch": 0.08986107005616317, + "grad_norm": 1.068489670753479, + "learning_rate": 9.702996320308394e-05, + "loss": 1.971, + "step": 342 + }, + { + "epoch": 0.09038657338982495, + "grad_norm": 0.8963595032691956, + "learning_rate": 9.701244086209919e-05, + "loss": 1.9239, + "step": 344 + }, + { + "epoch": 0.09091207672348671, + "grad_norm": 1.232599139213562, + "learning_rate": 9.699491852111443e-05, + "loss": 1.9282, + "step": 346 + }, + { + "epoch": 0.09143758005714848, + "grad_norm": 1.0067182779312134, + "learning_rate": 9.697739618012967e-05, + "loss": 1.9539, + "step": 348 + }, + { + "epoch": 0.09196308339081026, + "grad_norm": 0.927861750125885, + "learning_rate": 9.695987383914491e-05, + "loss": 1.9518, + "step": 350 + }, + { + "epoch": 0.09248858672447204, + "grad_norm": 1.1443390846252441, + "learning_rate": 9.694235149816015e-05, + "loss": 1.9527, + "step": 352 + }, + { + "epoch": 0.09301409005813381, + "grad_norm": 1.3024427890777588, + "learning_rate": 9.69248291571754e-05, + "loss": 1.9276, + "step": 354 + }, + { + "epoch": 0.09353959339179557, + "grad_norm": 1.0598125457763672, + "learning_rate": 9.690730681619064e-05, + "loss": 1.9728, + "step": 356 + }, + { + "epoch": 0.09406509672545735, + "grad_norm": 1.0384907722473145, + "learning_rate": 9.688978447520589e-05, + "loss": 1.9543, + "step": 358 + }, + { + "epoch": 0.09459060005911912, + "grad_norm": 1.4060821533203125, + "learning_rate": 9.687226213422114e-05, + "loss": 1.9005, + "step": 360 + }, + { + "epoch": 0.0951161033927809, + "grad_norm": 0.9630605578422546, + "learning_rate": 9.685473979323639e-05, + "loss": 1.9306, + "step": 362 + }, + { + "epoch": 0.09564160672644267, + "grad_norm": 0.9025930166244507, + "learning_rate": 9.683721745225162e-05, + "loss": 1.9391, + "step": 364 + }, + { + "epoch": 0.09616711006010445, + "grad_norm": 1.0383387804031372, + "learning_rate": 9.681969511126687e-05, + "loss": 1.9506, + "step": 366 + }, + { + "epoch": 0.09669261339376621, + "grad_norm": 0.9397739768028259, + "learning_rate": 9.680217277028212e-05, + "loss": 1.9132, + "step": 368 + }, + { + "epoch": 0.09721811672742799, + "grad_norm": 0.9472708106040955, + "learning_rate": 9.678465042929736e-05, + "loss": 1.9731, + "step": 370 + }, + { + "epoch": 0.09774362006108976, + "grad_norm": 0.8391740322113037, + "learning_rate": 9.676712808831261e-05, + "loss": 1.9956, + "step": 372 + }, + { + "epoch": 0.09826912339475154, + "grad_norm": 1.160340666770935, + "learning_rate": 9.674960574732785e-05, + "loss": 1.9576, + "step": 374 + }, + { + "epoch": 0.09879462672841331, + "grad_norm": 1.1132557392120361, + "learning_rate": 9.673208340634309e-05, + "loss": 1.9407, + "step": 376 + }, + { + "epoch": 0.09932013006207509, + "grad_norm": 0.9549061059951782, + "learning_rate": 9.671456106535834e-05, + "loss": 1.96, + "step": 378 + }, + { + "epoch": 0.09984563339573685, + "grad_norm": 1.953350305557251, + "learning_rate": 9.669703872437357e-05, + "loss": 1.9146, + "step": 380 + }, + { + "epoch": 0.10037113672939862, + "grad_norm": 0.8253780603408813, + "learning_rate": 9.667951638338882e-05, + "loss": 1.9576, + "step": 382 + }, + { + "epoch": 0.1008966400630604, + "grad_norm": 1.76423180103302, + "learning_rate": 9.666199404240407e-05, + "loss": 1.9411, + "step": 384 + }, + { + "epoch": 0.10142214339672218, + "grad_norm": 1.2772194147109985, + "learning_rate": 9.664447170141932e-05, + "loss": 1.939, + "step": 386 + }, + { + "epoch": 0.10194764673038395, + "grad_norm": 1.1129158735275269, + "learning_rate": 9.662694936043456e-05, + "loss": 1.9587, + "step": 388 + }, + { + "epoch": 0.10247315006404571, + "grad_norm": 1.5891733169555664, + "learning_rate": 9.660942701944981e-05, + "loss": 1.9677, + "step": 390 + }, + { + "epoch": 0.10299865339770749, + "grad_norm": 0.8642210364341736, + "learning_rate": 9.659190467846505e-05, + "loss": 1.9356, + "step": 392 + }, + { + "epoch": 0.10352415673136926, + "grad_norm": 1.0561408996582031, + "learning_rate": 9.65743823374803e-05, + "loss": 1.961, + "step": 394 + }, + { + "epoch": 0.10404966006503104, + "grad_norm": 0.9040923118591309, + "learning_rate": 9.655685999649554e-05, + "loss": 1.9217, + "step": 396 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.9783763289451599, + "learning_rate": 9.653933765551078e-05, + "loss": 1.9042, + "step": 398 + }, + { + "epoch": 0.10510066673235459, + "grad_norm": 1.0281099081039429, + "learning_rate": 9.652181531452602e-05, + "loss": 1.8989, + "step": 400 + }, + { + "epoch": 0.10510066673235459, + "eval_loss": 1.89004647731781, + "eval_runtime": 487.2614, + "eval_samples_per_second": 249.946, + "eval_steps_per_second": 31.244, + "step": 400 + }, + { + "epoch": 0.10562617006601635, + "grad_norm": 0.9645543694496155, + "learning_rate": 9.650429297354127e-05, + "loss": 1.9487, + "step": 402 + }, + { + "epoch": 0.10615167339967813, + "grad_norm": 1.1213475465774536, + "learning_rate": 9.648677063255652e-05, + "loss": 1.963, + "step": 404 + }, + { + "epoch": 0.1066771767333399, + "grad_norm": 0.8866876363754272, + "learning_rate": 9.646924829157175e-05, + "loss": 1.925, + "step": 406 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.9458855390548706, + "learning_rate": 9.6451725950587e-05, + "loss": 1.9641, + "step": 408 + }, + { + "epoch": 0.10772818340066345, + "grad_norm": 0.9262669086456299, + "learning_rate": 9.643420360960225e-05, + "loss": 1.9257, + "step": 410 + }, + { + "epoch": 0.10825368673432523, + "grad_norm": 0.9147223830223083, + "learning_rate": 9.64166812686175e-05, + "loss": 1.9242, + "step": 412 + }, + { + "epoch": 0.10877919006798699, + "grad_norm": 1.1678857803344727, + "learning_rate": 9.639915892763274e-05, + "loss": 1.9474, + "step": 414 + }, + { + "epoch": 0.10930469340164876, + "grad_norm": 1.0557537078857422, + "learning_rate": 9.638163658664799e-05, + "loss": 1.9527, + "step": 416 + }, + { + "epoch": 0.10983019673531054, + "grad_norm": 1.047229528427124, + "learning_rate": 9.636411424566322e-05, + "loss": 1.9116, + "step": 418 + }, + { + "epoch": 0.11035570006897231, + "grad_norm": 0.9046667814254761, + "learning_rate": 9.634659190467847e-05, + "loss": 1.914, + "step": 420 + }, + { + "epoch": 0.11088120340263409, + "grad_norm": 1.019170880317688, + "learning_rate": 9.632906956369372e-05, + "loss": 1.9667, + "step": 422 + }, + { + "epoch": 0.11140670673629585, + "grad_norm": 1.061903953552246, + "learning_rate": 9.631154722270895e-05, + "loss": 1.8915, + "step": 424 + }, + { + "epoch": 0.11193221006995763, + "grad_norm": 1.1627757549285889, + "learning_rate": 9.62940248817242e-05, + "loss": 1.9272, + "step": 426 + }, + { + "epoch": 0.1124577134036194, + "grad_norm": 0.9440149068832397, + "learning_rate": 9.627650254073945e-05, + "loss": 1.9686, + "step": 428 + }, + { + "epoch": 0.11298321673728118, + "grad_norm": 1.0250455141067505, + "learning_rate": 9.62589801997547e-05, + "loss": 1.937, + "step": 430 + }, + { + "epoch": 0.11350872007094295, + "grad_norm": 0.8273470401763916, + "learning_rate": 9.624145785876993e-05, + "loss": 1.9411, + "step": 432 + }, + { + "epoch": 0.11403422340460473, + "grad_norm": 1.4610145092010498, + "learning_rate": 9.622393551778518e-05, + "loss": 1.954, + "step": 434 + }, + { + "epoch": 0.11455972673826649, + "grad_norm": 0.9790822267532349, + "learning_rate": 9.620641317680042e-05, + "loss": 1.9255, + "step": 436 + }, + { + "epoch": 0.11508523007192827, + "grad_norm": 1.1833688020706177, + "learning_rate": 9.618889083581567e-05, + "loss": 1.9031, + "step": 438 + }, + { + "epoch": 0.11561073340559004, + "grad_norm": 0.879008948802948, + "learning_rate": 9.617136849483092e-05, + "loss": 1.937, + "step": 440 + }, + { + "epoch": 0.11613623673925182, + "grad_norm": 1.2231090068817139, + "learning_rate": 9.615384615384617e-05, + "loss": 1.9649, + "step": 442 + }, + { + "epoch": 0.11666174007291359, + "grad_norm": 1.1259514093399048, + "learning_rate": 9.61363238128614e-05, + "loss": 1.8921, + "step": 444 + }, + { + "epoch": 0.11718724340657537, + "grad_norm": 1.294339656829834, + "learning_rate": 9.611880147187665e-05, + "loss": 1.9084, + "step": 446 + }, + { + "epoch": 0.11771274674023713, + "grad_norm": 0.9451860785484314, + "learning_rate": 9.61012791308919e-05, + "loss": 1.9441, + "step": 448 + }, + { + "epoch": 0.1182382500738989, + "grad_norm": 0.9239658117294312, + "learning_rate": 9.608375678990713e-05, + "loss": 1.9291, + "step": 450 + }, + { + "epoch": 0.11876375340756068, + "grad_norm": 1.0765795707702637, + "learning_rate": 9.606623444892238e-05, + "loss": 1.9249, + "step": 452 + }, + { + "epoch": 0.11928925674122245, + "grad_norm": 0.919032633304596, + "learning_rate": 9.604871210793763e-05, + "loss": 1.9399, + "step": 454 + }, + { + "epoch": 0.11981476007488423, + "grad_norm": 0.8893537521362305, + "learning_rate": 9.603118976695287e-05, + "loss": 1.8985, + "step": 456 + }, + { + "epoch": 0.12034026340854599, + "grad_norm": 1.4801400899887085, + "learning_rate": 9.601366742596811e-05, + "loss": 1.8984, + "step": 458 + }, + { + "epoch": 0.12086576674220777, + "grad_norm": 0.8458815217018127, + "learning_rate": 9.599614508498335e-05, + "loss": 1.9365, + "step": 460 + }, + { + "epoch": 0.12139127007586954, + "grad_norm": 1.32188081741333, + "learning_rate": 9.59786227439986e-05, + "loss": 1.914, + "step": 462 + }, + { + "epoch": 0.12191677340953132, + "grad_norm": 0.8433484435081482, + "learning_rate": 9.596110040301385e-05, + "loss": 1.9746, + "step": 464 + }, + { + "epoch": 0.1224422767431931, + "grad_norm": 1.1092686653137207, + "learning_rate": 9.59435780620291e-05, + "loss": 1.9672, + "step": 466 + }, + { + "epoch": 0.12296778007685487, + "grad_norm": 0.9566219449043274, + "learning_rate": 9.592605572104434e-05, + "loss": 1.9225, + "step": 468 + }, + { + "epoch": 0.12349328341051663, + "grad_norm": 1.2604984045028687, + "learning_rate": 9.590853338005958e-05, + "loss": 1.9309, + "step": 470 + }, + { + "epoch": 0.1240187867441784, + "grad_norm": 1.2344852685928345, + "learning_rate": 9.589101103907483e-05, + "loss": 1.9236, + "step": 472 + }, + { + "epoch": 0.12454429007784018, + "grad_norm": 1.2368342876434326, + "learning_rate": 9.587348869809007e-05, + "loss": 1.9058, + "step": 474 + }, + { + "epoch": 0.12506979341150196, + "grad_norm": 1.3377318382263184, + "learning_rate": 9.585596635710531e-05, + "loss": 1.9345, + "step": 476 + }, + { + "epoch": 0.12559529674516373, + "grad_norm": 0.981948733329773, + "learning_rate": 9.583844401612056e-05, + "loss": 1.9253, + "step": 478 + }, + { + "epoch": 0.1261208000788255, + "grad_norm": 1.3136436939239502, + "learning_rate": 9.58209216751358e-05, + "loss": 1.933, + "step": 480 + }, + { + "epoch": 0.12664630341248728, + "grad_norm": 0.9691250324249268, + "learning_rate": 9.580339933415105e-05, + "loss": 1.9274, + "step": 482 + }, + { + "epoch": 0.12717180674614906, + "grad_norm": 1.4272805452346802, + "learning_rate": 9.578587699316628e-05, + "loss": 1.9242, + "step": 484 + }, + { + "epoch": 0.1276973100798108, + "grad_norm": 1.0407310724258423, + "learning_rate": 9.576835465218153e-05, + "loss": 1.9399, + "step": 486 + }, + { + "epoch": 0.12822281341347258, + "grad_norm": 0.8317910432815552, + "learning_rate": 9.575083231119678e-05, + "loss": 1.8733, + "step": 488 + }, + { + "epoch": 0.12874831674713436, + "grad_norm": 1.1145474910736084, + "learning_rate": 9.573330997021203e-05, + "loss": 1.9443, + "step": 490 + }, + { + "epoch": 0.12927382008079613, + "grad_norm": 1.0935791730880737, + "learning_rate": 9.571578762922728e-05, + "loss": 1.9324, + "step": 492 + }, + { + "epoch": 0.1297993234144579, + "grad_norm": 1.340178370475769, + "learning_rate": 9.569826528824252e-05, + "loss": 1.9046, + "step": 494 + }, + { + "epoch": 0.13032482674811968, + "grad_norm": 0.8448821902275085, + "learning_rate": 9.568074294725776e-05, + "loss": 1.9172, + "step": 496 + }, + { + "epoch": 0.13085033008178146, + "grad_norm": 1.1193829774856567, + "learning_rate": 9.5663220606273e-05, + "loss": 1.9471, + "step": 498 + }, + { + "epoch": 0.13137583341544323, + "grad_norm": 1.0313880443572998, + "learning_rate": 9.564569826528824e-05, + "loss": 1.9027, + "step": 500 + }, + { + "epoch": 0.131901336749105, + "grad_norm": 1.037778615951538, + "learning_rate": 9.562817592430349e-05, + "loss": 1.9375, + "step": 502 + }, + { + "epoch": 0.13242684008276678, + "grad_norm": 1.1615263223648071, + "learning_rate": 9.561065358331873e-05, + "loss": 1.9226, + "step": 504 + }, + { + "epoch": 0.13295234341642856, + "grad_norm": 1.5699961185455322, + "learning_rate": 9.559313124233398e-05, + "loss": 1.9097, + "step": 506 + }, + { + "epoch": 0.13347784675009033, + "grad_norm": 2.0286099910736084, + "learning_rate": 9.557560890134923e-05, + "loss": 1.9207, + "step": 508 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 1.0447641611099243, + "learning_rate": 9.555808656036446e-05, + "loss": 1.9415, + "step": 510 + }, + { + "epoch": 0.13452885341741386, + "grad_norm": 1.357127070426941, + "learning_rate": 9.554056421937971e-05, + "loss": 1.9165, + "step": 512 + }, + { + "epoch": 0.13505435675107563, + "grad_norm": 1.3550409078598022, + "learning_rate": 9.552304187839496e-05, + "loss": 1.9043, + "step": 514 + }, + { + "epoch": 0.1355798600847374, + "grad_norm": 1.1390573978424072, + "learning_rate": 9.55055195374102e-05, + "loss": 1.8865, + "step": 516 + }, + { + "epoch": 0.13610536341839918, + "grad_norm": 1.2006975412368774, + "learning_rate": 9.548799719642545e-05, + "loss": 1.9478, + "step": 518 + }, + { + "epoch": 0.13663086675206096, + "grad_norm": 0.9007882475852966, + "learning_rate": 9.54704748554407e-05, + "loss": 1.8973, + "step": 520 + }, + { + "epoch": 0.13715637008572273, + "grad_norm": 1.0151760578155518, + "learning_rate": 9.545295251445593e-05, + "loss": 1.9178, + "step": 522 + }, + { + "epoch": 0.1376818734193845, + "grad_norm": 1.0484004020690918, + "learning_rate": 9.543543017347118e-05, + "loss": 1.9464, + "step": 524 + }, + { + "epoch": 0.13820737675304628, + "grad_norm": 0.9942461848258972, + "learning_rate": 9.541790783248642e-05, + "loss": 1.8783, + "step": 526 + }, + { + "epoch": 0.13873288008670806, + "grad_norm": 1.2193222045898438, + "learning_rate": 9.540038549150166e-05, + "loss": 1.911, + "step": 528 + }, + { + "epoch": 0.13925838342036984, + "grad_norm": 0.9065030813217163, + "learning_rate": 9.538286315051691e-05, + "loss": 1.9214, + "step": 530 + }, + { + "epoch": 0.13978388675403158, + "grad_norm": 1.1993494033813477, + "learning_rate": 9.536534080953216e-05, + "loss": 1.9159, + "step": 532 + }, + { + "epoch": 0.14030939008769336, + "grad_norm": 1.1094871759414673, + "learning_rate": 9.53478184685474e-05, + "loss": 1.9393, + "step": 534 + }, + { + "epoch": 0.14083489342135513, + "grad_norm": 1.1761337518692017, + "learning_rate": 9.533029612756264e-05, + "loss": 1.9315, + "step": 536 + }, + { + "epoch": 0.1413603967550169, + "grad_norm": 0.8991339802742004, + "learning_rate": 9.531277378657789e-05, + "loss": 1.9138, + "step": 538 + }, + { + "epoch": 0.14188590008867868, + "grad_norm": 1.0923110246658325, + "learning_rate": 9.529525144559314e-05, + "loss": 1.9254, + "step": 540 + }, + { + "epoch": 0.14241140342234046, + "grad_norm": 0.8643242716789246, + "learning_rate": 9.527772910460838e-05, + "loss": 1.912, + "step": 542 + }, + { + "epoch": 0.14293690675600224, + "grad_norm": 0.8578360080718994, + "learning_rate": 9.526020676362363e-05, + "loss": 1.9403, + "step": 544 + }, + { + "epoch": 0.143462410089664, + "grad_norm": 0.854049563407898, + "learning_rate": 9.524268442263888e-05, + "loss": 1.9251, + "step": 546 + }, + { + "epoch": 0.1439879134233258, + "grad_norm": 1.1244609355926514, + "learning_rate": 9.522516208165411e-05, + "loss": 1.928, + "step": 548 + }, + { + "epoch": 0.14451341675698756, + "grad_norm": 1.0357218980789185, + "learning_rate": 9.520763974066936e-05, + "loss": 1.9137, + "step": 550 + }, + { + "epoch": 0.14503892009064934, + "grad_norm": 1.2136287689208984, + "learning_rate": 9.51901173996846e-05, + "loss": 1.9009, + "step": 552 + }, + { + "epoch": 0.14556442342431108, + "grad_norm": 1.5104690790176392, + "learning_rate": 9.517259505869984e-05, + "loss": 1.8873, + "step": 554 + }, + { + "epoch": 0.14608992675797286, + "grad_norm": 0.9920956492424011, + "learning_rate": 9.515507271771509e-05, + "loss": 1.9168, + "step": 556 + }, + { + "epoch": 0.14661543009163464, + "grad_norm": 1.0754443407058716, + "learning_rate": 9.513755037673034e-05, + "loss": 1.9258, + "step": 558 + }, + { + "epoch": 0.1471409334252964, + "grad_norm": 0.9329949617385864, + "learning_rate": 9.512002803574558e-05, + "loss": 1.8999, + "step": 560 + }, + { + "epoch": 0.14766643675895819, + "grad_norm": 0.9859107136726379, + "learning_rate": 9.510250569476082e-05, + "loss": 1.8973, + "step": 562 + }, + { + "epoch": 0.14819194009261996, + "grad_norm": 0.8728905916213989, + "learning_rate": 9.508498335377607e-05, + "loss": 1.8981, + "step": 564 + }, + { + "epoch": 0.14871744342628174, + "grad_norm": 0.9786973595619202, + "learning_rate": 9.506746101279131e-05, + "loss": 1.9167, + "step": 566 + }, + { + "epoch": 0.1492429467599435, + "grad_norm": 0.9139885902404785, + "learning_rate": 9.504993867180656e-05, + "loss": 1.8988, + "step": 568 + }, + { + "epoch": 0.1497684500936053, + "grad_norm": 0.9002274870872498, + "learning_rate": 9.503241633082181e-05, + "loss": 1.8964, + "step": 570 + }, + { + "epoch": 0.15029395342726706, + "grad_norm": 0.9470378756523132, + "learning_rate": 9.501489398983706e-05, + "loss": 1.8877, + "step": 572 + }, + { + "epoch": 0.15081945676092884, + "grad_norm": 1.2751479148864746, + "learning_rate": 9.499737164885229e-05, + "loss": 1.9328, + "step": 574 + }, + { + "epoch": 0.1513449600945906, + "grad_norm": 1.0107779502868652, + "learning_rate": 9.497984930786754e-05, + "loss": 1.9271, + "step": 576 + }, + { + "epoch": 0.15187046342825236, + "grad_norm": 1.3902311325073242, + "learning_rate": 9.496232696688277e-05, + "loss": 1.8903, + "step": 578 + }, + { + "epoch": 0.15239596676191414, + "grad_norm": 0.8747096657752991, + "learning_rate": 9.494480462589802e-05, + "loss": 1.9058, + "step": 580 + }, + { + "epoch": 0.1529214700955759, + "grad_norm": 0.9474460482597351, + "learning_rate": 9.492728228491327e-05, + "loss": 1.9357, + "step": 582 + }, + { + "epoch": 0.1534469734292377, + "grad_norm": 0.9786020517349243, + "learning_rate": 9.490975994392851e-05, + "loss": 1.9267, + "step": 584 + }, + { + "epoch": 0.15397247676289946, + "grad_norm": 0.9129196405410767, + "learning_rate": 9.489223760294376e-05, + "loss": 1.9129, + "step": 586 + }, + { + "epoch": 0.15449798009656124, + "grad_norm": 1.2065305709838867, + "learning_rate": 9.4874715261959e-05, + "loss": 1.8809, + "step": 588 + }, + { + "epoch": 0.155023483430223, + "grad_norm": 1.0768754482269287, + "learning_rate": 9.485719292097424e-05, + "loss": 1.8827, + "step": 590 + }, + { + "epoch": 0.1555489867638848, + "grad_norm": 1.3688833713531494, + "learning_rate": 9.483967057998949e-05, + "loss": 1.9325, + "step": 592 + }, + { + "epoch": 0.15607449009754656, + "grad_norm": 1.0570260286331177, + "learning_rate": 9.482214823900474e-05, + "loss": 1.8599, + "step": 594 + }, + { + "epoch": 0.15659999343120834, + "grad_norm": 1.0146839618682861, + "learning_rate": 9.480462589801999e-05, + "loss": 1.9205, + "step": 596 + }, + { + "epoch": 0.15712549676487012, + "grad_norm": 1.6670814752578735, + "learning_rate": 9.478710355703523e-05, + "loss": 1.8699, + "step": 598 + }, + { + "epoch": 0.15765100009853186, + "grad_norm": 1.178289532661438, + "learning_rate": 9.476958121605047e-05, + "loss": 1.9022, + "step": 600 + }, + { + "epoch": 0.15817650343219364, + "grad_norm": 1.6808775663375854, + "learning_rate": 9.47520588750657e-05, + "loss": 1.9264, + "step": 602 + }, + { + "epoch": 0.1587020067658554, + "grad_norm": 0.9852617383003235, + "learning_rate": 9.473453653408095e-05, + "loss": 1.9114, + "step": 604 + }, + { + "epoch": 0.1592275100995172, + "grad_norm": 0.8299278616905212, + "learning_rate": 9.47170141930962e-05, + "loss": 1.9012, + "step": 606 + }, + { + "epoch": 0.15975301343317896, + "grad_norm": 0.9899255037307739, + "learning_rate": 9.469949185211144e-05, + "loss": 1.9187, + "step": 608 + }, + { + "epoch": 0.16027851676684074, + "grad_norm": 0.9521119594573975, + "learning_rate": 9.468196951112669e-05, + "loss": 1.8972, + "step": 610 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 1.2446365356445312, + "learning_rate": 9.466444717014194e-05, + "loss": 1.8962, + "step": 612 + }, + { + "epoch": 0.1613295234341643, + "grad_norm": 1.1197859048843384, + "learning_rate": 9.464692482915717e-05, + "loss": 1.8753, + "step": 614 + }, + { + "epoch": 0.16185502676782607, + "grad_norm": 0.9934601783752441, + "learning_rate": 9.462940248817242e-05, + "loss": 1.8974, + "step": 616 + }, + { + "epoch": 0.16238053010148784, + "grad_norm": 0.9261951446533203, + "learning_rate": 9.461188014718767e-05, + "loss": 1.9052, + "step": 618 + }, + { + "epoch": 0.16290603343514962, + "grad_norm": 1.2140916585922241, + "learning_rate": 9.459435780620292e-05, + "loss": 1.8779, + "step": 620 + }, + { + "epoch": 0.16343153676881136, + "grad_norm": 1.067958116531372, + "learning_rate": 9.457683546521816e-05, + "loss": 1.8771, + "step": 622 + }, + { + "epoch": 0.16395704010247314, + "grad_norm": 0.9392744302749634, + "learning_rate": 9.455931312423341e-05, + "loss": 1.892, + "step": 624 + }, + { + "epoch": 0.16448254343613491, + "grad_norm": 1.6888792514801025, + "learning_rate": 9.454179078324864e-05, + "loss": 1.8829, + "step": 626 + }, + { + "epoch": 0.1650080467697967, + "grad_norm": 0.9804869890213013, + "learning_rate": 9.452426844226389e-05, + "loss": 1.9113, + "step": 628 + }, + { + "epoch": 0.16553355010345847, + "grad_norm": 1.0176154375076294, + "learning_rate": 9.450674610127913e-05, + "loss": 1.9063, + "step": 630 + }, + { + "epoch": 0.16605905343712024, + "grad_norm": 0.9681494235992432, + "learning_rate": 9.448922376029437e-05, + "loss": 1.9095, + "step": 632 + }, + { + "epoch": 0.16658455677078202, + "grad_norm": 0.9195823669433594, + "learning_rate": 9.447170141930962e-05, + "loss": 1.8881, + "step": 634 + }, + { + "epoch": 0.1671100601044438, + "grad_norm": 0.9165635108947754, + "learning_rate": 9.445417907832487e-05, + "loss": 1.8788, + "step": 636 + }, + { + "epoch": 0.16763556343810557, + "grad_norm": 1.2198373079299927, + "learning_rate": 9.443665673734012e-05, + "loss": 1.8717, + "step": 638 + }, + { + "epoch": 0.16816106677176734, + "grad_norm": 0.8923284411430359, + "learning_rate": 9.441913439635536e-05, + "loss": 1.918, + "step": 640 + }, + { + "epoch": 0.16868657010542912, + "grad_norm": 1.1938835382461548, + "learning_rate": 9.44016120553706e-05, + "loss": 1.9118, + "step": 642 + }, + { + "epoch": 0.16921207343909087, + "grad_norm": 0.8569881319999695, + "learning_rate": 9.438408971438585e-05, + "loss": 1.8894, + "step": 644 + }, + { + "epoch": 0.16973757677275264, + "grad_norm": 0.79718017578125, + "learning_rate": 9.43665673734011e-05, + "loss": 1.8815, + "step": 646 + }, + { + "epoch": 0.17026308010641442, + "grad_norm": 0.9728055000305176, + "learning_rate": 9.434904503241634e-05, + "loss": 1.8942, + "step": 648 + }, + { + "epoch": 0.1707885834400762, + "grad_norm": 1.2369781732559204, + "learning_rate": 9.433152269143159e-05, + "loss": 1.8663, + "step": 650 + }, + { + "epoch": 0.17131408677373797, + "grad_norm": 1.0035600662231445, + "learning_rate": 9.431400035044684e-05, + "loss": 1.9365, + "step": 652 + }, + { + "epoch": 0.17183959010739974, + "grad_norm": 0.8765824437141418, + "learning_rate": 9.429647800946207e-05, + "loss": 1.8818, + "step": 654 + }, + { + "epoch": 0.17236509344106152, + "grad_norm": 1.0349746942520142, + "learning_rate": 9.42789556684773e-05, + "loss": 1.8732, + "step": 656 + }, + { + "epoch": 0.1728905967747233, + "grad_norm": 0.9079018235206604, + "learning_rate": 9.426143332749255e-05, + "loss": 1.8841, + "step": 658 + }, + { + "epoch": 0.17341610010838507, + "grad_norm": 0.9596872925758362, + "learning_rate": 9.42439109865078e-05, + "loss": 1.8824, + "step": 660 + }, + { + "epoch": 0.17394160344204684, + "grad_norm": 0.8027825355529785, + "learning_rate": 9.422638864552305e-05, + "loss": 1.9056, + "step": 662 + }, + { + "epoch": 0.17446710677570862, + "grad_norm": 1.0182610750198364, + "learning_rate": 9.42088663045383e-05, + "loss": 1.8637, + "step": 664 + }, + { + "epoch": 0.1749926101093704, + "grad_norm": 1.0941540002822876, + "learning_rate": 9.419134396355354e-05, + "loss": 1.9068, + "step": 666 + }, + { + "epoch": 0.17551811344303214, + "grad_norm": 0.8376652598381042, + "learning_rate": 9.417382162256878e-05, + "loss": 1.88, + "step": 668 + }, + { + "epoch": 0.17604361677669392, + "grad_norm": 1.1947648525238037, + "learning_rate": 9.415629928158402e-05, + "loss": 1.8974, + "step": 670 + }, + { + "epoch": 0.1765691201103557, + "grad_norm": 1.1815409660339355, + "learning_rate": 9.413877694059927e-05, + "loss": 1.9003, + "step": 672 + }, + { + "epoch": 0.17709462344401747, + "grad_norm": 1.298322319984436, + "learning_rate": 9.412125459961452e-05, + "loss": 1.9069, + "step": 674 + }, + { + "epoch": 0.17762012677767924, + "grad_norm": 0.9089726209640503, + "learning_rate": 9.410373225862977e-05, + "loss": 1.8631, + "step": 676 + }, + { + "epoch": 0.17814563011134102, + "grad_norm": 2.489449977874756, + "learning_rate": 9.408620991764501e-05, + "loss": 1.8517, + "step": 678 + }, + { + "epoch": 0.1786711334450028, + "grad_norm": 1.456616759300232, + "learning_rate": 9.406868757666025e-05, + "loss": 1.9026, + "step": 680 + }, + { + "epoch": 0.17919663677866457, + "grad_norm": 1.3760594129562378, + "learning_rate": 9.405116523567548e-05, + "loss": 1.9106, + "step": 682 + }, + { + "epoch": 0.17972214011232635, + "grad_norm": 1.0166149139404297, + "learning_rate": 9.403364289469073e-05, + "loss": 1.8985, + "step": 684 + }, + { + "epoch": 0.18024764344598812, + "grad_norm": 0.9207403659820557, + "learning_rate": 9.401612055370598e-05, + "loss": 1.9231, + "step": 686 + }, + { + "epoch": 0.1807731467796499, + "grad_norm": 1.1127294301986694, + "learning_rate": 9.399859821272122e-05, + "loss": 1.9001, + "step": 688 + }, + { + "epoch": 0.18129865011331164, + "grad_norm": 1.1024377346038818, + "learning_rate": 9.398107587173647e-05, + "loss": 1.8673, + "step": 690 + }, + { + "epoch": 0.18182415344697342, + "grad_norm": 1.2524852752685547, + "learning_rate": 9.396355353075172e-05, + "loss": 1.8481, + "step": 692 + }, + { + "epoch": 0.1823496567806352, + "grad_norm": 1.0531530380249023, + "learning_rate": 9.394603118976695e-05, + "loss": 1.8818, + "step": 694 + }, + { + "epoch": 0.18287516011429697, + "grad_norm": 1.0276319980621338, + "learning_rate": 9.39285088487822e-05, + "loss": 1.8516, + "step": 696 + }, + { + "epoch": 0.18340066344795874, + "grad_norm": 0.9953783750534058, + "learning_rate": 9.391098650779745e-05, + "loss": 1.8906, + "step": 698 + }, + { + "epoch": 0.18392616678162052, + "grad_norm": 1.245374321937561, + "learning_rate": 9.38934641668127e-05, + "loss": 1.8797, + "step": 700 + }, + { + "epoch": 0.1844516701152823, + "grad_norm": 0.7994580864906311, + "learning_rate": 9.387594182582794e-05, + "loss": 1.8902, + "step": 702 + }, + { + "epoch": 0.18497717344894407, + "grad_norm": 0.941091775894165, + "learning_rate": 9.385841948484318e-05, + "loss": 1.8964, + "step": 704 + }, + { + "epoch": 0.18550267678260585, + "grad_norm": 0.8709573149681091, + "learning_rate": 9.384089714385843e-05, + "loss": 1.894, + "step": 706 + }, + { + "epoch": 0.18602818011626762, + "grad_norm": 1.1883244514465332, + "learning_rate": 9.382337480287366e-05, + "loss": 1.9171, + "step": 708 + }, + { + "epoch": 0.1865536834499294, + "grad_norm": 2.1829898357391357, + "learning_rate": 9.380585246188891e-05, + "loss": 1.9089, + "step": 710 + }, + { + "epoch": 0.18707918678359114, + "grad_norm": 0.9389178156852722, + "learning_rate": 9.378833012090415e-05, + "loss": 1.9028, + "step": 712 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.8789083957672119, + "learning_rate": 9.37708077799194e-05, + "loss": 1.8881, + "step": 714 + }, + { + "epoch": 0.1881301934509147, + "grad_norm": 1.3515336513519287, + "learning_rate": 9.375328543893465e-05, + "loss": 1.897, + "step": 716 + }, + { + "epoch": 0.18865569678457647, + "grad_norm": 1.123223066329956, + "learning_rate": 9.37357630979499e-05, + "loss": 1.9337, + "step": 718 + }, + { + "epoch": 0.18918120011823825, + "grad_norm": 1.16353440284729, + "learning_rate": 9.371824075696513e-05, + "loss": 1.9257, + "step": 720 + }, + { + "epoch": 0.18970670345190002, + "grad_norm": 0.8678929209709167, + "learning_rate": 9.370071841598038e-05, + "loss": 1.8686, + "step": 722 + }, + { + "epoch": 0.1902322067855618, + "grad_norm": 0.8892553448677063, + "learning_rate": 9.368319607499563e-05, + "loss": 1.8724, + "step": 724 + }, + { + "epoch": 0.19075771011922357, + "grad_norm": 1.2203401327133179, + "learning_rate": 9.366567373401087e-05, + "loss": 1.8663, + "step": 726 + }, + { + "epoch": 0.19128321345288535, + "grad_norm": 0.8035688400268555, + "learning_rate": 9.364815139302612e-05, + "loss": 1.8748, + "step": 728 + }, + { + "epoch": 0.19180871678654712, + "grad_norm": 0.8692450523376465, + "learning_rate": 9.363062905204136e-05, + "loss": 1.9093, + "step": 730 + }, + { + "epoch": 0.1923342201202089, + "grad_norm": 1.1297273635864258, + "learning_rate": 9.36131067110566e-05, + "loss": 1.8618, + "step": 732 + }, + { + "epoch": 0.19285972345387067, + "grad_norm": 0.853670597076416, + "learning_rate": 9.359558437007184e-05, + "loss": 1.8859, + "step": 734 + }, + { + "epoch": 0.19338522678753242, + "grad_norm": 1.115268588066101, + "learning_rate": 9.357806202908708e-05, + "loss": 1.8823, + "step": 736 + }, + { + "epoch": 0.1939107301211942, + "grad_norm": 1.002034068107605, + "learning_rate": 9.356053968810233e-05, + "loss": 1.8743, + "step": 738 + }, + { + "epoch": 0.19443623345485597, + "grad_norm": 1.3825932741165161, + "learning_rate": 9.354301734711758e-05, + "loss": 1.8897, + "step": 740 + }, + { + "epoch": 0.19496173678851775, + "grad_norm": 1.4974256753921509, + "learning_rate": 9.352549500613283e-05, + "loss": 1.899, + "step": 742 + }, + { + "epoch": 0.19548724012217952, + "grad_norm": 1.2865314483642578, + "learning_rate": 9.350797266514808e-05, + "loss": 1.9042, + "step": 744 + }, + { + "epoch": 0.1960127434558413, + "grad_norm": 1.036019206047058, + "learning_rate": 9.349045032416331e-05, + "loss": 1.9169, + "step": 746 + }, + { + "epoch": 0.19653824678950307, + "grad_norm": 1.1401413679122925, + "learning_rate": 9.347292798317856e-05, + "loss": 1.8908, + "step": 748 + }, + { + "epoch": 0.19706375012316485, + "grad_norm": 1.3356719017028809, + "learning_rate": 9.34554056421938e-05, + "loss": 1.8642, + "step": 750 + }, + { + "epoch": 0.19758925345682662, + "grad_norm": 1.025230884552002, + "learning_rate": 9.343788330120905e-05, + "loss": 1.8733, + "step": 752 + }, + { + "epoch": 0.1981147567904884, + "grad_norm": 0.981281578540802, + "learning_rate": 9.34203609602243e-05, + "loss": 1.8894, + "step": 754 + }, + { + "epoch": 0.19864026012415018, + "grad_norm": 1.0571757555007935, + "learning_rate": 9.340283861923953e-05, + "loss": 1.8753, + "step": 756 + }, + { + "epoch": 0.19916576345781192, + "grad_norm": 0.9619776606559753, + "learning_rate": 9.338531627825478e-05, + "loss": 1.8791, + "step": 758 + }, + { + "epoch": 0.1996912667914737, + "grad_norm": 0.8430715203285217, + "learning_rate": 9.336779393727001e-05, + "loss": 1.9133, + "step": 760 + }, + { + "epoch": 0.20021677012513547, + "grad_norm": 1.8572043180465698, + "learning_rate": 9.335027159628526e-05, + "loss": 1.9123, + "step": 762 + }, + { + "epoch": 0.20074227345879725, + "grad_norm": 1.0308524370193481, + "learning_rate": 9.333274925530051e-05, + "loss": 1.8696, + "step": 764 + }, + { + "epoch": 0.20126777679245902, + "grad_norm": 1.3068006038665771, + "learning_rate": 9.331522691431576e-05, + "loss": 1.8932, + "step": 766 + }, + { + "epoch": 0.2017932801261208, + "grad_norm": 0.9089499711990356, + "learning_rate": 9.3297704573331e-05, + "loss": 1.8717, + "step": 768 + }, + { + "epoch": 0.20231878345978258, + "grad_norm": 1.1663358211517334, + "learning_rate": 9.328018223234625e-05, + "loss": 1.8356, + "step": 770 + }, + { + "epoch": 0.20284428679344435, + "grad_norm": 1.1666746139526367, + "learning_rate": 9.326265989136149e-05, + "loss": 1.8785, + "step": 772 + }, + { + "epoch": 0.20336979012710613, + "grad_norm": 0.8820154666900635, + "learning_rate": 9.324513755037673e-05, + "loss": 1.8353, + "step": 774 + }, + { + "epoch": 0.2038952934607679, + "grad_norm": 1.0544767379760742, + "learning_rate": 9.322761520939198e-05, + "loss": 1.9217, + "step": 776 + }, + { + "epoch": 0.20442079679442968, + "grad_norm": 1.0300171375274658, + "learning_rate": 9.321009286840723e-05, + "loss": 1.8982, + "step": 778 + }, + { + "epoch": 0.20494630012809142, + "grad_norm": 0.884556770324707, + "learning_rate": 9.319257052742248e-05, + "loss": 1.8783, + "step": 780 + }, + { + "epoch": 0.2054718034617532, + "grad_norm": 1.150823712348938, + "learning_rate": 9.317504818643771e-05, + "loss": 1.9116, + "step": 782 + }, + { + "epoch": 0.20599730679541498, + "grad_norm": 1.1606664657592773, + "learning_rate": 9.315752584545296e-05, + "loss": 1.8594, + "step": 784 + }, + { + "epoch": 0.20652281012907675, + "grad_norm": 0.9920266270637512, + "learning_rate": 9.314000350446819e-05, + "loss": 1.8659, + "step": 786 + }, + { + "epoch": 0.20704831346273853, + "grad_norm": 1.0931354761123657, + "learning_rate": 9.312248116348344e-05, + "loss": 1.8876, + "step": 788 + }, + { + "epoch": 0.2075738167964003, + "grad_norm": 1.097580075263977, + "learning_rate": 9.310495882249869e-05, + "loss": 1.8961, + "step": 790 + }, + { + "epoch": 0.20809932013006208, + "grad_norm": 1.160636067390442, + "learning_rate": 9.308743648151394e-05, + "loss": 1.8925, + "step": 792 + }, + { + "epoch": 0.20862482346372385, + "grad_norm": 0.9510796666145325, + "learning_rate": 9.306991414052918e-05, + "loss": 1.8449, + "step": 794 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 1.0860143899917603, + "learning_rate": 9.305239179954443e-05, + "loss": 1.8647, + "step": 796 + }, + { + "epoch": 0.2096758301310474, + "grad_norm": 1.1009169816970825, + "learning_rate": 9.303486945855966e-05, + "loss": 1.8992, + "step": 798 + }, + { + "epoch": 0.21020133346470918, + "grad_norm": 1.0895287990570068, + "learning_rate": 9.301734711757491e-05, + "loss": 1.8563, + "step": 800 + }, + { + "epoch": 0.21020133346470918, + "eval_loss": 1.8377995491027832, + "eval_runtime": 487.1876, + "eval_samples_per_second": 249.984, + "eval_steps_per_second": 31.249, + "step": 800 + }, + { + "epoch": 0.21072683679837095, + "grad_norm": 1.2973535060882568, + "learning_rate": 9.299982477659016e-05, + "loss": 1.8701, + "step": 802 + }, + { + "epoch": 0.2112523401320327, + "grad_norm": 0.9440937042236328, + "learning_rate": 9.298230243560541e-05, + "loss": 1.8858, + "step": 804 + }, + { + "epoch": 0.21177784346569448, + "grad_norm": 0.8491653203964233, + "learning_rate": 9.296478009462064e-05, + "loss": 1.872, + "step": 806 + }, + { + "epoch": 0.21230334679935625, + "grad_norm": 1.0194580554962158, + "learning_rate": 9.294725775363589e-05, + "loss": 1.8803, + "step": 808 + }, + { + "epoch": 0.21282885013301803, + "grad_norm": 0.957872748374939, + "learning_rate": 9.292973541265114e-05, + "loss": 1.8825, + "step": 810 + }, + { + "epoch": 0.2133543534666798, + "grad_norm": 1.1070436239242554, + "learning_rate": 9.291221307166637e-05, + "loss": 1.8616, + "step": 812 + }, + { + "epoch": 0.21387985680034158, + "grad_norm": 0.8715662956237793, + "learning_rate": 9.289469073068162e-05, + "loss": 1.8634, + "step": 814 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 1.1930649280548096, + "learning_rate": 9.287716838969687e-05, + "loss": 1.842, + "step": 816 + }, + { + "epoch": 0.21493086346766513, + "grad_norm": 1.0091701745986938, + "learning_rate": 9.285964604871211e-05, + "loss": 1.9128, + "step": 818 + }, + { + "epoch": 0.2154563668013269, + "grad_norm": 1.1418068408966064, + "learning_rate": 9.284212370772736e-05, + "loss": 1.8697, + "step": 820 + }, + { + "epoch": 0.21598187013498868, + "grad_norm": 1.0154426097869873, + "learning_rate": 9.282460136674261e-05, + "loss": 1.9324, + "step": 822 + }, + { + "epoch": 0.21650737346865045, + "grad_norm": 1.2113468647003174, + "learning_rate": 9.280707902575784e-05, + "loss": 1.8379, + "step": 824 + }, + { + "epoch": 0.2170328768023122, + "grad_norm": 1.0505681037902832, + "learning_rate": 9.278955668477309e-05, + "loss": 1.866, + "step": 826 + }, + { + "epoch": 0.21755838013597398, + "grad_norm": 0.938463568687439, + "learning_rate": 9.277203434378834e-05, + "loss": 1.8504, + "step": 828 + }, + { + "epoch": 0.21808388346963575, + "grad_norm": 0.7944304943084717, + "learning_rate": 9.275451200280358e-05, + "loss": 1.8919, + "step": 830 + }, + { + "epoch": 0.21860938680329753, + "grad_norm": 0.9631441235542297, + "learning_rate": 9.273698966181882e-05, + "loss": 1.8451, + "step": 832 + }, + { + "epoch": 0.2191348901369593, + "grad_norm": 0.9219480156898499, + "learning_rate": 9.271946732083407e-05, + "loss": 1.8546, + "step": 834 + }, + { + "epoch": 0.21966039347062108, + "grad_norm": 0.8851411938667297, + "learning_rate": 9.270194497984931e-05, + "loss": 1.8685, + "step": 836 + }, + { + "epoch": 0.22018589680428285, + "grad_norm": 0.9707255959510803, + "learning_rate": 9.268442263886455e-05, + "loss": 1.887, + "step": 838 + }, + { + "epoch": 0.22071140013794463, + "grad_norm": 0.9294309616088867, + "learning_rate": 9.26669002978798e-05, + "loss": 1.8967, + "step": 840 + }, + { + "epoch": 0.2212369034716064, + "grad_norm": 0.9156199097633362, + "learning_rate": 9.264937795689504e-05, + "loss": 1.8756, + "step": 842 + }, + { + "epoch": 0.22176240680526818, + "grad_norm": 0.8118696808815002, + "learning_rate": 9.263185561591029e-05, + "loss": 1.8766, + "step": 844 + }, + { + "epoch": 0.22228791013892996, + "grad_norm": 0.9590555429458618, + "learning_rate": 9.261433327492554e-05, + "loss": 1.8693, + "step": 846 + }, + { + "epoch": 0.2228134134725917, + "grad_norm": 1.385361671447754, + "learning_rate": 9.259681093394079e-05, + "loss": 1.8824, + "step": 848 + }, + { + "epoch": 0.22333891680625348, + "grad_norm": 0.9501360654830933, + "learning_rate": 9.257928859295602e-05, + "loss": 1.8414, + "step": 850 + }, + { + "epoch": 0.22386442013991525, + "grad_norm": 1.0095267295837402, + "learning_rate": 9.256176625197127e-05, + "loss": 1.8499, + "step": 852 + }, + { + "epoch": 0.22438992347357703, + "grad_norm": 0.8820069432258606, + "learning_rate": 9.254424391098651e-05, + "loss": 1.8513, + "step": 854 + }, + { + "epoch": 0.2249154268072388, + "grad_norm": 0.7832709550857544, + "learning_rate": 9.252672157000176e-05, + "loss": 1.8585, + "step": 856 + }, + { + "epoch": 0.22544093014090058, + "grad_norm": 1.4282846450805664, + "learning_rate": 9.2509199229017e-05, + "loss": 1.8776, + "step": 858 + }, + { + "epoch": 0.22596643347456236, + "grad_norm": 0.8925049901008606, + "learning_rate": 9.249167688803224e-05, + "loss": 1.8626, + "step": 860 + }, + { + "epoch": 0.22649193680822413, + "grad_norm": 1.4945679903030396, + "learning_rate": 9.247415454704749e-05, + "loss": 1.874, + "step": 862 + }, + { + "epoch": 0.2270174401418859, + "grad_norm": 0.9479649066925049, + "learning_rate": 9.245663220606273e-05, + "loss": 1.8789, + "step": 864 + }, + { + "epoch": 0.22754294347554768, + "grad_norm": 1.023941159248352, + "learning_rate": 9.243910986507797e-05, + "loss": 1.8945, + "step": 866 + }, + { + "epoch": 0.22806844680920946, + "grad_norm": 1.0005570650100708, + "learning_rate": 9.242158752409322e-05, + "loss": 1.8736, + "step": 868 + }, + { + "epoch": 0.2285939501428712, + "grad_norm": 0.9343464374542236, + "learning_rate": 9.240406518310847e-05, + "loss": 1.844, + "step": 870 + }, + { + "epoch": 0.22911945347653298, + "grad_norm": 0.8942081332206726, + "learning_rate": 9.238654284212372e-05, + "loss": 1.8737, + "step": 872 + }, + { + "epoch": 0.22964495681019476, + "grad_norm": 1.0856554508209229, + "learning_rate": 9.236902050113896e-05, + "loss": 1.8492, + "step": 874 + }, + { + "epoch": 0.23017046014385653, + "grad_norm": 0.8268112540245056, + "learning_rate": 9.23514981601542e-05, + "loss": 1.8804, + "step": 876 + }, + { + "epoch": 0.2306959634775183, + "grad_norm": 0.9946966171264648, + "learning_rate": 9.233397581916944e-05, + "loss": 1.8592, + "step": 878 + }, + { + "epoch": 0.23122146681118008, + "grad_norm": 1.0663763284683228, + "learning_rate": 9.231645347818469e-05, + "loss": 1.8991, + "step": 880 + }, + { + "epoch": 0.23174697014484186, + "grad_norm": 1.2675755023956299, + "learning_rate": 9.229893113719993e-05, + "loss": 1.8621, + "step": 882 + }, + { + "epoch": 0.23227247347850363, + "grad_norm": 1.1921007633209229, + "learning_rate": 9.228140879621517e-05, + "loss": 1.9014, + "step": 884 + }, + { + "epoch": 0.2327979768121654, + "grad_norm": 0.8989017009735107, + "learning_rate": 9.226388645523042e-05, + "loss": 1.8575, + "step": 886 + }, + { + "epoch": 0.23332348014582718, + "grad_norm": 1.2373161315917969, + "learning_rate": 9.224636411424567e-05, + "loss": 1.8905, + "step": 888 + }, + { + "epoch": 0.23384898347948896, + "grad_norm": 0.938845694065094, + "learning_rate": 9.222884177326092e-05, + "loss": 1.9011, + "step": 890 + }, + { + "epoch": 0.23437448681315073, + "grad_norm": 0.8705965876579285, + "learning_rate": 9.221131943227615e-05, + "loss": 1.831, + "step": 892 + }, + { + "epoch": 0.23489999014681248, + "grad_norm": 1.3164907693862915, + "learning_rate": 9.21937970912914e-05, + "loss": 1.8455, + "step": 894 + }, + { + "epoch": 0.23542549348047426, + "grad_norm": 0.9536553025245667, + "learning_rate": 9.217627475030665e-05, + "loss": 1.8557, + "step": 896 + }, + { + "epoch": 0.23595099681413603, + "grad_norm": 1.0079597234725952, + "learning_rate": 9.21587524093219e-05, + "loss": 1.8758, + "step": 898 + }, + { + "epoch": 0.2364765001477978, + "grad_norm": 2.14665150642395, + "learning_rate": 9.214123006833714e-05, + "loss": 1.9172, + "step": 900 + }, + { + "epoch": 0.23700200348145958, + "grad_norm": 1.007122278213501, + "learning_rate": 9.212370772735239e-05, + "loss": 1.9175, + "step": 902 + }, + { + "epoch": 0.23752750681512136, + "grad_norm": 0.9156002998352051, + "learning_rate": 9.210618538636762e-05, + "loss": 1.8589, + "step": 904 + }, + { + "epoch": 0.23805301014878313, + "grad_norm": 1.2697999477386475, + "learning_rate": 9.208866304538287e-05, + "loss": 1.8996, + "step": 906 + }, + { + "epoch": 0.2385785134824449, + "grad_norm": 0.8666015863418579, + "learning_rate": 9.20711407043981e-05, + "loss": 1.8595, + "step": 908 + }, + { + "epoch": 0.23910401681610668, + "grad_norm": 1.5307285785675049, + "learning_rate": 9.205361836341335e-05, + "loss": 1.8551, + "step": 910 + }, + { + "epoch": 0.23962952014976846, + "grad_norm": 1.4660929441452026, + "learning_rate": 9.20360960224286e-05, + "loss": 1.8653, + "step": 912 + }, + { + "epoch": 0.24015502348343024, + "grad_norm": 0.9962916970252991, + "learning_rate": 9.201857368144385e-05, + "loss": 1.8713, + "step": 914 + }, + { + "epoch": 0.24068052681709198, + "grad_norm": 1.3509130477905273, + "learning_rate": 9.20010513404591e-05, + "loss": 1.8946, + "step": 916 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.8232421875, + "learning_rate": 9.198352899947433e-05, + "loss": 1.8795, + "step": 918 + }, + { + "epoch": 0.24173153348441553, + "grad_norm": 1.6278520822525024, + "learning_rate": 9.196600665848958e-05, + "loss": 1.901, + "step": 920 + }, + { + "epoch": 0.2422570368180773, + "grad_norm": 0.7896947860717773, + "learning_rate": 9.194848431750482e-05, + "loss": 1.8681, + "step": 922 + }, + { + "epoch": 0.24278254015173908, + "grad_norm": 1.1732158660888672, + "learning_rate": 9.193096197652007e-05, + "loss": 1.8632, + "step": 924 + }, + { + "epoch": 0.24330804348540086, + "grad_norm": 1.7037454843521118, + "learning_rate": 9.191343963553532e-05, + "loss": 1.8928, + "step": 926 + }, + { + "epoch": 0.24383354681906264, + "grad_norm": 0.9831770062446594, + "learning_rate": 9.189591729455057e-05, + "loss": 1.8491, + "step": 928 + }, + { + "epoch": 0.2443590501527244, + "grad_norm": 1.2588465213775635, + "learning_rate": 9.18783949535658e-05, + "loss": 1.8743, + "step": 930 + }, + { + "epoch": 0.2448845534863862, + "grad_norm": 0.8186757564544678, + "learning_rate": 9.186087261258105e-05, + "loss": 1.8961, + "step": 932 + }, + { + "epoch": 0.24541005682004796, + "grad_norm": 1.0041245222091675, + "learning_rate": 9.184335027159628e-05, + "loss": 1.8264, + "step": 934 + }, + { + "epoch": 0.24593556015370974, + "grad_norm": 1.397493600845337, + "learning_rate": 9.182582793061153e-05, + "loss": 1.8483, + "step": 936 + }, + { + "epoch": 0.24646106348737148, + "grad_norm": 0.9180475473403931, + "learning_rate": 9.180830558962678e-05, + "loss": 1.8609, + "step": 938 + }, + { + "epoch": 0.24698656682103326, + "grad_norm": 0.9013431072235107, + "learning_rate": 9.179078324864202e-05, + "loss": 1.8537, + "step": 940 + }, + { + "epoch": 0.24751207015469504, + "grad_norm": 0.7891268134117126, + "learning_rate": 9.177326090765727e-05, + "loss": 1.8224, + "step": 942 + }, + { + "epoch": 0.2480375734883568, + "grad_norm": 1.082982063293457, + "learning_rate": 9.17557385666725e-05, + "loss": 1.8799, + "step": 944 + }, + { + "epoch": 0.24856307682201859, + "grad_norm": 0.9956438541412354, + "learning_rate": 9.173821622568775e-05, + "loss": 1.8638, + "step": 946 + }, + { + "epoch": 0.24908858015568036, + "grad_norm": 1.1269092559814453, + "learning_rate": 9.1720693884703e-05, + "loss": 1.8589, + "step": 948 + }, + { + "epoch": 0.24961408348934214, + "grad_norm": 0.88730788230896, + "learning_rate": 9.170317154371825e-05, + "loss": 1.8901, + "step": 950 + }, + { + "epoch": 0.2501395868230039, + "grad_norm": 0.9314135313034058, + "learning_rate": 9.16856492027335e-05, + "loss": 1.8457, + "step": 952 + }, + { + "epoch": 0.25066509015666566, + "grad_norm": 1.0120025873184204, + "learning_rate": 9.166812686174874e-05, + "loss": 1.8666, + "step": 954 + }, + { + "epoch": 0.25119059349032746, + "grad_norm": 1.0400328636169434, + "learning_rate": 9.165060452076398e-05, + "loss": 1.8666, + "step": 956 + }, + { + "epoch": 0.2517160968239892, + "grad_norm": 1.084693431854248, + "learning_rate": 9.163308217977923e-05, + "loss": 1.8318, + "step": 958 + }, + { + "epoch": 0.252241600157651, + "grad_norm": 0.911669909954071, + "learning_rate": 9.161555983879446e-05, + "loss": 1.859, + "step": 960 + }, + { + "epoch": 0.25276710349131276, + "grad_norm": 1.0443209409713745, + "learning_rate": 9.159803749780971e-05, + "loss": 1.8647, + "step": 962 + }, + { + "epoch": 0.25329260682497456, + "grad_norm": 0.8681670427322388, + "learning_rate": 9.158051515682495e-05, + "loss": 1.8975, + "step": 964 + }, + { + "epoch": 0.2538181101586363, + "grad_norm": 1.2208961248397827, + "learning_rate": 9.15629928158402e-05, + "loss": 1.8722, + "step": 966 + }, + { + "epoch": 0.2543436134922981, + "grad_norm": 0.8562275767326355, + "learning_rate": 9.154547047485545e-05, + "loss": 1.8535, + "step": 968 + }, + { + "epoch": 0.25486911682595986, + "grad_norm": 0.8852279782295227, + "learning_rate": 9.152794813387068e-05, + "loss": 1.8713, + "step": 970 + }, + { + "epoch": 0.2553946201596216, + "grad_norm": 0.8528086543083191, + "learning_rate": 9.151042579288593e-05, + "loss": 1.8736, + "step": 972 + }, + { + "epoch": 0.2559201234932834, + "grad_norm": 0.886330246925354, + "learning_rate": 9.149290345190118e-05, + "loss": 1.8915, + "step": 974 + }, + { + "epoch": 0.25644562682694516, + "grad_norm": 0.8512532711029053, + "learning_rate": 9.147538111091643e-05, + "loss": 1.8293, + "step": 976 + }, + { + "epoch": 0.25697113016060696, + "grad_norm": 0.9382111430168152, + "learning_rate": 9.145785876993167e-05, + "loss": 1.8717, + "step": 978 + }, + { + "epoch": 0.2574966334942687, + "grad_norm": 0.8720589876174927, + "learning_rate": 9.144033642894692e-05, + "loss": 1.8839, + "step": 980 + }, + { + "epoch": 0.2580221368279305, + "grad_norm": 1.6592185497283936, + "learning_rate": 9.142281408796216e-05, + "loss": 1.8409, + "step": 982 + }, + { + "epoch": 0.25854764016159226, + "grad_norm": 1.2780932188034058, + "learning_rate": 9.140529174697739e-05, + "loss": 1.8717, + "step": 984 + }, + { + "epoch": 0.25907314349525407, + "grad_norm": 0.9220293164253235, + "learning_rate": 9.138776940599264e-05, + "loss": 1.8539, + "step": 986 + }, + { + "epoch": 0.2595986468289158, + "grad_norm": 0.8890568017959595, + "learning_rate": 9.137024706500788e-05, + "loss": 1.8678, + "step": 988 + }, + { + "epoch": 0.2601241501625776, + "grad_norm": 1.0034205913543701, + "learning_rate": 9.135272472402313e-05, + "loss": 1.8262, + "step": 990 + }, + { + "epoch": 0.26064965349623936, + "grad_norm": 1.0338081121444702, + "learning_rate": 9.133520238303838e-05, + "loss": 1.8566, + "step": 992 + }, + { + "epoch": 0.26117515682990117, + "grad_norm": 1.4746791124343872, + "learning_rate": 9.131768004205363e-05, + "loss": 1.8843, + "step": 994 + }, + { + "epoch": 0.2617006601635629, + "grad_norm": 1.0208336114883423, + "learning_rate": 9.130015770106886e-05, + "loss": 1.8503, + "step": 996 + }, + { + "epoch": 0.26222616349722466, + "grad_norm": 0.9133326411247253, + "learning_rate": 9.128263536008411e-05, + "loss": 1.8805, + "step": 998 + }, + { + "epoch": 0.26275166683088647, + "grad_norm": 1.1855682134628296, + "learning_rate": 9.126511301909936e-05, + "loss": 1.8862, + "step": 1000 + }, + { + "epoch": 0.2632771701645482, + "grad_norm": 0.9511350393295288, + "learning_rate": 9.12475906781146e-05, + "loss": 1.816, + "step": 1002 + }, + { + "epoch": 0.26380267349821, + "grad_norm": 1.5805948972702026, + "learning_rate": 9.123006833712985e-05, + "loss": 1.8539, + "step": 1004 + }, + { + "epoch": 0.26432817683187176, + "grad_norm": 1.7137740850448608, + "learning_rate": 9.12125459961451e-05, + "loss": 1.8516, + "step": 1006 + }, + { + "epoch": 0.26485368016553357, + "grad_norm": 1.1085962057113647, + "learning_rate": 9.119502365516033e-05, + "loss": 1.8548, + "step": 1008 + }, + { + "epoch": 0.2653791834991953, + "grad_norm": 0.927699625492096, + "learning_rate": 9.117750131417557e-05, + "loss": 1.8411, + "step": 1010 + }, + { + "epoch": 0.2659046868328571, + "grad_norm": 1.0528203248977661, + "learning_rate": 9.115997897319081e-05, + "loss": 1.8672, + "step": 1012 + }, + { + "epoch": 0.26643019016651887, + "grad_norm": 0.8325463533401489, + "learning_rate": 9.114245663220606e-05, + "loss": 1.8081, + "step": 1014 + }, + { + "epoch": 0.26695569350018067, + "grad_norm": 0.9019527435302734, + "learning_rate": 9.112493429122131e-05, + "loss": 1.869, + "step": 1016 + }, + { + "epoch": 0.2674811968338424, + "grad_norm": 1.3394633531570435, + "learning_rate": 9.110741195023656e-05, + "loss": 1.8063, + "step": 1018 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 1.0652636289596558, + "learning_rate": 9.10898896092518e-05, + "loss": 1.8685, + "step": 1020 + }, + { + "epoch": 0.26853220350116597, + "grad_norm": 1.0782673358917236, + "learning_rate": 9.107236726826704e-05, + "loss": 1.8749, + "step": 1022 + }, + { + "epoch": 0.2690577068348277, + "grad_norm": 1.4112943410873413, + "learning_rate": 9.105484492728229e-05, + "loss": 1.8942, + "step": 1024 + }, + { + "epoch": 0.2695832101684895, + "grad_norm": 0.9202266931533813, + "learning_rate": 9.103732258629753e-05, + "loss": 1.8578, + "step": 1026 + }, + { + "epoch": 0.27010871350215127, + "grad_norm": 1.4176150560379028, + "learning_rate": 9.101980024531278e-05, + "loss": 1.8788, + "step": 1028 + }, + { + "epoch": 0.27063421683581307, + "grad_norm": 1.0629339218139648, + "learning_rate": 9.100227790432803e-05, + "loss": 1.8597, + "step": 1030 + }, + { + "epoch": 0.2711597201694748, + "grad_norm": 1.4514985084533691, + "learning_rate": 9.098475556334328e-05, + "loss": 1.8748, + "step": 1032 + }, + { + "epoch": 0.2716852235031366, + "grad_norm": 1.7836532592773438, + "learning_rate": 9.096723322235851e-05, + "loss": 1.8205, + "step": 1034 + }, + { + "epoch": 0.27221072683679837, + "grad_norm": 0.9807853698730469, + "learning_rate": 9.094971088137374e-05, + "loss": 1.8556, + "step": 1036 + }, + { + "epoch": 0.27273623017046017, + "grad_norm": 0.9897574782371521, + "learning_rate": 9.093218854038899e-05, + "loss": 1.8298, + "step": 1038 + }, + { + "epoch": 0.2732617335041219, + "grad_norm": 0.8160204887390137, + "learning_rate": 9.091466619940424e-05, + "loss": 1.8612, + "step": 1040 + }, + { + "epoch": 0.27378723683778367, + "grad_norm": 1.4802687168121338, + "learning_rate": 9.089714385841949e-05, + "loss": 1.8657, + "step": 1042 + }, + { + "epoch": 0.27431274017144547, + "grad_norm": 1.0503878593444824, + "learning_rate": 9.087962151743474e-05, + "loss": 1.8625, + "step": 1044 + }, + { + "epoch": 0.2748382435051072, + "grad_norm": 1.03403639793396, + "learning_rate": 9.086209917644998e-05, + "loss": 1.8339, + "step": 1046 + }, + { + "epoch": 0.275363746838769, + "grad_norm": 1.1939598321914673, + "learning_rate": 9.084457683546522e-05, + "loss": 1.8322, + "step": 1048 + }, + { + "epoch": 0.27588925017243077, + "grad_norm": 0.8081462383270264, + "learning_rate": 9.082705449448046e-05, + "loss": 1.8527, + "step": 1050 + }, + { + "epoch": 0.27641475350609257, + "grad_norm": 0.9340723156929016, + "learning_rate": 9.080953215349571e-05, + "loss": 1.8462, + "step": 1052 + }, + { + "epoch": 0.2769402568397543, + "grad_norm": 2.116253137588501, + "learning_rate": 9.079200981251096e-05, + "loss": 1.8157, + "step": 1054 + }, + { + "epoch": 0.2774657601734161, + "grad_norm": 1.5195270776748657, + "learning_rate": 9.077448747152621e-05, + "loss": 1.8554, + "step": 1056 + }, + { + "epoch": 0.27799126350707787, + "grad_norm": 1.1265277862548828, + "learning_rate": 9.075696513054145e-05, + "loss": 1.8535, + "step": 1058 + }, + { + "epoch": 0.27851676684073967, + "grad_norm": 0.8470209240913391, + "learning_rate": 9.073944278955669e-05, + "loss": 1.8552, + "step": 1060 + }, + { + "epoch": 0.2790422701744014, + "grad_norm": 1.014785885810852, + "learning_rate": 9.072192044857192e-05, + "loss": 1.8314, + "step": 1062 + }, + { + "epoch": 0.27956777350806317, + "grad_norm": 0.9315053820610046, + "learning_rate": 9.070439810758717e-05, + "loss": 1.852, + "step": 1064 + }, + { + "epoch": 0.28009327684172497, + "grad_norm": 0.8854875564575195, + "learning_rate": 9.068687576660242e-05, + "loss": 1.853, + "step": 1066 + }, + { + "epoch": 0.2806187801753867, + "grad_norm": 1.0083775520324707, + "learning_rate": 9.066935342561767e-05, + "loss": 1.8525, + "step": 1068 + }, + { + "epoch": 0.2811442835090485, + "grad_norm": 0.8299185633659363, + "learning_rate": 9.065183108463291e-05, + "loss": 1.858, + "step": 1070 + }, + { + "epoch": 0.28166978684271027, + "grad_norm": 0.859104573726654, + "learning_rate": 9.063430874364816e-05, + "loss": 1.8478, + "step": 1072 + }, + { + "epoch": 0.28219529017637207, + "grad_norm": 0.8011692762374878, + "learning_rate": 9.06167864026634e-05, + "loss": 1.8531, + "step": 1074 + }, + { + "epoch": 0.2827207935100338, + "grad_norm": 0.8882426023483276, + "learning_rate": 9.059926406167864e-05, + "loss": 1.87, + "step": 1076 + }, + { + "epoch": 0.2832462968436956, + "grad_norm": 0.9461469650268555, + "learning_rate": 9.058174172069389e-05, + "loss": 1.8531, + "step": 1078 + }, + { + "epoch": 0.28377180017735737, + "grad_norm": 0.92154461145401, + "learning_rate": 9.056421937970914e-05, + "loss": 1.8378, + "step": 1080 + }, + { + "epoch": 0.2842973035110192, + "grad_norm": 0.9303539395332336, + "learning_rate": 9.054669703872438e-05, + "loss": 1.8897, + "step": 1082 + }, + { + "epoch": 0.2848228068446809, + "grad_norm": 0.8764487504959106, + "learning_rate": 9.052917469773963e-05, + "loss": 1.8908, + "step": 1084 + }, + { + "epoch": 0.28534831017834267, + "grad_norm": 1.0205122232437134, + "learning_rate": 9.051165235675487e-05, + "loss": 1.8712, + "step": 1086 + }, + { + "epoch": 0.28587381351200447, + "grad_norm": 1.2372097969055176, + "learning_rate": 9.04941300157701e-05, + "loss": 1.8288, + "step": 1088 + }, + { + "epoch": 0.2863993168456662, + "grad_norm": 0.9842033386230469, + "learning_rate": 9.047660767478535e-05, + "loss": 1.8767, + "step": 1090 + }, + { + "epoch": 0.286924820179328, + "grad_norm": 1.4316095113754272, + "learning_rate": 9.04590853338006e-05, + "loss": 1.8624, + "step": 1092 + }, + { + "epoch": 0.28745032351298977, + "grad_norm": 1.0971202850341797, + "learning_rate": 9.044156299281584e-05, + "loss": 1.858, + "step": 1094 + }, + { + "epoch": 0.2879758268466516, + "grad_norm": 1.3766525983810425, + "learning_rate": 9.042404065183109e-05, + "loss": 1.8366, + "step": 1096 + }, + { + "epoch": 0.2885013301803133, + "grad_norm": 1.5556044578552246, + "learning_rate": 9.040651831084634e-05, + "loss": 1.827, + "step": 1098 + }, + { + "epoch": 0.2890268335139751, + "grad_norm": 0.803501307964325, + "learning_rate": 9.038899596986157e-05, + "loss": 1.9033, + "step": 1100 + }, + { + "epoch": 0.28955233684763687, + "grad_norm": 1.090751051902771, + "learning_rate": 9.037147362887682e-05, + "loss": 1.8605, + "step": 1102 + }, + { + "epoch": 0.2900778401812987, + "grad_norm": 1.6796822547912598, + "learning_rate": 9.035395128789207e-05, + "loss": 1.8421, + "step": 1104 + }, + { + "epoch": 0.2906033435149604, + "grad_norm": 0.8966239094734192, + "learning_rate": 9.033642894690731e-05, + "loss": 1.8679, + "step": 1106 + }, + { + "epoch": 0.29112884684862217, + "grad_norm": 1.390019416809082, + "learning_rate": 9.031890660592256e-05, + "loss": 1.8322, + "step": 1108 + }, + { + "epoch": 0.291654350182284, + "grad_norm": 0.8526667356491089, + "learning_rate": 9.030138426493781e-05, + "loss": 1.8364, + "step": 1110 + }, + { + "epoch": 0.2921798535159457, + "grad_norm": 1.1773560047149658, + "learning_rate": 9.028386192395304e-05, + "loss": 1.8466, + "step": 1112 + }, + { + "epoch": 0.2927053568496075, + "grad_norm": 1.0654343366622925, + "learning_rate": 9.026633958296828e-05, + "loss": 1.8767, + "step": 1114 + }, + { + "epoch": 0.29323086018326927, + "grad_norm": 1.023926854133606, + "learning_rate": 9.024881724198353e-05, + "loss": 1.8461, + "step": 1116 + }, + { + "epoch": 0.2937563635169311, + "grad_norm": 0.94902503490448, + "learning_rate": 9.023129490099877e-05, + "loss": 1.8412, + "step": 1118 + }, + { + "epoch": 0.2942818668505928, + "grad_norm": 1.0602984428405762, + "learning_rate": 9.021377256001402e-05, + "loss": 1.8704, + "step": 1120 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.8396863341331482, + "learning_rate": 9.019625021902927e-05, + "loss": 1.8557, + "step": 1122 + }, + { + "epoch": 0.29533287351791637, + "grad_norm": 1.0940845012664795, + "learning_rate": 9.017872787804452e-05, + "loss": 1.8773, + "step": 1124 + }, + { + "epoch": 0.2958583768515782, + "grad_norm": 0.8471454977989197, + "learning_rate": 9.016120553705975e-05, + "loss": 1.8232, + "step": 1126 + }, + { + "epoch": 0.2963838801852399, + "grad_norm": 0.7603086829185486, + "learning_rate": 9.0143683196075e-05, + "loss": 1.855, + "step": 1128 + }, + { + "epoch": 0.29690938351890167, + "grad_norm": 0.8293117880821228, + "learning_rate": 9.012616085509024e-05, + "loss": 1.8497, + "step": 1130 + }, + { + "epoch": 0.2974348868525635, + "grad_norm": 0.8437036275863647, + "learning_rate": 9.010863851410549e-05, + "loss": 1.8476, + "step": 1132 + }, + { + "epoch": 0.2979603901862252, + "grad_norm": 0.9667044878005981, + "learning_rate": 9.009111617312074e-05, + "loss": 1.7826, + "step": 1134 + }, + { + "epoch": 0.298485893519887, + "grad_norm": 0.7626157402992249, + "learning_rate": 9.007359383213599e-05, + "loss": 1.8588, + "step": 1136 + }, + { + "epoch": 0.29901139685354877, + "grad_norm": 0.782361626625061, + "learning_rate": 9.005607149115122e-05, + "loss": 1.8495, + "step": 1138 + }, + { + "epoch": 0.2995369001872106, + "grad_norm": 0.9417736530303955, + "learning_rate": 9.003854915016647e-05, + "loss": 1.8875, + "step": 1140 + }, + { + "epoch": 0.3000624035208723, + "grad_norm": 0.8616002202033997, + "learning_rate": 9.00210268091817e-05, + "loss": 1.852, + "step": 1142 + }, + { + "epoch": 0.3005879068545341, + "grad_norm": 0.8036054372787476, + "learning_rate": 9.000350446819695e-05, + "loss": 1.8638, + "step": 1144 + }, + { + "epoch": 0.3011134101881959, + "grad_norm": 1.0912983417510986, + "learning_rate": 8.99859821272122e-05, + "loss": 1.8191, + "step": 1146 + }, + { + "epoch": 0.3016389135218577, + "grad_norm": 0.9259098172187805, + "learning_rate": 8.996845978622745e-05, + "loss": 1.8228, + "step": 1148 + }, + { + "epoch": 0.3021644168555194, + "grad_norm": 1.3986306190490723, + "learning_rate": 8.99509374452427e-05, + "loss": 1.8365, + "step": 1150 + }, + { + "epoch": 0.3026899201891812, + "grad_norm": 0.8834369778633118, + "learning_rate": 8.993341510425794e-05, + "loss": 1.857, + "step": 1152 + }, + { + "epoch": 0.303215423522843, + "grad_norm": 0.9686596989631653, + "learning_rate": 8.991589276327317e-05, + "loss": 1.8559, + "step": 1154 + }, + { + "epoch": 0.3037409268565047, + "grad_norm": 0.913817822933197, + "learning_rate": 8.989837042228842e-05, + "loss": 1.8618, + "step": 1156 + }, + { + "epoch": 0.3042664301901665, + "grad_norm": 1.0107851028442383, + "learning_rate": 8.988084808130367e-05, + "loss": 1.8706, + "step": 1158 + }, + { + "epoch": 0.3047919335238283, + "grad_norm": 1.2873750925064087, + "learning_rate": 8.986332574031892e-05, + "loss": 1.8451, + "step": 1160 + }, + { + "epoch": 0.3053174368574901, + "grad_norm": 0.9408276677131653, + "learning_rate": 8.984580339933417e-05, + "loss": 1.8417, + "step": 1162 + }, + { + "epoch": 0.3058429401911518, + "grad_norm": 1.078941822052002, + "learning_rate": 8.98282810583494e-05, + "loss": 1.8526, + "step": 1164 + }, + { + "epoch": 0.3063684435248136, + "grad_norm": 0.9041505455970764, + "learning_rate": 8.981075871736465e-05, + "loss": 1.8779, + "step": 1166 + }, + { + "epoch": 0.3068939468584754, + "grad_norm": 0.8624897599220276, + "learning_rate": 8.979323637637988e-05, + "loss": 1.7867, + "step": 1168 + }, + { + "epoch": 0.3074194501921372, + "grad_norm": 0.9410212635993958, + "learning_rate": 8.977571403539513e-05, + "loss": 1.8498, + "step": 1170 + }, + { + "epoch": 0.3079449535257989, + "grad_norm": 0.9149646162986755, + "learning_rate": 8.975819169441038e-05, + "loss": 1.8229, + "step": 1172 + }, + { + "epoch": 0.30847045685946073, + "grad_norm": 0.7817291021347046, + "learning_rate": 8.974066935342562e-05, + "loss": 1.8886, + "step": 1174 + }, + { + "epoch": 0.3089959601931225, + "grad_norm": 1.3264005184173584, + "learning_rate": 8.972314701244087e-05, + "loss": 1.851, + "step": 1176 + }, + { + "epoch": 0.3095214635267842, + "grad_norm": 1.0288749933242798, + "learning_rate": 8.970562467145612e-05, + "loss": 1.8514, + "step": 1178 + }, + { + "epoch": 0.310046966860446, + "grad_norm": 0.9613611698150635, + "learning_rate": 8.968810233047135e-05, + "loss": 1.8633, + "step": 1180 + }, + { + "epoch": 0.3105724701941078, + "grad_norm": 1.0935230255126953, + "learning_rate": 8.96705799894866e-05, + "loss": 1.8175, + "step": 1182 + }, + { + "epoch": 0.3110979735277696, + "grad_norm": 0.821371853351593, + "learning_rate": 8.965305764850185e-05, + "loss": 1.8275, + "step": 1184 + }, + { + "epoch": 0.3116234768614313, + "grad_norm": 1.0035851001739502, + "learning_rate": 8.96355353075171e-05, + "loss": 1.8379, + "step": 1186 + }, + { + "epoch": 0.31214898019509313, + "grad_norm": 1.4299391508102417, + "learning_rate": 8.961801296653233e-05, + "loss": 1.8456, + "step": 1188 + }, + { + "epoch": 0.3126744835287549, + "grad_norm": 1.1284465789794922, + "learning_rate": 8.960049062554758e-05, + "loss": 1.8317, + "step": 1190 + }, + { + "epoch": 0.3131999868624167, + "grad_norm": 0.8965946435928345, + "learning_rate": 8.958296828456282e-05, + "loss": 1.8265, + "step": 1192 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.7729229927062988, + "learning_rate": 8.956544594357806e-05, + "loss": 1.8479, + "step": 1194 + }, + { + "epoch": 0.31425099352974023, + "grad_norm": 1.0302493572235107, + "learning_rate": 8.95479236025933e-05, + "loss": 1.8563, + "step": 1196 + }, + { + "epoch": 0.314776496863402, + "grad_norm": 1.2506144046783447, + "learning_rate": 8.953040126160855e-05, + "loss": 1.8672, + "step": 1198 + }, + { + "epoch": 0.3153020001970637, + "grad_norm": 1.190073847770691, + "learning_rate": 8.95128789206238e-05, + "loss": 1.8476, + "step": 1200 + }, + { + "epoch": 0.3153020001970637, + "eval_loss": 1.7993388175964355, + "eval_runtime": 487.2115, + "eval_samples_per_second": 249.972, + "eval_steps_per_second": 31.247, + "step": 1200 + }, + { + "epoch": 0.31582750353072553, + "grad_norm": 1.772186279296875, + "learning_rate": 8.949535657963905e-05, + "loss": 1.8635, + "step": 1202 + }, + { + "epoch": 0.3163530068643873, + "grad_norm": 0.8721492290496826, + "learning_rate": 8.94778342386543e-05, + "loss": 1.8243, + "step": 1204 + }, + { + "epoch": 0.3168785101980491, + "grad_norm": 0.9871326684951782, + "learning_rate": 8.946031189766953e-05, + "loss": 1.8491, + "step": 1206 + }, + { + "epoch": 0.3174040135317108, + "grad_norm": 0.8752848505973816, + "learning_rate": 8.944278955668478e-05, + "loss": 1.8638, + "step": 1208 + }, + { + "epoch": 0.31792951686537263, + "grad_norm": 0.9636614918708801, + "learning_rate": 8.942526721570003e-05, + "loss": 1.8779, + "step": 1210 + }, + { + "epoch": 0.3184550201990344, + "grad_norm": 1.027335524559021, + "learning_rate": 8.940774487471527e-05, + "loss": 1.857, + "step": 1212 + }, + { + "epoch": 0.3189805235326962, + "grad_norm": 0.8861249089241028, + "learning_rate": 8.939022253373051e-05, + "loss": 1.875, + "step": 1214 + }, + { + "epoch": 0.31950602686635793, + "grad_norm": 0.8626223802566528, + "learning_rate": 8.937270019274575e-05, + "loss": 1.8096, + "step": 1216 + }, + { + "epoch": 0.32003153020001973, + "grad_norm": 0.8333232402801514, + "learning_rate": 8.9355177851761e-05, + "loss": 1.8275, + "step": 1218 + }, + { + "epoch": 0.3205570335336815, + "grad_norm": 0.7367919087409973, + "learning_rate": 8.933765551077624e-05, + "loss": 1.8653, + "step": 1220 + }, + { + "epoch": 0.3210825368673432, + "grad_norm": 1.1696327924728394, + "learning_rate": 8.932013316979148e-05, + "loss": 1.8558, + "step": 1222 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.9526751637458801, + "learning_rate": 8.930261082880673e-05, + "loss": 1.8429, + "step": 1224 + }, + { + "epoch": 0.3221335435346668, + "grad_norm": 1.0763540267944336, + "learning_rate": 8.928508848782198e-05, + "loss": 1.8365, + "step": 1226 + }, + { + "epoch": 0.3226590468683286, + "grad_norm": 0.8945477604866028, + "learning_rate": 8.926756614683723e-05, + "loss": 1.8303, + "step": 1228 + }, + { + "epoch": 0.32318455020199033, + "grad_norm": 0.8092701435089111, + "learning_rate": 8.925004380585247e-05, + "loss": 1.8454, + "step": 1230 + }, + { + "epoch": 0.32371005353565213, + "grad_norm": 1.0602396726608276, + "learning_rate": 8.923252146486771e-05, + "loss": 1.8256, + "step": 1232 + }, + { + "epoch": 0.3242355568693139, + "grad_norm": 0.8251585960388184, + "learning_rate": 8.921499912388296e-05, + "loss": 1.8057, + "step": 1234 + }, + { + "epoch": 0.3247610602029757, + "grad_norm": 1.2272833585739136, + "learning_rate": 8.91974767828982e-05, + "loss": 1.8549, + "step": 1236 + }, + { + "epoch": 0.32528656353663743, + "grad_norm": 0.9338359832763672, + "learning_rate": 8.917995444191345e-05, + "loss": 1.8618, + "step": 1238 + }, + { + "epoch": 0.32581206687029923, + "grad_norm": 0.8794339299201965, + "learning_rate": 8.916243210092868e-05, + "loss": 1.846, + "step": 1240 + }, + { + "epoch": 0.326337570203961, + "grad_norm": 0.9014391899108887, + "learning_rate": 8.914490975994393e-05, + "loss": 1.8469, + "step": 1242 + }, + { + "epoch": 0.32686307353762273, + "grad_norm": 0.9050634503364563, + "learning_rate": 8.912738741895918e-05, + "loss": 1.798, + "step": 1244 + }, + { + "epoch": 0.32738857687128453, + "grad_norm": 0.9596816897392273, + "learning_rate": 8.910986507797441e-05, + "loss": 1.8617, + "step": 1246 + }, + { + "epoch": 0.3279140802049463, + "grad_norm": 0.8555053472518921, + "learning_rate": 8.909234273698966e-05, + "loss": 1.8262, + "step": 1248 + }, + { + "epoch": 0.3284395835386081, + "grad_norm": 0.7877684831619263, + "learning_rate": 8.907482039600491e-05, + "loss": 1.8536, + "step": 1250 + }, + { + "epoch": 0.32896508687226983, + "grad_norm": 0.9233472347259521, + "learning_rate": 8.905729805502016e-05, + "loss": 1.8681, + "step": 1252 + }, + { + "epoch": 0.32949059020593163, + "grad_norm": 0.8724532127380371, + "learning_rate": 8.90397757140354e-05, + "loss": 1.8169, + "step": 1254 + }, + { + "epoch": 0.3300160935395934, + "grad_norm": 0.8664790987968445, + "learning_rate": 8.902225337305065e-05, + "loss": 1.8327, + "step": 1256 + }, + { + "epoch": 0.3305415968732552, + "grad_norm": 1.0527539253234863, + "learning_rate": 8.900473103206589e-05, + "loss": 1.849, + "step": 1258 + }, + { + "epoch": 0.33106710020691693, + "grad_norm": 1.0830262899398804, + "learning_rate": 8.898720869108113e-05, + "loss": 1.8293, + "step": 1260 + }, + { + "epoch": 0.33159260354057873, + "grad_norm": 0.9514210820198059, + "learning_rate": 8.896968635009638e-05, + "loss": 1.8509, + "step": 1262 + }, + { + "epoch": 0.3321181068742405, + "grad_norm": 1.2806978225708008, + "learning_rate": 8.895216400911163e-05, + "loss": 1.8622, + "step": 1264 + }, + { + "epoch": 0.33264361020790223, + "grad_norm": 0.8732459545135498, + "learning_rate": 8.893464166812686e-05, + "loss": 1.8385, + "step": 1266 + }, + { + "epoch": 0.33316911354156403, + "grad_norm": 0.9644619822502136, + "learning_rate": 8.891711932714211e-05, + "loss": 1.8368, + "step": 1268 + }, + { + "epoch": 0.3336946168752258, + "grad_norm": 0.9549365043640137, + "learning_rate": 8.889959698615736e-05, + "loss": 1.8363, + "step": 1270 + }, + { + "epoch": 0.3342201202088876, + "grad_norm": 0.9528286457061768, + "learning_rate": 8.888207464517259e-05, + "loss": 1.8548, + "step": 1272 + }, + { + "epoch": 0.33474562354254933, + "grad_norm": 1.0315399169921875, + "learning_rate": 8.886455230418784e-05, + "loss": 1.8354, + "step": 1274 + }, + { + "epoch": 0.33527112687621113, + "grad_norm": 0.8084585070610046, + "learning_rate": 8.884702996320309e-05, + "loss": 1.8423, + "step": 1276 + }, + { + "epoch": 0.3357966302098729, + "grad_norm": 1.1312843561172485, + "learning_rate": 8.882950762221833e-05, + "loss": 1.8374, + "step": 1278 + }, + { + "epoch": 0.3363221335435347, + "grad_norm": 1.1717549562454224, + "learning_rate": 8.881198528123358e-05, + "loss": 1.8204, + "step": 1280 + }, + { + "epoch": 0.33684763687719643, + "grad_norm": 0.8000698089599609, + "learning_rate": 8.879446294024883e-05, + "loss": 1.8577, + "step": 1282 + }, + { + "epoch": 0.33737314021085824, + "grad_norm": 1.3130030632019043, + "learning_rate": 8.877694059926406e-05, + "loss": 1.8072, + "step": 1284 + }, + { + "epoch": 0.33789864354452, + "grad_norm": 0.8949963450431824, + "learning_rate": 8.875941825827931e-05, + "loss": 1.8586, + "step": 1286 + }, + { + "epoch": 0.33842414687818173, + "grad_norm": 0.8775150775909424, + "learning_rate": 8.874189591729456e-05, + "loss": 1.831, + "step": 1288 + }, + { + "epoch": 0.33894965021184353, + "grad_norm": 0.8946396708488464, + "learning_rate": 8.872437357630979e-05, + "loss": 1.8243, + "step": 1290 + }, + { + "epoch": 0.3394751535455053, + "grad_norm": 1.271799921989441, + "learning_rate": 8.870685123532504e-05, + "loss": 1.8174, + "step": 1292 + }, + { + "epoch": 0.3400006568791671, + "grad_norm": 0.8557697534561157, + "learning_rate": 8.868932889434029e-05, + "loss": 1.8429, + "step": 1294 + }, + { + "epoch": 0.34052616021282883, + "grad_norm": 0.9884776473045349, + "learning_rate": 8.867180655335554e-05, + "loss": 1.7985, + "step": 1296 + }, + { + "epoch": 0.34105166354649064, + "grad_norm": 0.9385315775871277, + "learning_rate": 8.865428421237077e-05, + "loss": 1.7948, + "step": 1298 + }, + { + "epoch": 0.3415771668801524, + "grad_norm": 1.1224939823150635, + "learning_rate": 8.863676187138602e-05, + "loss": 1.8564, + "step": 1300 + }, + { + "epoch": 0.3421026702138142, + "grad_norm": 0.9227058291435242, + "learning_rate": 8.861923953040126e-05, + "loss": 1.8432, + "step": 1302 + }, + { + "epoch": 0.34262817354747593, + "grad_norm": 1.0591615438461304, + "learning_rate": 8.860171718941651e-05, + "loss": 1.8267, + "step": 1304 + }, + { + "epoch": 0.34315367688113774, + "grad_norm": 0.8901565670967102, + "learning_rate": 8.858419484843176e-05, + "loss": 1.8452, + "step": 1306 + }, + { + "epoch": 0.3436791802147995, + "grad_norm": 0.7925954461097717, + "learning_rate": 8.856667250744701e-05, + "loss": 1.8395, + "step": 1308 + }, + { + "epoch": 0.3442046835484613, + "grad_norm": 0.8542584776878357, + "learning_rate": 8.854915016646224e-05, + "loss": 1.821, + "step": 1310 + }, + { + "epoch": 0.34473018688212304, + "grad_norm": 0.9138728380203247, + "learning_rate": 8.853162782547749e-05, + "loss": 1.8466, + "step": 1312 + }, + { + "epoch": 0.3452556902157848, + "grad_norm": 1.0735788345336914, + "learning_rate": 8.851410548449274e-05, + "loss": 1.8318, + "step": 1314 + }, + { + "epoch": 0.3457811935494466, + "grad_norm": 1.3310229778289795, + "learning_rate": 8.849658314350797e-05, + "loss": 1.8334, + "step": 1316 + }, + { + "epoch": 0.34630669688310833, + "grad_norm": 0.7600061297416687, + "learning_rate": 8.847906080252322e-05, + "loss": 1.8089, + "step": 1318 + }, + { + "epoch": 0.34683220021677014, + "grad_norm": 0.818154513835907, + "learning_rate": 8.846153846153847e-05, + "loss": 1.8577, + "step": 1320 + }, + { + "epoch": 0.3473577035504319, + "grad_norm": 1.0234004259109497, + "learning_rate": 8.844401612055371e-05, + "loss": 1.8364, + "step": 1322 + }, + { + "epoch": 0.3478832068840937, + "grad_norm": 0.880425751209259, + "learning_rate": 8.842649377956895e-05, + "loss": 1.8671, + "step": 1324 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.8950909376144409, + "learning_rate": 8.84089714385842e-05, + "loss": 1.8217, + "step": 1326 + }, + { + "epoch": 0.34893421355141724, + "grad_norm": 1.029801607131958, + "learning_rate": 8.839144909759944e-05, + "loss": 1.8273, + "step": 1328 + }, + { + "epoch": 0.349459716885079, + "grad_norm": 1.1284875869750977, + "learning_rate": 8.837392675661469e-05, + "loss": 1.8523, + "step": 1330 + }, + { + "epoch": 0.3499852202187408, + "grad_norm": 1.3472214937210083, + "learning_rate": 8.835640441562994e-05, + "loss": 1.8878, + "step": 1332 + }, + { + "epoch": 0.35051072355240254, + "grad_norm": 0.8898762464523315, + "learning_rate": 8.833888207464518e-05, + "loss": 1.8225, + "step": 1334 + }, + { + "epoch": 0.3510362268860643, + "grad_norm": 1.2737003564834595, + "learning_rate": 8.832135973366042e-05, + "loss": 1.7853, + "step": 1336 + }, + { + "epoch": 0.3515617302197261, + "grad_norm": 0.9682241678237915, + "learning_rate": 8.830383739267567e-05, + "loss": 1.8502, + "step": 1338 + }, + { + "epoch": 0.35208723355338784, + "grad_norm": 0.8494237661361694, + "learning_rate": 8.828631505169091e-05, + "loss": 1.8515, + "step": 1340 + }, + { + "epoch": 0.35261273688704964, + "grad_norm": 0.923283040523529, + "learning_rate": 8.826879271070615e-05, + "loss": 1.849, + "step": 1342 + }, + { + "epoch": 0.3531382402207114, + "grad_norm": 1.0368821620941162, + "learning_rate": 8.82512703697214e-05, + "loss": 1.8228, + "step": 1344 + }, + { + "epoch": 0.3536637435543732, + "grad_norm": 0.76881343126297, + "learning_rate": 8.823374802873664e-05, + "loss": 1.8156, + "step": 1346 + }, + { + "epoch": 0.35418924688803494, + "grad_norm": 0.7315630316734314, + "learning_rate": 8.821622568775189e-05, + "loss": 1.8068, + "step": 1348 + }, + { + "epoch": 0.35471475022169674, + "grad_norm": 1.254550814628601, + "learning_rate": 8.819870334676712e-05, + "loss": 1.8498, + "step": 1350 + }, + { + "epoch": 0.3552402535553585, + "grad_norm": 1.1354317665100098, + "learning_rate": 8.818118100578237e-05, + "loss": 1.8547, + "step": 1352 + }, + { + "epoch": 0.3557657568890203, + "grad_norm": 1.0078952312469482, + "learning_rate": 8.816365866479762e-05, + "loss": 1.8635, + "step": 1354 + }, + { + "epoch": 0.35629126022268204, + "grad_norm": 1.4484366178512573, + "learning_rate": 8.814613632381287e-05, + "loss": 1.8405, + "step": 1356 + }, + { + "epoch": 0.3568167635563438, + "grad_norm": 0.8407228589057922, + "learning_rate": 8.812861398282811e-05, + "loss": 1.8295, + "step": 1358 + }, + { + "epoch": 0.3573422668900056, + "grad_norm": 0.9024233818054199, + "learning_rate": 8.811109164184336e-05, + "loss": 1.8052, + "step": 1360 + }, + { + "epoch": 0.35786777022366734, + "grad_norm": 0.9681188464164734, + "learning_rate": 8.80935693008586e-05, + "loss": 1.8024, + "step": 1362 + }, + { + "epoch": 0.35839327355732914, + "grad_norm": 0.9130085706710815, + "learning_rate": 8.807604695987384e-05, + "loss": 1.8159, + "step": 1364 + }, + { + "epoch": 0.3589187768909909, + "grad_norm": 0.938353419303894, + "learning_rate": 8.805852461888909e-05, + "loss": 1.8134, + "step": 1366 + }, + { + "epoch": 0.3594442802246527, + "grad_norm": 0.8700679540634155, + "learning_rate": 8.804100227790433e-05, + "loss": 1.8091, + "step": 1368 + }, + { + "epoch": 0.35996978355831444, + "grad_norm": 0.8863296508789062, + "learning_rate": 8.802347993691957e-05, + "loss": 1.7961, + "step": 1370 + }, + { + "epoch": 0.36049528689197624, + "grad_norm": 0.9155923128128052, + "learning_rate": 8.800595759593482e-05, + "loss": 1.8098, + "step": 1372 + }, + { + "epoch": 0.361020790225638, + "grad_norm": 1.020551323890686, + "learning_rate": 8.798843525495007e-05, + "loss": 1.848, + "step": 1374 + }, + { + "epoch": 0.3615462935592998, + "grad_norm": 0.9836577773094177, + "learning_rate": 8.79709129139653e-05, + "loss": 1.7839, + "step": 1376 + }, + { + "epoch": 0.36207179689296154, + "grad_norm": 0.9969834089279175, + "learning_rate": 8.795339057298055e-05, + "loss": 1.8062, + "step": 1378 + }, + { + "epoch": 0.3625973002266233, + "grad_norm": 0.8620086312294006, + "learning_rate": 8.79358682319958e-05, + "loss": 1.7944, + "step": 1380 + }, + { + "epoch": 0.3631228035602851, + "grad_norm": 1.2116692066192627, + "learning_rate": 8.791834589101104e-05, + "loss": 1.8718, + "step": 1382 + }, + { + "epoch": 0.36364830689394684, + "grad_norm": 0.8402097225189209, + "learning_rate": 8.790082355002629e-05, + "loss": 1.8276, + "step": 1384 + }, + { + "epoch": 0.36417381022760864, + "grad_norm": 0.9271780848503113, + "learning_rate": 8.788330120904154e-05, + "loss": 1.8222, + "step": 1386 + }, + { + "epoch": 0.3646993135612704, + "grad_norm": 0.8769554495811462, + "learning_rate": 8.786577886805677e-05, + "loss": 1.8416, + "step": 1388 + }, + { + "epoch": 0.3652248168949322, + "grad_norm": 0.9306502938270569, + "learning_rate": 8.784825652707202e-05, + "loss": 1.8506, + "step": 1390 + }, + { + "epoch": 0.36575032022859394, + "grad_norm": 0.8423568606376648, + "learning_rate": 8.783073418608726e-05, + "loss": 1.8617, + "step": 1392 + }, + { + "epoch": 0.36627582356225574, + "grad_norm": 0.9485574960708618, + "learning_rate": 8.78132118451025e-05, + "loss": 1.8448, + "step": 1394 + }, + { + "epoch": 0.3668013268959175, + "grad_norm": 1.1368005275726318, + "learning_rate": 8.779568950411775e-05, + "loss": 1.8217, + "step": 1396 + }, + { + "epoch": 0.3673268302295793, + "grad_norm": 0.9294119477272034, + "learning_rate": 8.7778167163133e-05, + "loss": 1.8099, + "step": 1398 + }, + { + "epoch": 0.36785233356324104, + "grad_norm": 0.8389936685562134, + "learning_rate": 8.776064482214825e-05, + "loss": 1.8601, + "step": 1400 + }, + { + "epoch": 0.3683778368969028, + "grad_norm": 0.7817425727844238, + "learning_rate": 8.77431224811635e-05, + "loss": 1.8218, + "step": 1402 + }, + { + "epoch": 0.3689033402305646, + "grad_norm": 1.1421295404434204, + "learning_rate": 8.772560014017873e-05, + "loss": 1.8673, + "step": 1404 + }, + { + "epoch": 0.36942884356422634, + "grad_norm": 0.8209173083305359, + "learning_rate": 8.770807779919397e-05, + "loss": 1.8298, + "step": 1406 + }, + { + "epoch": 0.36995434689788814, + "grad_norm": 1.0874011516571045, + "learning_rate": 8.769055545820922e-05, + "loss": 1.8008, + "step": 1408 + }, + { + "epoch": 0.3704798502315499, + "grad_norm": 0.839116096496582, + "learning_rate": 8.767303311722447e-05, + "loss": 1.8511, + "step": 1410 + }, + { + "epoch": 0.3710053535652117, + "grad_norm": 0.956777036190033, + "learning_rate": 8.765551077623972e-05, + "loss": 1.8761, + "step": 1412 + }, + { + "epoch": 0.37153085689887344, + "grad_norm": 0.7702937722206116, + "learning_rate": 8.763798843525497e-05, + "loss": 1.8231, + "step": 1414 + }, + { + "epoch": 0.37205636023253524, + "grad_norm": 0.8248230814933777, + "learning_rate": 8.76204660942702e-05, + "loss": 1.8538, + "step": 1416 + }, + { + "epoch": 0.372581863566197, + "grad_norm": 1.0771416425704956, + "learning_rate": 8.760294375328543e-05, + "loss": 1.8515, + "step": 1418 + }, + { + "epoch": 0.3731073668998588, + "grad_norm": 0.8044272661209106, + "learning_rate": 8.758542141230068e-05, + "loss": 1.8295, + "step": 1420 + }, + { + "epoch": 0.37363287023352054, + "grad_norm": 1.0227196216583252, + "learning_rate": 8.756789907131593e-05, + "loss": 1.8659, + "step": 1422 + }, + { + "epoch": 0.3741583735671823, + "grad_norm": 0.8310641646385193, + "learning_rate": 8.755037673033118e-05, + "loss": 1.8286, + "step": 1424 + }, + { + "epoch": 0.3746838769008441, + "grad_norm": 0.9817164540290833, + "learning_rate": 8.753285438934642e-05, + "loss": 1.8863, + "step": 1426 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.872424840927124, + "learning_rate": 8.751533204836167e-05, + "loss": 1.8489, + "step": 1428 + }, + { + "epoch": 0.37573488356816764, + "grad_norm": 0.9776557087898254, + "learning_rate": 8.74978097073769e-05, + "loss": 1.8536, + "step": 1430 + }, + { + "epoch": 0.3762603869018294, + "grad_norm": 1.3488025665283203, + "learning_rate": 8.748028736639215e-05, + "loss": 1.8432, + "step": 1432 + }, + { + "epoch": 0.3767858902354912, + "grad_norm": 0.8517011404037476, + "learning_rate": 8.74627650254074e-05, + "loss": 1.847, + "step": 1434 + }, + { + "epoch": 0.37731139356915294, + "grad_norm": 0.8631575703620911, + "learning_rate": 8.744524268442265e-05, + "loss": 1.8053, + "step": 1436 + }, + { + "epoch": 0.37783689690281475, + "grad_norm": 0.881100058555603, + "learning_rate": 8.74277203434379e-05, + "loss": 1.8662, + "step": 1438 + }, + { + "epoch": 0.3783624002364765, + "grad_norm": 0.8032435178756714, + "learning_rate": 8.741019800245314e-05, + "loss": 1.8438, + "step": 1440 + }, + { + "epoch": 0.3788879035701383, + "grad_norm": 0.7921327948570251, + "learning_rate": 8.739267566146838e-05, + "loss": 1.8685, + "step": 1442 + }, + { + "epoch": 0.37941340690380004, + "grad_norm": 1.060738444328308, + "learning_rate": 8.737515332048361e-05, + "loss": 1.8365, + "step": 1444 + }, + { + "epoch": 0.37993891023746185, + "grad_norm": 1.0198917388916016, + "learning_rate": 8.735763097949886e-05, + "loss": 1.8278, + "step": 1446 + }, + { + "epoch": 0.3804644135711236, + "grad_norm": 0.9688281416893005, + "learning_rate": 8.73401086385141e-05, + "loss": 1.8513, + "step": 1448 + }, + { + "epoch": 0.38098991690478534, + "grad_norm": 1.2723430395126343, + "learning_rate": 8.732258629752935e-05, + "loss": 1.7984, + "step": 1450 + }, + { + "epoch": 0.38151542023844714, + "grad_norm": 0.8690189123153687, + "learning_rate": 8.73050639565446e-05, + "loss": 1.8288, + "step": 1452 + }, + { + "epoch": 0.3820409235721089, + "grad_norm": 0.9124467968940735, + "learning_rate": 8.728754161555985e-05, + "loss": 1.8684, + "step": 1454 + }, + { + "epoch": 0.3825664269057707, + "grad_norm": 1.1950373649597168, + "learning_rate": 8.727001927457508e-05, + "loss": 1.8431, + "step": 1456 + }, + { + "epoch": 0.38309193023943244, + "grad_norm": 0.7846235632896423, + "learning_rate": 8.725249693359033e-05, + "loss": 1.8409, + "step": 1458 + }, + { + "epoch": 0.38361743357309425, + "grad_norm": 1.450654149055481, + "learning_rate": 8.723497459260558e-05, + "loss": 1.844, + "step": 1460 + }, + { + "epoch": 0.384142936906756, + "grad_norm": 1.0545793771743774, + "learning_rate": 8.721745225162083e-05, + "loss": 1.8332, + "step": 1462 + }, + { + "epoch": 0.3846684402404178, + "grad_norm": 1.000705599784851, + "learning_rate": 8.719992991063607e-05, + "loss": 1.8486, + "step": 1464 + }, + { + "epoch": 0.38519394357407954, + "grad_norm": 1.2795532941818237, + "learning_rate": 8.718240756965132e-05, + "loss": 1.8323, + "step": 1466 + }, + { + "epoch": 0.38571944690774135, + "grad_norm": 0.7551513314247131, + "learning_rate": 8.716488522866655e-05, + "loss": 1.8573, + "step": 1468 + }, + { + "epoch": 0.3862449502414031, + "grad_norm": 1.2810308933258057, + "learning_rate": 8.714736288768179e-05, + "loss": 1.8167, + "step": 1470 + }, + { + "epoch": 0.38677045357506484, + "grad_norm": 1.0538434982299805, + "learning_rate": 8.712984054669704e-05, + "loss": 1.8501, + "step": 1472 + }, + { + "epoch": 0.38729595690872665, + "grad_norm": 1.2018911838531494, + "learning_rate": 8.711231820571228e-05, + "loss": 1.8291, + "step": 1474 + }, + { + "epoch": 0.3878214602423884, + "grad_norm": 1.4515736103057861, + "learning_rate": 8.709479586472753e-05, + "loss": 1.8728, + "step": 1476 + }, + { + "epoch": 0.3883469635760502, + "grad_norm": 0.855747640132904, + "learning_rate": 8.707727352374278e-05, + "loss": 1.8048, + "step": 1478 + }, + { + "epoch": 0.38887246690971194, + "grad_norm": 1.3377580642700195, + "learning_rate": 8.705975118275803e-05, + "loss": 1.8599, + "step": 1480 + }, + { + "epoch": 0.38939797024337375, + "grad_norm": 0.9842968583106995, + "learning_rate": 8.704222884177326e-05, + "loss": 1.8038, + "step": 1482 + }, + { + "epoch": 0.3899234735770355, + "grad_norm": 1.4240106344223022, + "learning_rate": 8.702470650078851e-05, + "loss": 1.8305, + "step": 1484 + }, + { + "epoch": 0.3904489769106973, + "grad_norm": 0.7605730295181274, + "learning_rate": 8.700718415980376e-05, + "loss": 1.8321, + "step": 1486 + }, + { + "epoch": 0.39097448024435905, + "grad_norm": 0.9584787487983704, + "learning_rate": 8.6989661818819e-05, + "loss": 1.799, + "step": 1488 + }, + { + "epoch": 0.39149998357802085, + "grad_norm": 0.8087942004203796, + "learning_rate": 8.697213947783425e-05, + "loss": 1.8011, + "step": 1490 + }, + { + "epoch": 0.3920254869116826, + "grad_norm": 0.7870105504989624, + "learning_rate": 8.69546171368495e-05, + "loss": 1.7972, + "step": 1492 + }, + { + "epoch": 0.39255099024534434, + "grad_norm": 1.1304738521575928, + "learning_rate": 8.693709479586473e-05, + "loss": 1.8088, + "step": 1494 + }, + { + "epoch": 0.39307649357900615, + "grad_norm": 0.8902273178100586, + "learning_rate": 8.691957245487997e-05, + "loss": 1.8101, + "step": 1496 + }, + { + "epoch": 0.3936019969126679, + "grad_norm": 1.1424989700317383, + "learning_rate": 8.690205011389521e-05, + "loss": 1.8018, + "step": 1498 + }, + { + "epoch": 0.3941275002463297, + "grad_norm": 0.9772897362709045, + "learning_rate": 8.688452777291046e-05, + "loss": 1.8191, + "step": 1500 + }, + { + "epoch": 0.39465300357999145, + "grad_norm": 0.9879363775253296, + "learning_rate": 8.686700543192571e-05, + "loss": 1.7934, + "step": 1502 + }, + { + "epoch": 0.39517850691365325, + "grad_norm": 0.8215435147285461, + "learning_rate": 8.684948309094096e-05, + "loss": 1.8125, + "step": 1504 + }, + { + "epoch": 0.395704010247315, + "grad_norm": 0.8453714847564697, + "learning_rate": 8.68319607499562e-05, + "loss": 1.8385, + "step": 1506 + }, + { + "epoch": 0.3962295135809768, + "grad_norm": 0.9266200661659241, + "learning_rate": 8.681443840897144e-05, + "loss": 1.8537, + "step": 1508 + }, + { + "epoch": 0.39675501691463855, + "grad_norm": 1.2535603046417236, + "learning_rate": 8.679691606798669e-05, + "loss": 1.8138, + "step": 1510 + }, + { + "epoch": 0.39728052024830035, + "grad_norm": 1.0080575942993164, + "learning_rate": 8.677939372700193e-05, + "loss": 1.8193, + "step": 1512 + }, + { + "epoch": 0.3978060235819621, + "grad_norm": 0.8419904112815857, + "learning_rate": 8.676187138601718e-05, + "loss": 1.845, + "step": 1514 + }, + { + "epoch": 0.39833152691562385, + "grad_norm": 1.2089139223098755, + "learning_rate": 8.674434904503243e-05, + "loss": 1.8274, + "step": 1516 + }, + { + "epoch": 0.39885703024928565, + "grad_norm": 0.9421194791793823, + "learning_rate": 8.672682670404768e-05, + "loss": 1.8132, + "step": 1518 + }, + { + "epoch": 0.3993825335829474, + "grad_norm": 1.0286279916763306, + "learning_rate": 8.670930436306291e-05, + "loss": 1.8045, + "step": 1520 + }, + { + "epoch": 0.3999080369166092, + "grad_norm": 1.1791476011276245, + "learning_rate": 8.669178202207814e-05, + "loss": 1.8148, + "step": 1522 + }, + { + "epoch": 0.40043354025027095, + "grad_norm": 0.9198878407478333, + "learning_rate": 8.667425968109339e-05, + "loss": 1.8261, + "step": 1524 + }, + { + "epoch": 0.40095904358393275, + "grad_norm": 1.0204558372497559, + "learning_rate": 8.665673734010864e-05, + "loss": 1.8324, + "step": 1526 + }, + { + "epoch": 0.4014845469175945, + "grad_norm": 1.1662263870239258, + "learning_rate": 8.663921499912389e-05, + "loss": 1.7949, + "step": 1528 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 1.2517145872116089, + "learning_rate": 8.662169265813913e-05, + "loss": 1.7924, + "step": 1530 + }, + { + "epoch": 0.40253555358491805, + "grad_norm": 0.7785090208053589, + "learning_rate": 8.660417031715438e-05, + "loss": 1.8023, + "step": 1532 + }, + { + "epoch": 0.40306105691857985, + "grad_norm": 0.8084584474563599, + "learning_rate": 8.658664797616962e-05, + "loss": 1.8132, + "step": 1534 + }, + { + "epoch": 0.4035865602522416, + "grad_norm": 0.8784323930740356, + "learning_rate": 8.656912563518486e-05, + "loss": 1.8047, + "step": 1536 + }, + { + "epoch": 0.40411206358590335, + "grad_norm": 0.8626761436462402, + "learning_rate": 8.655160329420011e-05, + "loss": 1.7985, + "step": 1538 + }, + { + "epoch": 0.40463756691956515, + "grad_norm": 0.8983022570610046, + "learning_rate": 8.653408095321536e-05, + "loss": 1.8068, + "step": 1540 + }, + { + "epoch": 0.4051630702532269, + "grad_norm": 1.2110192775726318, + "learning_rate": 8.65165586122306e-05, + "loss": 1.815, + "step": 1542 + }, + { + "epoch": 0.4056885735868887, + "grad_norm": 0.9025999307632446, + "learning_rate": 8.649903627124585e-05, + "loss": 1.8305, + "step": 1544 + }, + { + "epoch": 0.40621407692055045, + "grad_norm": 0.8550492525100708, + "learning_rate": 8.648151393026109e-05, + "loss": 1.7915, + "step": 1546 + }, + { + "epoch": 0.40673958025421225, + "grad_norm": 0.8646672368049622, + "learning_rate": 8.646399158927632e-05, + "loss": 1.8338, + "step": 1548 + }, + { + "epoch": 0.407265083587874, + "grad_norm": 0.8966619372367859, + "learning_rate": 8.644646924829157e-05, + "loss": 1.8072, + "step": 1550 + }, + { + "epoch": 0.4077905869215358, + "grad_norm": 0.9003387689590454, + "learning_rate": 8.642894690730682e-05, + "loss": 1.8041, + "step": 1552 + }, + { + "epoch": 0.40831609025519755, + "grad_norm": 1.035400152206421, + "learning_rate": 8.641142456632206e-05, + "loss": 1.8484, + "step": 1554 + }, + { + "epoch": 0.40884159358885935, + "grad_norm": 0.8281182050704956, + "learning_rate": 8.639390222533731e-05, + "loss": 1.7943, + "step": 1556 + }, + { + "epoch": 0.4093670969225211, + "grad_norm": 0.8706338405609131, + "learning_rate": 8.637637988435256e-05, + "loss": 1.848, + "step": 1558 + }, + { + "epoch": 0.40989260025618285, + "grad_norm": 0.9510646462440491, + "learning_rate": 8.63588575433678e-05, + "loss": 1.81, + "step": 1560 + }, + { + "epoch": 0.41041810358984465, + "grad_norm": 1.2758607864379883, + "learning_rate": 8.634133520238304e-05, + "loss": 1.831, + "step": 1562 + }, + { + "epoch": 0.4109436069235064, + "grad_norm": 0.8133296966552734, + "learning_rate": 8.632381286139829e-05, + "loss": 1.833, + "step": 1564 + }, + { + "epoch": 0.4114691102571682, + "grad_norm": 0.8663495779037476, + "learning_rate": 8.630629052041354e-05, + "loss": 1.7999, + "step": 1566 + }, + { + "epoch": 0.41199461359082995, + "grad_norm": 0.8473132252693176, + "learning_rate": 8.628876817942878e-05, + "loss": 1.8148, + "step": 1568 + }, + { + "epoch": 0.41252011692449175, + "grad_norm": 0.7791121006011963, + "learning_rate": 8.627124583844402e-05, + "loss": 1.8107, + "step": 1570 + }, + { + "epoch": 0.4130456202581535, + "grad_norm": 0.8510565161705017, + "learning_rate": 8.625372349745927e-05, + "loss": 1.8434, + "step": 1572 + }, + { + "epoch": 0.4135711235918153, + "grad_norm": 0.7872109413146973, + "learning_rate": 8.62362011564745e-05, + "loss": 1.86, + "step": 1574 + }, + { + "epoch": 0.41409662692547705, + "grad_norm": 0.9160329699516296, + "learning_rate": 8.621867881548975e-05, + "loss": 1.8408, + "step": 1576 + }, + { + "epoch": 0.41462213025913885, + "grad_norm": 0.9173769354820251, + "learning_rate": 8.6201156474505e-05, + "loss": 1.8057, + "step": 1578 + }, + { + "epoch": 0.4151476335928006, + "grad_norm": 1.086470603942871, + "learning_rate": 8.618363413352024e-05, + "loss": 1.8199, + "step": 1580 + }, + { + "epoch": 0.41567313692646235, + "grad_norm": 0.8830829858779907, + "learning_rate": 8.616611179253549e-05, + "loss": 1.8107, + "step": 1582 + }, + { + "epoch": 0.41619864026012415, + "grad_norm": 0.8462435007095337, + "learning_rate": 8.614858945155074e-05, + "loss": 1.8159, + "step": 1584 + }, + { + "epoch": 0.4167241435937859, + "grad_norm": 0.9439370632171631, + "learning_rate": 8.613106711056597e-05, + "loss": 1.8311, + "step": 1586 + }, + { + "epoch": 0.4172496469274477, + "grad_norm": 0.933134138584137, + "learning_rate": 8.611354476958122e-05, + "loss": 1.8203, + "step": 1588 + }, + { + "epoch": 0.41777515026110945, + "grad_norm": 0.7939304709434509, + "learning_rate": 8.609602242859647e-05, + "loss": 1.7991, + "step": 1590 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.9367759823799133, + "learning_rate": 8.607850008761171e-05, + "loss": 1.8391, + "step": 1592 + }, + { + "epoch": 0.418826156928433, + "grad_norm": 0.8476933240890503, + "learning_rate": 8.606097774662696e-05, + "loss": 1.8135, + "step": 1594 + }, + { + "epoch": 0.4193516602620948, + "grad_norm": 0.9385167360305786, + "learning_rate": 8.60434554056422e-05, + "loss": 1.8396, + "step": 1596 + }, + { + "epoch": 0.41987716359575655, + "grad_norm": 0.9880960583686829, + "learning_rate": 8.602593306465744e-05, + "loss": 1.8282, + "step": 1598 + }, + { + "epoch": 0.42040266692941836, + "grad_norm": 1.3682297468185425, + "learning_rate": 8.600841072367268e-05, + "loss": 1.8063, + "step": 1600 + }, + { + "epoch": 0.42040266692941836, + "eval_loss": 1.785941243171692, + "eval_runtime": 487.1976, + "eval_samples_per_second": 249.979, + "eval_steps_per_second": 31.248, + "step": 1600 + }, + { + "epoch": 0.4209281702630801, + "grad_norm": 0.8974788784980774, + "learning_rate": 8.599088838268792e-05, + "loss": 1.8264, + "step": 1602 + }, + { + "epoch": 0.4214536735967419, + "grad_norm": 1.3051773309707642, + "learning_rate": 8.597336604170317e-05, + "loss": 1.8204, + "step": 1604 + }, + { + "epoch": 0.42197917693040365, + "grad_norm": 0.8313725590705872, + "learning_rate": 8.595584370071842e-05, + "loss": 1.8276, + "step": 1606 + }, + { + "epoch": 0.4225046802640654, + "grad_norm": 0.7489058375358582, + "learning_rate": 8.593832135973367e-05, + "loss": 1.8039, + "step": 1608 + }, + { + "epoch": 0.4230301835977272, + "grad_norm": 1.303904414176941, + "learning_rate": 8.592079901874891e-05, + "loss": 1.8117, + "step": 1610 + }, + { + "epoch": 0.42355568693138895, + "grad_norm": 0.7930120229721069, + "learning_rate": 8.590327667776415e-05, + "loss": 1.816, + "step": 1612 + }, + { + "epoch": 0.42408119026505076, + "grad_norm": 1.1683326959609985, + "learning_rate": 8.58857543367794e-05, + "loss": 1.8485, + "step": 1614 + }, + { + "epoch": 0.4246066935987125, + "grad_norm": 1.129786491394043, + "learning_rate": 8.586823199579464e-05, + "loss": 1.8091, + "step": 1616 + }, + { + "epoch": 0.4251321969323743, + "grad_norm": 1.006664752960205, + "learning_rate": 8.585070965480989e-05, + "loss": 1.801, + "step": 1618 + }, + { + "epoch": 0.42565770026603605, + "grad_norm": 1.2593824863433838, + "learning_rate": 8.583318731382514e-05, + "loss": 1.874, + "step": 1620 + }, + { + "epoch": 0.42618320359969786, + "grad_norm": 0.7356145977973938, + "learning_rate": 8.581566497284037e-05, + "loss": 1.8156, + "step": 1622 + }, + { + "epoch": 0.4267087069333596, + "grad_norm": 1.3224732875823975, + "learning_rate": 8.579814263185562e-05, + "loss": 1.8485, + "step": 1624 + }, + { + "epoch": 0.4272342102670214, + "grad_norm": 1.0026780366897583, + "learning_rate": 8.578062029087085e-05, + "loss": 1.8051, + "step": 1626 + }, + { + "epoch": 0.42775971360068316, + "grad_norm": 1.235370397567749, + "learning_rate": 8.57630979498861e-05, + "loss": 1.8267, + "step": 1628 + }, + { + "epoch": 0.4282852169343449, + "grad_norm": 1.0629823207855225, + "learning_rate": 8.574557560890135e-05, + "loss": 1.7937, + "step": 1630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.9405169486999512, + "learning_rate": 8.57280532679166e-05, + "loss": 1.8023, + "step": 1632 + }, + { + "epoch": 0.42933622360166845, + "grad_norm": 1.5205514430999756, + "learning_rate": 8.571053092693184e-05, + "loss": 1.8112, + "step": 1634 + }, + { + "epoch": 0.42986172693533026, + "grad_norm": 1.038159728050232, + "learning_rate": 8.569300858594709e-05, + "loss": 1.7833, + "step": 1636 + }, + { + "epoch": 0.430387230268992, + "grad_norm": 0.8901309370994568, + "learning_rate": 8.567548624496233e-05, + "loss": 1.8241, + "step": 1638 + }, + { + "epoch": 0.4309127336026538, + "grad_norm": 1.166390061378479, + "learning_rate": 8.565796390397757e-05, + "loss": 1.7899, + "step": 1640 + }, + { + "epoch": 0.43143823693631556, + "grad_norm": 1.0582796335220337, + "learning_rate": 8.564044156299282e-05, + "loss": 1.8208, + "step": 1642 + }, + { + "epoch": 0.43196374026997736, + "grad_norm": 0.8496580123901367, + "learning_rate": 8.562291922200807e-05, + "loss": 1.8206, + "step": 1644 + }, + { + "epoch": 0.4324892436036391, + "grad_norm": 0.761249303817749, + "learning_rate": 8.560539688102332e-05, + "loss": 1.7858, + "step": 1646 + }, + { + "epoch": 0.4330147469373009, + "grad_norm": 0.8980756402015686, + "learning_rate": 8.558787454003855e-05, + "loss": 1.8053, + "step": 1648 + }, + { + "epoch": 0.43354025027096266, + "grad_norm": 0.9203025698661804, + "learning_rate": 8.55703521990538e-05, + "loss": 1.8379, + "step": 1650 + }, + { + "epoch": 0.4340657536046244, + "grad_norm": 0.9592378735542297, + "learning_rate": 8.555282985806905e-05, + "loss": 1.861, + "step": 1652 + }, + { + "epoch": 0.4345912569382862, + "grad_norm": 1.0187515020370483, + "learning_rate": 8.553530751708428e-05, + "loss": 1.7764, + "step": 1654 + }, + { + "epoch": 0.43511676027194796, + "grad_norm": 0.8016685247421265, + "learning_rate": 8.551778517609953e-05, + "loss": 1.8176, + "step": 1656 + }, + { + "epoch": 0.43564226360560976, + "grad_norm": 0.7380330562591553, + "learning_rate": 8.550026283511477e-05, + "loss": 1.8218, + "step": 1658 + }, + { + "epoch": 0.4361677669392715, + "grad_norm": 0.6815687417984009, + "learning_rate": 8.548274049413002e-05, + "loss": 1.8127, + "step": 1660 + }, + { + "epoch": 0.4366932702729333, + "grad_norm": 0.6906920075416565, + "learning_rate": 8.546521815314527e-05, + "loss": 1.773, + "step": 1662 + }, + { + "epoch": 0.43721877360659506, + "grad_norm": 0.9510621428489685, + "learning_rate": 8.544769581216052e-05, + "loss": 1.8305, + "step": 1664 + }, + { + "epoch": 0.43774427694025686, + "grad_norm": 0.727105438709259, + "learning_rate": 8.543017347117575e-05, + "loss": 1.8192, + "step": 1666 + }, + { + "epoch": 0.4382697802739186, + "grad_norm": 0.7399454712867737, + "learning_rate": 8.5412651130191e-05, + "loss": 1.8133, + "step": 1668 + }, + { + "epoch": 0.4387952836075804, + "grad_norm": 0.8177588582038879, + "learning_rate": 8.539512878920625e-05, + "loss": 1.7933, + "step": 1670 + }, + { + "epoch": 0.43932078694124216, + "grad_norm": 0.7681954503059387, + "learning_rate": 8.537760644822148e-05, + "loss": 1.837, + "step": 1672 + }, + { + "epoch": 0.4398462902749039, + "grad_norm": 1.5765389204025269, + "learning_rate": 8.536008410723673e-05, + "loss": 1.8291, + "step": 1674 + }, + { + "epoch": 0.4403717936085657, + "grad_norm": 0.7724891304969788, + "learning_rate": 8.534256176625198e-05, + "loss": 1.8124, + "step": 1676 + }, + { + "epoch": 0.44089729694222746, + "grad_norm": 0.8893011808395386, + "learning_rate": 8.532503942526722e-05, + "loss": 1.8405, + "step": 1678 + }, + { + "epoch": 0.44142280027588926, + "grad_norm": 0.878136932849884, + "learning_rate": 8.530751708428246e-05, + "loss": 1.8106, + "step": 1680 + }, + { + "epoch": 0.441948303609551, + "grad_norm": 0.9325633645057678, + "learning_rate": 8.52899947432977e-05, + "loss": 1.7963, + "step": 1682 + }, + { + "epoch": 0.4424738069432128, + "grad_norm": 1.0837180614471436, + "learning_rate": 8.527247240231295e-05, + "loss": 1.8144, + "step": 1684 + }, + { + "epoch": 0.44299931027687456, + "grad_norm": 0.8428369164466858, + "learning_rate": 8.52549500613282e-05, + "loss": 1.8001, + "step": 1686 + }, + { + "epoch": 0.44352481361053636, + "grad_norm": 0.930844783782959, + "learning_rate": 8.523742772034345e-05, + "loss": 1.8038, + "step": 1688 + }, + { + "epoch": 0.4440503169441981, + "grad_norm": 0.7409669756889343, + "learning_rate": 8.52199053793587e-05, + "loss": 1.7926, + "step": 1690 + }, + { + "epoch": 0.4445758202778599, + "grad_norm": 1.0107098817825317, + "learning_rate": 8.520238303837393e-05, + "loss": 1.7864, + "step": 1692 + }, + { + "epoch": 0.44510132361152166, + "grad_norm": 0.760370671749115, + "learning_rate": 8.518486069738918e-05, + "loss": 1.7881, + "step": 1694 + }, + { + "epoch": 0.4456268269451834, + "grad_norm": 1.2437944412231445, + "learning_rate": 8.516733835640442e-05, + "loss": 1.8152, + "step": 1696 + }, + { + "epoch": 0.4461523302788452, + "grad_norm": 0.8944531679153442, + "learning_rate": 8.514981601541966e-05, + "loss": 1.8272, + "step": 1698 + }, + { + "epoch": 0.44667783361250696, + "grad_norm": 0.9550314545631409, + "learning_rate": 8.51322936744349e-05, + "loss": 1.7936, + "step": 1700 + }, + { + "epoch": 0.44720333694616876, + "grad_norm": 0.7015916109085083, + "learning_rate": 8.511477133345015e-05, + "loss": 1.8737, + "step": 1702 + }, + { + "epoch": 0.4477288402798305, + "grad_norm": 1.0813145637512207, + "learning_rate": 8.50972489924654e-05, + "loss": 1.8118, + "step": 1704 + }, + { + "epoch": 0.4482543436134923, + "grad_norm": 0.8479138016700745, + "learning_rate": 8.507972665148064e-05, + "loss": 1.8074, + "step": 1706 + }, + { + "epoch": 0.44877984694715406, + "grad_norm": 0.9790395498275757, + "learning_rate": 8.506220431049588e-05, + "loss": 1.7785, + "step": 1708 + }, + { + "epoch": 0.44930535028081586, + "grad_norm": 0.8719221353530884, + "learning_rate": 8.504468196951113e-05, + "loss": 1.8162, + "step": 1710 + }, + { + "epoch": 0.4498308536144776, + "grad_norm": 1.064282774925232, + "learning_rate": 8.502715962852638e-05, + "loss": 1.8148, + "step": 1712 + }, + { + "epoch": 0.4503563569481394, + "grad_norm": 0.8482780456542969, + "learning_rate": 8.500963728754163e-05, + "loss": 1.7768, + "step": 1714 + }, + { + "epoch": 0.45088186028180116, + "grad_norm": 0.901155412197113, + "learning_rate": 8.499211494655687e-05, + "loss": 1.8182, + "step": 1716 + }, + { + "epoch": 0.4514073636154629, + "grad_norm": 1.0124598741531372, + "learning_rate": 8.497459260557211e-05, + "loss": 1.8142, + "step": 1718 + }, + { + "epoch": 0.4519328669491247, + "grad_norm": 0.8708586692810059, + "learning_rate": 8.495707026458735e-05, + "loss": 1.8247, + "step": 1720 + }, + { + "epoch": 0.45245837028278646, + "grad_norm": 0.9597557187080383, + "learning_rate": 8.49395479236026e-05, + "loss": 1.7786, + "step": 1722 + }, + { + "epoch": 0.45298387361644826, + "grad_norm": 1.1022772789001465, + "learning_rate": 8.492202558261784e-05, + "loss": 1.8139, + "step": 1724 + }, + { + "epoch": 0.45350937695011, + "grad_norm": 1.0891538858413696, + "learning_rate": 8.490450324163308e-05, + "loss": 1.7986, + "step": 1726 + }, + { + "epoch": 0.4540348802837718, + "grad_norm": 0.8127626776695251, + "learning_rate": 8.488698090064833e-05, + "loss": 1.8026, + "step": 1728 + }, + { + "epoch": 0.45456038361743356, + "grad_norm": 0.9313668608665466, + "learning_rate": 8.486945855966358e-05, + "loss": 1.788, + "step": 1730 + }, + { + "epoch": 0.45508588695109536, + "grad_norm": 0.8581985235214233, + "learning_rate": 8.485193621867881e-05, + "loss": 1.8097, + "step": 1732 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.7745251059532166, + "learning_rate": 8.483441387769406e-05, + "loss": 1.8095, + "step": 1734 + }, + { + "epoch": 0.4561368936184189, + "grad_norm": 0.7251246571540833, + "learning_rate": 8.481689153670931e-05, + "loss": 1.8195, + "step": 1736 + }, + { + "epoch": 0.45666239695208066, + "grad_norm": 0.8949863314628601, + "learning_rate": 8.479936919572456e-05, + "loss": 1.8665, + "step": 1738 + }, + { + "epoch": 0.4571879002857424, + "grad_norm": 0.8396829962730408, + "learning_rate": 8.47818468547398e-05, + "loss": 1.808, + "step": 1740 + }, + { + "epoch": 0.4577134036194042, + "grad_norm": 0.8307755589485168, + "learning_rate": 8.476432451375505e-05, + "loss": 1.8332, + "step": 1742 + }, + { + "epoch": 0.45823890695306596, + "grad_norm": 0.8172729015350342, + "learning_rate": 8.474680217277028e-05, + "loss": 1.7829, + "step": 1744 + }, + { + "epoch": 0.45876441028672776, + "grad_norm": 0.7859178185462952, + "learning_rate": 8.472927983178553e-05, + "loss": 1.8111, + "step": 1746 + }, + { + "epoch": 0.4592899136203895, + "grad_norm": 0.8412219882011414, + "learning_rate": 8.471175749080078e-05, + "loss": 1.8106, + "step": 1748 + }, + { + "epoch": 0.4598154169540513, + "grad_norm": 1.1843855381011963, + "learning_rate": 8.469423514981601e-05, + "loss": 1.8064, + "step": 1750 + }, + { + "epoch": 0.46034092028771306, + "grad_norm": 0.7547696232795715, + "learning_rate": 8.467671280883126e-05, + "loss": 1.7785, + "step": 1752 + }, + { + "epoch": 0.46086642362137487, + "grad_norm": 0.8816393613815308, + "learning_rate": 8.465919046784651e-05, + "loss": 1.7851, + "step": 1754 + }, + { + "epoch": 0.4613919269550366, + "grad_norm": 0.985379695892334, + "learning_rate": 8.464166812686176e-05, + "loss": 1.7841, + "step": 1756 + }, + { + "epoch": 0.4619174302886984, + "grad_norm": 0.7585499286651611, + "learning_rate": 8.462414578587699e-05, + "loss": 1.7928, + "step": 1758 + }, + { + "epoch": 0.46244293362236016, + "grad_norm": 0.8072088956832886, + "learning_rate": 8.460662344489224e-05, + "loss": 1.7867, + "step": 1760 + }, + { + "epoch": 0.46296843695602197, + "grad_norm": 0.8818538784980774, + "learning_rate": 8.458910110390749e-05, + "loss": 1.8085, + "step": 1762 + }, + { + "epoch": 0.4634939402896837, + "grad_norm": 0.7545977830886841, + "learning_rate": 8.457157876292273e-05, + "loss": 1.8134, + "step": 1764 + }, + { + "epoch": 0.46401944362334546, + "grad_norm": 0.7525529265403748, + "learning_rate": 8.455405642193798e-05, + "loss": 1.8235, + "step": 1766 + }, + { + "epoch": 0.46454494695700727, + "grad_norm": 1.0930671691894531, + "learning_rate": 8.453653408095323e-05, + "loss": 1.7796, + "step": 1768 + }, + { + "epoch": 0.465070450290669, + "grad_norm": 0.8903842568397522, + "learning_rate": 8.451901173996846e-05, + "loss": 1.8413, + "step": 1770 + }, + { + "epoch": 0.4655959536243308, + "grad_norm": 0.9519714117050171, + "learning_rate": 8.450148939898371e-05, + "loss": 1.8189, + "step": 1772 + }, + { + "epoch": 0.46612145695799256, + "grad_norm": 0.816856861114502, + "learning_rate": 8.448396705799894e-05, + "loss": 1.8338, + "step": 1774 + }, + { + "epoch": 0.46664696029165437, + "grad_norm": 0.8597956299781799, + "learning_rate": 8.446644471701419e-05, + "loss": 1.7963, + "step": 1776 + }, + { + "epoch": 0.4671724636253161, + "grad_norm": 0.8783820271492004, + "learning_rate": 8.444892237602944e-05, + "loss": 1.7985, + "step": 1778 + }, + { + "epoch": 0.4676979669589779, + "grad_norm": 0.9130274653434753, + "learning_rate": 8.443140003504469e-05, + "loss": 1.791, + "step": 1780 + }, + { + "epoch": 0.46822347029263967, + "grad_norm": 1.001056432723999, + "learning_rate": 8.441387769405993e-05, + "loss": 1.8115, + "step": 1782 + }, + { + "epoch": 0.46874897362630147, + "grad_norm": 0.8054397702217102, + "learning_rate": 8.439635535307517e-05, + "loss": 1.8411, + "step": 1784 + }, + { + "epoch": 0.4692744769599632, + "grad_norm": 0.8605888485908508, + "learning_rate": 8.437883301209042e-05, + "loss": 1.7763, + "step": 1786 + }, + { + "epoch": 0.46979998029362496, + "grad_norm": 1.1046404838562012, + "learning_rate": 8.436131067110566e-05, + "loss": 1.8331, + "step": 1788 + }, + { + "epoch": 0.47032548362728677, + "grad_norm": 0.8004000186920166, + "learning_rate": 8.434378833012091e-05, + "loss": 1.826, + "step": 1790 + }, + { + "epoch": 0.4708509869609485, + "grad_norm": 1.05780827999115, + "learning_rate": 8.432626598913616e-05, + "loss": 1.8101, + "step": 1792 + }, + { + "epoch": 0.4713764902946103, + "grad_norm": 0.9407793879508972, + "learning_rate": 8.43087436481514e-05, + "loss": 1.7962, + "step": 1794 + }, + { + "epoch": 0.47190199362827207, + "grad_norm": 0.7705127000808716, + "learning_rate": 8.429122130716664e-05, + "loss": 1.7978, + "step": 1796 + }, + { + "epoch": 0.47242749696193387, + "grad_norm": 0.8127232193946838, + "learning_rate": 8.427369896618189e-05, + "loss": 1.7736, + "step": 1798 + }, + { + "epoch": 0.4729530002955956, + "grad_norm": 1.0332077741622925, + "learning_rate": 8.425617662519712e-05, + "loss": 1.7882, + "step": 1800 + }, + { + "epoch": 0.4734785036292574, + "grad_norm": 0.8787586092948914, + "learning_rate": 8.423865428421237e-05, + "loss": 1.7942, + "step": 1802 + }, + { + "epoch": 0.47400400696291917, + "grad_norm": 1.0014612674713135, + "learning_rate": 8.422113194322762e-05, + "loss": 1.8046, + "step": 1804 + }, + { + "epoch": 0.47452951029658097, + "grad_norm": 0.7657825946807861, + "learning_rate": 8.420360960224286e-05, + "loss": 1.7933, + "step": 1806 + }, + { + "epoch": 0.4750550136302427, + "grad_norm": 0.8316423296928406, + "learning_rate": 8.418608726125811e-05, + "loss": 1.7988, + "step": 1808 + }, + { + "epoch": 0.47558051696390446, + "grad_norm": 0.7967455387115479, + "learning_rate": 8.416856492027335e-05, + "loss": 1.8043, + "step": 1810 + }, + { + "epoch": 0.47610602029756627, + "grad_norm": 0.7839574217796326, + "learning_rate": 8.41510425792886e-05, + "loss": 1.8401, + "step": 1812 + }, + { + "epoch": 0.476631523631228, + "grad_norm": 0.8147667646408081, + "learning_rate": 8.413352023830384e-05, + "loss": 1.8019, + "step": 1814 + }, + { + "epoch": 0.4771570269648898, + "grad_norm": 1.0701135396957397, + "learning_rate": 8.411599789731909e-05, + "loss": 1.8301, + "step": 1816 + }, + { + "epoch": 0.47768253029855157, + "grad_norm": 0.8631109595298767, + "learning_rate": 8.409847555633434e-05, + "loss": 1.8335, + "step": 1818 + }, + { + "epoch": 0.47820803363221337, + "grad_norm": 0.9797492623329163, + "learning_rate": 8.408095321534958e-05, + "loss": 1.8189, + "step": 1820 + }, + { + "epoch": 0.4787335369658751, + "grad_norm": 0.9087586998939514, + "learning_rate": 8.406343087436482e-05, + "loss": 1.8405, + "step": 1822 + }, + { + "epoch": 0.4792590402995369, + "grad_norm": 0.7700434923171997, + "learning_rate": 8.404590853338007e-05, + "loss": 1.7943, + "step": 1824 + }, + { + "epoch": 0.47978454363319867, + "grad_norm": 0.9100522398948669, + "learning_rate": 8.40283861923953e-05, + "loss": 1.8021, + "step": 1826 + }, + { + "epoch": 0.48031004696686047, + "grad_norm": 1.0668331384658813, + "learning_rate": 8.401086385141055e-05, + "loss": 1.7966, + "step": 1828 + }, + { + "epoch": 0.4808355503005222, + "grad_norm": 0.9680224061012268, + "learning_rate": 8.39933415104258e-05, + "loss": 1.7945, + "step": 1830 + }, + { + "epoch": 0.48136105363418397, + "grad_norm": 1.0217275619506836, + "learning_rate": 8.397581916944104e-05, + "loss": 1.821, + "step": 1832 + }, + { + "epoch": 0.48188655696784577, + "grad_norm": 0.854264497756958, + "learning_rate": 8.395829682845629e-05, + "loss": 1.809, + "step": 1834 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.9226179718971252, + "learning_rate": 8.394077448747152e-05, + "loss": 1.8021, + "step": 1836 + }, + { + "epoch": 0.4829375636351693, + "grad_norm": 1.205917239189148, + "learning_rate": 8.392325214648677e-05, + "loss": 1.7912, + "step": 1838 + }, + { + "epoch": 0.48346306696883107, + "grad_norm": 1.0180691480636597, + "learning_rate": 8.390572980550202e-05, + "loss": 1.8032, + "step": 1840 + }, + { + "epoch": 0.48398857030249287, + "grad_norm": 0.9707418084144592, + "learning_rate": 8.388820746451727e-05, + "loss": 1.8038, + "step": 1842 + }, + { + "epoch": 0.4845140736361546, + "grad_norm": 1.21326744556427, + "learning_rate": 8.387068512353251e-05, + "loss": 1.8264, + "step": 1844 + }, + { + "epoch": 0.4850395769698164, + "grad_norm": 0.8171116709709167, + "learning_rate": 8.385316278254776e-05, + "loss": 1.8037, + "step": 1846 + }, + { + "epoch": 0.48556508030347817, + "grad_norm": 0.9211159348487854, + "learning_rate": 8.3835640441563e-05, + "loss": 1.8085, + "step": 1848 + }, + { + "epoch": 0.48609058363714, + "grad_norm": 1.2191238403320312, + "learning_rate": 8.381811810057824e-05, + "loss": 1.7982, + "step": 1850 + }, + { + "epoch": 0.4866160869708017, + "grad_norm": 0.8354179263114929, + "learning_rate": 8.380059575959348e-05, + "loss": 1.7862, + "step": 1852 + }, + { + "epoch": 0.48714159030446347, + "grad_norm": 0.7723087072372437, + "learning_rate": 8.378307341860872e-05, + "loss": 1.7629, + "step": 1854 + }, + { + "epoch": 0.48766709363812527, + "grad_norm": 1.1398297548294067, + "learning_rate": 8.376555107762397e-05, + "loss": 1.8118, + "step": 1856 + }, + { + "epoch": 0.488192596971787, + "grad_norm": 0.904444694519043, + "learning_rate": 8.374802873663922e-05, + "loss": 1.8417, + "step": 1858 + }, + { + "epoch": 0.4887181003054488, + "grad_norm": 1.8874095678329468, + "learning_rate": 8.373050639565447e-05, + "loss": 1.7985, + "step": 1860 + }, + { + "epoch": 0.48924360363911057, + "grad_norm": 1.9317784309387207, + "learning_rate": 8.37129840546697e-05, + "loss": 1.7974, + "step": 1862 + }, + { + "epoch": 0.4897691069727724, + "grad_norm": 0.9307408332824707, + "learning_rate": 8.369546171368495e-05, + "loss": 1.8137, + "step": 1864 + }, + { + "epoch": 0.4902946103064341, + "grad_norm": 0.7302669882774353, + "learning_rate": 8.36779393727002e-05, + "loss": 1.7555, + "step": 1866 + }, + { + "epoch": 0.4908201136400959, + "grad_norm": 0.9116623997688293, + "learning_rate": 8.366041703171544e-05, + "loss": 1.8122, + "step": 1868 + }, + { + "epoch": 0.49134561697375767, + "grad_norm": 0.8545958995819092, + "learning_rate": 8.364289469073069e-05, + "loss": 1.8192, + "step": 1870 + }, + { + "epoch": 0.4918711203074195, + "grad_norm": 0.8263342380523682, + "learning_rate": 8.362537234974594e-05, + "loss": 1.8059, + "step": 1872 + }, + { + "epoch": 0.4923966236410812, + "grad_norm": 1.1116405725479126, + "learning_rate": 8.360785000876117e-05, + "loss": 1.819, + "step": 1874 + }, + { + "epoch": 0.49292212697474297, + "grad_norm": 0.8676914572715759, + "learning_rate": 8.359032766777641e-05, + "loss": 1.801, + "step": 1876 + }, + { + "epoch": 0.49344763030840477, + "grad_norm": 0.7704113125801086, + "learning_rate": 8.357280532679165e-05, + "loss": 1.7834, + "step": 1878 + }, + { + "epoch": 0.4939731336420665, + "grad_norm": 1.1784600019454956, + "learning_rate": 8.35552829858069e-05, + "loss": 1.8283, + "step": 1880 + }, + { + "epoch": 0.4944986369757283, + "grad_norm": 1.0245088338851929, + "learning_rate": 8.353776064482215e-05, + "loss": 1.8172, + "step": 1882 + }, + { + "epoch": 0.49502414030939007, + "grad_norm": 0.9373153448104858, + "learning_rate": 8.35202383038374e-05, + "loss": 1.786, + "step": 1884 + }, + { + "epoch": 0.4955496436430519, + "grad_norm": 1.362306833267212, + "learning_rate": 8.350271596285264e-05, + "loss": 1.8157, + "step": 1886 + }, + { + "epoch": 0.4960751469767136, + "grad_norm": 0.9665769934654236, + "learning_rate": 8.348519362186788e-05, + "loss": 1.7878, + "step": 1888 + }, + { + "epoch": 0.4966006503103754, + "grad_norm": 0.8806145191192627, + "learning_rate": 8.346767128088313e-05, + "loss": 1.8323, + "step": 1890 + }, + { + "epoch": 0.49712615364403717, + "grad_norm": 1.2314025163650513, + "learning_rate": 8.345014893989837e-05, + "loss": 1.8569, + "step": 1892 + }, + { + "epoch": 0.497651656977699, + "grad_norm": 1.8855247497558594, + "learning_rate": 8.343262659891362e-05, + "loss": 1.8152, + "step": 1894 + }, + { + "epoch": 0.4981771603113607, + "grad_norm": 1.1224102973937988, + "learning_rate": 8.341510425792887e-05, + "loss": 1.8022, + "step": 1896 + }, + { + "epoch": 0.49870266364502247, + "grad_norm": 0.7415306568145752, + "learning_rate": 8.339758191694412e-05, + "loss": 1.7825, + "step": 1898 + }, + { + "epoch": 0.4992281669786843, + "grad_norm": 1.4165035486221313, + "learning_rate": 8.338005957595935e-05, + "loss": 1.8148, + "step": 1900 + }, + { + "epoch": 0.499753670312346, + "grad_norm": 0.7428869605064392, + "learning_rate": 8.33625372349746e-05, + "loss": 1.8034, + "step": 1902 + }, + { + "epoch": 0.5002791736460078, + "grad_norm": 0.6850984692573547, + "learning_rate": 8.334501489398983e-05, + "loss": 1.7891, + "step": 1904 + }, + { + "epoch": 0.5008046769796696, + "grad_norm": 0.7525566816329956, + "learning_rate": 8.332749255300508e-05, + "loss": 1.8046, + "step": 1906 + }, + { + "epoch": 0.5013301803133313, + "grad_norm": 0.8629323840141296, + "learning_rate": 8.330997021202033e-05, + "loss": 1.82, + "step": 1908 + }, + { + "epoch": 0.5018556836469932, + "grad_norm": 0.885529637336731, + "learning_rate": 8.329244787103557e-05, + "loss": 1.7576, + "step": 1910 + }, + { + "epoch": 0.5023811869806549, + "grad_norm": 0.7622796893119812, + "learning_rate": 8.327492553005082e-05, + "loss": 1.8287, + "step": 1912 + }, + { + "epoch": 0.5029066903143167, + "grad_norm": 0.8797925710678101, + "learning_rate": 8.325740318906607e-05, + "loss": 1.8291, + "step": 1914 + }, + { + "epoch": 0.5034321936479784, + "grad_norm": 0.8444175124168396, + "learning_rate": 8.32398808480813e-05, + "loss": 1.8036, + "step": 1916 + }, + { + "epoch": 0.5039576969816403, + "grad_norm": 0.9204092025756836, + "learning_rate": 8.322235850709655e-05, + "loss": 1.8346, + "step": 1918 + }, + { + "epoch": 0.504483200315302, + "grad_norm": 0.7571083307266235, + "learning_rate": 8.32048361661118e-05, + "loss": 1.7909, + "step": 1920 + }, + { + "epoch": 0.5050087036489638, + "grad_norm": 0.861875593662262, + "learning_rate": 8.318731382512705e-05, + "loss": 1.8077, + "step": 1922 + }, + { + "epoch": 0.5055342069826255, + "grad_norm": 0.8385462164878845, + "learning_rate": 8.31697914841423e-05, + "loss": 1.7738, + "step": 1924 + }, + { + "epoch": 0.5060597103162873, + "grad_norm": 0.9641711115837097, + "learning_rate": 8.315226914315754e-05, + "loss": 1.7992, + "step": 1926 + }, + { + "epoch": 0.5065852136499491, + "grad_norm": 1.5050584077835083, + "learning_rate": 8.313474680217278e-05, + "loss": 1.7867, + "step": 1928 + }, + { + "epoch": 0.5071107169836109, + "grad_norm": 1.0225639343261719, + "learning_rate": 8.311722446118801e-05, + "loss": 1.7787, + "step": 1930 + }, + { + "epoch": 0.5076362203172726, + "grad_norm": 0.9370511174201965, + "learning_rate": 8.309970212020326e-05, + "loss": 1.7945, + "step": 1932 + }, + { + "epoch": 0.5081617236509344, + "grad_norm": 1.1602392196655273, + "learning_rate": 8.30821797792185e-05, + "loss": 1.8071, + "step": 1934 + }, + { + "epoch": 0.5086872269845962, + "grad_norm": 1.095885157585144, + "learning_rate": 8.306465743823375e-05, + "loss": 1.8048, + "step": 1936 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 1.124812364578247, + "learning_rate": 8.3047135097249e-05, + "loss": 1.8092, + "step": 1938 + }, + { + "epoch": 0.5097382336519197, + "grad_norm": 0.7736151218414307, + "learning_rate": 8.302961275626425e-05, + "loss": 1.8173, + "step": 1940 + }, + { + "epoch": 0.5102637369855815, + "grad_norm": 1.199781894683838, + "learning_rate": 8.301209041527948e-05, + "loss": 1.7725, + "step": 1942 + }, + { + "epoch": 0.5107892403192432, + "grad_norm": 0.772127091884613, + "learning_rate": 8.299456807429473e-05, + "loss": 1.8158, + "step": 1944 + }, + { + "epoch": 0.5113147436529051, + "grad_norm": 0.8498915433883667, + "learning_rate": 8.297704573330998e-05, + "loss": 1.7641, + "step": 1946 + }, + { + "epoch": 0.5118402469865668, + "grad_norm": 1.0101765394210815, + "learning_rate": 8.295952339232522e-05, + "loss": 1.8214, + "step": 1948 + }, + { + "epoch": 0.5123657503202286, + "grad_norm": 0.8255197405815125, + "learning_rate": 8.294200105134047e-05, + "loss": 1.7684, + "step": 1950 + }, + { + "epoch": 0.5128912536538903, + "grad_norm": 0.884064257144928, + "learning_rate": 8.292447871035572e-05, + "loss": 1.7953, + "step": 1952 + }, + { + "epoch": 0.5134167569875522, + "grad_norm": 1.1106665134429932, + "learning_rate": 8.290695636937095e-05, + "loss": 1.8086, + "step": 1954 + }, + { + "epoch": 0.5139422603212139, + "grad_norm": 0.8464235663414001, + "learning_rate": 8.288943402838619e-05, + "loss": 1.8182, + "step": 1956 + }, + { + "epoch": 0.5144677636548757, + "grad_norm": 0.8477456569671631, + "learning_rate": 8.287191168740144e-05, + "loss": 1.7845, + "step": 1958 + }, + { + "epoch": 0.5149932669885374, + "grad_norm": 0.8464748859405518, + "learning_rate": 8.285438934641668e-05, + "loss": 1.7325, + "step": 1960 + }, + { + "epoch": 0.5155187703221993, + "grad_norm": 0.8044800758361816, + "learning_rate": 8.283686700543193e-05, + "loss": 1.7715, + "step": 1962 + }, + { + "epoch": 0.516044273655861, + "grad_norm": 0.8983359336853027, + "learning_rate": 8.281934466444718e-05, + "loss": 1.8041, + "step": 1964 + }, + { + "epoch": 0.5165697769895228, + "grad_norm": 1.2751051187515259, + "learning_rate": 8.280182232346243e-05, + "loss": 1.8038, + "step": 1966 + }, + { + "epoch": 0.5170952803231845, + "grad_norm": 0.9036842584609985, + "learning_rate": 8.278429998247766e-05, + "loss": 1.7891, + "step": 1968 + }, + { + "epoch": 0.5176207836568463, + "grad_norm": 0.8021928668022156, + "learning_rate": 8.276677764149291e-05, + "loss": 1.8139, + "step": 1970 + }, + { + "epoch": 0.5181462869905081, + "grad_norm": 0.8150444030761719, + "learning_rate": 8.274925530050815e-05, + "loss": 1.7707, + "step": 1972 + }, + { + "epoch": 0.5186717903241699, + "grad_norm": 0.7655881643295288, + "learning_rate": 8.27317329595234e-05, + "loss": 1.822, + "step": 1974 + }, + { + "epoch": 0.5191972936578316, + "grad_norm": 1.1329301595687866, + "learning_rate": 8.271421061853865e-05, + "loss": 1.7675, + "step": 1976 + }, + { + "epoch": 0.5197227969914934, + "grad_norm": 1.0726947784423828, + "learning_rate": 8.269668827755388e-05, + "loss": 1.8203, + "step": 1978 + }, + { + "epoch": 0.5202483003251552, + "grad_norm": 0.7917013168334961, + "learning_rate": 8.267916593656913e-05, + "loss": 1.8326, + "step": 1980 + }, + { + "epoch": 0.520773803658817, + "grad_norm": 0.955825686454773, + "learning_rate": 8.266164359558437e-05, + "loss": 1.7993, + "step": 1982 + }, + { + "epoch": 0.5212993069924787, + "grad_norm": 1.0151026248931885, + "learning_rate": 8.264412125459961e-05, + "loss": 1.7764, + "step": 1984 + }, + { + "epoch": 0.5218248103261405, + "grad_norm": 1.11459219455719, + "learning_rate": 8.262659891361486e-05, + "loss": 1.8025, + "step": 1986 + }, + { + "epoch": 0.5223503136598023, + "grad_norm": 0.9148701429367065, + "learning_rate": 8.260907657263011e-05, + "loss": 1.8218, + "step": 1988 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.7594835758209229, + "learning_rate": 8.259155423164536e-05, + "loss": 1.8266, + "step": 1990 + }, + { + "epoch": 0.5234013203271258, + "grad_norm": 1.0429881811141968, + "learning_rate": 8.25740318906606e-05, + "loss": 1.8223, + "step": 1992 + }, + { + "epoch": 0.5239268236607876, + "grad_norm": 0.9307808876037598, + "learning_rate": 8.255650954967584e-05, + "loss": 1.8322, + "step": 1994 + }, + { + "epoch": 0.5244523269944493, + "grad_norm": 0.7135612964630127, + "learning_rate": 8.253898720869108e-05, + "loss": 1.8104, + "step": 1996 + }, + { + "epoch": 0.5249778303281112, + "grad_norm": 0.8590657711029053, + "learning_rate": 8.252146486770633e-05, + "loss": 1.7808, + "step": 1998 + }, + { + "epoch": 0.5255033336617729, + "grad_norm": 0.9131346940994263, + "learning_rate": 8.250394252672158e-05, + "loss": 1.7846, + "step": 2000 + }, + { + "epoch": 0.5255033336617729, + "eval_loss": 1.7627384662628174, + "eval_runtime": 487.1945, + "eval_samples_per_second": 249.98, + "eval_steps_per_second": 31.248, + "step": 2000 + }, + { + "epoch": 0.5260288369954347, + "grad_norm": 0.9146105647087097, + "learning_rate": 8.248642018573683e-05, + "loss": 1.8062, + "step": 2002 + }, + { + "epoch": 0.5265543403290964, + "grad_norm": 0.8270906209945679, + "learning_rate": 8.246889784475206e-05, + "loss": 1.7992, + "step": 2004 + }, + { + "epoch": 0.5270798436627583, + "grad_norm": 0.7374973297119141, + "learning_rate": 8.245137550376731e-05, + "loss": 1.8103, + "step": 2006 + }, + { + "epoch": 0.52760534699642, + "grad_norm": 0.748988926410675, + "learning_rate": 8.243385316278254e-05, + "loss": 1.8284, + "step": 2008 + }, + { + "epoch": 0.5281308503300818, + "grad_norm": 0.7624616622924805, + "learning_rate": 8.241633082179779e-05, + "loss": 1.7947, + "step": 2010 + }, + { + "epoch": 0.5286563536637435, + "grad_norm": 0.8599483966827393, + "learning_rate": 8.239880848081304e-05, + "loss": 1.7732, + "step": 2012 + }, + { + "epoch": 0.5291818569974053, + "grad_norm": 0.8122212886810303, + "learning_rate": 8.238128613982829e-05, + "loss": 1.8128, + "step": 2014 + }, + { + "epoch": 0.5297073603310671, + "grad_norm": 0.8008537292480469, + "learning_rate": 8.236376379884353e-05, + "loss": 1.8028, + "step": 2016 + }, + { + "epoch": 0.5302328636647289, + "grad_norm": 0.8155772686004639, + "learning_rate": 8.234624145785878e-05, + "loss": 1.8124, + "step": 2018 + }, + { + "epoch": 0.5307583669983906, + "grad_norm": 0.9014889001846313, + "learning_rate": 8.232871911687401e-05, + "loss": 1.8056, + "step": 2020 + }, + { + "epoch": 0.5312838703320524, + "grad_norm": 0.7495489716529846, + "learning_rate": 8.231119677588926e-05, + "loss": 1.7891, + "step": 2022 + }, + { + "epoch": 0.5318093736657142, + "grad_norm": 0.6256895065307617, + "learning_rate": 8.229367443490451e-05, + "loss": 1.7671, + "step": 2024 + }, + { + "epoch": 0.532334876999376, + "grad_norm": 0.9852064847946167, + "learning_rate": 8.227615209391976e-05, + "loss": 1.7945, + "step": 2026 + }, + { + "epoch": 0.5328603803330377, + "grad_norm": 0.9755546450614929, + "learning_rate": 8.2258629752935e-05, + "loss": 1.7999, + "step": 2028 + }, + { + "epoch": 0.5333858836666995, + "grad_norm": 0.7656162977218628, + "learning_rate": 8.224110741195024e-05, + "loss": 1.7869, + "step": 2030 + }, + { + "epoch": 0.5339113870003613, + "grad_norm": 0.9118050336837769, + "learning_rate": 8.222358507096549e-05, + "loss": 1.8278, + "step": 2032 + }, + { + "epoch": 0.5344368903340231, + "grad_norm": 0.6935544013977051, + "learning_rate": 8.220606272998072e-05, + "loss": 1.8328, + "step": 2034 + }, + { + "epoch": 0.5349623936676848, + "grad_norm": 0.9424465298652649, + "learning_rate": 8.218854038899597e-05, + "loss": 1.7714, + "step": 2036 + }, + { + "epoch": 0.5354878970013466, + "grad_norm": 0.7186869978904724, + "learning_rate": 8.217101804801122e-05, + "loss": 1.8039, + "step": 2038 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.6855552792549133, + "learning_rate": 8.215349570702646e-05, + "loss": 1.7952, + "step": 2040 + }, + { + "epoch": 0.5365389036686702, + "grad_norm": 0.6616165637969971, + "learning_rate": 8.213597336604171e-05, + "loss": 1.7949, + "step": 2042 + }, + { + "epoch": 0.5370644070023319, + "grad_norm": 0.7983983755111694, + "learning_rate": 8.211845102505696e-05, + "loss": 1.8027, + "step": 2044 + }, + { + "epoch": 0.5375899103359937, + "grad_norm": 0.7051743865013123, + "learning_rate": 8.210092868407219e-05, + "loss": 1.8062, + "step": 2046 + }, + { + "epoch": 0.5381154136696554, + "grad_norm": 0.7773457169532776, + "learning_rate": 8.208340634308744e-05, + "loss": 1.7559, + "step": 2048 + }, + { + "epoch": 0.5386409170033173, + "grad_norm": 1.415022373199463, + "learning_rate": 8.206588400210269e-05, + "loss": 1.8263, + "step": 2050 + }, + { + "epoch": 0.539166420336979, + "grad_norm": 0.8128517270088196, + "learning_rate": 8.204836166111794e-05, + "loss": 1.8131, + "step": 2052 + }, + { + "epoch": 0.5396919236706408, + "grad_norm": 0.8620209097862244, + "learning_rate": 8.203083932013317e-05, + "loss": 1.7608, + "step": 2054 + }, + { + "epoch": 0.5402174270043025, + "grad_norm": 0.7338502407073975, + "learning_rate": 8.201331697914842e-05, + "loss": 1.8006, + "step": 2056 + }, + { + "epoch": 0.5407429303379643, + "grad_norm": 0.8432105183601379, + "learning_rate": 8.199579463816366e-05, + "loss": 1.7716, + "step": 2058 + }, + { + "epoch": 0.5412684336716261, + "grad_norm": 0.600993812084198, + "learning_rate": 8.19782722971789e-05, + "loss": 1.7858, + "step": 2060 + }, + { + "epoch": 0.5417939370052879, + "grad_norm": 0.7413330674171448, + "learning_rate": 8.196074995619415e-05, + "loss": 1.7714, + "step": 2062 + }, + { + "epoch": 0.5423194403389496, + "grad_norm": 0.7860500812530518, + "learning_rate": 8.19432276152094e-05, + "loss": 1.8065, + "step": 2064 + }, + { + "epoch": 0.5428449436726114, + "grad_norm": 0.8075912594795227, + "learning_rate": 8.192570527422464e-05, + "loss": 1.7946, + "step": 2066 + }, + { + "epoch": 0.5433704470062732, + "grad_norm": 0.7681949734687805, + "learning_rate": 8.190818293323989e-05, + "loss": 1.8246, + "step": 2068 + }, + { + "epoch": 0.543895950339935, + "grad_norm": 0.7330343127250671, + "learning_rate": 8.189066059225514e-05, + "loss": 1.7884, + "step": 2070 + }, + { + "epoch": 0.5444214536735967, + "grad_norm": 0.8136972784996033, + "learning_rate": 8.187313825127037e-05, + "loss": 1.7783, + "step": 2072 + }, + { + "epoch": 0.5449469570072585, + "grad_norm": 0.9508219361305237, + "learning_rate": 8.185561591028562e-05, + "loss": 1.822, + "step": 2074 + }, + { + "epoch": 0.5454724603409203, + "grad_norm": 1.0187771320343018, + "learning_rate": 8.183809356930087e-05, + "loss": 1.7858, + "step": 2076 + }, + { + "epoch": 0.5459979636745821, + "grad_norm": 0.872322678565979, + "learning_rate": 8.182057122831611e-05, + "loss": 1.7715, + "step": 2078 + }, + { + "epoch": 0.5465234670082438, + "grad_norm": 0.9391134977340698, + "learning_rate": 8.180304888733135e-05, + "loss": 1.8014, + "step": 2080 + }, + { + "epoch": 0.5470489703419056, + "grad_norm": 0.7798128128051758, + "learning_rate": 8.17855265463466e-05, + "loss": 1.7803, + "step": 2082 + }, + { + "epoch": 0.5475744736755673, + "grad_norm": 0.9748620390892029, + "learning_rate": 8.176800420536184e-05, + "loss": 1.8054, + "step": 2084 + }, + { + "epoch": 0.5480999770092292, + "grad_norm": 0.9456170201301575, + "learning_rate": 8.175048186437708e-05, + "loss": 1.8084, + "step": 2086 + }, + { + "epoch": 0.5486254803428909, + "grad_norm": 1.295296549797058, + "learning_rate": 8.173295952339232e-05, + "loss": 1.8044, + "step": 2088 + }, + { + "epoch": 0.5491509836765527, + "grad_norm": 0.7323461174964905, + "learning_rate": 8.171543718240757e-05, + "loss": 1.787, + "step": 2090 + }, + { + "epoch": 0.5496764870102144, + "grad_norm": 1.0989707708358765, + "learning_rate": 8.169791484142282e-05, + "loss": 1.7789, + "step": 2092 + }, + { + "epoch": 0.5502019903438763, + "grad_norm": 0.9566003680229187, + "learning_rate": 8.168039250043807e-05, + "loss": 1.7832, + "step": 2094 + }, + { + "epoch": 0.550727493677538, + "grad_norm": 0.8293377757072449, + "learning_rate": 8.166287015945331e-05, + "loss": 1.8081, + "step": 2096 + }, + { + "epoch": 0.5512529970111998, + "grad_norm": 0.9931288361549377, + "learning_rate": 8.164534781846855e-05, + "loss": 1.81, + "step": 2098 + }, + { + "epoch": 0.5517785003448615, + "grad_norm": 0.7140156626701355, + "learning_rate": 8.16278254774838e-05, + "loss": 1.7824, + "step": 2100 + }, + { + "epoch": 0.5523040036785233, + "grad_norm": 0.8644457459449768, + "learning_rate": 8.161030313649904e-05, + "loss": 1.8058, + "step": 2102 + }, + { + "epoch": 0.5528295070121851, + "grad_norm": 0.9265533685684204, + "learning_rate": 8.159278079551429e-05, + "loss": 1.7759, + "step": 2104 + }, + { + "epoch": 0.5533550103458469, + "grad_norm": 0.7051352262496948, + "learning_rate": 8.157525845452952e-05, + "loss": 1.7986, + "step": 2106 + }, + { + "epoch": 0.5538805136795086, + "grad_norm": 0.8235836625099182, + "learning_rate": 8.155773611354477e-05, + "loss": 1.8038, + "step": 2108 + }, + { + "epoch": 0.5544060170131704, + "grad_norm": 0.7595791816711426, + "learning_rate": 8.154021377256002e-05, + "loss": 1.8321, + "step": 2110 + }, + { + "epoch": 0.5549315203468322, + "grad_norm": 0.7582104802131653, + "learning_rate": 8.152269143157525e-05, + "loss": 1.8297, + "step": 2112 + }, + { + "epoch": 0.555457023680494, + "grad_norm": 0.6678735613822937, + "learning_rate": 8.15051690905905e-05, + "loss": 1.7559, + "step": 2114 + }, + { + "epoch": 0.5559825270141557, + "grad_norm": 0.8775631189346313, + "learning_rate": 8.148764674960575e-05, + "loss": 1.8064, + "step": 2116 + }, + { + "epoch": 0.5565080303478175, + "grad_norm": 0.8533100485801697, + "learning_rate": 8.1470124408621e-05, + "loss": 1.7946, + "step": 2118 + }, + { + "epoch": 0.5570335336814793, + "grad_norm": 0.6834318041801453, + "learning_rate": 8.145260206763624e-05, + "loss": 1.7833, + "step": 2120 + }, + { + "epoch": 0.5575590370151411, + "grad_norm": 0.9778022766113281, + "learning_rate": 8.143507972665149e-05, + "loss": 1.8012, + "step": 2122 + }, + { + "epoch": 0.5580845403488028, + "grad_norm": 0.7117162942886353, + "learning_rate": 8.141755738566673e-05, + "loss": 1.8246, + "step": 2124 + }, + { + "epoch": 0.5586100436824646, + "grad_norm": 0.8154255747795105, + "learning_rate": 8.140003504468197e-05, + "loss": 1.7692, + "step": 2126 + }, + { + "epoch": 0.5591355470161263, + "grad_norm": 0.7914754152297974, + "learning_rate": 8.138251270369722e-05, + "loss": 1.7623, + "step": 2128 + }, + { + "epoch": 0.5596610503497882, + "grad_norm": 0.657900333404541, + "learning_rate": 8.136499036271247e-05, + "loss": 1.7882, + "step": 2130 + }, + { + "epoch": 0.5601865536834499, + "grad_norm": 0.6770361065864563, + "learning_rate": 8.13474680217277e-05, + "loss": 1.7737, + "step": 2132 + }, + { + "epoch": 0.5607120570171117, + "grad_norm": 0.9176309108734131, + "learning_rate": 8.132994568074295e-05, + "loss": 1.7706, + "step": 2134 + }, + { + "epoch": 0.5612375603507734, + "grad_norm": 1.037473201751709, + "learning_rate": 8.13124233397582e-05, + "loss": 1.8172, + "step": 2136 + }, + { + "epoch": 0.5617630636844353, + "grad_norm": 1.409900426864624, + "learning_rate": 8.129490099877343e-05, + "loss": 1.7831, + "step": 2138 + }, + { + "epoch": 0.562288567018097, + "grad_norm": 0.7014243602752686, + "learning_rate": 8.127737865778868e-05, + "loss": 1.7844, + "step": 2140 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.7743593454360962, + "learning_rate": 8.125985631680393e-05, + "loss": 1.7591, + "step": 2142 + }, + { + "epoch": 0.5633395736854205, + "grad_norm": 0.8568558096885681, + "learning_rate": 8.124233397581917e-05, + "loss": 1.8341, + "step": 2144 + }, + { + "epoch": 0.5638650770190824, + "grad_norm": 0.792972981929779, + "learning_rate": 8.122481163483442e-05, + "loss": 1.7639, + "step": 2146 + }, + { + "epoch": 0.5643905803527441, + "grad_norm": 0.983025312423706, + "learning_rate": 8.120728929384967e-05, + "loss": 1.7952, + "step": 2148 + }, + { + "epoch": 0.5649160836864059, + "grad_norm": 0.7422555088996887, + "learning_rate": 8.11897669528649e-05, + "loss": 1.7856, + "step": 2150 + }, + { + "epoch": 0.5654415870200676, + "grad_norm": 0.6807308793067932, + "learning_rate": 8.117224461188015e-05, + "loss": 1.841, + "step": 2152 + }, + { + "epoch": 0.5659670903537294, + "grad_norm": 0.845778226852417, + "learning_rate": 8.11547222708954e-05, + "loss": 1.7815, + "step": 2154 + }, + { + "epoch": 0.5664925936873912, + "grad_norm": 0.8135868906974792, + "learning_rate": 8.113719992991063e-05, + "loss": 1.8014, + "step": 2156 + }, + { + "epoch": 0.567018097021053, + "grad_norm": 0.7737998366355896, + "learning_rate": 8.111967758892588e-05, + "loss": 1.7906, + "step": 2158 + }, + { + "epoch": 0.5675436003547147, + "grad_norm": 0.8078686594963074, + "learning_rate": 8.110215524794113e-05, + "loss": 1.7612, + "step": 2160 + }, + { + "epoch": 0.5680691036883765, + "grad_norm": 0.8356254696846008, + "learning_rate": 8.108463290695637e-05, + "loss": 1.7736, + "step": 2162 + }, + { + "epoch": 0.5685946070220383, + "grad_norm": 0.7324886322021484, + "learning_rate": 8.106711056597162e-05, + "loss": 1.8031, + "step": 2164 + }, + { + "epoch": 0.5691201103557001, + "grad_norm": 0.6843339800834656, + "learning_rate": 8.104958822498686e-05, + "loss": 1.7733, + "step": 2166 + }, + { + "epoch": 0.5696456136893618, + "grad_norm": 1.1155132055282593, + "learning_rate": 8.10320658840021e-05, + "loss": 1.8057, + "step": 2168 + }, + { + "epoch": 0.5701711170230236, + "grad_norm": 0.944843053817749, + "learning_rate": 8.101454354301735e-05, + "loss": 1.7848, + "step": 2170 + }, + { + "epoch": 0.5706966203566853, + "grad_norm": 0.7923617959022522, + "learning_rate": 8.09970212020326e-05, + "loss": 1.7636, + "step": 2172 + }, + { + "epoch": 0.5712221236903472, + "grad_norm": 0.7884588837623596, + "learning_rate": 8.097949886104785e-05, + "loss": 1.7856, + "step": 2174 + }, + { + "epoch": 0.5717476270240089, + "grad_norm": 0.8496378064155579, + "learning_rate": 8.09619765200631e-05, + "loss": 1.8124, + "step": 2176 + }, + { + "epoch": 0.5722731303576707, + "grad_norm": 0.6795907020568848, + "learning_rate": 8.094445417907833e-05, + "loss": 1.825, + "step": 2178 + }, + { + "epoch": 0.5727986336913324, + "grad_norm": 1.0210143327713013, + "learning_rate": 8.092693183809358e-05, + "loss": 1.8193, + "step": 2180 + }, + { + "epoch": 0.5733241370249943, + "grad_norm": 0.9265308380126953, + "learning_rate": 8.090940949710881e-05, + "loss": 1.7955, + "step": 2182 + }, + { + "epoch": 0.573849640358656, + "grad_norm": 0.7709974646568298, + "learning_rate": 8.089188715612406e-05, + "loss": 1.7748, + "step": 2184 + }, + { + "epoch": 0.5743751436923178, + "grad_norm": 1.015137791633606, + "learning_rate": 8.08743648151393e-05, + "loss": 1.7767, + "step": 2186 + }, + { + "epoch": 0.5749006470259795, + "grad_norm": 0.7084217071533203, + "learning_rate": 8.085684247415455e-05, + "loss": 1.7737, + "step": 2188 + }, + { + "epoch": 0.5754261503596414, + "grad_norm": 0.7948693633079529, + "learning_rate": 8.08393201331698e-05, + "loss": 1.8202, + "step": 2190 + }, + { + "epoch": 0.5759516536933031, + "grad_norm": 0.921947181224823, + "learning_rate": 8.082179779218503e-05, + "loss": 1.7906, + "step": 2192 + }, + { + "epoch": 0.5764771570269649, + "grad_norm": 1.3730195760726929, + "learning_rate": 8.080427545120028e-05, + "loss": 1.7645, + "step": 2194 + }, + { + "epoch": 0.5770026603606266, + "grad_norm": 0.7949815392494202, + "learning_rate": 8.078675311021553e-05, + "loss": 1.7519, + "step": 2196 + }, + { + "epoch": 0.5775281636942884, + "grad_norm": 0.8247926831245422, + "learning_rate": 8.076923076923078e-05, + "loss": 1.7875, + "step": 2198 + }, + { + "epoch": 0.5780536670279502, + "grad_norm": 0.8016488552093506, + "learning_rate": 8.075170842824602e-05, + "loss": 1.7832, + "step": 2200 + }, + { + "epoch": 0.578579170361612, + "grad_norm": 0.8356485366821289, + "learning_rate": 8.073418608726127e-05, + "loss": 1.7964, + "step": 2202 + }, + { + "epoch": 0.5791046736952737, + "grad_norm": 0.6765563488006592, + "learning_rate": 8.07166637462765e-05, + "loss": 1.7864, + "step": 2204 + }, + { + "epoch": 0.5796301770289355, + "grad_norm": 0.6917481422424316, + "learning_rate": 8.069914140529175e-05, + "loss": 1.7662, + "step": 2206 + }, + { + "epoch": 0.5801556803625973, + "grad_norm": 0.7653847336769104, + "learning_rate": 8.068161906430699e-05, + "loss": 1.7586, + "step": 2208 + }, + { + "epoch": 0.5806811836962591, + "grad_norm": 0.7687065005302429, + "learning_rate": 8.066409672332224e-05, + "loss": 1.8408, + "step": 2210 + }, + { + "epoch": 0.5812066870299208, + "grad_norm": 1.3475931882858276, + "learning_rate": 8.064657438233748e-05, + "loss": 1.7714, + "step": 2212 + }, + { + "epoch": 0.5817321903635826, + "grad_norm": 0.9233903288841248, + "learning_rate": 8.062905204135273e-05, + "loss": 1.7366, + "step": 2214 + }, + { + "epoch": 0.5822576936972443, + "grad_norm": 1.3624159097671509, + "learning_rate": 8.061152970036798e-05, + "loss": 1.8115, + "step": 2216 + }, + { + "epoch": 0.5827831970309062, + "grad_norm": 1.2570765018463135, + "learning_rate": 8.059400735938321e-05, + "loss": 1.8475, + "step": 2218 + }, + { + "epoch": 0.583308700364568, + "grad_norm": 1.3805052042007446, + "learning_rate": 8.057648501839846e-05, + "loss": 1.7814, + "step": 2220 + }, + { + "epoch": 0.5838342036982297, + "grad_norm": 1.1308529376983643, + "learning_rate": 8.055896267741371e-05, + "loss": 1.7531, + "step": 2222 + }, + { + "epoch": 0.5843597070318914, + "grad_norm": 0.8926995992660522, + "learning_rate": 8.054144033642895e-05, + "loss": 1.7973, + "step": 2224 + }, + { + "epoch": 0.5848852103655533, + "grad_norm": 1.3856247663497925, + "learning_rate": 8.05239179954442e-05, + "loss": 1.8255, + "step": 2226 + }, + { + "epoch": 0.585410713699215, + "grad_norm": 1.143256664276123, + "learning_rate": 8.050639565445945e-05, + "loss": 1.7983, + "step": 2228 + }, + { + "epoch": 0.5859362170328768, + "grad_norm": 0.7704371809959412, + "learning_rate": 8.048887331347468e-05, + "loss": 1.787, + "step": 2230 + }, + { + "epoch": 0.5864617203665385, + "grad_norm": 0.8830547332763672, + "learning_rate": 8.047135097248993e-05, + "loss": 1.8016, + "step": 2232 + }, + { + "epoch": 0.5869872237002004, + "grad_norm": 1.3716325759887695, + "learning_rate": 8.045382863150517e-05, + "loss": 1.8184, + "step": 2234 + }, + { + "epoch": 0.5875127270338621, + "grad_norm": 0.6707213521003723, + "learning_rate": 8.043630629052041e-05, + "loss": 1.7814, + "step": 2236 + }, + { + "epoch": 0.5880382303675239, + "grad_norm": 0.8658749461174011, + "learning_rate": 8.041878394953566e-05, + "loss": 1.7797, + "step": 2238 + }, + { + "epoch": 0.5885637337011856, + "grad_norm": 0.8602432608604431, + "learning_rate": 8.040126160855091e-05, + "loss": 1.7646, + "step": 2240 + }, + { + "epoch": 0.5890892370348474, + "grad_norm": 0.6430072784423828, + "learning_rate": 8.038373926756616e-05, + "loss": 1.7872, + "step": 2242 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.8540019989013672, + "learning_rate": 8.036621692658139e-05, + "loss": 1.7961, + "step": 2244 + }, + { + "epoch": 0.590140243702171, + "grad_norm": 0.7394554615020752, + "learning_rate": 8.034869458559664e-05, + "loss": 1.814, + "step": 2246 + }, + { + "epoch": 0.5906657470358327, + "grad_norm": 0.6837593913078308, + "learning_rate": 8.033117224461188e-05, + "loss": 1.7913, + "step": 2248 + }, + { + "epoch": 0.5911912503694945, + "grad_norm": 0.7128705382347107, + "learning_rate": 8.031364990362713e-05, + "loss": 1.7655, + "step": 2250 + }, + { + "epoch": 0.5917167537031564, + "grad_norm": 0.6362258791923523, + "learning_rate": 8.029612756264238e-05, + "loss": 1.8152, + "step": 2252 + }, + { + "epoch": 0.5922422570368181, + "grad_norm": 0.8071087002754211, + "learning_rate": 8.027860522165763e-05, + "loss": 1.774, + "step": 2254 + }, + { + "epoch": 0.5927677603704798, + "grad_norm": 0.713575005531311, + "learning_rate": 8.026108288067286e-05, + "loss": 1.7543, + "step": 2256 + }, + { + "epoch": 0.5932932637041416, + "grad_norm": 1.3430129289627075, + "learning_rate": 8.02435605396881e-05, + "loss": 1.8361, + "step": 2258 + }, + { + "epoch": 0.5938187670378033, + "grad_norm": 0.7131674885749817, + "learning_rate": 8.022603819870334e-05, + "loss": 1.7964, + "step": 2260 + }, + { + "epoch": 0.5943442703714652, + "grad_norm": 0.8277941942214966, + "learning_rate": 8.020851585771859e-05, + "loss": 1.7872, + "step": 2262 + }, + { + "epoch": 0.594869773705127, + "grad_norm": 1.3098149299621582, + "learning_rate": 8.019099351673384e-05, + "loss": 1.7578, + "step": 2264 + }, + { + "epoch": 0.5953952770387887, + "grad_norm": 0.7214488983154297, + "learning_rate": 8.017347117574909e-05, + "loss": 1.8064, + "step": 2266 + }, + { + "epoch": 0.5959207803724504, + "grad_norm": 0.6886647343635559, + "learning_rate": 8.015594883476433e-05, + "loss": 1.8442, + "step": 2268 + }, + { + "epoch": 0.5964462837061123, + "grad_norm": 0.7065162062644958, + "learning_rate": 8.013842649377957e-05, + "loss": 1.7493, + "step": 2270 + }, + { + "epoch": 0.596971787039774, + "grad_norm": 0.647866427898407, + "learning_rate": 8.012090415279481e-05, + "loss": 1.7716, + "step": 2272 + }, + { + "epoch": 0.5974972903734358, + "grad_norm": 1.0947537422180176, + "learning_rate": 8.010338181181006e-05, + "loss": 1.7847, + "step": 2274 + }, + { + "epoch": 0.5980227937070975, + "grad_norm": 0.7569209337234497, + "learning_rate": 8.008585947082531e-05, + "loss": 1.8247, + "step": 2276 + }, + { + "epoch": 0.5985482970407594, + "grad_norm": 0.8344804644584656, + "learning_rate": 8.006833712984056e-05, + "loss": 1.7553, + "step": 2278 + }, + { + "epoch": 0.5990738003744212, + "grad_norm": 0.7028648257255554, + "learning_rate": 8.00508147888558e-05, + "loss": 1.7851, + "step": 2280 + }, + { + "epoch": 0.5995993037080829, + "grad_norm": 0.8421052098274231, + "learning_rate": 8.003329244787104e-05, + "loss": 1.7872, + "step": 2282 + }, + { + "epoch": 0.6001248070417446, + "grad_norm": 0.6548582911491394, + "learning_rate": 8.001577010688627e-05, + "loss": 1.7568, + "step": 2284 + }, + { + "epoch": 0.6006503103754064, + "grad_norm": 0.7493446469306946, + "learning_rate": 7.999824776590152e-05, + "loss": 1.766, + "step": 2286 + }, + { + "epoch": 0.6011758137090683, + "grad_norm": 0.6293224096298218, + "learning_rate": 7.998072542491677e-05, + "loss": 1.7701, + "step": 2288 + }, + { + "epoch": 0.60170131704273, + "grad_norm": 0.8307104706764221, + "learning_rate": 7.996320308393202e-05, + "loss": 1.7783, + "step": 2290 + }, + { + "epoch": 0.6022268203763917, + "grad_norm": 0.696723997592926, + "learning_rate": 7.994568074294726e-05, + "loss": 1.8009, + "step": 2292 + }, + { + "epoch": 0.6027523237100535, + "grad_norm": 0.6556825637817383, + "learning_rate": 7.992815840196251e-05, + "loss": 1.7693, + "step": 2294 + }, + { + "epoch": 0.6032778270437154, + "grad_norm": 0.9338749051094055, + "learning_rate": 7.991063606097774e-05, + "loss": 1.7614, + "step": 2296 + }, + { + "epoch": 0.6038033303773771, + "grad_norm": 0.8844968676567078, + "learning_rate": 7.989311371999299e-05, + "loss": 1.7809, + "step": 2298 + }, + { + "epoch": 0.6043288337110388, + "grad_norm": 0.6624906063079834, + "learning_rate": 7.987559137900824e-05, + "loss": 1.7607, + "step": 2300 + }, + { + "epoch": 0.6048543370447006, + "grad_norm": 0.7565969824790955, + "learning_rate": 7.985806903802349e-05, + "loss": 1.7374, + "step": 2302 + }, + { + "epoch": 0.6053798403783625, + "grad_norm": 1.3834869861602783, + "learning_rate": 7.984054669703873e-05, + "loss": 1.8123, + "step": 2304 + }, + { + "epoch": 0.6059053437120242, + "grad_norm": 0.9589686989784241, + "learning_rate": 7.982302435605398e-05, + "loss": 1.7766, + "step": 2306 + }, + { + "epoch": 0.606430847045686, + "grad_norm": 1.1011096239089966, + "learning_rate": 7.980550201506922e-05, + "loss": 1.7737, + "step": 2308 + }, + { + "epoch": 0.6069563503793477, + "grad_norm": 1.1299936771392822, + "learning_rate": 7.978797967408445e-05, + "loss": 1.8042, + "step": 2310 + }, + { + "epoch": 0.6074818537130094, + "grad_norm": 0.7697176933288574, + "learning_rate": 7.97704573330997e-05, + "loss": 1.742, + "step": 2312 + }, + { + "epoch": 0.6080073570466713, + "grad_norm": 0.991256833076477, + "learning_rate": 7.975293499211495e-05, + "loss": 1.8302, + "step": 2314 + }, + { + "epoch": 0.608532860380333, + "grad_norm": 0.7879564166069031, + "learning_rate": 7.97354126511302e-05, + "loss": 1.7624, + "step": 2316 + }, + { + "epoch": 0.6090583637139948, + "grad_norm": 0.745040774345398, + "learning_rate": 7.971789031014544e-05, + "loss": 1.807, + "step": 2318 + }, + { + "epoch": 0.6095838670476565, + "grad_norm": 0.9064005613327026, + "learning_rate": 7.970036796916069e-05, + "loss": 1.7371, + "step": 2320 + }, + { + "epoch": 0.6101093703813184, + "grad_norm": 0.9049443602561951, + "learning_rate": 7.968284562817592e-05, + "loss": 1.8108, + "step": 2322 + }, + { + "epoch": 0.6106348737149802, + "grad_norm": 0.8754010200500488, + "learning_rate": 7.966532328719117e-05, + "loss": 1.7804, + "step": 2324 + }, + { + "epoch": 0.6111603770486419, + "grad_norm": 0.7384723424911499, + "learning_rate": 7.964780094620642e-05, + "loss": 1.7683, + "step": 2326 + }, + { + "epoch": 0.6116858803823036, + "grad_norm": 0.739629328250885, + "learning_rate": 7.963027860522167e-05, + "loss": 1.7826, + "step": 2328 + }, + { + "epoch": 0.6122113837159654, + "grad_norm": 0.824571967124939, + "learning_rate": 7.961275626423691e-05, + "loss": 1.7686, + "step": 2330 + }, + { + "epoch": 0.6127368870496273, + "grad_norm": 0.9385930299758911, + "learning_rate": 7.959523392325216e-05, + "loss": 1.7893, + "step": 2332 + }, + { + "epoch": 0.613262390383289, + "grad_norm": 0.7331735491752625, + "learning_rate": 7.95777115822674e-05, + "loss": 1.7835, + "step": 2334 + }, + { + "epoch": 0.6137878937169507, + "grad_norm": 0.6689608693122864, + "learning_rate": 7.956018924128263e-05, + "loss": 1.755, + "step": 2336 + }, + { + "epoch": 0.6143133970506125, + "grad_norm": 0.7041564583778381, + "learning_rate": 7.954266690029788e-05, + "loss": 1.7593, + "step": 2338 + }, + { + "epoch": 0.6148389003842744, + "grad_norm": 0.818277895450592, + "learning_rate": 7.952514455931312e-05, + "loss": 1.8126, + "step": 2340 + }, + { + "epoch": 0.6153644037179361, + "grad_norm": 1.1610822677612305, + "learning_rate": 7.950762221832837e-05, + "loss": 1.8324, + "step": 2342 + }, + { + "epoch": 0.6158899070515979, + "grad_norm": 0.9594135284423828, + "learning_rate": 7.949009987734362e-05, + "loss": 1.7777, + "step": 2344 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.9519843459129333, + "learning_rate": 7.947257753635887e-05, + "loss": 1.7905, + "step": 2346 + }, + { + "epoch": 0.6169409137189215, + "grad_norm": 0.7015102505683899, + "learning_rate": 7.94550551953741e-05, + "loss": 1.7753, + "step": 2348 + }, + { + "epoch": 0.6174664170525832, + "grad_norm": 0.6667357087135315, + "learning_rate": 7.943753285438935e-05, + "loss": 1.7925, + "step": 2350 + }, + { + "epoch": 0.617991920386245, + "grad_norm": 0.6917058229446411, + "learning_rate": 7.94200105134046e-05, + "loss": 1.7737, + "step": 2352 + }, + { + "epoch": 0.6185174237199067, + "grad_norm": 0.8922196626663208, + "learning_rate": 7.940248817241984e-05, + "loss": 1.77, + "step": 2354 + }, + { + "epoch": 0.6190429270535684, + "grad_norm": 0.8834502696990967, + "learning_rate": 7.938496583143509e-05, + "loss": 1.7626, + "step": 2356 + }, + { + "epoch": 0.6195684303872303, + "grad_norm": 0.6824911832809448, + "learning_rate": 7.936744349045034e-05, + "loss": 1.7694, + "step": 2358 + }, + { + "epoch": 0.620093933720892, + "grad_norm": 0.8218874931335449, + "learning_rate": 7.934992114946557e-05, + "loss": 1.753, + "step": 2360 + }, + { + "epoch": 0.6206194370545538, + "grad_norm": 0.808892011642456, + "learning_rate": 7.93323988084808e-05, + "loss": 1.7619, + "step": 2362 + }, + { + "epoch": 0.6211449403882155, + "grad_norm": 0.783028781414032, + "learning_rate": 7.931487646749605e-05, + "loss": 1.8033, + "step": 2364 + }, + { + "epoch": 0.6216704437218774, + "grad_norm": 0.8071235418319702, + "learning_rate": 7.92973541265113e-05, + "loss": 1.7536, + "step": 2366 + }, + { + "epoch": 0.6221959470555392, + "grad_norm": 0.7900059819221497, + "learning_rate": 7.927983178552655e-05, + "loss": 1.7715, + "step": 2368 + }, + { + "epoch": 0.6227214503892009, + "grad_norm": 0.8198074102401733, + "learning_rate": 7.92623094445418e-05, + "loss": 1.7898, + "step": 2370 + }, + { + "epoch": 0.6232469537228627, + "grad_norm": 0.6880433559417725, + "learning_rate": 7.924478710355704e-05, + "loss": 1.7617, + "step": 2372 + }, + { + "epoch": 0.6237724570565244, + "grad_norm": 0.7786495685577393, + "learning_rate": 7.922726476257228e-05, + "loss": 1.7574, + "step": 2374 + }, + { + "epoch": 0.6242979603901863, + "grad_norm": 0.8043944239616394, + "learning_rate": 7.920974242158753e-05, + "loss": 1.797, + "step": 2376 + }, + { + "epoch": 0.624823463723848, + "grad_norm": 0.9602116942405701, + "learning_rate": 7.919222008060277e-05, + "loss": 1.7835, + "step": 2378 + }, + { + "epoch": 0.6253489670575098, + "grad_norm": 0.6723161339759827, + "learning_rate": 7.917469773961802e-05, + "loss": 1.751, + "step": 2380 + }, + { + "epoch": 0.6258744703911715, + "grad_norm": 0.7045361399650574, + "learning_rate": 7.915717539863327e-05, + "loss": 1.7788, + "step": 2382 + }, + { + "epoch": 0.6263999737248334, + "grad_norm": 0.7056633234024048, + "learning_rate": 7.913965305764852e-05, + "loss": 1.7581, + "step": 2384 + }, + { + "epoch": 0.6269254770584951, + "grad_norm": 0.8192391395568848, + "learning_rate": 7.912213071666375e-05, + "loss": 1.751, + "step": 2386 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.8521485924720764, + "learning_rate": 7.910460837567898e-05, + "loss": 1.811, + "step": 2388 + }, + { + "epoch": 0.6279764837258186, + "grad_norm": 0.7382224202156067, + "learning_rate": 7.908708603469423e-05, + "loss": 1.7848, + "step": 2390 + }, + { + "epoch": 0.6285019870594805, + "grad_norm": 0.6544625163078308, + "learning_rate": 7.906956369370948e-05, + "loss": 1.7902, + "step": 2392 + }, + { + "epoch": 0.6290274903931422, + "grad_norm": 0.7634027600288391, + "learning_rate": 7.905204135272473e-05, + "loss": 1.79, + "step": 2394 + }, + { + "epoch": 0.629552993726804, + "grad_norm": 1.0319316387176514, + "learning_rate": 7.903451901173997e-05, + "loss": 1.8045, + "step": 2396 + }, + { + "epoch": 0.6300784970604657, + "grad_norm": 0.6364408731460571, + "learning_rate": 7.901699667075522e-05, + "loss": 1.7666, + "step": 2398 + }, + { + "epoch": 0.6306040003941275, + "grad_norm": 0.919385552406311, + "learning_rate": 7.899947432977046e-05, + "loss": 1.7625, + "step": 2400 + }, + { + "epoch": 0.6306040003941275, + "eval_loss": 1.7536036968231201, + "eval_runtime": 487.1509, + "eval_samples_per_second": 250.003, + "eval_steps_per_second": 31.251, + "step": 2400 + }, + { + "epoch": 0.6311295037277893, + "grad_norm": 0.9996768832206726, + "learning_rate": 7.89819519887857e-05, + "loss": 1.8133, + "step": 2402 + }, + { + "epoch": 0.6316550070614511, + "grad_norm": 0.7376594543457031, + "learning_rate": 7.896442964780095e-05, + "loss": 1.8095, + "step": 2404 + }, + { + "epoch": 0.6321805103951128, + "grad_norm": 0.970077633857727, + "learning_rate": 7.89469073068162e-05, + "loss": 1.7818, + "step": 2406 + }, + { + "epoch": 0.6327060137287746, + "grad_norm": 0.8934677839279175, + "learning_rate": 7.892938496583145e-05, + "loss": 1.7905, + "step": 2408 + }, + { + "epoch": 0.6332315170624364, + "grad_norm": 0.8888778686523438, + "learning_rate": 7.891186262484669e-05, + "loss": 1.7646, + "step": 2410 + }, + { + "epoch": 0.6337570203960982, + "grad_norm": 0.7317706942558289, + "learning_rate": 7.889434028386193e-05, + "loss": 1.8039, + "step": 2412 + }, + { + "epoch": 0.6342825237297599, + "grad_norm": 0.768997848033905, + "learning_rate": 7.887681794287717e-05, + "loss": 1.8152, + "step": 2414 + }, + { + "epoch": 0.6348080270634217, + "grad_norm": 0.8444989323616028, + "learning_rate": 7.885929560189241e-05, + "loss": 1.7889, + "step": 2416 + }, + { + "epoch": 0.6353335303970834, + "grad_norm": 0.7109376788139343, + "learning_rate": 7.884177326090766e-05, + "loss": 1.7729, + "step": 2418 + }, + { + "epoch": 0.6358590337307453, + "grad_norm": 0.630806565284729, + "learning_rate": 7.88242509199229e-05, + "loss": 1.7954, + "step": 2420 + }, + { + "epoch": 0.636384537064407, + "grad_norm": 0.6395316123962402, + "learning_rate": 7.880672857893815e-05, + "loss": 1.7797, + "step": 2422 + }, + { + "epoch": 0.6369100403980688, + "grad_norm": 1.0122566223144531, + "learning_rate": 7.87892062379534e-05, + "loss": 1.7841, + "step": 2424 + }, + { + "epoch": 0.6374355437317305, + "grad_norm": 0.7840449810028076, + "learning_rate": 7.877168389696865e-05, + "loss": 1.7888, + "step": 2426 + }, + { + "epoch": 0.6379610470653924, + "grad_norm": 0.7690210342407227, + "learning_rate": 7.875416155598388e-05, + "loss": 1.8122, + "step": 2428 + }, + { + "epoch": 0.6384865503990541, + "grad_norm": 0.7418575286865234, + "learning_rate": 7.873663921499913e-05, + "loss": 1.8056, + "step": 2430 + }, + { + "epoch": 0.6390120537327159, + "grad_norm": 0.8744335770606995, + "learning_rate": 7.871911687401438e-05, + "loss": 1.7914, + "step": 2432 + }, + { + "epoch": 0.6395375570663776, + "grad_norm": 0.9662806391716003, + "learning_rate": 7.870159453302962e-05, + "loss": 1.7712, + "step": 2434 + }, + { + "epoch": 0.6400630604000395, + "grad_norm": 1.1848862171173096, + "learning_rate": 7.868407219204487e-05, + "loss": 1.7627, + "step": 2436 + }, + { + "epoch": 0.6405885637337012, + "grad_norm": 0.7062596082687378, + "learning_rate": 7.86665498510601e-05, + "loss": 1.7691, + "step": 2438 + }, + { + "epoch": 0.641114067067363, + "grad_norm": 0.9317710399627686, + "learning_rate": 7.864902751007535e-05, + "loss": 1.7697, + "step": 2440 + }, + { + "epoch": 0.6416395704010247, + "grad_norm": 0.8188003301620483, + "learning_rate": 7.863150516909059e-05, + "loss": 1.7934, + "step": 2442 + }, + { + "epoch": 0.6421650737346865, + "grad_norm": 1.0256218910217285, + "learning_rate": 7.861398282810583e-05, + "loss": 1.8175, + "step": 2444 + }, + { + "epoch": 0.6426905770683483, + "grad_norm": 0.7887519598007202, + "learning_rate": 7.859646048712108e-05, + "loss": 1.7842, + "step": 2446 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.906284749507904, + "learning_rate": 7.857893814613633e-05, + "loss": 1.7407, + "step": 2448 + }, + { + "epoch": 0.6437415837356718, + "grad_norm": 1.160643458366394, + "learning_rate": 7.856141580515158e-05, + "loss": 1.7891, + "step": 2450 + }, + { + "epoch": 0.6442670870693336, + "grad_norm": 0.7816227078437805, + "learning_rate": 7.854389346416682e-05, + "loss": 1.802, + "step": 2452 + }, + { + "epoch": 0.6447925904029954, + "grad_norm": 0.6715728640556335, + "learning_rate": 7.852637112318206e-05, + "loss": 1.754, + "step": 2454 + }, + { + "epoch": 0.6453180937366572, + "grad_norm": 0.7650018334388733, + "learning_rate": 7.85088487821973e-05, + "loss": 1.7936, + "step": 2456 + }, + { + "epoch": 0.6458435970703189, + "grad_norm": 0.8200324773788452, + "learning_rate": 7.849132644121255e-05, + "loss": 1.7879, + "step": 2458 + }, + { + "epoch": 0.6463691004039807, + "grad_norm": 0.7117056846618652, + "learning_rate": 7.84738041002278e-05, + "loss": 1.7869, + "step": 2460 + }, + { + "epoch": 0.6468946037376425, + "grad_norm": 0.7513619661331177, + "learning_rate": 7.845628175924303e-05, + "loss": 1.7929, + "step": 2462 + }, + { + "epoch": 0.6474201070713043, + "grad_norm": 0.6291913986206055, + "learning_rate": 7.843875941825828e-05, + "loss": 1.7597, + "step": 2464 + }, + { + "epoch": 0.647945610404966, + "grad_norm": 0.7736865878105164, + "learning_rate": 7.842123707727353e-05, + "loss": 1.8084, + "step": 2466 + }, + { + "epoch": 0.6484711137386278, + "grad_norm": 0.9892921447753906, + "learning_rate": 7.840371473628876e-05, + "loss": 1.7724, + "step": 2468 + }, + { + "epoch": 0.6489966170722895, + "grad_norm": 0.5670979619026184, + "learning_rate": 7.838619239530401e-05, + "loss": 1.7806, + "step": 2470 + }, + { + "epoch": 0.6495221204059514, + "grad_norm": 0.728164792060852, + "learning_rate": 7.836867005431926e-05, + "loss": 1.7687, + "step": 2472 + }, + { + "epoch": 0.6500476237396131, + "grad_norm": 1.0032124519348145, + "learning_rate": 7.835114771333451e-05, + "loss": 1.7992, + "step": 2474 + }, + { + "epoch": 0.6505731270732749, + "grad_norm": 0.70088130235672, + "learning_rate": 7.833362537234975e-05, + "loss": 1.7664, + "step": 2476 + }, + { + "epoch": 0.6510986304069366, + "grad_norm": 0.8623471856117249, + "learning_rate": 7.8316103031365e-05, + "loss": 1.8202, + "step": 2478 + }, + { + "epoch": 0.6516241337405985, + "grad_norm": 0.8462334275245667, + "learning_rate": 7.829858069038024e-05, + "loss": 1.8086, + "step": 2480 + }, + { + "epoch": 0.6521496370742602, + "grad_norm": 0.6222977638244629, + "learning_rate": 7.828105834939548e-05, + "loss": 1.7396, + "step": 2482 + }, + { + "epoch": 0.652675140407922, + "grad_norm": 0.7277782559394836, + "learning_rate": 7.826353600841073e-05, + "loss": 1.787, + "step": 2484 + }, + { + "epoch": 0.6532006437415837, + "grad_norm": 0.6451889276504517, + "learning_rate": 7.824601366742598e-05, + "loss": 1.7687, + "step": 2486 + }, + { + "epoch": 0.6537261470752455, + "grad_norm": 0.6663830280303955, + "learning_rate": 7.822849132644121e-05, + "loss": 1.7627, + "step": 2488 + }, + { + "epoch": 0.6542516504089073, + "grad_norm": 0.7570757269859314, + "learning_rate": 7.821096898545646e-05, + "loss": 1.7423, + "step": 2490 + }, + { + "epoch": 0.6547771537425691, + "grad_norm": 0.5967277884483337, + "learning_rate": 7.819344664447171e-05, + "loss": 1.7834, + "step": 2492 + }, + { + "epoch": 0.6553026570762308, + "grad_norm": 0.6351729035377502, + "learning_rate": 7.817592430348694e-05, + "loss": 1.7541, + "step": 2494 + }, + { + "epoch": 0.6558281604098926, + "grad_norm": 0.9284831881523132, + "learning_rate": 7.815840196250219e-05, + "loss": 1.7658, + "step": 2496 + }, + { + "epoch": 0.6563536637435544, + "grad_norm": 0.754885196685791, + "learning_rate": 7.814087962151744e-05, + "loss": 1.7602, + "step": 2498 + }, + { + "epoch": 0.6568791670772162, + "grad_norm": 0.7284504771232605, + "learning_rate": 7.812335728053268e-05, + "loss": 1.7983, + "step": 2500 + }, + { + "epoch": 0.6574046704108779, + "grad_norm": 0.6399169564247131, + "learning_rate": 7.810583493954793e-05, + "loss": 1.7851, + "step": 2502 + }, + { + "epoch": 0.6579301737445397, + "grad_norm": 1.2791913747787476, + "learning_rate": 7.808831259856318e-05, + "loss": 1.8401, + "step": 2504 + }, + { + "epoch": 0.6584556770782015, + "grad_norm": 0.8345859050750732, + "learning_rate": 7.807079025757841e-05, + "loss": 1.7636, + "step": 2506 + }, + { + "epoch": 0.6589811804118633, + "grad_norm": 0.6579688191413879, + "learning_rate": 7.805326791659366e-05, + "loss": 1.815, + "step": 2508 + }, + { + "epoch": 0.659506683745525, + "grad_norm": 0.744471549987793, + "learning_rate": 7.803574557560891e-05, + "loss": 1.787, + "step": 2510 + }, + { + "epoch": 0.6600321870791868, + "grad_norm": 0.6032891869544983, + "learning_rate": 7.801822323462416e-05, + "loss": 1.7615, + "step": 2512 + }, + { + "epoch": 0.6605576904128485, + "grad_norm": 0.6453471183776855, + "learning_rate": 7.800070089363939e-05, + "loss": 1.7519, + "step": 2514 + }, + { + "epoch": 0.6610831937465104, + "grad_norm": 0.6828714609146118, + "learning_rate": 7.798317855265464e-05, + "loss": 1.7563, + "step": 2516 + }, + { + "epoch": 0.6616086970801721, + "grad_norm": 0.7225235104560852, + "learning_rate": 7.796565621166989e-05, + "loss": 1.7685, + "step": 2518 + }, + { + "epoch": 0.6621342004138339, + "grad_norm": 0.7070510387420654, + "learning_rate": 7.794813387068512e-05, + "loss": 1.7816, + "step": 2520 + }, + { + "epoch": 0.6626597037474956, + "grad_norm": 0.7075088620185852, + "learning_rate": 7.793061152970037e-05, + "loss": 1.7918, + "step": 2522 + }, + { + "epoch": 0.6631852070811575, + "grad_norm": 0.8367542028427124, + "learning_rate": 7.791308918871561e-05, + "loss": 1.7769, + "step": 2524 + }, + { + "epoch": 0.6637107104148192, + "grad_norm": 0.7209259867668152, + "learning_rate": 7.789556684773086e-05, + "loss": 1.7997, + "step": 2526 + }, + { + "epoch": 0.664236213748481, + "grad_norm": 0.9258558750152588, + "learning_rate": 7.787804450674611e-05, + "loss": 1.7706, + "step": 2528 + }, + { + "epoch": 0.6647617170821427, + "grad_norm": 0.7765336632728577, + "learning_rate": 7.786052216576136e-05, + "loss": 1.7774, + "step": 2530 + }, + { + "epoch": 0.6652872204158045, + "grad_norm": 0.8592368960380554, + "learning_rate": 7.784299982477659e-05, + "loss": 1.7496, + "step": 2532 + }, + { + "epoch": 0.6658127237494663, + "grad_norm": 0.7928656339645386, + "learning_rate": 7.782547748379184e-05, + "loss": 1.7735, + "step": 2534 + }, + { + "epoch": 0.6663382270831281, + "grad_norm": 0.7750053405761719, + "learning_rate": 7.780795514280709e-05, + "loss": 1.7732, + "step": 2536 + }, + { + "epoch": 0.6668637304167898, + "grad_norm": 1.0451760292053223, + "learning_rate": 7.779043280182233e-05, + "loss": 1.7658, + "step": 2538 + }, + { + "epoch": 0.6673892337504516, + "grad_norm": 0.6935849189758301, + "learning_rate": 7.777291046083757e-05, + "loss": 1.7494, + "step": 2540 + }, + { + "epoch": 0.6679147370841134, + "grad_norm": 0.9037797451019287, + "learning_rate": 7.775538811985282e-05, + "loss": 1.7664, + "step": 2542 + }, + { + "epoch": 0.6684402404177752, + "grad_norm": 0.6649421453475952, + "learning_rate": 7.773786577886806e-05, + "loss": 1.7503, + "step": 2544 + }, + { + "epoch": 0.6689657437514369, + "grad_norm": 0.6808927059173584, + "learning_rate": 7.77203434378833e-05, + "loss": 1.7481, + "step": 2546 + }, + { + "epoch": 0.6694912470850987, + "grad_norm": 0.697090208530426, + "learning_rate": 7.770282109689854e-05, + "loss": 1.7843, + "step": 2548 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.9084567427635193, + "learning_rate": 7.768529875591379e-05, + "loss": 1.7643, + "step": 2550 + }, + { + "epoch": 0.6705422537524223, + "grad_norm": 0.6704044938087463, + "learning_rate": 7.766777641492904e-05, + "loss": 1.7687, + "step": 2552 + }, + { + "epoch": 0.671067757086084, + "grad_norm": 0.7542858719825745, + "learning_rate": 7.765025407394429e-05, + "loss": 1.771, + "step": 2554 + }, + { + "epoch": 0.6715932604197458, + "grad_norm": 0.8199552893638611, + "learning_rate": 7.763273173295953e-05, + "loss": 1.8042, + "step": 2556 + }, + { + "epoch": 0.6721187637534075, + "grad_norm": 0.8224305510520935, + "learning_rate": 7.761520939197477e-05, + "loss": 1.7625, + "step": 2558 + }, + { + "epoch": 0.6726442670870694, + "grad_norm": 0.7174823880195618, + "learning_rate": 7.759768705099002e-05, + "loss": 1.7857, + "step": 2560 + }, + { + "epoch": 0.6731697704207311, + "grad_norm": 0.6753933429718018, + "learning_rate": 7.758016471000526e-05, + "loss": 1.7732, + "step": 2562 + }, + { + "epoch": 0.6736952737543929, + "grad_norm": 0.985587477684021, + "learning_rate": 7.75626423690205e-05, + "loss": 1.797, + "step": 2564 + }, + { + "epoch": 0.6742207770880546, + "grad_norm": 0.892760694026947, + "learning_rate": 7.754512002803575e-05, + "loss": 1.7881, + "step": 2566 + }, + { + "epoch": 0.6747462804217165, + "grad_norm": 0.7832928895950317, + "learning_rate": 7.752759768705099e-05, + "loss": 1.781, + "step": 2568 + }, + { + "epoch": 0.6752717837553782, + "grad_norm": 0.7372546195983887, + "learning_rate": 7.751007534606624e-05, + "loss": 1.7774, + "step": 2570 + }, + { + "epoch": 0.67579728708904, + "grad_norm": 0.7375915050506592, + "learning_rate": 7.749255300508147e-05, + "loss": 1.8019, + "step": 2572 + }, + { + "epoch": 0.6763227904227017, + "grad_norm": 0.77280193567276, + "learning_rate": 7.747503066409672e-05, + "loss": 1.7743, + "step": 2574 + }, + { + "epoch": 0.6768482937563635, + "grad_norm": 0.7224514484405518, + "learning_rate": 7.745750832311197e-05, + "loss": 1.7741, + "step": 2576 + }, + { + "epoch": 0.6773737970900253, + "grad_norm": 0.6910998821258545, + "learning_rate": 7.743998598212722e-05, + "loss": 1.7893, + "step": 2578 + }, + { + "epoch": 0.6778993004236871, + "grad_norm": 0.61247718334198, + "learning_rate": 7.742246364114247e-05, + "loss": 1.7628, + "step": 2580 + }, + { + "epoch": 0.6784248037573488, + "grad_norm": 0.6819799542427063, + "learning_rate": 7.740494130015771e-05, + "loss": 1.7355, + "step": 2582 + }, + { + "epoch": 0.6789503070910106, + "grad_norm": 0.6858199834823608, + "learning_rate": 7.738741895917295e-05, + "loss": 1.772, + "step": 2584 + }, + { + "epoch": 0.6794758104246724, + "grad_norm": 0.7759047150611877, + "learning_rate": 7.73698966181882e-05, + "loss": 1.7586, + "step": 2586 + }, + { + "epoch": 0.6800013137583342, + "grad_norm": 0.6968771815299988, + "learning_rate": 7.735237427720344e-05, + "loss": 1.7886, + "step": 2588 + }, + { + "epoch": 0.6805268170919959, + "grad_norm": 0.7313429117202759, + "learning_rate": 7.733485193621868e-05, + "loss": 1.7884, + "step": 2590 + }, + { + "epoch": 0.6810523204256577, + "grad_norm": 0.6731216311454773, + "learning_rate": 7.731732959523392e-05, + "loss": 1.8149, + "step": 2592 + }, + { + "epoch": 0.6815778237593195, + "grad_norm": 0.6619842648506165, + "learning_rate": 7.729980725424917e-05, + "loss": 1.7993, + "step": 2594 + }, + { + "epoch": 0.6821033270929813, + "grad_norm": 0.6684291362762451, + "learning_rate": 7.728228491326442e-05, + "loss": 1.7386, + "step": 2596 + }, + { + "epoch": 0.682628830426643, + "grad_norm": 0.6439480781555176, + "learning_rate": 7.726476257227965e-05, + "loss": 1.7711, + "step": 2598 + }, + { + "epoch": 0.6831543337603048, + "grad_norm": 0.9060842990875244, + "learning_rate": 7.72472402312949e-05, + "loss": 1.7901, + "step": 2600 + }, + { + "epoch": 0.6836798370939665, + "grad_norm": 0.8999879360198975, + "learning_rate": 7.722971789031015e-05, + "loss": 1.755, + "step": 2602 + }, + { + "epoch": 0.6842053404276284, + "grad_norm": 0.931769609451294, + "learning_rate": 7.72121955493254e-05, + "loss": 1.7682, + "step": 2604 + }, + { + "epoch": 0.6847308437612901, + "grad_norm": 0.6413145065307617, + "learning_rate": 7.719467320834064e-05, + "loss": 1.7774, + "step": 2606 + }, + { + "epoch": 0.6852563470949519, + "grad_norm": 0.7932469248771667, + "learning_rate": 7.717715086735589e-05, + "loss": 1.7702, + "step": 2608 + }, + { + "epoch": 0.6857818504286136, + "grad_norm": 0.9040171504020691, + "learning_rate": 7.715962852637112e-05, + "loss": 1.7799, + "step": 2610 + }, + { + "epoch": 0.6863073537622755, + "grad_norm": 0.8220160603523254, + "learning_rate": 7.714210618538637e-05, + "loss": 1.776, + "step": 2612 + }, + { + "epoch": 0.6868328570959372, + "grad_norm": 0.727171778678894, + "learning_rate": 7.712458384440162e-05, + "loss": 1.7711, + "step": 2614 + }, + { + "epoch": 0.687358360429599, + "grad_norm": 0.877223789691925, + "learning_rate": 7.710706150341685e-05, + "loss": 1.7487, + "step": 2616 + }, + { + "epoch": 0.6878838637632607, + "grad_norm": 0.7638031244277954, + "learning_rate": 7.70895391624321e-05, + "loss": 1.7925, + "step": 2618 + }, + { + "epoch": 0.6884093670969226, + "grad_norm": 0.8082540035247803, + "learning_rate": 7.707201682144735e-05, + "loss": 1.756, + "step": 2620 + }, + { + "epoch": 0.6889348704305843, + "grad_norm": 0.8931254148483276, + "learning_rate": 7.70544944804626e-05, + "loss": 1.7783, + "step": 2622 + }, + { + "epoch": 0.6894603737642461, + "grad_norm": 0.760176420211792, + "learning_rate": 7.703697213947783e-05, + "loss": 1.7603, + "step": 2624 + }, + { + "epoch": 0.6899858770979078, + "grad_norm": 0.7421066761016846, + "learning_rate": 7.701944979849308e-05, + "loss": 1.7872, + "step": 2626 + }, + { + "epoch": 0.6905113804315696, + "grad_norm": 0.7226428389549255, + "learning_rate": 7.700192745750833e-05, + "loss": 1.7947, + "step": 2628 + }, + { + "epoch": 0.6910368837652314, + "grad_norm": 0.7266933917999268, + "learning_rate": 7.698440511652357e-05, + "loss": 1.7512, + "step": 2630 + }, + { + "epoch": 0.6915623870988932, + "grad_norm": 0.6181286573410034, + "learning_rate": 7.696688277553882e-05, + "loss": 1.7382, + "step": 2632 + }, + { + "epoch": 0.6920878904325549, + "grad_norm": 0.7923069000244141, + "learning_rate": 7.694936043455407e-05, + "loss": 1.772, + "step": 2634 + }, + { + "epoch": 0.6926133937662167, + "grad_norm": 0.7019143104553223, + "learning_rate": 7.69318380935693e-05, + "loss": 1.7768, + "step": 2636 + }, + { + "epoch": 0.6931388970998785, + "grad_norm": 0.6440560817718506, + "learning_rate": 7.691431575258455e-05, + "loss": 1.784, + "step": 2638 + }, + { + "epoch": 0.6936644004335403, + "grad_norm": 0.6912879943847656, + "learning_rate": 7.689679341159978e-05, + "loss": 1.7714, + "step": 2640 + }, + { + "epoch": 0.694189903767202, + "grad_norm": 0.7078375816345215, + "learning_rate": 7.687927107061503e-05, + "loss": 1.7774, + "step": 2642 + }, + { + "epoch": 0.6947154071008638, + "grad_norm": 0.744601309299469, + "learning_rate": 7.686174872963028e-05, + "loss": 1.781, + "step": 2644 + }, + { + "epoch": 0.6952409104345255, + "grad_norm": 0.6456273198127747, + "learning_rate": 7.684422638864553e-05, + "loss": 1.7704, + "step": 2646 + }, + { + "epoch": 0.6957664137681874, + "grad_norm": 1.024562954902649, + "learning_rate": 7.682670404766077e-05, + "loss": 1.7578, + "step": 2648 + }, + { + "epoch": 0.6962919171018491, + "grad_norm": 0.7917899489402771, + "learning_rate": 7.680918170667601e-05, + "loss": 1.7853, + "step": 2650 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.8094285130500793, + "learning_rate": 7.679165936569126e-05, + "loss": 1.7531, + "step": 2652 + }, + { + "epoch": 0.6973429237691726, + "grad_norm": 0.5724372863769531, + "learning_rate": 7.67741370247065e-05, + "loss": 1.7458, + "step": 2654 + }, + { + "epoch": 0.6978684271028345, + "grad_norm": 0.8631569743156433, + "learning_rate": 7.675661468372175e-05, + "loss": 1.7559, + "step": 2656 + }, + { + "epoch": 0.6983939304364962, + "grad_norm": 0.8773946762084961, + "learning_rate": 7.6739092342737e-05, + "loss": 1.7658, + "step": 2658 + }, + { + "epoch": 0.698919433770158, + "grad_norm": 0.7718110084533691, + "learning_rate": 7.672157000175225e-05, + "loss": 1.8134, + "step": 2660 + }, + { + "epoch": 0.6994449371038197, + "grad_norm": 0.7242169380187988, + "learning_rate": 7.670404766076748e-05, + "loss": 1.7982, + "step": 2662 + }, + { + "epoch": 0.6999704404374816, + "grad_norm": 0.8516372442245483, + "learning_rate": 7.668652531978273e-05, + "loss": 1.7878, + "step": 2664 + }, + { + "epoch": 0.7004959437711433, + "grad_norm": 0.7234377861022949, + "learning_rate": 7.666900297879796e-05, + "loss": 1.808, + "step": 2666 + }, + { + "epoch": 0.7010214471048051, + "grad_norm": 0.6989150643348694, + "learning_rate": 7.665148063781321e-05, + "loss": 1.7998, + "step": 2668 + }, + { + "epoch": 0.7015469504384668, + "grad_norm": 0.8885288834571838, + "learning_rate": 7.663395829682846e-05, + "loss": 1.7768, + "step": 2670 + }, + { + "epoch": 0.7020724537721286, + "grad_norm": 0.7263723015785217, + "learning_rate": 7.66164359558437e-05, + "loss": 1.7789, + "step": 2672 + }, + { + "epoch": 0.7025979571057904, + "grad_norm": 0.7461345195770264, + "learning_rate": 7.659891361485895e-05, + "loss": 1.7583, + "step": 2674 + }, + { + "epoch": 0.7031234604394522, + "grad_norm": 1.028860330581665, + "learning_rate": 7.65813912738742e-05, + "loss": 1.7666, + "step": 2676 + }, + { + "epoch": 0.7036489637731139, + "grad_norm": 0.7666818499565125, + "learning_rate": 7.656386893288943e-05, + "loss": 1.7621, + "step": 2678 + }, + { + "epoch": 0.7041744671067757, + "grad_norm": 0.8842204809188843, + "learning_rate": 7.654634659190468e-05, + "loss": 1.7664, + "step": 2680 + }, + { + "epoch": 0.7046999704404375, + "grad_norm": 1.2368742227554321, + "learning_rate": 7.652882425091993e-05, + "loss": 1.7572, + "step": 2682 + }, + { + "epoch": 0.7052254737740993, + "grad_norm": 0.9390943646430969, + "learning_rate": 7.651130190993518e-05, + "loss": 1.7482, + "step": 2684 + }, + { + "epoch": 0.705750977107761, + "grad_norm": 0.7266187071800232, + "learning_rate": 7.649377956895042e-05, + "loss": 1.7961, + "step": 2686 + }, + { + "epoch": 0.7062764804414228, + "grad_norm": 0.6799050569534302, + "learning_rate": 7.647625722796567e-05, + "loss": 1.7712, + "step": 2688 + }, + { + "epoch": 0.7068019837750845, + "grad_norm": 0.8757466077804565, + "learning_rate": 7.64587348869809e-05, + "loss": 1.7721, + "step": 2690 + }, + { + "epoch": 0.7073274871087464, + "grad_norm": 0.8347486257553101, + "learning_rate": 7.644121254599614e-05, + "loss": 1.7481, + "step": 2692 + }, + { + "epoch": 0.7078529904424081, + "grad_norm": 0.7270652055740356, + "learning_rate": 7.642369020501139e-05, + "loss": 1.7818, + "step": 2694 + }, + { + "epoch": 0.7083784937760699, + "grad_norm": 0.563240110874176, + "learning_rate": 7.640616786402663e-05, + "loss": 1.769, + "step": 2696 + }, + { + "epoch": 0.7089039971097316, + "grad_norm": 0.8982949256896973, + "learning_rate": 7.638864552304188e-05, + "loss": 1.7806, + "step": 2698 + }, + { + "epoch": 0.7094295004433935, + "grad_norm": 0.723839282989502, + "learning_rate": 7.637112318205713e-05, + "loss": 1.7947, + "step": 2700 + }, + { + "epoch": 0.7099550037770552, + "grad_norm": 0.7281327247619629, + "learning_rate": 7.635360084107238e-05, + "loss": 1.7763, + "step": 2702 + }, + { + "epoch": 0.710480507110717, + "grad_norm": 0.8034355044364929, + "learning_rate": 7.633607850008761e-05, + "loss": 1.7894, + "step": 2704 + }, + { + "epoch": 0.7110060104443787, + "grad_norm": 0.7230488061904907, + "learning_rate": 7.631855615910286e-05, + "loss": 1.7715, + "step": 2706 + }, + { + "epoch": 0.7115315137780406, + "grad_norm": 0.9625870585441589, + "learning_rate": 7.63010338181181e-05, + "loss": 1.7966, + "step": 2708 + }, + { + "epoch": 0.7120570171117023, + "grad_norm": 0.6187042593955994, + "learning_rate": 7.628351147713335e-05, + "loss": 1.791, + "step": 2710 + }, + { + "epoch": 0.7125825204453641, + "grad_norm": 0.7607492208480835, + "learning_rate": 7.62659891361486e-05, + "loss": 1.8201, + "step": 2712 + }, + { + "epoch": 0.7131080237790258, + "grad_norm": 0.8220197558403015, + "learning_rate": 7.624846679516385e-05, + "loss": 1.7512, + "step": 2714 + }, + { + "epoch": 0.7136335271126876, + "grad_norm": 0.7219741344451904, + "learning_rate": 7.623094445417908e-05, + "loss": 1.775, + "step": 2716 + }, + { + "epoch": 0.7141590304463494, + "grad_norm": 0.9267223477363586, + "learning_rate": 7.621342211319432e-05, + "loss": 1.7833, + "step": 2718 + }, + { + "epoch": 0.7146845337800112, + "grad_norm": 0.6037401556968689, + "learning_rate": 7.619589977220956e-05, + "loss": 1.7956, + "step": 2720 + }, + { + "epoch": 0.7152100371136729, + "grad_norm": 0.7281090021133423, + "learning_rate": 7.617837743122481e-05, + "loss": 1.7341, + "step": 2722 + }, + { + "epoch": 0.7157355404473347, + "grad_norm": 0.8907596468925476, + "learning_rate": 7.616085509024006e-05, + "loss": 1.754, + "step": 2724 + }, + { + "epoch": 0.7162610437809965, + "grad_norm": 0.7699323296546936, + "learning_rate": 7.61433327492553e-05, + "loss": 1.7591, + "step": 2726 + }, + { + "epoch": 0.7167865471146583, + "grad_norm": 0.7458289265632629, + "learning_rate": 7.612581040827055e-05, + "loss": 1.8033, + "step": 2728 + }, + { + "epoch": 0.71731205044832, + "grad_norm": 1.1340159177780151, + "learning_rate": 7.610828806728579e-05, + "loss": 1.7821, + "step": 2730 + }, + { + "epoch": 0.7178375537819818, + "grad_norm": 0.6701779365539551, + "learning_rate": 7.609076572630104e-05, + "loss": 1.7681, + "step": 2732 + }, + { + "epoch": 0.7183630571156436, + "grad_norm": 0.7182425260543823, + "learning_rate": 7.607324338531628e-05, + "loss": 1.7735, + "step": 2734 + }, + { + "epoch": 0.7188885604493054, + "grad_norm": 0.8409538865089417, + "learning_rate": 7.605572104433153e-05, + "loss": 1.7535, + "step": 2736 + }, + { + "epoch": 0.7194140637829671, + "grad_norm": 0.7824596166610718, + "learning_rate": 7.603819870334678e-05, + "loss": 1.7853, + "step": 2738 + }, + { + "epoch": 0.7199395671166289, + "grad_norm": 1.0866069793701172, + "learning_rate": 7.602067636236203e-05, + "loss": 1.7729, + "step": 2740 + }, + { + "epoch": 0.7204650704502906, + "grad_norm": 0.7940483093261719, + "learning_rate": 7.600315402137726e-05, + "loss": 1.7643, + "step": 2742 + }, + { + "epoch": 0.7209905737839525, + "grad_norm": 1.0263162851333618, + "learning_rate": 7.59856316803925e-05, + "loss": 1.7533, + "step": 2744 + }, + { + "epoch": 0.7215160771176142, + "grad_norm": 0.7818773984909058, + "learning_rate": 7.596810933940774e-05, + "loss": 1.7526, + "step": 2746 + }, + { + "epoch": 0.722041580451276, + "grad_norm": 0.7003962397575378, + "learning_rate": 7.595058699842299e-05, + "loss": 1.7635, + "step": 2748 + }, + { + "epoch": 0.7225670837849377, + "grad_norm": 0.7112312316894531, + "learning_rate": 7.593306465743824e-05, + "loss": 1.7608, + "step": 2750 + }, + { + "epoch": 0.7230925871185996, + "grad_norm": 0.8331362009048462, + "learning_rate": 7.591554231645348e-05, + "loss": 1.7899, + "step": 2752 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.7973011136054993, + "learning_rate": 7.589801997546873e-05, + "loss": 1.757, + "step": 2754 + }, + { + "epoch": 0.7241435937859231, + "grad_norm": 0.6794710159301758, + "learning_rate": 7.588049763448397e-05, + "loss": 1.7516, + "step": 2756 + }, + { + "epoch": 0.7246690971195848, + "grad_norm": 0.9882004857063293, + "learning_rate": 7.586297529349921e-05, + "loss": 1.7499, + "step": 2758 + }, + { + "epoch": 0.7251946004532466, + "grad_norm": 0.6301134824752808, + "learning_rate": 7.584545295251446e-05, + "loss": 1.7785, + "step": 2760 + }, + { + "epoch": 0.7257201037869084, + "grad_norm": 0.8858461976051331, + "learning_rate": 7.582793061152971e-05, + "loss": 1.7956, + "step": 2762 + }, + { + "epoch": 0.7262456071205702, + "grad_norm": 0.7689917087554932, + "learning_rate": 7.581040827054496e-05, + "loss": 1.7588, + "step": 2764 + }, + { + "epoch": 0.7267711104542319, + "grad_norm": 0.7277519702911377, + "learning_rate": 7.57928859295602e-05, + "loss": 1.7878, + "step": 2766 + }, + { + "epoch": 0.7272966137878937, + "grad_norm": 0.7690980434417725, + "learning_rate": 7.577536358857544e-05, + "loss": 1.7738, + "step": 2768 + }, + { + "epoch": 0.7278221171215555, + "grad_norm": 0.9652357697486877, + "learning_rate": 7.575784124759067e-05, + "loss": 1.8106, + "step": 2770 + }, + { + "epoch": 0.7283476204552173, + "grad_norm": 0.7968404293060303, + "learning_rate": 7.574031890660592e-05, + "loss": 1.7581, + "step": 2772 + }, + { + "epoch": 0.728873123788879, + "grad_norm": 0.7790765762329102, + "learning_rate": 7.572279656562117e-05, + "loss": 1.7257, + "step": 2774 + }, + { + "epoch": 0.7293986271225408, + "grad_norm": 0.93352872133255, + "learning_rate": 7.570527422463641e-05, + "loss": 1.7343, + "step": 2776 + }, + { + "epoch": 0.7299241304562026, + "grad_norm": 0.8516173958778381, + "learning_rate": 7.568775188365166e-05, + "loss": 1.7442, + "step": 2778 + }, + { + "epoch": 0.7304496337898644, + "grad_norm": 0.6152732968330383, + "learning_rate": 7.567022954266691e-05, + "loss": 1.7601, + "step": 2780 + }, + { + "epoch": 0.7309751371235261, + "grad_norm": 0.6822034120559692, + "learning_rate": 7.565270720168214e-05, + "loss": 1.7915, + "step": 2782 + }, + { + "epoch": 0.7315006404571879, + "grad_norm": 0.6148113012313843, + "learning_rate": 7.563518486069739e-05, + "loss": 1.7642, + "step": 2784 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.9175143241882324, + "learning_rate": 7.561766251971264e-05, + "loss": 1.7588, + "step": 2786 + }, + { + "epoch": 0.7325516471245115, + "grad_norm": 0.6481335759162903, + "learning_rate": 7.560014017872789e-05, + "loss": 1.7838, + "step": 2788 + }, + { + "epoch": 0.7330771504581732, + "grad_norm": 0.712563693523407, + "learning_rate": 7.558261783774313e-05, + "loss": 1.7449, + "step": 2790 + }, + { + "epoch": 0.733602653791835, + "grad_norm": 0.7027430534362793, + "learning_rate": 7.556509549675838e-05, + "loss": 1.7544, + "step": 2792 + }, + { + "epoch": 0.7341281571254967, + "grad_norm": 0.7059524059295654, + "learning_rate": 7.554757315577362e-05, + "loss": 1.7766, + "step": 2794 + }, + { + "epoch": 0.7346536604591586, + "grad_norm": 0.7603355050086975, + "learning_rate": 7.553005081478885e-05, + "loss": 1.7778, + "step": 2796 + }, + { + "epoch": 0.7351791637928203, + "grad_norm": 0.8169555068016052, + "learning_rate": 7.55125284738041e-05, + "loss": 1.768, + "step": 2798 + }, + { + "epoch": 0.7357046671264821, + "grad_norm": 0.7924453616142273, + "learning_rate": 7.549500613281934e-05, + "loss": 1.7617, + "step": 2800 + }, + { + "epoch": 0.7357046671264821, + "eval_loss": 1.736789345741272, + "eval_runtime": 487.2587, + "eval_samples_per_second": 249.947, + "eval_steps_per_second": 31.244, + "step": 2800 + }, + { + "epoch": 0.7362301704601438, + "grad_norm": 0.6122123003005981, + "learning_rate": 7.547748379183459e-05, + "loss": 1.7861, + "step": 2802 + }, + { + "epoch": 0.7367556737938056, + "grad_norm": 0.7799514532089233, + "learning_rate": 7.545996145084984e-05, + "loss": 1.7612, + "step": 2804 + }, + { + "epoch": 0.7372811771274674, + "grad_norm": 0.6509242057800293, + "learning_rate": 7.544243910986509e-05, + "loss": 1.8034, + "step": 2806 + }, + { + "epoch": 0.7378066804611292, + "grad_norm": 0.6152432560920715, + "learning_rate": 7.542491676888032e-05, + "loss": 1.7565, + "step": 2808 + }, + { + "epoch": 0.7383321837947909, + "grad_norm": 0.6861807703971863, + "learning_rate": 7.540739442789557e-05, + "loss": 1.7622, + "step": 2810 + }, + { + "epoch": 0.7388576871284527, + "grad_norm": 0.6434677839279175, + "learning_rate": 7.538987208691082e-05, + "loss": 1.7778, + "step": 2812 + }, + { + "epoch": 0.7393831904621145, + "grad_norm": 0.8894173502922058, + "learning_rate": 7.537234974592606e-05, + "loss": 1.7921, + "step": 2814 + }, + { + "epoch": 0.7399086937957763, + "grad_norm": 0.6787139177322388, + "learning_rate": 7.535482740494131e-05, + "loss": 1.76, + "step": 2816 + }, + { + "epoch": 0.740434197129438, + "grad_norm": 0.6434260010719299, + "learning_rate": 7.533730506395656e-05, + "loss": 1.7702, + "step": 2818 + }, + { + "epoch": 0.7409597004630998, + "grad_norm": 0.6635193228721619, + "learning_rate": 7.531978272297179e-05, + "loss": 1.7631, + "step": 2820 + }, + { + "epoch": 0.7414852037967616, + "grad_norm": 0.6992824077606201, + "learning_rate": 7.530226038198703e-05, + "loss": 1.8017, + "step": 2822 + }, + { + "epoch": 0.7420107071304234, + "grad_norm": 0.6800288558006287, + "learning_rate": 7.528473804100227e-05, + "loss": 1.764, + "step": 2824 + }, + { + "epoch": 0.7425362104640851, + "grad_norm": 1.0732684135437012, + "learning_rate": 7.526721570001752e-05, + "loss": 1.7818, + "step": 2826 + }, + { + "epoch": 0.7430617137977469, + "grad_norm": 0.7014878392219543, + "learning_rate": 7.524969335903277e-05, + "loss": 1.7774, + "step": 2828 + }, + { + "epoch": 0.7435872171314086, + "grad_norm": 0.5895276069641113, + "learning_rate": 7.523217101804802e-05, + "loss": 1.7622, + "step": 2830 + }, + { + "epoch": 0.7441127204650705, + "grad_norm": 0.8901596069335938, + "learning_rate": 7.521464867706326e-05, + "loss": 1.7579, + "step": 2832 + }, + { + "epoch": 0.7446382237987322, + "grad_norm": 0.9512175917625427, + "learning_rate": 7.51971263360785e-05, + "loss": 1.7481, + "step": 2834 + }, + { + "epoch": 0.745163727132394, + "grad_norm": 0.8138533234596252, + "learning_rate": 7.517960399509375e-05, + "loss": 1.7612, + "step": 2836 + }, + { + "epoch": 0.7456892304660557, + "grad_norm": 0.7234623432159424, + "learning_rate": 7.5162081654109e-05, + "loss": 1.7587, + "step": 2838 + }, + { + "epoch": 0.7462147337997176, + "grad_norm": 1.0871793031692505, + "learning_rate": 7.514455931312424e-05, + "loss": 1.7368, + "step": 2840 + }, + { + "epoch": 0.7467402371333793, + "grad_norm": 0.7515408992767334, + "learning_rate": 7.512703697213949e-05, + "loss": 1.8096, + "step": 2842 + }, + { + "epoch": 0.7472657404670411, + "grad_norm": 0.6623795032501221, + "learning_rate": 7.510951463115472e-05, + "loss": 1.7683, + "step": 2844 + }, + { + "epoch": 0.7477912438007028, + "grad_norm": 0.648363471031189, + "learning_rate": 7.509199229016997e-05, + "loss": 1.8195, + "step": 2846 + }, + { + "epoch": 0.7483167471343646, + "grad_norm": 0.7021984457969666, + "learning_rate": 7.50744699491852e-05, + "loss": 1.7918, + "step": 2848 + }, + { + "epoch": 0.7488422504680264, + "grad_norm": 0.7864859104156494, + "learning_rate": 7.505694760820045e-05, + "loss": 1.8118, + "step": 2850 + }, + { + "epoch": 0.7493677538016882, + "grad_norm": 0.6326330900192261, + "learning_rate": 7.50394252672157e-05, + "loss": 1.7781, + "step": 2852 + }, + { + "epoch": 0.7498932571353499, + "grad_norm": 0.6461377143859863, + "learning_rate": 7.502190292623095e-05, + "loss": 1.77, + "step": 2854 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.6641056537628174, + "learning_rate": 7.50043805852462e-05, + "loss": 1.7442, + "step": 2856 + }, + { + "epoch": 0.7509442638026735, + "grad_norm": 0.6882733702659607, + "learning_rate": 7.498685824426144e-05, + "loss": 1.7583, + "step": 2858 + }, + { + "epoch": 0.7514697671363353, + "grad_norm": 0.9753492474555969, + "learning_rate": 7.496933590327668e-05, + "loss": 1.7557, + "step": 2860 + }, + { + "epoch": 0.751995270469997, + "grad_norm": 0.6425254940986633, + "learning_rate": 7.495181356229192e-05, + "loss": 1.8, + "step": 2862 + }, + { + "epoch": 0.7525207738036588, + "grad_norm": 0.7045446038246155, + "learning_rate": 7.493429122130717e-05, + "loss": 1.7532, + "step": 2864 + }, + { + "epoch": 0.7530462771373206, + "grad_norm": 0.6855698227882385, + "learning_rate": 7.491676888032242e-05, + "loss": 1.7652, + "step": 2866 + }, + { + "epoch": 0.7535717804709824, + "grad_norm": 0.7373823523521423, + "learning_rate": 7.489924653933767e-05, + "loss": 1.7896, + "step": 2868 + }, + { + "epoch": 0.7540972838046441, + "grad_norm": 0.772221565246582, + "learning_rate": 7.48817241983529e-05, + "loss": 1.7712, + "step": 2870 + }, + { + "epoch": 0.7546227871383059, + "grad_norm": 0.7220898270606995, + "learning_rate": 7.486420185736815e-05, + "loss": 1.7298, + "step": 2872 + }, + { + "epoch": 0.7551482904719676, + "grad_norm": 0.6807803511619568, + "learning_rate": 7.484667951638338e-05, + "loss": 1.7767, + "step": 2874 + }, + { + "epoch": 0.7556737938056295, + "grad_norm": 0.7383838891983032, + "learning_rate": 7.482915717539863e-05, + "loss": 1.7722, + "step": 2876 + }, + { + "epoch": 0.7561992971392912, + "grad_norm": 0.7259317636489868, + "learning_rate": 7.481163483441388e-05, + "loss": 1.7623, + "step": 2878 + }, + { + "epoch": 0.756724800472953, + "grad_norm": 0.7161348462104797, + "learning_rate": 7.479411249342913e-05, + "loss": 1.7379, + "step": 2880 + }, + { + "epoch": 0.7572503038066147, + "grad_norm": 0.5866290330886841, + "learning_rate": 7.477659015244437e-05, + "loss": 1.7687, + "step": 2882 + }, + { + "epoch": 0.7577758071402766, + "grad_norm": 0.6606796979904175, + "learning_rate": 7.475906781145962e-05, + "loss": 1.7833, + "step": 2884 + }, + { + "epoch": 0.7583013104739383, + "grad_norm": 0.6400638818740845, + "learning_rate": 7.474154547047485e-05, + "loss": 1.75, + "step": 2886 + }, + { + "epoch": 0.7588268138076001, + "grad_norm": 0.6338980793952942, + "learning_rate": 7.47240231294901e-05, + "loss": 1.7836, + "step": 2888 + }, + { + "epoch": 0.7593523171412618, + "grad_norm": 0.6896232962608337, + "learning_rate": 7.470650078850535e-05, + "loss": 1.7473, + "step": 2890 + }, + { + "epoch": 0.7598778204749237, + "grad_norm": 0.7504851222038269, + "learning_rate": 7.46889784475206e-05, + "loss": 1.7587, + "step": 2892 + }, + { + "epoch": 0.7604033238085854, + "grad_norm": 0.6796631217002869, + "learning_rate": 7.467145610653584e-05, + "loss": 1.7469, + "step": 2894 + }, + { + "epoch": 0.7609288271422472, + "grad_norm": 0.6032044887542725, + "learning_rate": 7.465393376555108e-05, + "loss": 1.7651, + "step": 2896 + }, + { + "epoch": 0.7614543304759089, + "grad_norm": 0.6399370431900024, + "learning_rate": 7.463641142456633e-05, + "loss": 1.8041, + "step": 2898 + }, + { + "epoch": 0.7619798338095707, + "grad_norm": 0.6167407631874084, + "learning_rate": 7.461888908358156e-05, + "loss": 1.7743, + "step": 2900 + }, + { + "epoch": 0.7625053371432325, + "grad_norm": 0.5767862200737, + "learning_rate": 7.460136674259681e-05, + "loss": 1.7598, + "step": 2902 + }, + { + "epoch": 0.7630308404768943, + "grad_norm": 0.6222682595252991, + "learning_rate": 7.458384440161206e-05, + "loss": 1.7659, + "step": 2904 + }, + { + "epoch": 0.763556343810556, + "grad_norm": 0.6252115368843079, + "learning_rate": 7.45663220606273e-05, + "loss": 1.7671, + "step": 2906 + }, + { + "epoch": 0.7640818471442178, + "grad_norm": 0.6266006231307983, + "learning_rate": 7.454879971964255e-05, + "loss": 1.7786, + "step": 2908 + }, + { + "epoch": 0.7646073504778796, + "grad_norm": 0.7761850357055664, + "learning_rate": 7.45312773786578e-05, + "loss": 1.7185, + "step": 2910 + }, + { + "epoch": 0.7651328538115414, + "grad_norm": 0.6215353608131409, + "learning_rate": 7.451375503767303e-05, + "loss": 1.7478, + "step": 2912 + }, + { + "epoch": 0.7656583571452031, + "grad_norm": 0.7758198380470276, + "learning_rate": 7.449623269668828e-05, + "loss": 1.7437, + "step": 2914 + }, + { + "epoch": 0.7661838604788649, + "grad_norm": 0.5999752879142761, + "learning_rate": 7.447871035570353e-05, + "loss": 1.7534, + "step": 2916 + }, + { + "epoch": 0.7667093638125266, + "grad_norm": 0.6463642120361328, + "learning_rate": 7.446118801471877e-05, + "loss": 1.7589, + "step": 2918 + }, + { + "epoch": 0.7672348671461885, + "grad_norm": 0.7435876727104187, + "learning_rate": 7.444366567373402e-05, + "loss": 1.8053, + "step": 2920 + }, + { + "epoch": 0.7677603704798502, + "grad_norm": 0.7085327506065369, + "learning_rate": 7.442614333274926e-05, + "loss": 1.748, + "step": 2922 + }, + { + "epoch": 0.768285873813512, + "grad_norm": 0.6690971255302429, + "learning_rate": 7.44086209917645e-05, + "loss": 1.808, + "step": 2924 + }, + { + "epoch": 0.7688113771471737, + "grad_norm": 0.652035653591156, + "learning_rate": 7.439109865077975e-05, + "loss": 1.7491, + "step": 2926 + }, + { + "epoch": 0.7693368804808356, + "grad_norm": 0.6042243242263794, + "learning_rate": 7.437357630979499e-05, + "loss": 1.751, + "step": 2928 + }, + { + "epoch": 0.7698623838144973, + "grad_norm": 0.5847947597503662, + "learning_rate": 7.435605396881023e-05, + "loss": 1.7539, + "step": 2930 + }, + { + "epoch": 0.7703878871481591, + "grad_norm": 0.690543532371521, + "learning_rate": 7.433853162782548e-05, + "loss": 1.7448, + "step": 2932 + }, + { + "epoch": 0.7709133904818208, + "grad_norm": 0.7835954427719116, + "learning_rate": 7.432100928684073e-05, + "loss": 1.7352, + "step": 2934 + }, + { + "epoch": 0.7714388938154827, + "grad_norm": 0.8784381747245789, + "learning_rate": 7.430348694585598e-05, + "loss": 1.7802, + "step": 2936 + }, + { + "epoch": 0.7719643971491444, + "grad_norm": 0.8104349970817566, + "learning_rate": 7.428596460487122e-05, + "loss": 1.755, + "step": 2938 + }, + { + "epoch": 0.7724899004828062, + "grad_norm": 0.6043236255645752, + "learning_rate": 7.426844226388646e-05, + "loss": 1.8118, + "step": 2940 + }, + { + "epoch": 0.7730154038164679, + "grad_norm": 1.0382120609283447, + "learning_rate": 7.42509199229017e-05, + "loss": 1.7832, + "step": 2942 + }, + { + "epoch": 0.7735409071501297, + "grad_norm": 0.6753326654434204, + "learning_rate": 7.423339758191695e-05, + "loss": 1.7886, + "step": 2944 + }, + { + "epoch": 0.7740664104837915, + "grad_norm": 1.226515531539917, + "learning_rate": 7.421587524093219e-05, + "loss": 1.7406, + "step": 2946 + }, + { + "epoch": 0.7745919138174533, + "grad_norm": 0.7555555105209351, + "learning_rate": 7.419835289994743e-05, + "loss": 1.726, + "step": 2948 + }, + { + "epoch": 0.775117417151115, + "grad_norm": 1.073789119720459, + "learning_rate": 7.418083055896268e-05, + "loss": 1.7905, + "step": 2950 + }, + { + "epoch": 0.7756429204847768, + "grad_norm": 0.883283793926239, + "learning_rate": 7.416330821797793e-05, + "loss": 1.7516, + "step": 2952 + }, + { + "epoch": 0.7761684238184386, + "grad_norm": 0.8123281598091125, + "learning_rate": 7.414578587699316e-05, + "loss": 1.7689, + "step": 2954 + }, + { + "epoch": 0.7766939271521004, + "grad_norm": 0.6473522782325745, + "learning_rate": 7.412826353600841e-05, + "loss": 1.7966, + "step": 2956 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.6950981616973877, + "learning_rate": 7.411074119502366e-05, + "loss": 1.7494, + "step": 2958 + }, + { + "epoch": 0.7777449338194239, + "grad_norm": 0.7919045686721802, + "learning_rate": 7.40932188540389e-05, + "loss": 1.7992, + "step": 2960 + }, + { + "epoch": 0.7782704371530856, + "grad_norm": 0.8082287311553955, + "learning_rate": 7.407569651305415e-05, + "loss": 1.758, + "step": 2962 + }, + { + "epoch": 0.7787959404867475, + "grad_norm": 0.634069561958313, + "learning_rate": 7.40581741720694e-05, + "loss": 1.7855, + "step": 2964 + }, + { + "epoch": 0.7793214438204092, + "grad_norm": 1.0830199718475342, + "learning_rate": 7.404065183108463e-05, + "loss": 1.7509, + "step": 2966 + }, + { + "epoch": 0.779846947154071, + "grad_norm": 0.6961039304733276, + "learning_rate": 7.402312949009988e-05, + "loss": 1.7491, + "step": 2968 + }, + { + "epoch": 0.7803724504877327, + "grad_norm": 0.7842292189598083, + "learning_rate": 7.400560714911513e-05, + "loss": 1.736, + "step": 2970 + }, + { + "epoch": 0.7808979538213946, + "grad_norm": 0.6680390238761902, + "learning_rate": 7.398808480813036e-05, + "loss": 1.7594, + "step": 2972 + }, + { + "epoch": 0.7814234571550563, + "grad_norm": 0.8992615342140198, + "learning_rate": 7.397056246714561e-05, + "loss": 1.7811, + "step": 2974 + }, + { + "epoch": 0.7819489604887181, + "grad_norm": 0.6779314279556274, + "learning_rate": 7.395304012616086e-05, + "loss": 1.7675, + "step": 2976 + }, + { + "epoch": 0.7824744638223798, + "grad_norm": 0.5508474111557007, + "learning_rate": 7.39355177851761e-05, + "loss": 1.8152, + "step": 2978 + }, + { + "epoch": 0.7829999671560417, + "grad_norm": 0.6748946905136108, + "learning_rate": 7.391799544419134e-05, + "loss": 1.7596, + "step": 2980 + }, + { + "epoch": 0.7835254704897034, + "grad_norm": 0.5707883834838867, + "learning_rate": 7.390047310320659e-05, + "loss": 1.765, + "step": 2982 + }, + { + "epoch": 0.7840509738233652, + "grad_norm": 0.6725517511367798, + "learning_rate": 7.388295076222184e-05, + "loss": 1.7528, + "step": 2984 + }, + { + "epoch": 0.7845764771570269, + "grad_norm": 0.6516979336738586, + "learning_rate": 7.386542842123708e-05, + "loss": 1.7613, + "step": 2986 + }, + { + "epoch": 0.7851019804906887, + "grad_norm": 0.6247593760490417, + "learning_rate": 7.384790608025233e-05, + "loss": 1.7519, + "step": 2988 + }, + { + "epoch": 0.7856274838243505, + "grad_norm": 0.6806461811065674, + "learning_rate": 7.383038373926758e-05, + "loss": 1.7501, + "step": 2990 + }, + { + "epoch": 0.7861529871580123, + "grad_norm": 0.8175075054168701, + "learning_rate": 7.381286139828281e-05, + "loss": 1.7543, + "step": 2992 + }, + { + "epoch": 0.786678490491674, + "grad_norm": 0.8868039846420288, + "learning_rate": 7.379533905729806e-05, + "loss": 1.7285, + "step": 2994 + }, + { + "epoch": 0.7872039938253358, + "grad_norm": 0.9624283909797668, + "learning_rate": 7.377781671631331e-05, + "loss": 1.7679, + "step": 2996 + }, + { + "epoch": 0.7877294971589976, + "grad_norm": 0.8716928958892822, + "learning_rate": 7.376029437532854e-05, + "loss": 1.7642, + "step": 2998 + }, + { + "epoch": 0.7882550004926594, + "grad_norm": 0.6427202224731445, + "learning_rate": 7.374277203434379e-05, + "loss": 1.777, + "step": 3000 + }, + { + "epoch": 0.7887805038263211, + "grad_norm": 0.6569937467575073, + "learning_rate": 7.372524969335904e-05, + "loss": 1.7414, + "step": 3002 + }, + { + "epoch": 0.7893060071599829, + "grad_norm": 1.117759346961975, + "learning_rate": 7.370772735237428e-05, + "loss": 1.7709, + "step": 3004 + }, + { + "epoch": 0.7898315104936446, + "grad_norm": 0.6267141103744507, + "learning_rate": 7.369020501138952e-05, + "loss": 1.7871, + "step": 3006 + }, + { + "epoch": 0.7903570138273065, + "grad_norm": 0.8071964979171753, + "learning_rate": 7.367268267040477e-05, + "loss": 1.7471, + "step": 3008 + }, + { + "epoch": 0.7908825171609682, + "grad_norm": 0.702384352684021, + "learning_rate": 7.365516032942001e-05, + "loss": 1.7434, + "step": 3010 + }, + { + "epoch": 0.79140802049463, + "grad_norm": 0.6770474314689636, + "learning_rate": 7.363763798843526e-05, + "loss": 1.782, + "step": 3012 + }, + { + "epoch": 0.7919335238282917, + "grad_norm": 0.6293635964393616, + "learning_rate": 7.362011564745051e-05, + "loss": 1.7682, + "step": 3014 + }, + { + "epoch": 0.7924590271619536, + "grad_norm": 0.8292271494865417, + "learning_rate": 7.360259330646576e-05, + "loss": 1.7729, + "step": 3016 + }, + { + "epoch": 0.7929845304956153, + "grad_norm": 0.6828389167785645, + "learning_rate": 7.358507096548099e-05, + "loss": 1.7521, + "step": 3018 + }, + { + "epoch": 0.7935100338292771, + "grad_norm": 0.6849939823150635, + "learning_rate": 7.356754862449624e-05, + "loss": 1.7619, + "step": 3020 + }, + { + "epoch": 0.7940355371629388, + "grad_norm": 0.720439612865448, + "learning_rate": 7.355002628351149e-05, + "loss": 1.7725, + "step": 3022 + }, + { + "epoch": 0.7945610404966007, + "grad_norm": 0.6729586124420166, + "learning_rate": 7.353250394252672e-05, + "loss": 1.7711, + "step": 3024 + }, + { + "epoch": 0.7950865438302624, + "grad_norm": 1.0918195247650146, + "learning_rate": 7.351498160154197e-05, + "loss": 1.7764, + "step": 3026 + }, + { + "epoch": 0.7956120471639242, + "grad_norm": 0.658743143081665, + "learning_rate": 7.349745926055721e-05, + "loss": 1.7641, + "step": 3028 + }, + { + "epoch": 0.7961375504975859, + "grad_norm": 0.5973094701766968, + "learning_rate": 7.347993691957246e-05, + "loss": 1.7554, + "step": 3030 + }, + { + "epoch": 0.7966630538312477, + "grad_norm": 0.6641756296157837, + "learning_rate": 7.34624145785877e-05, + "loss": 1.7656, + "step": 3032 + }, + { + "epoch": 0.7971885571649096, + "grad_norm": 0.7535148859024048, + "learning_rate": 7.344489223760294e-05, + "loss": 1.7521, + "step": 3034 + }, + { + "epoch": 0.7977140604985713, + "grad_norm": 0.9680157899856567, + "learning_rate": 7.342736989661819e-05, + "loss": 1.7794, + "step": 3036 + }, + { + "epoch": 0.798239563832233, + "grad_norm": 0.7855157852172852, + "learning_rate": 7.340984755563344e-05, + "loss": 1.7853, + "step": 3038 + }, + { + "epoch": 0.7987650671658948, + "grad_norm": 0.6831044554710388, + "learning_rate": 7.339232521464869e-05, + "loss": 1.7443, + "step": 3040 + }, + { + "epoch": 0.7992905704995567, + "grad_norm": 0.7435096502304077, + "learning_rate": 7.337480287366393e-05, + "loss": 1.7585, + "step": 3042 + }, + { + "epoch": 0.7998160738332184, + "grad_norm": 0.6614308953285217, + "learning_rate": 7.335728053267917e-05, + "loss": 1.7872, + "step": 3044 + }, + { + "epoch": 0.8003415771668801, + "grad_norm": 0.5782405138015747, + "learning_rate": 7.333975819169442e-05, + "loss": 1.7504, + "step": 3046 + }, + { + "epoch": 0.8008670805005419, + "grad_norm": 0.6519070267677307, + "learning_rate": 7.332223585070965e-05, + "loss": 1.7396, + "step": 3048 + }, + { + "epoch": 0.8013925838342038, + "grad_norm": 0.5866365432739258, + "learning_rate": 7.33047135097249e-05, + "loss": 1.7591, + "step": 3050 + }, + { + "epoch": 0.8019180871678655, + "grad_norm": 0.6096078753471375, + "learning_rate": 7.328719116874014e-05, + "loss": 1.7871, + "step": 3052 + }, + { + "epoch": 0.8024435905015272, + "grad_norm": 0.6846382021903992, + "learning_rate": 7.326966882775539e-05, + "loss": 1.7773, + "step": 3054 + }, + { + "epoch": 0.802969093835189, + "grad_norm": 0.6193353533744812, + "learning_rate": 7.325214648677064e-05, + "loss": 1.753, + "step": 3056 + }, + { + "epoch": 0.8034945971688507, + "grad_norm": 0.6320629119873047, + "learning_rate": 7.323462414578587e-05, + "loss": 1.7679, + "step": 3058 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.5982667803764343, + "learning_rate": 7.321710180480112e-05, + "loss": 1.7307, + "step": 3060 + }, + { + "epoch": 0.8045456038361744, + "grad_norm": 0.7248689532279968, + "learning_rate": 7.319957946381637e-05, + "loss": 1.7933, + "step": 3062 + }, + { + "epoch": 0.8050711071698361, + "grad_norm": 0.7433560490608215, + "learning_rate": 7.318205712283162e-05, + "loss": 1.7393, + "step": 3064 + }, + { + "epoch": 0.8055966105034978, + "grad_norm": 0.6755779981613159, + "learning_rate": 7.316453478184686e-05, + "loss": 1.7589, + "step": 3066 + }, + { + "epoch": 0.8061221138371597, + "grad_norm": 0.6949239373207092, + "learning_rate": 7.314701244086211e-05, + "loss": 1.7677, + "step": 3068 + }, + { + "epoch": 0.8066476171708215, + "grad_norm": 0.6781786680221558, + "learning_rate": 7.312949009987735e-05, + "loss": 1.7027, + "step": 3070 + }, + { + "epoch": 0.8071731205044832, + "grad_norm": 0.683310866355896, + "learning_rate": 7.311196775889259e-05, + "loss": 1.7205, + "step": 3072 + }, + { + "epoch": 0.807698623838145, + "grad_norm": 0.5861634016036987, + "learning_rate": 7.309444541790783e-05, + "loss": 1.7412, + "step": 3074 + }, + { + "epoch": 0.8082241271718067, + "grad_norm": 0.7344016432762146, + "learning_rate": 7.307692307692307e-05, + "loss": 1.7988, + "step": 3076 + }, + { + "epoch": 0.8087496305054686, + "grad_norm": 0.5996577143669128, + "learning_rate": 7.305940073593832e-05, + "loss": 1.765, + "step": 3078 + }, + { + "epoch": 0.8092751338391303, + "grad_norm": 0.5766566395759583, + "learning_rate": 7.304187839495357e-05, + "loss": 1.8002, + "step": 3080 + }, + { + "epoch": 0.809800637172792, + "grad_norm": 0.6364811658859253, + "learning_rate": 7.302435605396882e-05, + "loss": 1.7687, + "step": 3082 + }, + { + "epoch": 0.8103261405064538, + "grad_norm": 0.839227557182312, + "learning_rate": 7.300683371298405e-05, + "loss": 1.7501, + "step": 3084 + }, + { + "epoch": 0.8108516438401157, + "grad_norm": 0.6285102367401123, + "learning_rate": 7.29893113719993e-05, + "loss": 1.7503, + "step": 3086 + }, + { + "epoch": 0.8113771471737774, + "grad_norm": 0.6087677478790283, + "learning_rate": 7.297178903101455e-05, + "loss": 1.7873, + "step": 3088 + }, + { + "epoch": 0.8119026505074391, + "grad_norm": 0.9094337821006775, + "learning_rate": 7.29542666900298e-05, + "loss": 1.7501, + "step": 3090 + }, + { + "epoch": 0.8124281538411009, + "grad_norm": 0.6166443228721619, + "learning_rate": 7.293674434904504e-05, + "loss": 1.7809, + "step": 3092 + }, + { + "epoch": 0.8129536571747628, + "grad_norm": 0.6993762850761414, + "learning_rate": 7.291922200806029e-05, + "loss": 1.776, + "step": 3094 + }, + { + "epoch": 0.8134791605084245, + "grad_norm": 0.6359695792198181, + "learning_rate": 7.290169966707552e-05, + "loss": 1.7177, + "step": 3096 + }, + { + "epoch": 0.8140046638420863, + "grad_norm": 0.8014838695526123, + "learning_rate": 7.288417732609077e-05, + "loss": 1.7488, + "step": 3098 + }, + { + "epoch": 0.814530167175748, + "grad_norm": 0.6601728200912476, + "learning_rate": 7.2866654985106e-05, + "loss": 1.8006, + "step": 3100 + }, + { + "epoch": 0.8150556705094097, + "grad_norm": 0.9497177004814148, + "learning_rate": 7.284913264412125e-05, + "loss": 1.8234, + "step": 3102 + }, + { + "epoch": 0.8155811738430716, + "grad_norm": 0.7122120261192322, + "learning_rate": 7.28316103031365e-05, + "loss": 1.7306, + "step": 3104 + }, + { + "epoch": 0.8161066771767334, + "grad_norm": 0.7118192911148071, + "learning_rate": 7.281408796215175e-05, + "loss": 1.7178, + "step": 3106 + }, + { + "epoch": 0.8166321805103951, + "grad_norm": 0.727682888507843, + "learning_rate": 7.2796565621167e-05, + "loss": 1.7857, + "step": 3108 + }, + { + "epoch": 0.8171576838440568, + "grad_norm": 0.6266892552375793, + "learning_rate": 7.277904328018223e-05, + "loss": 1.7531, + "step": 3110 + }, + { + "epoch": 0.8176831871777187, + "grad_norm": 1.1099108457565308, + "learning_rate": 7.276152093919748e-05, + "loss": 1.7579, + "step": 3112 + }, + { + "epoch": 0.8182086905113805, + "grad_norm": 0.7087392807006836, + "learning_rate": 7.274399859821272e-05, + "loss": 1.7393, + "step": 3114 + }, + { + "epoch": 0.8187341938450422, + "grad_norm": 0.9023381471633911, + "learning_rate": 7.272647625722797e-05, + "loss": 1.7488, + "step": 3116 + }, + { + "epoch": 0.819259697178704, + "grad_norm": 0.6250995993614197, + "learning_rate": 7.270895391624322e-05, + "loss": 1.7567, + "step": 3118 + }, + { + "epoch": 0.8197852005123657, + "grad_norm": 0.7343935370445251, + "learning_rate": 7.269143157525847e-05, + "loss": 1.7801, + "step": 3120 + }, + { + "epoch": 0.8203107038460276, + "grad_norm": 0.7513467073440552, + "learning_rate": 7.26739092342737e-05, + "loss": 1.7638, + "step": 3122 + }, + { + "epoch": 0.8208362071796893, + "grad_norm": 0.7709615230560303, + "learning_rate": 7.265638689328895e-05, + "loss": 1.7477, + "step": 3124 + }, + { + "epoch": 0.821361710513351, + "grad_norm": 0.6068372130393982, + "learning_rate": 7.263886455230418e-05, + "loss": 1.7597, + "step": 3126 + }, + { + "epoch": 0.8218872138470128, + "grad_norm": 0.6252472400665283, + "learning_rate": 7.262134221131943e-05, + "loss": 1.7413, + "step": 3128 + }, + { + "epoch": 0.8224127171806747, + "grad_norm": 0.6601640582084656, + "learning_rate": 7.260381987033468e-05, + "loss": 1.7298, + "step": 3130 + }, + { + "epoch": 0.8229382205143364, + "grad_norm": 0.6942901015281677, + "learning_rate": 7.258629752934993e-05, + "loss": 1.7788, + "step": 3132 + }, + { + "epoch": 0.8234637238479982, + "grad_norm": 0.7625568509101868, + "learning_rate": 7.256877518836517e-05, + "loss": 1.752, + "step": 3134 + }, + { + "epoch": 0.8239892271816599, + "grad_norm": 0.7857789993286133, + "learning_rate": 7.25512528473804e-05, + "loss": 1.7903, + "step": 3136 + }, + { + "epoch": 0.8245147305153218, + "grad_norm": 0.76043301820755, + "learning_rate": 7.253373050639565e-05, + "loss": 1.7619, + "step": 3138 + }, + { + "epoch": 0.8250402338489835, + "grad_norm": 0.7142301201820374, + "learning_rate": 7.25162081654109e-05, + "loss": 1.7328, + "step": 3140 + }, + { + "epoch": 0.8255657371826453, + "grad_norm": 0.8170753717422485, + "learning_rate": 7.249868582442615e-05, + "loss": 1.7779, + "step": 3142 + }, + { + "epoch": 0.826091240516307, + "grad_norm": 0.6277784109115601, + "learning_rate": 7.24811634834414e-05, + "loss": 1.7421, + "step": 3144 + }, + { + "epoch": 0.8266167438499687, + "grad_norm": 0.8075504899024963, + "learning_rate": 7.246364114245664e-05, + "loss": 1.7504, + "step": 3146 + }, + { + "epoch": 0.8271422471836306, + "grad_norm": 0.5615305304527283, + "learning_rate": 7.244611880147188e-05, + "loss": 1.7537, + "step": 3148 + }, + { + "epoch": 0.8276677505172924, + "grad_norm": 0.7479259967803955, + "learning_rate": 7.242859646048711e-05, + "loss": 1.7801, + "step": 3150 + }, + { + "epoch": 0.8281932538509541, + "grad_norm": 0.7294136881828308, + "learning_rate": 7.241107411950236e-05, + "loss": 1.7752, + "step": 3152 + }, + { + "epoch": 0.8287187571846159, + "grad_norm": 0.6307454705238342, + "learning_rate": 7.239355177851761e-05, + "loss": 1.8088, + "step": 3154 + }, + { + "epoch": 0.8292442605182777, + "grad_norm": 0.641791582107544, + "learning_rate": 7.237602943753286e-05, + "loss": 1.758, + "step": 3156 + }, + { + "epoch": 0.8297697638519395, + "grad_norm": 0.7693712711334229, + "learning_rate": 7.23585070965481e-05, + "loss": 1.7816, + "step": 3158 + }, + { + "epoch": 0.8302952671856012, + "grad_norm": 0.6319524049758911, + "learning_rate": 7.234098475556335e-05, + "loss": 1.7644, + "step": 3160 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.6433089375495911, + "learning_rate": 7.232346241457858e-05, + "loss": 1.7876, + "step": 3162 + }, + { + "epoch": 0.8313462738529247, + "grad_norm": 0.5543965697288513, + "learning_rate": 7.230594007359383e-05, + "loss": 1.7678, + "step": 3164 + }, + { + "epoch": 0.8318717771865866, + "grad_norm": 0.7125136852264404, + "learning_rate": 7.228841773260908e-05, + "loss": 1.7816, + "step": 3166 + }, + { + "epoch": 0.8323972805202483, + "grad_norm": 0.6893459558486938, + "learning_rate": 7.227089539162433e-05, + "loss": 1.7556, + "step": 3168 + }, + { + "epoch": 0.83292278385391, + "grad_norm": 0.7028675675392151, + "learning_rate": 7.225337305063957e-05, + "loss": 1.752, + "step": 3170 + }, + { + "epoch": 0.8334482871875718, + "grad_norm": 0.6112826466560364, + "learning_rate": 7.223585070965482e-05, + "loss": 1.7367, + "step": 3172 + }, + { + "epoch": 0.8339737905212337, + "grad_norm": 0.6377979516983032, + "learning_rate": 7.221832836867006e-05, + "loss": 1.7452, + "step": 3174 + }, + { + "epoch": 0.8344992938548954, + "grad_norm": 0.6925122141838074, + "learning_rate": 7.22008060276853e-05, + "loss": 1.7776, + "step": 3176 + }, + { + "epoch": 0.8350247971885572, + "grad_norm": 0.6226949095726013, + "learning_rate": 7.218328368670054e-05, + "loss": 1.748, + "step": 3178 + }, + { + "epoch": 0.8355503005222189, + "grad_norm": 0.7138300538063049, + "learning_rate": 7.216576134571579e-05, + "loss": 1.7662, + "step": 3180 + }, + { + "epoch": 0.8360758038558808, + "grad_norm": 0.6248802542686462, + "learning_rate": 7.214823900473103e-05, + "loss": 1.7652, + "step": 3182 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.6834786534309387, + "learning_rate": 7.213071666374628e-05, + "loss": 1.7537, + "step": 3184 + }, + { + "epoch": 0.8371268105232043, + "grad_norm": 0.6411855220794678, + "learning_rate": 7.211319432276153e-05, + "loss": 1.7582, + "step": 3186 + }, + { + "epoch": 0.837652313856866, + "grad_norm": 0.6991042494773865, + "learning_rate": 7.209567198177678e-05, + "loss": 1.7643, + "step": 3188 + }, + { + "epoch": 0.8381778171905278, + "grad_norm": 0.6581319570541382, + "learning_rate": 7.207814964079201e-05, + "loss": 1.7283, + "step": 3190 + }, + { + "epoch": 0.8387033205241896, + "grad_norm": 0.6132378578186035, + "learning_rate": 7.206062729980726e-05, + "loss": 1.7724, + "step": 3192 + }, + { + "epoch": 0.8392288238578514, + "grad_norm": 0.6994782090187073, + "learning_rate": 7.20431049588225e-05, + "loss": 1.7765, + "step": 3194 + }, + { + "epoch": 0.8397543271915131, + "grad_norm": 0.7247324585914612, + "learning_rate": 7.202558261783775e-05, + "loss": 1.7665, + "step": 3196 + }, + { + "epoch": 0.8402798305251749, + "grad_norm": 0.8014911413192749, + "learning_rate": 7.2008060276853e-05, + "loss": 1.7471, + "step": 3198 + }, + { + "epoch": 0.8408053338588367, + "grad_norm": 0.7040480971336365, + "learning_rate": 7.199053793586825e-05, + "loss": 1.7527, + "step": 3200 + }, + { + "epoch": 0.8408053338588367, + "eval_loss": 1.7256534099578857, + "eval_runtime": 487.1811, + "eval_samples_per_second": 249.987, + "eval_steps_per_second": 31.249, + "step": 3200 + }, + { + "epoch": 0.8413308371924985, + "grad_norm": 0.8999149799346924, + "learning_rate": 7.197301559488348e-05, + "loss": 1.7836, + "step": 3202 + }, + { + "epoch": 0.8418563405261602, + "grad_norm": 0.6944252252578735, + "learning_rate": 7.195549325389872e-05, + "loss": 1.7336, + "step": 3204 + }, + { + "epoch": 0.842381843859822, + "grad_norm": 0.5566868185997009, + "learning_rate": 7.193797091291396e-05, + "loss": 1.7681, + "step": 3206 + }, + { + "epoch": 0.8429073471934838, + "grad_norm": 0.769062340259552, + "learning_rate": 7.192044857192921e-05, + "loss": 1.7477, + "step": 3208 + }, + { + "epoch": 0.8434328505271456, + "grad_norm": 0.6222507953643799, + "learning_rate": 7.190292623094446e-05, + "loss": 1.7576, + "step": 3210 + }, + { + "epoch": 0.8439583538608073, + "grad_norm": 0.6584329009056091, + "learning_rate": 7.18854038899597e-05, + "loss": 1.7658, + "step": 3212 + }, + { + "epoch": 0.8444838571944691, + "grad_norm": 0.6822264194488525, + "learning_rate": 7.186788154897495e-05, + "loss": 1.7709, + "step": 3214 + }, + { + "epoch": 0.8450093605281308, + "grad_norm": 0.6141505241394043, + "learning_rate": 7.185035920799019e-05, + "loss": 1.776, + "step": 3216 + }, + { + "epoch": 0.8455348638617927, + "grad_norm": 0.5974141359329224, + "learning_rate": 7.183283686700543e-05, + "loss": 1.7246, + "step": 3218 + }, + { + "epoch": 0.8460603671954544, + "grad_norm": 0.6883708238601685, + "learning_rate": 7.181531452602068e-05, + "loss": 1.7609, + "step": 3220 + }, + { + "epoch": 0.8465858705291162, + "grad_norm": 0.6883281469345093, + "learning_rate": 7.179779218503593e-05, + "loss": 1.7315, + "step": 3222 + }, + { + "epoch": 0.8471113738627779, + "grad_norm": 0.6654126048088074, + "learning_rate": 7.178026984405118e-05, + "loss": 1.7469, + "step": 3224 + }, + { + "epoch": 0.8476368771964398, + "grad_norm": 0.8119237422943115, + "learning_rate": 7.176274750306641e-05, + "loss": 1.7882, + "step": 3226 + }, + { + "epoch": 0.8481623805301015, + "grad_norm": 0.7793521285057068, + "learning_rate": 7.174522516208166e-05, + "loss": 1.7826, + "step": 3228 + }, + { + "epoch": 0.8486878838637633, + "grad_norm": 0.8359899520874023, + "learning_rate": 7.172770282109689e-05, + "loss": 1.7612, + "step": 3230 + }, + { + "epoch": 0.849213387197425, + "grad_norm": 0.5970791578292847, + "learning_rate": 7.171018048011214e-05, + "loss": 1.7498, + "step": 3232 + }, + { + "epoch": 0.8497388905310868, + "grad_norm": 0.6869466304779053, + "learning_rate": 7.169265813912739e-05, + "loss": 1.7787, + "step": 3234 + }, + { + "epoch": 0.8502643938647486, + "grad_norm": 0.6755763292312622, + "learning_rate": 7.167513579814264e-05, + "loss": 1.7883, + "step": 3236 + }, + { + "epoch": 0.8507898971984104, + "grad_norm": 0.8182030320167542, + "learning_rate": 7.165761345715788e-05, + "loss": 1.7821, + "step": 3238 + }, + { + "epoch": 0.8513154005320721, + "grad_norm": 0.6454432010650635, + "learning_rate": 7.164009111617313e-05, + "loss": 1.7527, + "step": 3240 + }, + { + "epoch": 0.8518409038657339, + "grad_norm": 0.7202356457710266, + "learning_rate": 7.162256877518836e-05, + "loss": 1.7821, + "step": 3242 + }, + { + "epoch": 0.8523664071993957, + "grad_norm": 0.7236865758895874, + "learning_rate": 7.160504643420361e-05, + "loss": 1.7579, + "step": 3244 + }, + { + "epoch": 0.8528919105330575, + "grad_norm": 0.6406378746032715, + "learning_rate": 7.158752409321886e-05, + "loss": 1.7459, + "step": 3246 + }, + { + "epoch": 0.8534174138667192, + "grad_norm": 0.755293607711792, + "learning_rate": 7.157000175223411e-05, + "loss": 1.7638, + "step": 3248 + }, + { + "epoch": 0.853942917200381, + "grad_norm": 0.7986418604850769, + "learning_rate": 7.155247941124936e-05, + "loss": 1.7686, + "step": 3250 + }, + { + "epoch": 0.8544684205340428, + "grad_norm": 0.7703737616539001, + "learning_rate": 7.153495707026459e-05, + "loss": 1.7372, + "step": 3252 + }, + { + "epoch": 0.8549939238677046, + "grad_norm": 0.6074422597885132, + "learning_rate": 7.151743472927984e-05, + "loss": 1.7765, + "step": 3254 + }, + { + "epoch": 0.8555194272013663, + "grad_norm": 0.7662899494171143, + "learning_rate": 7.149991238829507e-05, + "loss": 1.7563, + "step": 3256 + }, + { + "epoch": 0.8560449305350281, + "grad_norm": 0.7859123945236206, + "learning_rate": 7.148239004731032e-05, + "loss": 1.7462, + "step": 3258 + }, + { + "epoch": 0.8565704338686898, + "grad_norm": 0.5845335721969604, + "learning_rate": 7.146486770632557e-05, + "loss": 1.7524, + "step": 3260 + }, + { + "epoch": 0.8570959372023517, + "grad_norm": 0.6083472967147827, + "learning_rate": 7.144734536534081e-05, + "loss": 1.7705, + "step": 3262 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.779712438583374, + "learning_rate": 7.142982302435606e-05, + "loss": 1.7768, + "step": 3264 + }, + { + "epoch": 0.8581469438696752, + "grad_norm": 0.7343006134033203, + "learning_rate": 7.141230068337131e-05, + "loss": 1.7489, + "step": 3266 + }, + { + "epoch": 0.8586724472033369, + "grad_norm": 0.578289270401001, + "learning_rate": 7.139477834238654e-05, + "loss": 1.7436, + "step": 3268 + }, + { + "epoch": 0.8591979505369988, + "grad_norm": 0.6657706499099731, + "learning_rate": 7.137725600140179e-05, + "loss": 1.7674, + "step": 3270 + }, + { + "epoch": 0.8597234538706605, + "grad_norm": 0.7936644554138184, + "learning_rate": 7.135973366041704e-05, + "loss": 1.7482, + "step": 3272 + }, + { + "epoch": 0.8602489572043223, + "grad_norm": 0.5892787575721741, + "learning_rate": 7.134221131943229e-05, + "loss": 1.7448, + "step": 3274 + }, + { + "epoch": 0.860774460537984, + "grad_norm": 0.6341478228569031, + "learning_rate": 7.132468897844753e-05, + "loss": 1.7613, + "step": 3276 + }, + { + "epoch": 0.8612999638716458, + "grad_norm": 0.684622585773468, + "learning_rate": 7.130716663746277e-05, + "loss": 1.7685, + "step": 3278 + }, + { + "epoch": 0.8618254672053076, + "grad_norm": 0.5765745639801025, + "learning_rate": 7.128964429647801e-05, + "loss": 1.7889, + "step": 3280 + }, + { + "epoch": 0.8623509705389694, + "grad_norm": 0.5164791941642761, + "learning_rate": 7.127212195549325e-05, + "loss": 1.7573, + "step": 3282 + }, + { + "epoch": 0.8628764738726311, + "grad_norm": 0.5809277892112732, + "learning_rate": 7.12545996145085e-05, + "loss": 1.8003, + "step": 3284 + }, + { + "epoch": 0.8634019772062929, + "grad_norm": 0.6859455108642578, + "learning_rate": 7.123707727352374e-05, + "loss": 1.726, + "step": 3286 + }, + { + "epoch": 0.8639274805399547, + "grad_norm": 0.665773332118988, + "learning_rate": 7.121955493253899e-05, + "loss": 1.7629, + "step": 3288 + }, + { + "epoch": 0.8644529838736165, + "grad_norm": 0.7473315596580505, + "learning_rate": 7.120203259155424e-05, + "loss": 1.7538, + "step": 3290 + }, + { + "epoch": 0.8649784872072782, + "grad_norm": 0.7632318735122681, + "learning_rate": 7.118451025056949e-05, + "loss": 1.7755, + "step": 3292 + }, + { + "epoch": 0.86550399054094, + "grad_norm": 0.7813208699226379, + "learning_rate": 7.116698790958472e-05, + "loss": 1.7835, + "step": 3294 + }, + { + "epoch": 0.8660294938746018, + "grad_norm": 0.9293962121009827, + "learning_rate": 7.114946556859997e-05, + "loss": 1.7332, + "step": 3296 + }, + { + "epoch": 0.8665549972082636, + "grad_norm": 0.9213319420814514, + "learning_rate": 7.113194322761522e-05, + "loss": 1.709, + "step": 3298 + }, + { + "epoch": 0.8670805005419253, + "grad_norm": 0.5981359481811523, + "learning_rate": 7.111442088663046e-05, + "loss": 1.7406, + "step": 3300 + }, + { + "epoch": 0.8676060038755871, + "grad_norm": 0.645785927772522, + "learning_rate": 7.109689854564571e-05, + "loss": 1.7659, + "step": 3302 + }, + { + "epoch": 0.8681315072092488, + "grad_norm": 0.9009891748428345, + "learning_rate": 7.107937620466094e-05, + "loss": 1.7335, + "step": 3304 + }, + { + "epoch": 0.8686570105429107, + "grad_norm": 0.7231364250183105, + "learning_rate": 7.106185386367619e-05, + "loss": 1.7378, + "step": 3306 + }, + { + "epoch": 0.8691825138765724, + "grad_norm": 0.8442168235778809, + "learning_rate": 7.104433152269143e-05, + "loss": 1.7722, + "step": 3308 + }, + { + "epoch": 0.8697080172102342, + "grad_norm": 0.7314670085906982, + "learning_rate": 7.102680918170667e-05, + "loss": 1.7594, + "step": 3310 + }, + { + "epoch": 0.8702335205438959, + "grad_norm": 0.6394951343536377, + "learning_rate": 7.100928684072192e-05, + "loss": 1.7693, + "step": 3312 + }, + { + "epoch": 0.8707590238775578, + "grad_norm": 0.612392008304596, + "learning_rate": 7.099176449973717e-05, + "loss": 1.7777, + "step": 3314 + }, + { + "epoch": 0.8712845272112195, + "grad_norm": 0.6011560559272766, + "learning_rate": 7.097424215875242e-05, + "loss": 1.7429, + "step": 3316 + }, + { + "epoch": 0.8718100305448813, + "grad_norm": 0.9169111847877502, + "learning_rate": 7.095671981776766e-05, + "loss": 1.7723, + "step": 3318 + }, + { + "epoch": 0.872335533878543, + "grad_norm": 0.5939310193061829, + "learning_rate": 7.09391974767829e-05, + "loss": 1.718, + "step": 3320 + }, + { + "epoch": 0.8728610372122048, + "grad_norm": 0.8067646026611328, + "learning_rate": 7.092167513579815e-05, + "loss": 1.7679, + "step": 3322 + }, + { + "epoch": 0.8733865405458666, + "grad_norm": 0.6771594882011414, + "learning_rate": 7.090415279481339e-05, + "loss": 1.7405, + "step": 3324 + }, + { + "epoch": 0.8739120438795284, + "grad_norm": 0.7373068332672119, + "learning_rate": 7.088663045382864e-05, + "loss": 1.7363, + "step": 3326 + }, + { + "epoch": 0.8744375472131901, + "grad_norm": 0.5647407174110413, + "learning_rate": 7.086910811284387e-05, + "loss": 1.8027, + "step": 3328 + }, + { + "epoch": 0.8749630505468519, + "grad_norm": 0.5459885001182556, + "learning_rate": 7.085158577185912e-05, + "loss": 1.7588, + "step": 3330 + }, + { + "epoch": 0.8754885538805137, + "grad_norm": 0.6479038596153259, + "learning_rate": 7.083406343087437e-05, + "loss": 1.8066, + "step": 3332 + }, + { + "epoch": 0.8760140572141755, + "grad_norm": 0.6865916848182678, + "learning_rate": 7.08165410898896e-05, + "loss": 1.7605, + "step": 3334 + }, + { + "epoch": 0.8765395605478372, + "grad_norm": 0.6146122813224792, + "learning_rate": 7.079901874890485e-05, + "loss": 1.7711, + "step": 3336 + }, + { + "epoch": 0.877065063881499, + "grad_norm": 0.6822938323020935, + "learning_rate": 7.07814964079201e-05, + "loss": 1.7355, + "step": 3338 + }, + { + "epoch": 0.8775905672151608, + "grad_norm": 0.8722227215766907, + "learning_rate": 7.076397406693535e-05, + "loss": 1.7405, + "step": 3340 + }, + { + "epoch": 0.8781160705488226, + "grad_norm": 0.6625831127166748, + "learning_rate": 7.07464517259506e-05, + "loss": 1.748, + "step": 3342 + }, + { + "epoch": 0.8786415738824843, + "grad_norm": 0.5351794362068176, + "learning_rate": 7.072892938496584e-05, + "loss": 1.747, + "step": 3344 + }, + { + "epoch": 0.8791670772161461, + "grad_norm": 0.7087706923484802, + "learning_rate": 7.071140704398108e-05, + "loss": 1.7475, + "step": 3346 + }, + { + "epoch": 0.8796925805498078, + "grad_norm": 0.6210707426071167, + "learning_rate": 7.069388470299632e-05, + "loss": 1.7666, + "step": 3348 + }, + { + "epoch": 0.8802180838834697, + "grad_norm": 0.5860678553581238, + "learning_rate": 7.067636236201157e-05, + "loss": 1.728, + "step": 3350 + }, + { + "epoch": 0.8807435872171314, + "grad_norm": 0.5733693242073059, + "learning_rate": 7.065884002102682e-05, + "loss": 1.8006, + "step": 3352 + }, + { + "epoch": 0.8812690905507932, + "grad_norm": 0.6897549629211426, + "learning_rate": 7.064131768004205e-05, + "loss": 1.7615, + "step": 3354 + }, + { + "epoch": 0.8817945938844549, + "grad_norm": 0.8119019865989685, + "learning_rate": 7.06237953390573e-05, + "loss": 1.7428, + "step": 3356 + }, + { + "epoch": 0.8823200972181168, + "grad_norm": 0.6543797254562378, + "learning_rate": 7.060627299807255e-05, + "loss": 1.7592, + "step": 3358 + }, + { + "epoch": 0.8828456005517785, + "grad_norm": 0.6914211511611938, + "learning_rate": 7.058875065708778e-05, + "loss": 1.7549, + "step": 3360 + }, + { + "epoch": 0.8833711038854403, + "grad_norm": 0.7868301868438721, + "learning_rate": 7.057122831610303e-05, + "loss": 1.7475, + "step": 3362 + }, + { + "epoch": 0.883896607219102, + "grad_norm": 0.7719436287879944, + "learning_rate": 7.055370597511828e-05, + "loss": 1.7549, + "step": 3364 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.6737284064292908, + "learning_rate": 7.053618363413352e-05, + "loss": 1.7392, + "step": 3366 + }, + { + "epoch": 0.8849476138864256, + "grad_norm": 1.0450035333633423, + "learning_rate": 7.051866129314877e-05, + "loss": 1.736, + "step": 3368 + }, + { + "epoch": 0.8854731172200874, + "grad_norm": 0.6924079060554504, + "learning_rate": 7.050113895216402e-05, + "loss": 1.7431, + "step": 3370 + }, + { + "epoch": 0.8859986205537491, + "grad_norm": 0.689011812210083, + "learning_rate": 7.048361661117925e-05, + "loss": 1.7567, + "step": 3372 + }, + { + "epoch": 0.8865241238874109, + "grad_norm": 0.6025006771087646, + "learning_rate": 7.04660942701945e-05, + "loss": 1.7339, + "step": 3374 + }, + { + "epoch": 0.8870496272210727, + "grad_norm": 0.6246035695075989, + "learning_rate": 7.044857192920975e-05, + "loss": 1.7728, + "step": 3376 + }, + { + "epoch": 0.8875751305547345, + "grad_norm": 0.92098069190979, + "learning_rate": 7.0431049588225e-05, + "loss": 1.7452, + "step": 3378 + }, + { + "epoch": 0.8881006338883962, + "grad_norm": 0.6286795139312744, + "learning_rate": 7.041352724724023e-05, + "loss": 1.752, + "step": 3380 + }, + { + "epoch": 0.888626137222058, + "grad_norm": 0.5665922164916992, + "learning_rate": 7.039600490625548e-05, + "loss": 1.7561, + "step": 3382 + }, + { + "epoch": 0.8891516405557198, + "grad_norm": 0.7606804966926575, + "learning_rate": 7.037848256527073e-05, + "loss": 1.7903, + "step": 3384 + }, + { + "epoch": 0.8896771438893816, + "grad_norm": 0.8399646282196045, + "learning_rate": 7.036096022428596e-05, + "loss": 1.8045, + "step": 3386 + }, + { + "epoch": 0.8902026472230433, + "grad_norm": 0.6141343116760254, + "learning_rate": 7.03434378833012e-05, + "loss": 1.7665, + "step": 3388 + }, + { + "epoch": 0.8907281505567051, + "grad_norm": 0.7554699778556824, + "learning_rate": 7.032591554231645e-05, + "loss": 1.7219, + "step": 3390 + }, + { + "epoch": 0.8912536538903668, + "grad_norm": 0.7068594694137573, + "learning_rate": 7.03083932013317e-05, + "loss": 1.7689, + "step": 3392 + }, + { + "epoch": 0.8917791572240287, + "grad_norm": 0.8859004378318787, + "learning_rate": 7.029087086034695e-05, + "loss": 1.7261, + "step": 3394 + }, + { + "epoch": 0.8923046605576904, + "grad_norm": 0.7646594047546387, + "learning_rate": 7.02733485193622e-05, + "loss": 1.7813, + "step": 3396 + }, + { + "epoch": 0.8928301638913522, + "grad_norm": 0.7105104923248291, + "learning_rate": 7.025582617837743e-05, + "loss": 1.7528, + "step": 3398 + }, + { + "epoch": 0.8933556672250139, + "grad_norm": 0.5750377178192139, + "learning_rate": 7.023830383739268e-05, + "loss": 1.7324, + "step": 3400 + }, + { + "epoch": 0.8938811705586758, + "grad_norm": 0.6942424178123474, + "learning_rate": 7.022078149640793e-05, + "loss": 1.7352, + "step": 3402 + }, + { + "epoch": 0.8944066738923375, + "grad_norm": 0.7094271779060364, + "learning_rate": 7.020325915542317e-05, + "loss": 1.7449, + "step": 3404 + }, + { + "epoch": 0.8949321772259993, + "grad_norm": 0.6620864868164062, + "learning_rate": 7.018573681443841e-05, + "loss": 1.7718, + "step": 3406 + }, + { + "epoch": 0.895457680559661, + "grad_norm": 0.6577492952346802, + "learning_rate": 7.016821447345366e-05, + "loss": 1.745, + "step": 3408 + }, + { + "epoch": 0.8959831838933229, + "grad_norm": 0.6020835041999817, + "learning_rate": 7.01506921324689e-05, + "loss": 1.7486, + "step": 3410 + }, + { + "epoch": 0.8965086872269846, + "grad_norm": 0.6429753303527832, + "learning_rate": 7.013316979148414e-05, + "loss": 1.7656, + "step": 3412 + }, + { + "epoch": 0.8970341905606464, + "grad_norm": 0.6767374277114868, + "learning_rate": 7.011564745049938e-05, + "loss": 1.7193, + "step": 3414 + }, + { + "epoch": 0.8975596938943081, + "grad_norm": 0.7432959079742432, + "learning_rate": 7.009812510951463e-05, + "loss": 1.7336, + "step": 3416 + }, + { + "epoch": 0.8980851972279699, + "grad_norm": 0.6830999851226807, + "learning_rate": 7.008060276852988e-05, + "loss": 1.7438, + "step": 3418 + }, + { + "epoch": 0.8986107005616317, + "grad_norm": 0.5865710377693176, + "learning_rate": 7.006308042754513e-05, + "loss": 1.8067, + "step": 3420 + }, + { + "epoch": 0.8991362038952935, + "grad_norm": 1.0097007751464844, + "learning_rate": 7.004555808656037e-05, + "loss": 1.7458, + "step": 3422 + }, + { + "epoch": 0.8996617072289552, + "grad_norm": 0.7260006666183472, + "learning_rate": 7.002803574557561e-05, + "loss": 1.7707, + "step": 3424 + }, + { + "epoch": 0.900187210562617, + "grad_norm": 0.7570676803588867, + "learning_rate": 7.001051340459086e-05, + "loss": 1.7631, + "step": 3426 + }, + { + "epoch": 0.9007127138962788, + "grad_norm": 1.079424500465393, + "learning_rate": 6.99929910636061e-05, + "loss": 1.7318, + "step": 3428 + }, + { + "epoch": 0.9012382172299406, + "grad_norm": 0.637510359287262, + "learning_rate": 6.997546872262134e-05, + "loss": 1.7427, + "step": 3430 + }, + { + "epoch": 0.9017637205636023, + "grad_norm": 0.9443916082382202, + "learning_rate": 6.995794638163659e-05, + "loss": 1.7182, + "step": 3432 + }, + { + "epoch": 0.9022892238972641, + "grad_norm": 0.7777066826820374, + "learning_rate": 6.994042404065183e-05, + "loss": 1.7543, + "step": 3434 + }, + { + "epoch": 0.9028147272309258, + "grad_norm": 0.6093029975891113, + "learning_rate": 6.992290169966708e-05, + "loss": 1.7501, + "step": 3436 + }, + { + "epoch": 0.9033402305645877, + "grad_norm": 0.6203290820121765, + "learning_rate": 6.990537935868233e-05, + "loss": 1.8313, + "step": 3438 + }, + { + "epoch": 0.9038657338982494, + "grad_norm": 0.6452112793922424, + "learning_rate": 6.988785701769756e-05, + "loss": 1.7691, + "step": 3440 + }, + { + "epoch": 0.9043912372319112, + "grad_norm": 0.6220587491989136, + "learning_rate": 6.987033467671281e-05, + "loss": 1.7458, + "step": 3442 + }, + { + "epoch": 0.9049167405655729, + "grad_norm": 0.6492230296134949, + "learning_rate": 6.985281233572806e-05, + "loss": 1.7624, + "step": 3444 + }, + { + "epoch": 0.9054422438992348, + "grad_norm": 0.8058958053588867, + "learning_rate": 6.98352899947433e-05, + "loss": 1.7341, + "step": 3446 + }, + { + "epoch": 0.9059677472328965, + "grad_norm": 0.765034019947052, + "learning_rate": 6.981776765375855e-05, + "loss": 1.7312, + "step": 3448 + }, + { + "epoch": 0.9064932505665583, + "grad_norm": 0.9301319122314453, + "learning_rate": 6.98002453127738e-05, + "loss": 1.7785, + "step": 3450 + }, + { + "epoch": 0.90701875390022, + "grad_norm": 0.723552942276001, + "learning_rate": 6.978272297178903e-05, + "loss": 1.7547, + "step": 3452 + }, + { + "epoch": 0.9075442572338819, + "grad_norm": 0.8970544338226318, + "learning_rate": 6.976520063080428e-05, + "loss": 1.7424, + "step": 3454 + }, + { + "epoch": 0.9080697605675436, + "grad_norm": 0.6986632943153381, + "learning_rate": 6.974767828981952e-05, + "loss": 1.76, + "step": 3456 + }, + { + "epoch": 0.9085952639012054, + "grad_norm": 0.6767635941505432, + "learning_rate": 6.973015594883476e-05, + "loss": 1.7517, + "step": 3458 + }, + { + "epoch": 0.9091207672348671, + "grad_norm": 0.7813493013381958, + "learning_rate": 6.971263360785001e-05, + "loss": 1.7303, + "step": 3460 + }, + { + "epoch": 0.9096462705685289, + "grad_norm": 0.7122093439102173, + "learning_rate": 6.969511126686526e-05, + "loss": 1.7735, + "step": 3462 + }, + { + "epoch": 0.9101717739021907, + "grad_norm": 0.9538240432739258, + "learning_rate": 6.96775889258805e-05, + "loss": 1.7279, + "step": 3464 + }, + { + "epoch": 0.9106972772358525, + "grad_norm": 0.9474038481712341, + "learning_rate": 6.966006658489574e-05, + "loss": 1.7636, + "step": 3466 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.6011683344841003, + "learning_rate": 6.964254424391099e-05, + "loss": 1.7414, + "step": 3468 + }, + { + "epoch": 0.911748283903176, + "grad_norm": 0.7591129541397095, + "learning_rate": 6.962502190292623e-05, + "loss": 1.7654, + "step": 3470 + }, + { + "epoch": 0.9122737872368378, + "grad_norm": 0.8848958015441895, + "learning_rate": 6.960749956194148e-05, + "loss": 1.7581, + "step": 3472 + }, + { + "epoch": 0.9127992905704996, + "grad_norm": 0.7266464233398438, + "learning_rate": 6.958997722095673e-05, + "loss": 1.769, + "step": 3474 + }, + { + "epoch": 0.9133247939041613, + "grad_norm": 0.8937695026397705, + "learning_rate": 6.957245487997198e-05, + "loss": 1.7574, + "step": 3476 + }, + { + "epoch": 0.9138502972378231, + "grad_norm": 0.6543369293212891, + "learning_rate": 6.955493253898721e-05, + "loss": 1.7587, + "step": 3478 + }, + { + "epoch": 0.9143758005714848, + "grad_norm": 1.0419197082519531, + "learning_rate": 6.953741019800246e-05, + "loss": 1.7311, + "step": 3480 + }, + { + "epoch": 0.9149013039051467, + "grad_norm": 0.6583675742149353, + "learning_rate": 6.951988785701769e-05, + "loss": 1.7415, + "step": 3482 + }, + { + "epoch": 0.9154268072388084, + "grad_norm": 0.7123555541038513, + "learning_rate": 6.950236551603294e-05, + "loss": 1.7227, + "step": 3484 + }, + { + "epoch": 0.9159523105724702, + "grad_norm": 0.606636106967926, + "learning_rate": 6.948484317504819e-05, + "loss": 1.7555, + "step": 3486 + }, + { + "epoch": 0.9164778139061319, + "grad_norm": 1.0369200706481934, + "learning_rate": 6.946732083406344e-05, + "loss": 1.7494, + "step": 3488 + }, + { + "epoch": 0.9170033172397938, + "grad_norm": 0.6828787922859192, + "learning_rate": 6.944979849307868e-05, + "loss": 1.7302, + "step": 3490 + }, + { + "epoch": 0.9175288205734555, + "grad_norm": 0.7840218544006348, + "learning_rate": 6.943227615209392e-05, + "loss": 1.734, + "step": 3492 + }, + { + "epoch": 0.9180543239071173, + "grad_norm": 0.6639379858970642, + "learning_rate": 6.941475381110916e-05, + "loss": 1.7197, + "step": 3494 + }, + { + "epoch": 0.918579827240779, + "grad_norm": 0.6590544581413269, + "learning_rate": 6.939723147012441e-05, + "loss": 1.758, + "step": 3496 + }, + { + "epoch": 0.9191053305744409, + "grad_norm": 0.5985316038131714, + "learning_rate": 6.937970912913966e-05, + "loss": 1.7545, + "step": 3498 + }, + { + "epoch": 0.9196308339081026, + "grad_norm": 0.6269810795783997, + "learning_rate": 6.936218678815491e-05, + "loss": 1.7476, + "step": 3500 + }, + { + "epoch": 0.9201563372417644, + "grad_norm": 0.7149941921234131, + "learning_rate": 6.934466444717016e-05, + "loss": 1.7629, + "step": 3502 + }, + { + "epoch": 0.9206818405754261, + "grad_norm": 0.641220211982727, + "learning_rate": 6.932714210618539e-05, + "loss": 1.7613, + "step": 3504 + }, + { + "epoch": 0.9212073439090879, + "grad_norm": 0.7803055644035339, + "learning_rate": 6.930961976520064e-05, + "loss": 1.7401, + "step": 3506 + }, + { + "epoch": 0.9217328472427497, + "grad_norm": 0.6284143924713135, + "learning_rate": 6.929209742421587e-05, + "loss": 1.7267, + "step": 3508 + }, + { + "epoch": 0.9222583505764115, + "grad_norm": 0.594203531742096, + "learning_rate": 6.927457508323112e-05, + "loss": 1.7553, + "step": 3510 + }, + { + "epoch": 0.9227838539100732, + "grad_norm": 0.7015509009361267, + "learning_rate": 6.925705274224637e-05, + "loss": 1.7122, + "step": 3512 + }, + { + "epoch": 0.923309357243735, + "grad_norm": 0.6864806413650513, + "learning_rate": 6.923953040126161e-05, + "loss": 1.7099, + "step": 3514 + }, + { + "epoch": 0.9238348605773968, + "grad_norm": 0.7701146602630615, + "learning_rate": 6.922200806027686e-05, + "loss": 1.7508, + "step": 3516 + }, + { + "epoch": 0.9243603639110586, + "grad_norm": 0.6888076663017273, + "learning_rate": 6.92044857192921e-05, + "loss": 1.7455, + "step": 3518 + }, + { + "epoch": 0.9248858672447203, + "grad_norm": 0.6871370077133179, + "learning_rate": 6.918696337830734e-05, + "loss": 1.7597, + "step": 3520 + }, + { + "epoch": 0.9254113705783821, + "grad_norm": 1.1456379890441895, + "learning_rate": 6.916944103732259e-05, + "loss": 1.7585, + "step": 3522 + }, + { + "epoch": 0.9259368739120439, + "grad_norm": 0.6293717622756958, + "learning_rate": 6.915191869633784e-05, + "loss": 1.7279, + "step": 3524 + }, + { + "epoch": 0.9264623772457057, + "grad_norm": 0.6523435711860657, + "learning_rate": 6.913439635535309e-05, + "loss": 1.7486, + "step": 3526 + }, + { + "epoch": 0.9269878805793674, + "grad_norm": 0.6591430306434631, + "learning_rate": 6.911687401436833e-05, + "loss": 1.7544, + "step": 3528 + }, + { + "epoch": 0.9275133839130292, + "grad_norm": 0.7993916273117065, + "learning_rate": 6.909935167338357e-05, + "loss": 1.7468, + "step": 3530 + }, + { + "epoch": 0.9280388872466909, + "grad_norm": 0.5962069630622864, + "learning_rate": 6.90818293323988e-05, + "loss": 1.73, + "step": 3532 + }, + { + "epoch": 0.9285643905803528, + "grad_norm": 0.7057903409004211, + "learning_rate": 6.906430699141405e-05, + "loss": 1.7613, + "step": 3534 + }, + { + "epoch": 0.9290898939140145, + "grad_norm": 0.8017992377281189, + "learning_rate": 6.90467846504293e-05, + "loss": 1.7576, + "step": 3536 + }, + { + "epoch": 0.9296153972476763, + "grad_norm": 0.717413067817688, + "learning_rate": 6.902926230944454e-05, + "loss": 1.6982, + "step": 3538 + }, + { + "epoch": 0.930140900581338, + "grad_norm": 0.7504727840423584, + "learning_rate": 6.901173996845979e-05, + "loss": 1.7657, + "step": 3540 + }, + { + "epoch": 0.9306664039149999, + "grad_norm": 1.548189401626587, + "learning_rate": 6.899421762747504e-05, + "loss": 1.7614, + "step": 3542 + }, + { + "epoch": 0.9311919072486616, + "grad_norm": 0.7025728821754456, + "learning_rate": 6.897669528649027e-05, + "loss": 1.748, + "step": 3544 + }, + { + "epoch": 0.9317174105823234, + "grad_norm": 0.8109622001647949, + "learning_rate": 6.895917294550552e-05, + "loss": 1.7346, + "step": 3546 + }, + { + "epoch": 0.9322429139159851, + "grad_norm": 0.6339906454086304, + "learning_rate": 6.894165060452077e-05, + "loss": 1.7594, + "step": 3548 + }, + { + "epoch": 0.9327684172496469, + "grad_norm": 0.7020452618598938, + "learning_rate": 6.892412826353602e-05, + "loss": 1.7461, + "step": 3550 + }, + { + "epoch": 0.9332939205833087, + "grad_norm": 0.755463182926178, + "learning_rate": 6.890660592255126e-05, + "loss": 1.7522, + "step": 3552 + }, + { + "epoch": 0.9338194239169705, + "grad_norm": 0.8311240673065186, + "learning_rate": 6.888908358156651e-05, + "loss": 1.7438, + "step": 3554 + }, + { + "epoch": 0.9343449272506322, + "grad_norm": 0.8520714044570923, + "learning_rate": 6.887156124058174e-05, + "loss": 1.7354, + "step": 3556 + }, + { + "epoch": 0.934870430584294, + "grad_norm": 0.5846522450447083, + "learning_rate": 6.885403889959698e-05, + "loss": 1.7784, + "step": 3558 + }, + { + "epoch": 0.9353959339179558, + "grad_norm": 0.6541776657104492, + "learning_rate": 6.883651655861223e-05, + "loss": 1.7437, + "step": 3560 + }, + { + "epoch": 0.9359214372516176, + "grad_norm": 0.8012898564338684, + "learning_rate": 6.881899421762747e-05, + "loss": 1.7559, + "step": 3562 + }, + { + "epoch": 0.9364469405852793, + "grad_norm": 0.6521446704864502, + "learning_rate": 6.880147187664272e-05, + "loss": 1.7359, + "step": 3564 + }, + { + "epoch": 0.9369724439189411, + "grad_norm": 0.6322072744369507, + "learning_rate": 6.878394953565797e-05, + "loss": 1.7682, + "step": 3566 + }, + { + "epoch": 0.9374979472526029, + "grad_norm": 0.8116897940635681, + "learning_rate": 6.876642719467322e-05, + "loss": 1.7303, + "step": 3568 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.6734224557876587, + "learning_rate": 6.874890485368845e-05, + "loss": 1.7101, + "step": 3570 + }, + { + "epoch": 0.9385489539199264, + "grad_norm": 0.7724919319152832, + "learning_rate": 6.87313825127037e-05, + "loss": 1.754, + "step": 3572 + }, + { + "epoch": 0.9390744572535882, + "grad_norm": 0.6816089749336243, + "learning_rate": 6.871386017171895e-05, + "loss": 1.7592, + "step": 3574 + }, + { + "epoch": 0.9395999605872499, + "grad_norm": 0.6413043737411499, + "learning_rate": 6.869633783073419e-05, + "loss": 1.7446, + "step": 3576 + }, + { + "epoch": 0.9401254639209118, + "grad_norm": 0.8377385139465332, + "learning_rate": 6.867881548974944e-05, + "loss": 1.7528, + "step": 3578 + }, + { + "epoch": 0.9406509672545735, + "grad_norm": 0.7805231809616089, + "learning_rate": 6.866129314876469e-05, + "loss": 1.8016, + "step": 3580 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.5867004990577698, + "learning_rate": 6.864377080777992e-05, + "loss": 1.7655, + "step": 3582 + }, + { + "epoch": 0.941701973921897, + "grad_norm": 0.6897924542427063, + "learning_rate": 6.862624846679516e-05, + "loss": 1.7547, + "step": 3584 + }, + { + "epoch": 0.9422274772555589, + "grad_norm": 0.8123984336853027, + "learning_rate": 6.86087261258104e-05, + "loss": 1.7478, + "step": 3586 + }, + { + "epoch": 0.9427529805892206, + "grad_norm": 0.6391046643257141, + "learning_rate": 6.859120378482565e-05, + "loss": 1.7409, + "step": 3588 + }, + { + "epoch": 0.9432784839228824, + "grad_norm": 0.7633985280990601, + "learning_rate": 6.85736814438409e-05, + "loss": 1.742, + "step": 3590 + }, + { + "epoch": 0.9438039872565441, + "grad_norm": 0.6767701506614685, + "learning_rate": 6.855615910285615e-05, + "loss": 1.723, + "step": 3592 + }, + { + "epoch": 0.9443294905902059, + "grad_norm": 0.7079693675041199, + "learning_rate": 6.85386367618714e-05, + "loss": 1.7801, + "step": 3594 + }, + { + "epoch": 0.9448549939238677, + "grad_norm": 0.6894028782844543, + "learning_rate": 6.852111442088663e-05, + "loss": 1.7508, + "step": 3596 + }, + { + "epoch": 0.9453804972575295, + "grad_norm": 0.6596964597702026, + "learning_rate": 6.850359207990188e-05, + "loss": 1.7529, + "step": 3598 + }, + { + "epoch": 0.9459060005911912, + "grad_norm": 0.6702237725257874, + "learning_rate": 6.848606973891712e-05, + "loss": 1.7714, + "step": 3600 + }, + { + "epoch": 0.9459060005911912, + "eval_loss": 1.7171640396118164, + "eval_runtime": 487.1544, + "eval_samples_per_second": 250.001, + "eval_steps_per_second": 31.251, + "step": 3600 + }, + { + "epoch": 0.946431503924853, + "grad_norm": 0.5935298800468445, + "learning_rate": 6.846854739793237e-05, + "loss": 1.7409, + "step": 3602 + }, + { + "epoch": 0.9469570072585148, + "grad_norm": 0.844529926776886, + "learning_rate": 6.845102505694762e-05, + "loss": 1.7598, + "step": 3604 + }, + { + "epoch": 0.9474825105921766, + "grad_norm": 0.6452075839042664, + "learning_rate": 6.843350271596287e-05, + "loss": 1.7641, + "step": 3606 + }, + { + "epoch": 0.9480080139258383, + "grad_norm": 0.7184598445892334, + "learning_rate": 6.84159803749781e-05, + "loss": 1.7575, + "step": 3608 + }, + { + "epoch": 0.9485335172595001, + "grad_norm": 0.6628120541572571, + "learning_rate": 6.839845803399333e-05, + "loss": 1.7427, + "step": 3610 + }, + { + "epoch": 0.9490590205931619, + "grad_norm": 0.6025474667549133, + "learning_rate": 6.838093569300858e-05, + "loss": 1.7132, + "step": 3612 + }, + { + "epoch": 0.9495845239268237, + "grad_norm": 0.6190858483314514, + "learning_rate": 6.836341335202383e-05, + "loss": 1.7773, + "step": 3614 + }, + { + "epoch": 0.9501100272604854, + "grad_norm": 0.6773670315742493, + "learning_rate": 6.834589101103908e-05, + "loss": 1.7139, + "step": 3616 + }, + { + "epoch": 0.9506355305941472, + "grad_norm": 0.6356403827667236, + "learning_rate": 6.832836867005432e-05, + "loss": 1.7366, + "step": 3618 + }, + { + "epoch": 0.9511610339278089, + "grad_norm": 0.7546253800392151, + "learning_rate": 6.831084632906957e-05, + "loss": 1.7313, + "step": 3620 + }, + { + "epoch": 0.9516865372614708, + "grad_norm": 0.6066844463348389, + "learning_rate": 6.82933239880848e-05, + "loss": 1.7141, + "step": 3622 + }, + { + "epoch": 0.9522120405951325, + "grad_norm": 0.5842781066894531, + "learning_rate": 6.827580164710005e-05, + "loss": 1.7677, + "step": 3624 + }, + { + "epoch": 0.9527375439287943, + "grad_norm": 0.6750701069831848, + "learning_rate": 6.82582793061153e-05, + "loss": 1.7065, + "step": 3626 + }, + { + "epoch": 0.953263047262456, + "grad_norm": 0.5796899199485779, + "learning_rate": 6.824075696513055e-05, + "loss": 1.7272, + "step": 3628 + }, + { + "epoch": 0.9537885505961179, + "grad_norm": 0.7888158559799194, + "learning_rate": 6.82232346241458e-05, + "loss": 1.7441, + "step": 3630 + }, + { + "epoch": 0.9543140539297796, + "grad_norm": 0.5940207839012146, + "learning_rate": 6.820571228316104e-05, + "loss": 1.7261, + "step": 3632 + }, + { + "epoch": 0.9548395572634414, + "grad_norm": 0.7521408200263977, + "learning_rate": 6.818818994217628e-05, + "loss": 1.7826, + "step": 3634 + }, + { + "epoch": 0.9553650605971031, + "grad_norm": 0.6173054575920105, + "learning_rate": 6.817066760119151e-05, + "loss": 1.7113, + "step": 3636 + }, + { + "epoch": 0.9558905639307649, + "grad_norm": 0.6263679265975952, + "learning_rate": 6.815314526020676e-05, + "loss": 1.6943, + "step": 3638 + }, + { + "epoch": 0.9564160672644267, + "grad_norm": 0.6220889687538147, + "learning_rate": 6.8135622919222e-05, + "loss": 1.7146, + "step": 3640 + }, + { + "epoch": 0.9569415705980885, + "grad_norm": 0.6978549957275391, + "learning_rate": 6.811810057823725e-05, + "loss": 1.791, + "step": 3642 + }, + { + "epoch": 0.9574670739317502, + "grad_norm": 0.6536712050437927, + "learning_rate": 6.81005782372525e-05, + "loss": 1.7397, + "step": 3644 + }, + { + "epoch": 0.957992577265412, + "grad_norm": 0.6998410820960999, + "learning_rate": 6.808305589626775e-05, + "loss": 1.736, + "step": 3646 + }, + { + "epoch": 0.9585180805990738, + "grad_norm": 0.6182027459144592, + "learning_rate": 6.806553355528298e-05, + "loss": 1.7462, + "step": 3648 + }, + { + "epoch": 0.9590435839327356, + "grad_norm": 0.7864842414855957, + "learning_rate": 6.804801121429823e-05, + "loss": 1.7495, + "step": 3650 + }, + { + "epoch": 0.9595690872663973, + "grad_norm": 0.6455209255218506, + "learning_rate": 6.803048887331348e-05, + "loss": 1.7324, + "step": 3652 + }, + { + "epoch": 0.9600945906000591, + "grad_norm": 0.6225829720497131, + "learning_rate": 6.801296653232873e-05, + "loss": 1.748, + "step": 3654 + }, + { + "epoch": 0.9606200939337209, + "grad_norm": 0.6527931094169617, + "learning_rate": 6.799544419134397e-05, + "loss": 1.7133, + "step": 3656 + }, + { + "epoch": 0.9611455972673827, + "grad_norm": 0.619773805141449, + "learning_rate": 6.797792185035922e-05, + "loss": 1.7876, + "step": 3658 + }, + { + "epoch": 0.9616711006010444, + "grad_norm": 0.681759774684906, + "learning_rate": 6.796039950937446e-05, + "loss": 1.7069, + "step": 3660 + }, + { + "epoch": 0.9621966039347062, + "grad_norm": 0.746255099773407, + "learning_rate": 6.794287716838969e-05, + "loss": 1.7784, + "step": 3662 + }, + { + "epoch": 0.9627221072683679, + "grad_norm": 0.5940551161766052, + "learning_rate": 6.792535482740494e-05, + "loss": 1.7495, + "step": 3664 + }, + { + "epoch": 0.9632476106020298, + "grad_norm": 0.6246922016143799, + "learning_rate": 6.790783248642018e-05, + "loss": 1.7758, + "step": 3666 + }, + { + "epoch": 0.9637731139356915, + "grad_norm": 0.6583105325698853, + "learning_rate": 6.789031014543543e-05, + "loss": 1.7431, + "step": 3668 + }, + { + "epoch": 0.9642986172693533, + "grad_norm": 0.6988399624824524, + "learning_rate": 6.787278780445068e-05, + "loss": 1.7269, + "step": 3670 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.60069739818573, + "learning_rate": 6.785526546346593e-05, + "loss": 1.7466, + "step": 3672 + }, + { + "epoch": 0.9653496239366769, + "grad_norm": 0.615967869758606, + "learning_rate": 6.783774312248116e-05, + "loss": 1.7208, + "step": 3674 + }, + { + "epoch": 0.9658751272703386, + "grad_norm": 0.7129417061805725, + "learning_rate": 6.782022078149641e-05, + "loss": 1.7441, + "step": 3676 + }, + { + "epoch": 0.9664006306040004, + "grad_norm": 0.6287668347358704, + "learning_rate": 6.780269844051166e-05, + "loss": 1.7413, + "step": 3678 + }, + { + "epoch": 0.9669261339376621, + "grad_norm": 0.7385637760162354, + "learning_rate": 6.77851760995269e-05, + "loss": 1.7569, + "step": 3680 + }, + { + "epoch": 0.967451637271324, + "grad_norm": 0.6797763109207153, + "learning_rate": 6.776765375854215e-05, + "loss": 1.7417, + "step": 3682 + }, + { + "epoch": 0.9679771406049857, + "grad_norm": 0.667385458946228, + "learning_rate": 6.77501314175574e-05, + "loss": 1.7554, + "step": 3684 + }, + { + "epoch": 0.9685026439386475, + "grad_norm": 0.7547754645347595, + "learning_rate": 6.773260907657263e-05, + "loss": 1.6905, + "step": 3686 + }, + { + "epoch": 0.9690281472723092, + "grad_norm": 0.6124489903450012, + "learning_rate": 6.771508673558788e-05, + "loss": 1.7276, + "step": 3688 + }, + { + "epoch": 0.969553650605971, + "grad_norm": 0.6314408779144287, + "learning_rate": 6.769756439460311e-05, + "loss": 1.7375, + "step": 3690 + }, + { + "epoch": 0.9700791539396328, + "grad_norm": 1.1786826848983765, + "learning_rate": 6.768004205361836e-05, + "loss": 1.7797, + "step": 3692 + }, + { + "epoch": 0.9706046572732946, + "grad_norm": 0.6495144963264465, + "learning_rate": 6.766251971263361e-05, + "loss": 1.7767, + "step": 3694 + }, + { + "epoch": 0.9711301606069563, + "grad_norm": 0.8259857892990112, + "learning_rate": 6.764499737164886e-05, + "loss": 1.7568, + "step": 3696 + }, + { + "epoch": 0.9716556639406181, + "grad_norm": 0.9718241095542908, + "learning_rate": 6.76274750306641e-05, + "loss": 1.7382, + "step": 3698 + }, + { + "epoch": 0.97218116727428, + "grad_norm": 0.7467637062072754, + "learning_rate": 6.760995268967935e-05, + "loss": 1.7168, + "step": 3700 + }, + { + "epoch": 0.9727066706079417, + "grad_norm": 0.6963891386985779, + "learning_rate": 6.759243034869459e-05, + "loss": 1.75, + "step": 3702 + }, + { + "epoch": 0.9732321739416034, + "grad_norm": 0.5882383584976196, + "learning_rate": 6.757490800770983e-05, + "loss": 1.7664, + "step": 3704 + }, + { + "epoch": 0.9737576772752652, + "grad_norm": 0.5909221768379211, + "learning_rate": 6.755738566672508e-05, + "loss": 1.7569, + "step": 3706 + }, + { + "epoch": 0.9742831806089269, + "grad_norm": 0.6651691794395447, + "learning_rate": 6.753986332574033e-05, + "loss": 1.7332, + "step": 3708 + }, + { + "epoch": 0.9748086839425888, + "grad_norm": 0.6473085880279541, + "learning_rate": 6.752234098475558e-05, + "loss": 1.7668, + "step": 3710 + }, + { + "epoch": 0.9753341872762505, + "grad_norm": 0.6437013745307922, + "learning_rate": 6.750481864377081e-05, + "loss": 1.747, + "step": 3712 + }, + { + "epoch": 0.9758596906099123, + "grad_norm": 0.6409528255462646, + "learning_rate": 6.748729630278606e-05, + "loss": 1.7105, + "step": 3714 + }, + { + "epoch": 0.976385193943574, + "grad_norm": 0.7331600785255432, + "learning_rate": 6.746977396180129e-05, + "loss": 1.7328, + "step": 3716 + }, + { + "epoch": 0.9769106972772359, + "grad_norm": 0.7643489837646484, + "learning_rate": 6.745225162081654e-05, + "loss": 1.7464, + "step": 3718 + }, + { + "epoch": 0.9774362006108976, + "grad_norm": 0.8822628259658813, + "learning_rate": 6.743472927983179e-05, + "loss": 1.7681, + "step": 3720 + }, + { + "epoch": 0.9779617039445594, + "grad_norm": 0.9610887765884399, + "learning_rate": 6.741720693884703e-05, + "loss": 1.7329, + "step": 3722 + }, + { + "epoch": 0.9784872072782211, + "grad_norm": 0.7626636028289795, + "learning_rate": 6.739968459786228e-05, + "loss": 1.7161, + "step": 3724 + }, + { + "epoch": 0.979012710611883, + "grad_norm": 0.6538355946540833, + "learning_rate": 6.738216225687753e-05, + "loss": 1.7716, + "step": 3726 + }, + { + "epoch": 0.9795382139455447, + "grad_norm": 0.7273457050323486, + "learning_rate": 6.736463991589276e-05, + "loss": 1.7367, + "step": 3728 + }, + { + "epoch": 0.9800637172792065, + "grad_norm": 0.7992196679115295, + "learning_rate": 6.734711757490801e-05, + "loss": 1.7294, + "step": 3730 + }, + { + "epoch": 0.9805892206128682, + "grad_norm": 0.6318385601043701, + "learning_rate": 6.732959523392326e-05, + "loss": 1.7498, + "step": 3732 + }, + { + "epoch": 0.98111472394653, + "grad_norm": 0.8519952893257141, + "learning_rate": 6.73120728929385e-05, + "loss": 1.7366, + "step": 3734 + }, + { + "epoch": 0.9816402272801918, + "grad_norm": 0.7661817073822021, + "learning_rate": 6.729455055195374e-05, + "loss": 1.7565, + "step": 3736 + }, + { + "epoch": 0.9821657306138536, + "grad_norm": 0.5940839052200317, + "learning_rate": 6.727702821096899e-05, + "loss": 1.7384, + "step": 3738 + }, + { + "epoch": 0.9826912339475153, + "grad_norm": 0.6861841082572937, + "learning_rate": 6.725950586998424e-05, + "loss": 1.7129, + "step": 3740 + }, + { + "epoch": 0.9832167372811771, + "grad_norm": 0.9989137649536133, + "learning_rate": 6.724198352899947e-05, + "loss": 1.7758, + "step": 3742 + }, + { + "epoch": 0.983742240614839, + "grad_norm": 0.6522616147994995, + "learning_rate": 6.722446118801472e-05, + "loss": 1.7114, + "step": 3744 + }, + { + "epoch": 0.9842677439485007, + "grad_norm": 0.6522443294525146, + "learning_rate": 6.720693884702996e-05, + "loss": 1.7375, + "step": 3746 + }, + { + "epoch": 0.9847932472821624, + "grad_norm": 0.5641542077064514, + "learning_rate": 6.718941650604521e-05, + "loss": 1.7371, + "step": 3748 + }, + { + "epoch": 0.9853187506158242, + "grad_norm": 0.5788604021072388, + "learning_rate": 6.717189416506046e-05, + "loss": 1.7184, + "step": 3750 + }, + { + "epoch": 0.9858442539494859, + "grad_norm": 0.5921624302864075, + "learning_rate": 6.715437182407571e-05, + "loss": 1.7304, + "step": 3752 + }, + { + "epoch": 0.9863697572831478, + "grad_norm": 0.6481342911720276, + "learning_rate": 6.713684948309094e-05, + "loss": 1.7838, + "step": 3754 + }, + { + "epoch": 0.9868952606168095, + "grad_norm": 0.6901116967201233, + "learning_rate": 6.711932714210619e-05, + "loss": 1.7189, + "step": 3756 + }, + { + "epoch": 0.9874207639504713, + "grad_norm": 0.7430614233016968, + "learning_rate": 6.710180480112144e-05, + "loss": 1.7532, + "step": 3758 + }, + { + "epoch": 0.987946267284133, + "grad_norm": 0.8305982947349548, + "learning_rate": 6.708428246013668e-05, + "loss": 1.7417, + "step": 3760 + }, + { + "epoch": 0.9884717706177949, + "grad_norm": 0.801034152507782, + "learning_rate": 6.706676011915192e-05, + "loss": 1.7058, + "step": 3762 + }, + { + "epoch": 0.9889972739514566, + "grad_norm": 0.6561713218688965, + "learning_rate": 6.704923777816717e-05, + "loss": 1.7347, + "step": 3764 + }, + { + "epoch": 0.9895227772851184, + "grad_norm": 0.6591217517852783, + "learning_rate": 6.703171543718241e-05, + "loss": 1.7693, + "step": 3766 + }, + { + "epoch": 0.9900482806187801, + "grad_norm": 0.6448448300361633, + "learning_rate": 6.701419309619765e-05, + "loss": 1.7327, + "step": 3768 + }, + { + "epoch": 0.990573783952442, + "grad_norm": 0.7278120517730713, + "learning_rate": 6.69966707552129e-05, + "loss": 1.7049, + "step": 3770 + }, + { + "epoch": 0.9910992872861037, + "grad_norm": 0.5780648589134216, + "learning_rate": 6.697914841422814e-05, + "loss": 1.7643, + "step": 3772 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.599757969379425, + "learning_rate": 6.696162607324339e-05, + "loss": 1.7419, + "step": 3774 + }, + { + "epoch": 0.9921502939534272, + "grad_norm": 0.6809077858924866, + "learning_rate": 6.694410373225864e-05, + "loss": 1.7638, + "step": 3776 + }, + { + "epoch": 0.992675797287089, + "grad_norm": 0.7125533223152161, + "learning_rate": 6.692658139127389e-05, + "loss": 1.7598, + "step": 3778 + }, + { + "epoch": 0.9932013006207508, + "grad_norm": 0.7388641834259033, + "learning_rate": 6.690905905028912e-05, + "loss": 1.7484, + "step": 3780 + }, + { + "epoch": 0.9937268039544126, + "grad_norm": 0.622369647026062, + "learning_rate": 6.689153670930437e-05, + "loss": 1.7587, + "step": 3782 + }, + { + "epoch": 0.9942523072880743, + "grad_norm": 0.7785460948944092, + "learning_rate": 6.687401436831961e-05, + "loss": 1.7575, + "step": 3784 + }, + { + "epoch": 0.9947778106217361, + "grad_norm": 0.6789509057998657, + "learning_rate": 6.685649202733486e-05, + "loss": 1.7346, + "step": 3786 + }, + { + "epoch": 0.995303313955398, + "grad_norm": 0.6360666155815125, + "learning_rate": 6.68389696863501e-05, + "loss": 1.7838, + "step": 3788 + }, + { + "epoch": 0.9958288172890597, + "grad_norm": 0.6261754035949707, + "learning_rate": 6.682144734536534e-05, + "loss": 1.7563, + "step": 3790 + }, + { + "epoch": 0.9963543206227214, + "grad_norm": 0.6586030125617981, + "learning_rate": 6.680392500438059e-05, + "loss": 1.7034, + "step": 3792 + }, + { + "epoch": 0.9968798239563832, + "grad_norm": 0.7084933519363403, + "learning_rate": 6.678640266339583e-05, + "loss": 1.7995, + "step": 3794 + }, + { + "epoch": 0.9974053272900449, + "grad_norm": 0.8571730256080627, + "learning_rate": 6.676888032241107e-05, + "loss": 1.7874, + "step": 3796 + }, + { + "epoch": 0.9979308306237068, + "grad_norm": 0.6537512540817261, + "learning_rate": 6.675135798142632e-05, + "loss": 1.7672, + "step": 3798 + }, + { + "epoch": 0.9984563339573685, + "grad_norm": 0.6276223659515381, + "learning_rate": 6.673383564044157e-05, + "loss": 1.7106, + "step": 3800 + }, + { + "epoch": 0.9989818372910303, + "grad_norm": 0.6775636076927185, + "learning_rate": 6.671631329945682e-05, + "loss": 1.7228, + "step": 3802 + }, + { + "epoch": 0.999507340624692, + "grad_norm": 0.776142954826355, + "learning_rate": 6.669879095847206e-05, + "loss": 1.7674, + "step": 3804 + }, + { + "epoch": 1.000032843958354, + "grad_norm": 0.6469699740409851, + "learning_rate": 6.66812686174873e-05, + "loss": 1.7201, + "step": 3806 + }, + { + "epoch": 1.0005583472920156, + "grad_norm": 0.6889809966087341, + "learning_rate": 6.666374627650254e-05, + "loss": 1.6785, + "step": 3808 + }, + { + "epoch": 1.0010838506256774, + "grad_norm": 0.5873193740844727, + "learning_rate": 6.664622393551779e-05, + "loss": 1.6643, + "step": 3810 + }, + { + "epoch": 1.0016093539593391, + "grad_norm": 0.6216626167297363, + "learning_rate": 6.662870159453303e-05, + "loss": 1.6835, + "step": 3812 + }, + { + "epoch": 1.002134857293001, + "grad_norm": 0.621909499168396, + "learning_rate": 6.661117925354827e-05, + "loss": 1.6972, + "step": 3814 + }, + { + "epoch": 1.0026603606266626, + "grad_norm": 0.6056811213493347, + "learning_rate": 6.659365691256352e-05, + "loss": 1.7327, + "step": 3816 + }, + { + "epoch": 1.0031858639603246, + "grad_norm": 0.6430692076683044, + "learning_rate": 6.657613457157877e-05, + "loss": 1.7151, + "step": 3818 + }, + { + "epoch": 1.0037113672939864, + "grad_norm": 0.6576054692268372, + "learning_rate": 6.6558612230594e-05, + "loss": 1.7058, + "step": 3820 + }, + { + "epoch": 1.004236870627648, + "grad_norm": 0.666572093963623, + "learning_rate": 6.654108988960925e-05, + "loss": 1.6745, + "step": 3822 + }, + { + "epoch": 1.0047623739613099, + "grad_norm": 0.873776376247406, + "learning_rate": 6.65235675486245e-05, + "loss": 1.7123, + "step": 3824 + }, + { + "epoch": 1.0052878772949716, + "grad_norm": 0.6568595767021179, + "learning_rate": 6.650604520763975e-05, + "loss": 1.6813, + "step": 3826 + }, + { + "epoch": 1.0058133806286333, + "grad_norm": 0.7071711421012878, + "learning_rate": 6.648852286665499e-05, + "loss": 1.6926, + "step": 3828 + }, + { + "epoch": 1.006338883962295, + "grad_norm": 0.756188154220581, + "learning_rate": 6.647100052567024e-05, + "loss": 1.6939, + "step": 3830 + }, + { + "epoch": 1.0068643872959568, + "grad_norm": 0.6261985301971436, + "learning_rate": 6.645347818468547e-05, + "loss": 1.6729, + "step": 3832 + }, + { + "epoch": 1.0073898906296186, + "grad_norm": 0.6476467847824097, + "learning_rate": 6.643595584370072e-05, + "loss": 1.6905, + "step": 3834 + }, + { + "epoch": 1.0079153939632806, + "grad_norm": 0.7380629777908325, + "learning_rate": 6.641843350271597e-05, + "loss": 1.6695, + "step": 3836 + }, + { + "epoch": 1.0084408972969423, + "grad_norm": 0.7090455293655396, + "learning_rate": 6.64009111617312e-05, + "loss": 1.7271, + "step": 3838 + }, + { + "epoch": 1.008966400630604, + "grad_norm": 0.5697006583213806, + "learning_rate": 6.638338882074645e-05, + "loss": 1.6869, + "step": 3840 + }, + { + "epoch": 1.0094919039642658, + "grad_norm": 0.5765069127082825, + "learning_rate": 6.63658664797617e-05, + "loss": 1.7133, + "step": 3842 + }, + { + "epoch": 1.0100174072979275, + "grad_norm": 0.7223833799362183, + "learning_rate": 6.634834413877695e-05, + "loss": 1.6566, + "step": 3844 + }, + { + "epoch": 1.0105429106315893, + "grad_norm": 0.6132098436355591, + "learning_rate": 6.633082179779218e-05, + "loss": 1.7448, + "step": 3846 + }, + { + "epoch": 1.011068413965251, + "grad_norm": 0.8053067922592163, + "learning_rate": 6.631329945680743e-05, + "loss": 1.6833, + "step": 3848 + }, + { + "epoch": 1.0115939172989128, + "grad_norm": 0.6919856667518616, + "learning_rate": 6.629577711582268e-05, + "loss": 1.7138, + "step": 3850 + }, + { + "epoch": 1.0121194206325745, + "grad_norm": 0.590743899345398, + "learning_rate": 6.627825477483792e-05, + "loss": 1.7058, + "step": 3852 + }, + { + "epoch": 1.0126449239662365, + "grad_norm": 0.6320709586143494, + "learning_rate": 6.626073243385317e-05, + "loss": 1.6988, + "step": 3854 + }, + { + "epoch": 1.0131704272998983, + "grad_norm": 0.5564618706703186, + "learning_rate": 6.624321009286842e-05, + "loss": 1.7098, + "step": 3856 + }, + { + "epoch": 1.01369593063356, + "grad_norm": 0.5974157452583313, + "learning_rate": 6.622568775188365e-05, + "loss": 1.7168, + "step": 3858 + }, + { + "epoch": 1.0142214339672218, + "grad_norm": 0.6708089709281921, + "learning_rate": 6.62081654108989e-05, + "loss": 1.6849, + "step": 3860 + }, + { + "epoch": 1.0147469373008835, + "grad_norm": 0.6684040427207947, + "learning_rate": 6.619064306991415e-05, + "loss": 1.7061, + "step": 3862 + }, + { + "epoch": 1.0152724406345452, + "grad_norm": 0.6342937350273132, + "learning_rate": 6.617312072892938e-05, + "loss": 1.689, + "step": 3864 + }, + { + "epoch": 1.015797943968207, + "grad_norm": 0.5644361972808838, + "learning_rate": 6.615559838794463e-05, + "loss": 1.6784, + "step": 3866 + }, + { + "epoch": 1.0163234473018687, + "grad_norm": 0.7777919173240662, + "learning_rate": 6.613807604695988e-05, + "loss": 1.6912, + "step": 3868 + }, + { + "epoch": 1.0168489506355305, + "grad_norm": 0.7786663770675659, + "learning_rate": 6.612055370597512e-05, + "loss": 1.6968, + "step": 3870 + }, + { + "epoch": 1.0173744539691925, + "grad_norm": 0.7163161039352417, + "learning_rate": 6.610303136499036e-05, + "loss": 1.7118, + "step": 3872 + }, + { + "epoch": 1.0178999573028542, + "grad_norm": 0.731606662273407, + "learning_rate": 6.60855090240056e-05, + "loss": 1.6944, + "step": 3874 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.6335828900337219, + "learning_rate": 6.606798668302085e-05, + "loss": 1.7006, + "step": 3876 + }, + { + "epoch": 1.0189509639701777, + "grad_norm": 0.7113467454910278, + "learning_rate": 6.60504643420361e-05, + "loss": 1.694, + "step": 3878 + }, + { + "epoch": 1.0194764673038395, + "grad_norm": 0.84892338514328, + "learning_rate": 6.603294200105135e-05, + "loss": 1.7098, + "step": 3880 + }, + { + "epoch": 1.0200019706375012, + "grad_norm": 0.6938359141349792, + "learning_rate": 6.60154196600666e-05, + "loss": 1.6912, + "step": 3882 + }, + { + "epoch": 1.020527473971163, + "grad_norm": 0.6478989124298096, + "learning_rate": 6.599789731908183e-05, + "loss": 1.6992, + "step": 3884 + }, + { + "epoch": 1.0210529773048247, + "grad_norm": 0.7370628118515015, + "learning_rate": 6.598037497809708e-05, + "loss": 1.6763, + "step": 3886 + }, + { + "epoch": 1.0215784806384864, + "grad_norm": 0.6913176774978638, + "learning_rate": 6.596285263711233e-05, + "loss": 1.6808, + "step": 3888 + }, + { + "epoch": 1.0221039839721484, + "grad_norm": 0.6637833118438721, + "learning_rate": 6.594533029612756e-05, + "loss": 1.679, + "step": 3890 + }, + { + "epoch": 1.0226294873058102, + "grad_norm": 0.7522826194763184, + "learning_rate": 6.59278079551428e-05, + "loss": 1.6729, + "step": 3892 + }, + { + "epoch": 1.023154990639472, + "grad_norm": 0.5955492258071899, + "learning_rate": 6.591028561415805e-05, + "loss": 1.6915, + "step": 3894 + }, + { + "epoch": 1.0236804939731337, + "grad_norm": 0.6156378388404846, + "learning_rate": 6.58927632731733e-05, + "loss": 1.6887, + "step": 3896 + }, + { + "epoch": 1.0242059973067954, + "grad_norm": 0.5954993963241577, + "learning_rate": 6.587524093218854e-05, + "loss": 1.7028, + "step": 3898 + }, + { + "epoch": 1.0247315006404571, + "grad_norm": 0.6089223623275757, + "learning_rate": 6.585771859120378e-05, + "loss": 1.7172, + "step": 3900 + }, + { + "epoch": 1.025257003974119, + "grad_norm": 0.6109156012535095, + "learning_rate": 6.584019625021903e-05, + "loss": 1.6883, + "step": 3902 + }, + { + "epoch": 1.0257825073077806, + "grad_norm": 0.769751787185669, + "learning_rate": 6.582267390923428e-05, + "loss": 1.7029, + "step": 3904 + }, + { + "epoch": 1.0263080106414426, + "grad_norm": 0.579433023929596, + "learning_rate": 6.580515156824953e-05, + "loss": 1.6728, + "step": 3906 + }, + { + "epoch": 1.0268335139751044, + "grad_norm": 0.6194645166397095, + "learning_rate": 6.578762922726477e-05, + "loss": 1.6949, + "step": 3908 + }, + { + "epoch": 1.027359017308766, + "grad_norm": 0.5495603680610657, + "learning_rate": 6.577010688628001e-05, + "loss": 1.7059, + "step": 3910 + }, + { + "epoch": 1.0278845206424279, + "grad_norm": 0.7045862674713135, + "learning_rate": 6.575258454529526e-05, + "loss": 1.7098, + "step": 3912 + }, + { + "epoch": 1.0284100239760896, + "grad_norm": 0.6708394885063171, + "learning_rate": 6.573506220431049e-05, + "loss": 1.6671, + "step": 3914 + }, + { + "epoch": 1.0289355273097514, + "grad_norm": 0.6526671051979065, + "learning_rate": 6.571753986332574e-05, + "loss": 1.6645, + "step": 3916 + }, + { + "epoch": 1.029461030643413, + "grad_norm": 0.5542386174201965, + "learning_rate": 6.570001752234098e-05, + "loss": 1.6694, + "step": 3918 + }, + { + "epoch": 1.0299865339770748, + "grad_norm": 0.6871373057365417, + "learning_rate": 6.568249518135623e-05, + "loss": 1.7054, + "step": 3920 + }, + { + "epoch": 1.0305120373107366, + "grad_norm": 0.6650441288948059, + "learning_rate": 6.566497284037148e-05, + "loss": 1.7034, + "step": 3922 + }, + { + "epoch": 1.0310375406443986, + "grad_norm": 1.027212142944336, + "learning_rate": 6.564745049938671e-05, + "loss": 1.6684, + "step": 3924 + }, + { + "epoch": 1.0315630439780603, + "grad_norm": 0.7262475490570068, + "learning_rate": 6.562992815840196e-05, + "loss": 1.6994, + "step": 3926 + }, + { + "epoch": 1.032088547311722, + "grad_norm": 0.697229266166687, + "learning_rate": 6.561240581741721e-05, + "loss": 1.6627, + "step": 3928 + }, + { + "epoch": 1.0326140506453838, + "grad_norm": 0.6965095400810242, + "learning_rate": 6.559488347643246e-05, + "loss": 1.7078, + "step": 3930 + }, + { + "epoch": 1.0331395539790456, + "grad_norm": 0.6743383407592773, + "learning_rate": 6.55773611354477e-05, + "loss": 1.6954, + "step": 3932 + }, + { + "epoch": 1.0336650573127073, + "grad_norm": 0.7088636159896851, + "learning_rate": 6.555983879446295e-05, + "loss": 1.7079, + "step": 3934 + }, + { + "epoch": 1.034190560646369, + "grad_norm": 0.9612395763397217, + "learning_rate": 6.554231645347819e-05, + "loss": 1.7034, + "step": 3936 + }, + { + "epoch": 1.0347160639800308, + "grad_norm": 0.644659161567688, + "learning_rate": 6.552479411249343e-05, + "loss": 1.6736, + "step": 3938 + }, + { + "epoch": 1.0352415673136925, + "grad_norm": 0.6379082202911377, + "learning_rate": 6.550727177150867e-05, + "loss": 1.6999, + "step": 3940 + }, + { + "epoch": 1.0357670706473545, + "grad_norm": 0.8248457908630371, + "learning_rate": 6.548974943052391e-05, + "loss": 1.7085, + "step": 3942 + }, + { + "epoch": 1.0362925739810163, + "grad_norm": 0.9295015335083008, + "learning_rate": 6.547222708953916e-05, + "loss": 1.6797, + "step": 3944 + }, + { + "epoch": 1.036818077314678, + "grad_norm": 0.614661693572998, + "learning_rate": 6.545470474855441e-05, + "loss": 1.6928, + "step": 3946 + }, + { + "epoch": 1.0373435806483398, + "grad_norm": 0.721056342124939, + "learning_rate": 6.543718240756966e-05, + "loss": 1.6783, + "step": 3948 + }, + { + "epoch": 1.0378690839820015, + "grad_norm": 1.0733940601348877, + "learning_rate": 6.54196600665849e-05, + "loss": 1.7053, + "step": 3950 + }, + { + "epoch": 1.0383945873156633, + "grad_norm": 0.6472097039222717, + "learning_rate": 6.540213772560014e-05, + "loss": 1.7327, + "step": 3952 + }, + { + "epoch": 1.038920090649325, + "grad_norm": 0.7506822943687439, + "learning_rate": 6.538461538461539e-05, + "loss": 1.6703, + "step": 3954 + }, + { + "epoch": 1.0394455939829867, + "grad_norm": 0.8442516326904297, + "learning_rate": 6.536709304363063e-05, + "loss": 1.695, + "step": 3956 + }, + { + "epoch": 1.0399710973166485, + "grad_norm": 0.7090259790420532, + "learning_rate": 6.534957070264588e-05, + "loss": 1.6954, + "step": 3958 + }, + { + "epoch": 1.0404966006503105, + "grad_norm": 0.8459334969520569, + "learning_rate": 6.533204836166113e-05, + "loss": 1.6729, + "step": 3960 + }, + { + "epoch": 1.0410221039839722, + "grad_norm": 0.8890243172645569, + "learning_rate": 6.531452602067638e-05, + "loss": 1.6778, + "step": 3962 + }, + { + "epoch": 1.041547607317634, + "grad_norm": 0.9002764821052551, + "learning_rate": 6.529700367969161e-05, + "loss": 1.6651, + "step": 3964 + }, + { + "epoch": 1.0420731106512957, + "grad_norm": 0.7871319055557251, + "learning_rate": 6.527948133870684e-05, + "loss": 1.7182, + "step": 3966 + }, + { + "epoch": 1.0425986139849575, + "grad_norm": 1.2501089572906494, + "learning_rate": 6.526195899772209e-05, + "loss": 1.7056, + "step": 3968 + }, + { + "epoch": 1.0431241173186192, + "grad_norm": 0.8261802792549133, + "learning_rate": 6.524443665673734e-05, + "loss": 1.7063, + "step": 3970 + }, + { + "epoch": 1.043649620652281, + "grad_norm": 0.8968937993049622, + "learning_rate": 6.522691431575259e-05, + "loss": 1.6993, + "step": 3972 + }, + { + "epoch": 1.0441751239859427, + "grad_norm": 0.8691303730010986, + "learning_rate": 6.520939197476783e-05, + "loss": 1.6731, + "step": 3974 + }, + { + "epoch": 1.0447006273196044, + "grad_norm": 0.9870227575302124, + "learning_rate": 6.519186963378308e-05, + "loss": 1.6689, + "step": 3976 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.6180405616760254, + "learning_rate": 6.517434729279832e-05, + "loss": 1.69, + "step": 3978 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.6060590147972107, + "learning_rate": 6.515682495181356e-05, + "loss": 1.7048, + "step": 3980 + }, + { + "epoch": 1.04627713732059, + "grad_norm": 0.6996809244155884, + "learning_rate": 6.513930261082881e-05, + "loss": 1.693, + "step": 3982 + }, + { + "epoch": 1.0468026406542517, + "grad_norm": 0.8016669154167175, + "learning_rate": 6.512178026984406e-05, + "loss": 1.7072, + "step": 3984 + }, + { + "epoch": 1.0473281439879134, + "grad_norm": 0.6650173664093018, + "learning_rate": 6.51042579288593e-05, + "loss": 1.6923, + "step": 3986 + }, + { + "epoch": 1.0478536473215752, + "grad_norm": 0.921610951423645, + "learning_rate": 6.508673558787455e-05, + "loss": 1.6857, + "step": 3988 + }, + { + "epoch": 1.048379150655237, + "grad_norm": 0.5630477666854858, + "learning_rate": 6.506921324688979e-05, + "loss": 1.6831, + "step": 3990 + }, + { + "epoch": 1.0489046539888986, + "grad_norm": 0.7276068329811096, + "learning_rate": 6.505169090590502e-05, + "loss": 1.6785, + "step": 3992 + }, + { + "epoch": 1.0494301573225606, + "grad_norm": 0.7698312997817993, + "learning_rate": 6.503416856492027e-05, + "loss": 1.6842, + "step": 3994 + }, + { + "epoch": 1.0499556606562224, + "grad_norm": 0.8240602016448975, + "learning_rate": 6.501664622393552e-05, + "loss": 1.6815, + "step": 3996 + }, + { + "epoch": 1.0504811639898841, + "grad_norm": 0.6555003523826599, + "learning_rate": 6.499912388295076e-05, + "loss": 1.6668, + "step": 3998 + }, + { + "epoch": 1.0510066673235459, + "grad_norm": 0.7370006442070007, + "learning_rate": 6.498160154196601e-05, + "loss": 1.6993, + "step": 4000 + }, + { + "epoch": 1.0510066673235459, + "eval_loss": 1.7162247896194458, + "eval_runtime": 487.2234, + "eval_samples_per_second": 249.965, + "eval_steps_per_second": 31.246, + "step": 4000 + }, + { + "epoch": 1.0515321706572076, + "grad_norm": 0.7542751431465149, + "learning_rate": 6.496407920098126e-05, + "loss": 1.6854, + "step": 4002 + }, + { + "epoch": 1.0520576739908694, + "grad_norm": 0.7216155529022217, + "learning_rate": 6.49465568599965e-05, + "loss": 1.7008, + "step": 4004 + }, + { + "epoch": 1.052583177324531, + "grad_norm": 0.6681018471717834, + "learning_rate": 6.492903451901174e-05, + "loss": 1.703, + "step": 4006 + }, + { + "epoch": 1.0531086806581929, + "grad_norm": 0.8611218929290771, + "learning_rate": 6.491151217802699e-05, + "loss": 1.6686, + "step": 4008 + }, + { + "epoch": 1.0536341839918546, + "grad_norm": 0.6838074922561646, + "learning_rate": 6.489398983704224e-05, + "loss": 1.7053, + "step": 4010 + }, + { + "epoch": 1.0541596873255166, + "grad_norm": 0.6425184607505798, + "learning_rate": 6.487646749605748e-05, + "loss": 1.6621, + "step": 4012 + }, + { + "epoch": 1.0546851906591783, + "grad_norm": 0.8689895272254944, + "learning_rate": 6.485894515507273e-05, + "loss": 1.6693, + "step": 4014 + }, + { + "epoch": 1.05521069399284, + "grad_norm": 0.7122433185577393, + "learning_rate": 6.484142281408797e-05, + "loss": 1.6936, + "step": 4016 + }, + { + "epoch": 1.0557361973265018, + "grad_norm": 0.7124624252319336, + "learning_rate": 6.48239004731032e-05, + "loss": 1.6915, + "step": 4018 + }, + { + "epoch": 1.0562617006601636, + "grad_norm": 0.6151629686355591, + "learning_rate": 6.480637813211845e-05, + "loss": 1.6601, + "step": 4020 + }, + { + "epoch": 1.0567872039938253, + "grad_norm": 0.5656692385673523, + "learning_rate": 6.47888557911337e-05, + "loss": 1.6651, + "step": 4022 + }, + { + "epoch": 1.057312707327487, + "grad_norm": 0.6214647889137268, + "learning_rate": 6.477133345014894e-05, + "loss": 1.7278, + "step": 4024 + }, + { + "epoch": 1.0578382106611488, + "grad_norm": 0.7187774777412415, + "learning_rate": 6.475381110916419e-05, + "loss": 1.682, + "step": 4026 + }, + { + "epoch": 1.0583637139948106, + "grad_norm": 0.602172315120697, + "learning_rate": 6.473628876817944e-05, + "loss": 1.6712, + "step": 4028 + }, + { + "epoch": 1.0588892173284725, + "grad_norm": 0.7032018303871155, + "learning_rate": 6.471876642719467e-05, + "loss": 1.7056, + "step": 4030 + }, + { + "epoch": 1.0594147206621343, + "grad_norm": 0.720413088798523, + "learning_rate": 6.470124408620992e-05, + "loss": 1.6658, + "step": 4032 + }, + { + "epoch": 1.059940223995796, + "grad_norm": 0.7536730170249939, + "learning_rate": 6.468372174522517e-05, + "loss": 1.6624, + "step": 4034 + }, + { + "epoch": 1.0604657273294578, + "grad_norm": 0.6960994005203247, + "learning_rate": 6.466619940424041e-05, + "loss": 1.6805, + "step": 4036 + }, + { + "epoch": 1.0609912306631195, + "grad_norm": 0.6354637145996094, + "learning_rate": 6.464867706325566e-05, + "loss": 1.6969, + "step": 4038 + }, + { + "epoch": 1.0615167339967813, + "grad_norm": 0.659994900226593, + "learning_rate": 6.463115472227091e-05, + "loss": 1.7076, + "step": 4040 + }, + { + "epoch": 1.062042237330443, + "grad_norm": 0.6840848326683044, + "learning_rate": 6.461363238128614e-05, + "loss": 1.7255, + "step": 4042 + }, + { + "epoch": 1.0625677406641048, + "grad_norm": 0.8560720086097717, + "learning_rate": 6.459611004030138e-05, + "loss": 1.6797, + "step": 4044 + }, + { + "epoch": 1.0630932439977667, + "grad_norm": 0.5914828181266785, + "learning_rate": 6.457858769931663e-05, + "loss": 1.6948, + "step": 4046 + }, + { + "epoch": 1.0636187473314285, + "grad_norm": 0.9306698441505432, + "learning_rate": 6.456106535833187e-05, + "loss": 1.6749, + "step": 4048 + }, + { + "epoch": 1.0641442506650902, + "grad_norm": 0.6383719444274902, + "learning_rate": 6.454354301734712e-05, + "loss": 1.7245, + "step": 4050 + }, + { + "epoch": 1.064669753998752, + "grad_norm": 0.6287466287612915, + "learning_rate": 6.452602067636237e-05, + "loss": 1.6935, + "step": 4052 + }, + { + "epoch": 1.0651952573324137, + "grad_norm": 0.7017025947570801, + "learning_rate": 6.450849833537762e-05, + "loss": 1.6792, + "step": 4054 + }, + { + "epoch": 1.0657207606660755, + "grad_norm": 0.6088765859603882, + "learning_rate": 6.449097599439285e-05, + "loss": 1.6926, + "step": 4056 + }, + { + "epoch": 1.0662462639997372, + "grad_norm": 0.6329763531684875, + "learning_rate": 6.44734536534081e-05, + "loss": 1.7088, + "step": 4058 + }, + { + "epoch": 1.066771767333399, + "grad_norm": 0.6406139731407166, + "learning_rate": 6.445593131242334e-05, + "loss": 1.6847, + "step": 4060 + }, + { + "epoch": 1.0672972706670607, + "grad_norm": 0.5512668490409851, + "learning_rate": 6.443840897143859e-05, + "loss": 1.6572, + "step": 4062 + }, + { + "epoch": 1.0678227740007227, + "grad_norm": 0.6363273859024048, + "learning_rate": 6.442088663045384e-05, + "loss": 1.7079, + "step": 4064 + }, + { + "epoch": 1.0683482773343844, + "grad_norm": 0.5485667586326599, + "learning_rate": 6.440336428946909e-05, + "loss": 1.7357, + "step": 4066 + }, + { + "epoch": 1.0688737806680462, + "grad_norm": 0.6467545628547668, + "learning_rate": 6.438584194848432e-05, + "loss": 1.6527, + "step": 4068 + }, + { + "epoch": 1.069399284001708, + "grad_norm": 0.6813017129898071, + "learning_rate": 6.436831960749956e-05, + "loss": 1.6907, + "step": 4070 + }, + { + "epoch": 1.0699247873353697, + "grad_norm": 0.7942006587982178, + "learning_rate": 6.43507972665148e-05, + "loss": 1.6908, + "step": 4072 + }, + { + "epoch": 1.0704502906690314, + "grad_norm": 0.7022045254707336, + "learning_rate": 6.433327492553005e-05, + "loss": 1.6829, + "step": 4074 + }, + { + "epoch": 1.0709757940026932, + "grad_norm": 0.581889271736145, + "learning_rate": 6.43157525845453e-05, + "loss": 1.6625, + "step": 4076 + }, + { + "epoch": 1.071501297336355, + "grad_norm": 0.979885458946228, + "learning_rate": 6.429823024356055e-05, + "loss": 1.6818, + "step": 4078 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.6776348352432251, + "learning_rate": 6.428070790257579e-05, + "loss": 1.6806, + "step": 4080 + }, + { + "epoch": 1.0725523040036786, + "grad_norm": 0.8267763257026672, + "learning_rate": 6.426318556159103e-05, + "loss": 1.6728, + "step": 4082 + }, + { + "epoch": 1.0730778073373404, + "grad_norm": 0.6556562781333923, + "learning_rate": 6.424566322060627e-05, + "loss": 1.6894, + "step": 4084 + }, + { + "epoch": 1.0736033106710021, + "grad_norm": 0.809785783290863, + "learning_rate": 6.422814087962152e-05, + "loss": 1.6918, + "step": 4086 + }, + { + "epoch": 1.0741288140046639, + "grad_norm": 0.5993219017982483, + "learning_rate": 6.421061853863677e-05, + "loss": 1.673, + "step": 4088 + }, + { + "epoch": 1.0746543173383256, + "grad_norm": 0.6594780087471008, + "learning_rate": 6.419309619765202e-05, + "loss": 1.7099, + "step": 4090 + }, + { + "epoch": 1.0751798206719874, + "grad_norm": 0.8069416284561157, + "learning_rate": 6.417557385666726e-05, + "loss": 1.7032, + "step": 4092 + }, + { + "epoch": 1.075705324005649, + "grad_norm": 0.6383103728294373, + "learning_rate": 6.41580515156825e-05, + "loss": 1.6763, + "step": 4094 + }, + { + "epoch": 1.0762308273393109, + "grad_norm": 0.622296929359436, + "learning_rate": 6.414052917469773e-05, + "loss": 1.6717, + "step": 4096 + }, + { + "epoch": 1.0767563306729726, + "grad_norm": 0.5699704885482788, + "learning_rate": 6.412300683371298e-05, + "loss": 1.6684, + "step": 4098 + }, + { + "epoch": 1.0772818340066346, + "grad_norm": 0.653029203414917, + "learning_rate": 6.410548449272823e-05, + "loss": 1.6898, + "step": 4100 + }, + { + "epoch": 1.0778073373402963, + "grad_norm": 0.5569880604743958, + "learning_rate": 6.408796215174348e-05, + "loss": 1.6897, + "step": 4102 + }, + { + "epoch": 1.078332840673958, + "grad_norm": 0.8268890380859375, + "learning_rate": 6.407043981075872e-05, + "loss": 1.691, + "step": 4104 + }, + { + "epoch": 1.0788583440076198, + "grad_norm": 0.6143434643745422, + "learning_rate": 6.405291746977397e-05, + "loss": 1.7033, + "step": 4106 + }, + { + "epoch": 1.0793838473412816, + "grad_norm": 0.6650940775871277, + "learning_rate": 6.40353951287892e-05, + "loss": 1.7149, + "step": 4108 + }, + { + "epoch": 1.0799093506749433, + "grad_norm": 0.9318827986717224, + "learning_rate": 6.401787278780445e-05, + "loss": 1.7007, + "step": 4110 + }, + { + "epoch": 1.080434854008605, + "grad_norm": 0.870758056640625, + "learning_rate": 6.40003504468197e-05, + "loss": 1.6928, + "step": 4112 + }, + { + "epoch": 1.0809603573422668, + "grad_norm": 0.5847840309143066, + "learning_rate": 6.398282810583495e-05, + "loss": 1.7171, + "step": 4114 + }, + { + "epoch": 1.0814858606759286, + "grad_norm": 0.6585195064544678, + "learning_rate": 6.39653057648502e-05, + "loss": 1.6983, + "step": 4116 + }, + { + "epoch": 1.0820113640095905, + "grad_norm": 0.6032067537307739, + "learning_rate": 6.394778342386543e-05, + "loss": 1.7266, + "step": 4118 + }, + { + "epoch": 1.0825368673432523, + "grad_norm": 0.7615604996681213, + "learning_rate": 6.393026108288068e-05, + "loss": 1.6821, + "step": 4120 + }, + { + "epoch": 1.083062370676914, + "grad_norm": 0.6143482327461243, + "learning_rate": 6.391273874189591e-05, + "loss": 1.6857, + "step": 4122 + }, + { + "epoch": 1.0835878740105758, + "grad_norm": 0.7933993935585022, + "learning_rate": 6.389521640091116e-05, + "loss": 1.6717, + "step": 4124 + }, + { + "epoch": 1.0841133773442375, + "grad_norm": 0.6543474793434143, + "learning_rate": 6.38776940599264e-05, + "loss": 1.6959, + "step": 4126 + }, + { + "epoch": 1.0846388806778993, + "grad_norm": 0.6127591133117676, + "learning_rate": 6.386017171894165e-05, + "loss": 1.6885, + "step": 4128 + }, + { + "epoch": 1.085164384011561, + "grad_norm": 0.8532068133354187, + "learning_rate": 6.38426493779569e-05, + "loss": 1.6763, + "step": 4130 + }, + { + "epoch": 1.0856898873452228, + "grad_norm": 0.5886075496673584, + "learning_rate": 6.382512703697215e-05, + "loss": 1.6857, + "step": 4132 + }, + { + "epoch": 1.0862153906788845, + "grad_norm": 0.6927480697631836, + "learning_rate": 6.380760469598738e-05, + "loss": 1.6978, + "step": 4134 + }, + { + "epoch": 1.0867408940125465, + "grad_norm": 0.5445473790168762, + "learning_rate": 6.379008235500263e-05, + "loss": 1.6985, + "step": 4136 + }, + { + "epoch": 1.0872663973462082, + "grad_norm": 0.6567670702934265, + "learning_rate": 6.377256001401788e-05, + "loss": 1.6897, + "step": 4138 + }, + { + "epoch": 1.08779190067987, + "grad_norm": 0.6687731742858887, + "learning_rate": 6.375503767303312e-05, + "loss": 1.688, + "step": 4140 + }, + { + "epoch": 1.0883174040135317, + "grad_norm": 0.575955331325531, + "learning_rate": 6.373751533204837e-05, + "loss": 1.7204, + "step": 4142 + }, + { + "epoch": 1.0888429073471935, + "grad_norm": 0.6957133412361145, + "learning_rate": 6.37199929910636e-05, + "loss": 1.6945, + "step": 4144 + }, + { + "epoch": 1.0893684106808552, + "grad_norm": 0.7448277473449707, + "learning_rate": 6.370247065007885e-05, + "loss": 1.6953, + "step": 4146 + }, + { + "epoch": 1.089893914014517, + "grad_norm": 0.6629153490066528, + "learning_rate": 6.368494830909409e-05, + "loss": 1.703, + "step": 4148 + }, + { + "epoch": 1.0904194173481787, + "grad_norm": 0.7909244298934937, + "learning_rate": 6.366742596810934e-05, + "loss": 1.6915, + "step": 4150 + }, + { + "epoch": 1.0909449206818405, + "grad_norm": 0.5901594161987305, + "learning_rate": 6.364990362712458e-05, + "loss": 1.6747, + "step": 4152 + }, + { + "epoch": 1.0914704240155024, + "grad_norm": 0.6351743340492249, + "learning_rate": 6.363238128613983e-05, + "loss": 1.6776, + "step": 4154 + }, + { + "epoch": 1.0919959273491642, + "grad_norm": 0.7577309608459473, + "learning_rate": 6.361485894515508e-05, + "loss": 1.6941, + "step": 4156 + }, + { + "epoch": 1.092521430682826, + "grad_norm": 0.8337988257408142, + "learning_rate": 6.359733660417033e-05, + "loss": 1.6985, + "step": 4158 + }, + { + "epoch": 1.0930469340164877, + "grad_norm": 0.6406223177909851, + "learning_rate": 6.357981426318556e-05, + "loss": 1.6687, + "step": 4160 + }, + { + "epoch": 1.0935724373501494, + "grad_norm": 0.5720377564430237, + "learning_rate": 6.356229192220081e-05, + "loss": 1.6844, + "step": 4162 + }, + { + "epoch": 1.0940979406838112, + "grad_norm": 0.5843801498413086, + "learning_rate": 6.354476958121606e-05, + "loss": 1.7099, + "step": 4164 + }, + { + "epoch": 1.094623444017473, + "grad_norm": 0.7225820422172546, + "learning_rate": 6.35272472402313e-05, + "loss": 1.6941, + "step": 4166 + }, + { + "epoch": 1.0951489473511347, + "grad_norm": 0.6221319437026978, + "learning_rate": 6.350972489924655e-05, + "loss": 1.7102, + "step": 4168 + }, + { + "epoch": 1.0956744506847966, + "grad_norm": 0.6108186841011047, + "learning_rate": 6.349220255826178e-05, + "loss": 1.688, + "step": 4170 + }, + { + "epoch": 1.0961999540184584, + "grad_norm": 0.6956847906112671, + "learning_rate": 6.347468021727703e-05, + "loss": 1.67, + "step": 4172 + }, + { + "epoch": 1.0967254573521201, + "grad_norm": 0.616248369216919, + "learning_rate": 6.345715787629227e-05, + "loss": 1.7036, + "step": 4174 + }, + { + "epoch": 1.0972509606857819, + "grad_norm": 0.6128188371658325, + "learning_rate": 6.343963553530751e-05, + "loss": 1.6961, + "step": 4176 + }, + { + "epoch": 1.0977764640194436, + "grad_norm": 0.6509534120559692, + "learning_rate": 6.342211319432276e-05, + "loss": 1.7298, + "step": 4178 + }, + { + "epoch": 1.0983019673531054, + "grad_norm": 0.8290244340896606, + "learning_rate": 6.340459085333801e-05, + "loss": 1.7022, + "step": 4180 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.8365241289138794, + "learning_rate": 6.338706851235326e-05, + "loss": 1.7145, + "step": 4182 + }, + { + "epoch": 1.0993529740204289, + "grad_norm": 0.6763657927513123, + "learning_rate": 6.33695461713685e-05, + "loss": 1.7356, + "step": 4184 + }, + { + "epoch": 1.0998784773540906, + "grad_norm": 0.6510267853736877, + "learning_rate": 6.335202383038374e-05, + "loss": 1.7102, + "step": 4186 + }, + { + "epoch": 1.1004039806877526, + "grad_norm": 0.8549639582633972, + "learning_rate": 6.333450148939899e-05, + "loss": 1.7083, + "step": 4188 + }, + { + "epoch": 1.1009294840214143, + "grad_norm": 0.5909300446510315, + "learning_rate": 6.331697914841423e-05, + "loss": 1.7224, + "step": 4190 + }, + { + "epoch": 1.101454987355076, + "grad_norm": 0.5720754265785217, + "learning_rate": 6.329945680742948e-05, + "loss": 1.6936, + "step": 4192 + }, + { + "epoch": 1.1019804906887378, + "grad_norm": 0.8893141150474548, + "learning_rate": 6.328193446644473e-05, + "loss": 1.6726, + "step": 4194 + }, + { + "epoch": 1.1025059940223996, + "grad_norm": 0.5539205074310303, + "learning_rate": 6.326441212545996e-05, + "loss": 1.6461, + "step": 4196 + }, + { + "epoch": 1.1030314973560613, + "grad_norm": 0.8009784817695618, + "learning_rate": 6.324688978447521e-05, + "loss": 1.7247, + "step": 4198 + }, + { + "epoch": 1.103557000689723, + "grad_norm": 0.6527197957038879, + "learning_rate": 6.322936744349046e-05, + "loss": 1.7007, + "step": 4200 + }, + { + "epoch": 1.1040825040233848, + "grad_norm": 0.6934798955917358, + "learning_rate": 6.321184510250569e-05, + "loss": 1.6944, + "step": 4202 + }, + { + "epoch": 1.1046080073570468, + "grad_norm": 0.6967670917510986, + "learning_rate": 6.319432276152094e-05, + "loss": 1.6937, + "step": 4204 + }, + { + "epoch": 1.1051335106907085, + "grad_norm": 0.7582138776779175, + "learning_rate": 6.317680042053619e-05, + "loss": 1.703, + "step": 4206 + }, + { + "epoch": 1.1056590140243703, + "grad_norm": 0.6267343759536743, + "learning_rate": 6.315927807955143e-05, + "loss": 1.6888, + "step": 4208 + }, + { + "epoch": 1.106184517358032, + "grad_norm": 0.6019650101661682, + "learning_rate": 6.314175573856668e-05, + "loss": 1.7076, + "step": 4210 + }, + { + "epoch": 1.1067100206916938, + "grad_norm": 0.5411569476127625, + "learning_rate": 6.312423339758193e-05, + "loss": 1.6816, + "step": 4212 + }, + { + "epoch": 1.1072355240253555, + "grad_norm": 1.0304468870162964, + "learning_rate": 6.310671105659716e-05, + "loss": 1.7147, + "step": 4214 + }, + { + "epoch": 1.1077610273590173, + "grad_norm": 0.6413879990577698, + "learning_rate": 6.308918871561241e-05, + "loss": 1.6949, + "step": 4216 + }, + { + "epoch": 1.108286530692679, + "grad_norm": 0.6234250664710999, + "learning_rate": 6.307166637462766e-05, + "loss": 1.6998, + "step": 4218 + }, + { + "epoch": 1.1088120340263408, + "grad_norm": 0.5468757152557373, + "learning_rate": 6.305414403364289e-05, + "loss": 1.7209, + "step": 4220 + }, + { + "epoch": 1.1093375373600027, + "grad_norm": 0.7098813056945801, + "learning_rate": 6.303662169265814e-05, + "loss": 1.6913, + "step": 4222 + }, + { + "epoch": 1.1098630406936645, + "grad_norm": 0.7469329237937927, + "learning_rate": 6.301909935167339e-05, + "loss": 1.6945, + "step": 4224 + }, + { + "epoch": 1.1103885440273262, + "grad_norm": 0.634135901927948, + "learning_rate": 6.300157701068863e-05, + "loss": 1.6777, + "step": 4226 + }, + { + "epoch": 1.110914047360988, + "grad_norm": 0.670148491859436, + "learning_rate": 6.298405466970387e-05, + "loss": 1.6921, + "step": 4228 + }, + { + "epoch": 1.1114395506946497, + "grad_norm": 0.6443579792976379, + "learning_rate": 6.296653232871912e-05, + "loss": 1.7338, + "step": 4230 + }, + { + "epoch": 1.1119650540283115, + "grad_norm": 0.6890257000923157, + "learning_rate": 6.294900998773436e-05, + "loss": 1.7463, + "step": 4232 + }, + { + "epoch": 1.1124905573619732, + "grad_norm": 0.5529821515083313, + "learning_rate": 6.293148764674961e-05, + "loss": 1.6557, + "step": 4234 + }, + { + "epoch": 1.113016060695635, + "grad_norm": 0.7622890472412109, + "learning_rate": 6.291396530576486e-05, + "loss": 1.7009, + "step": 4236 + }, + { + "epoch": 1.1135415640292967, + "grad_norm": 0.6255015730857849, + "learning_rate": 6.28964429647801e-05, + "loss": 1.6922, + "step": 4238 + }, + { + "epoch": 1.1140670673629587, + "grad_norm": 0.5990403890609741, + "learning_rate": 6.287892062379534e-05, + "loss": 1.7081, + "step": 4240 + }, + { + "epoch": 1.1145925706966204, + "grad_norm": 0.6255038976669312, + "learning_rate": 6.286139828281059e-05, + "loss": 1.6878, + "step": 4242 + }, + { + "epoch": 1.1151180740302822, + "grad_norm": 0.6530934572219849, + "learning_rate": 6.284387594182584e-05, + "loss": 1.6948, + "step": 4244 + }, + { + "epoch": 1.115643577363944, + "grad_norm": 0.6111620664596558, + "learning_rate": 6.282635360084107e-05, + "loss": 1.6752, + "step": 4246 + }, + { + "epoch": 1.1161690806976057, + "grad_norm": 0.7106771469116211, + "learning_rate": 6.280883125985632e-05, + "loss": 1.6941, + "step": 4248 + }, + { + "epoch": 1.1166945840312674, + "grad_norm": 0.6513094902038574, + "learning_rate": 6.279130891887156e-05, + "loss": 1.6937, + "step": 4250 + }, + { + "epoch": 1.1172200873649292, + "grad_norm": 0.6989749670028687, + "learning_rate": 6.277378657788681e-05, + "loss": 1.7048, + "step": 4252 + }, + { + "epoch": 1.117745590698591, + "grad_norm": 0.6800375580787659, + "learning_rate": 6.275626423690205e-05, + "loss": 1.6896, + "step": 4254 + }, + { + "epoch": 1.1182710940322527, + "grad_norm": 0.5913258790969849, + "learning_rate": 6.27387418959173e-05, + "loss": 1.7218, + "step": 4256 + }, + { + "epoch": 1.1187965973659146, + "grad_norm": 0.6066367030143738, + "learning_rate": 6.272121955493254e-05, + "loss": 1.7018, + "step": 4258 + }, + { + "epoch": 1.1193221006995764, + "grad_norm": 0.6343132853507996, + "learning_rate": 6.270369721394779e-05, + "loss": 1.6942, + "step": 4260 + }, + { + "epoch": 1.1198476040332381, + "grad_norm": 0.72137451171875, + "learning_rate": 6.268617487296304e-05, + "loss": 1.6996, + "step": 4262 + }, + { + "epoch": 1.1203731073668999, + "grad_norm": 0.6029708981513977, + "learning_rate": 6.266865253197828e-05, + "loss": 1.7288, + "step": 4264 + }, + { + "epoch": 1.1208986107005616, + "grad_norm": 0.5431277751922607, + "learning_rate": 6.265113019099352e-05, + "loss": 1.677, + "step": 4266 + }, + { + "epoch": 1.1214241140342234, + "grad_norm": 0.5661784410476685, + "learning_rate": 6.263360785000877e-05, + "loss": 1.7043, + "step": 4268 + }, + { + "epoch": 1.1219496173678851, + "grad_norm": 0.611405074596405, + "learning_rate": 6.261608550902401e-05, + "loss": 1.6822, + "step": 4270 + }, + { + "epoch": 1.1224751207015469, + "grad_norm": 0.6432753205299377, + "learning_rate": 6.259856316803925e-05, + "loss": 1.7392, + "step": 4272 + }, + { + "epoch": 1.1230006240352086, + "grad_norm": 0.6208961606025696, + "learning_rate": 6.25810408270545e-05, + "loss": 1.649, + "step": 4274 + }, + { + "epoch": 1.1235261273688706, + "grad_norm": 0.628790020942688, + "learning_rate": 6.256351848606974e-05, + "loss": 1.6912, + "step": 4276 + }, + { + "epoch": 1.1240516307025323, + "grad_norm": 0.6716508269309998, + "learning_rate": 6.254599614508499e-05, + "loss": 1.7001, + "step": 4278 + }, + { + "epoch": 1.124577134036194, + "grad_norm": 0.7140381932258606, + "learning_rate": 6.252847380410022e-05, + "loss": 1.6913, + "step": 4280 + }, + { + "epoch": 1.1251026373698558, + "grad_norm": 0.6927066445350647, + "learning_rate": 6.251095146311547e-05, + "loss": 1.7051, + "step": 4282 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.6016809940338135, + "learning_rate": 6.249342912213072e-05, + "loss": 1.6734, + "step": 4284 + }, + { + "epoch": 1.1261536440371793, + "grad_norm": 0.6632773876190186, + "learning_rate": 6.247590678114597e-05, + "loss": 1.6911, + "step": 4286 + }, + { + "epoch": 1.126679147370841, + "grad_norm": 0.6406089663505554, + "learning_rate": 6.245838444016121e-05, + "loss": 1.6867, + "step": 4288 + }, + { + "epoch": 1.1272046507045028, + "grad_norm": 0.6394585371017456, + "learning_rate": 6.244086209917646e-05, + "loss": 1.7197, + "step": 4290 + }, + { + "epoch": 1.1277301540381646, + "grad_norm": 0.9273669719696045, + "learning_rate": 6.24233397581917e-05, + "loss": 1.6678, + "step": 4292 + }, + { + "epoch": 1.1282556573718265, + "grad_norm": 0.703338086605072, + "learning_rate": 6.240581741720694e-05, + "loss": 1.6839, + "step": 4294 + }, + { + "epoch": 1.1287811607054883, + "grad_norm": 0.788475751876831, + "learning_rate": 6.238829507622218e-05, + "loss": 1.6906, + "step": 4296 + }, + { + "epoch": 1.12930666403915, + "grad_norm": 0.9603837728500366, + "learning_rate": 6.237077273523742e-05, + "loss": 1.6735, + "step": 4298 + }, + { + "epoch": 1.1298321673728118, + "grad_norm": 0.7163801789283752, + "learning_rate": 6.235325039425267e-05, + "loss": 1.6773, + "step": 4300 + }, + { + "epoch": 1.1303576707064735, + "grad_norm": 0.8321335911750793, + "learning_rate": 6.233572805326792e-05, + "loss": 1.6836, + "step": 4302 + }, + { + "epoch": 1.1308831740401353, + "grad_norm": 0.6714113354682922, + "learning_rate": 6.231820571228317e-05, + "loss": 1.6949, + "step": 4304 + }, + { + "epoch": 1.131408677373797, + "grad_norm": 0.9358461499214172, + "learning_rate": 6.23006833712984e-05, + "loss": 1.6965, + "step": 4306 + }, + { + "epoch": 1.1319341807074588, + "grad_norm": 0.8378779292106628, + "learning_rate": 6.228316103031365e-05, + "loss": 1.697, + "step": 4308 + }, + { + "epoch": 1.1324596840411205, + "grad_norm": 0.5909398198127747, + "learning_rate": 6.22656386893289e-05, + "loss": 1.7349, + "step": 4310 + }, + { + "epoch": 1.1329851873747825, + "grad_norm": 0.5922428965568542, + "learning_rate": 6.224811634834414e-05, + "loss": 1.6598, + "step": 4312 + }, + { + "epoch": 1.1335106907084442, + "grad_norm": 0.7783740162849426, + "learning_rate": 6.223059400735939e-05, + "loss": 1.6822, + "step": 4314 + }, + { + "epoch": 1.134036194042106, + "grad_norm": 0.5940402746200562, + "learning_rate": 6.221307166637464e-05, + "loss": 1.7093, + "step": 4316 + }, + { + "epoch": 1.1345616973757677, + "grad_norm": 0.7679793834686279, + "learning_rate": 6.219554932538987e-05, + "loss": 1.6886, + "step": 4318 + }, + { + "epoch": 1.1350872007094295, + "grad_norm": 0.624814510345459, + "learning_rate": 6.217802698440512e-05, + "loss": 1.684, + "step": 4320 + }, + { + "epoch": 1.1356127040430912, + "grad_norm": 0.87510085105896, + "learning_rate": 6.216050464342036e-05, + "loss": 1.7228, + "step": 4322 + }, + { + "epoch": 1.136138207376753, + "grad_norm": 0.6720306277275085, + "learning_rate": 6.21429823024356e-05, + "loss": 1.6675, + "step": 4324 + }, + { + "epoch": 1.136663710710415, + "grad_norm": 0.6201871037483215, + "learning_rate": 6.212545996145085e-05, + "loss": 1.708, + "step": 4326 + }, + { + "epoch": 1.1371892140440765, + "grad_norm": 0.6067994832992554, + "learning_rate": 6.21079376204661e-05, + "loss": 1.6825, + "step": 4328 + }, + { + "epoch": 1.1377147173777384, + "grad_norm": 0.5989435315132141, + "learning_rate": 6.209041527948135e-05, + "loss": 1.6796, + "step": 4330 + }, + { + "epoch": 1.1382402207114002, + "grad_norm": 0.6668772101402283, + "learning_rate": 6.207289293849658e-05, + "loss": 1.6989, + "step": 4332 + }, + { + "epoch": 1.138765724045062, + "grad_norm": 0.6205788850784302, + "learning_rate": 6.205537059751183e-05, + "loss": 1.6787, + "step": 4334 + }, + { + "epoch": 1.1392912273787237, + "grad_norm": 0.7359316349029541, + "learning_rate": 6.203784825652707e-05, + "loss": 1.6806, + "step": 4336 + }, + { + "epoch": 1.1398167307123854, + "grad_norm": 0.5981895923614502, + "learning_rate": 6.202032591554232e-05, + "loss": 1.6804, + "step": 4338 + }, + { + "epoch": 1.1403422340460472, + "grad_norm": 0.6678347587585449, + "learning_rate": 6.200280357455757e-05, + "loss": 1.6966, + "step": 4340 + }, + { + "epoch": 1.140867737379709, + "grad_norm": 0.691186249256134, + "learning_rate": 6.198528123357282e-05, + "loss": 1.6746, + "step": 4342 + }, + { + "epoch": 1.141393240713371, + "grad_norm": 0.6158877611160278, + "learning_rate": 6.196775889258805e-05, + "loss": 1.6968, + "step": 4344 + }, + { + "epoch": 1.1419187440470326, + "grad_norm": 0.6359262466430664, + "learning_rate": 6.19502365516033e-05, + "loss": 1.6768, + "step": 4346 + }, + { + "epoch": 1.1424442473806944, + "grad_norm": 0.5825957655906677, + "learning_rate": 6.193271421061853e-05, + "loss": 1.6953, + "step": 4348 + }, + { + "epoch": 1.1429697507143561, + "grad_norm": 0.6011431217193604, + "learning_rate": 6.191519186963378e-05, + "loss": 1.6895, + "step": 4350 + }, + { + "epoch": 1.1434952540480179, + "grad_norm": 0.5862739086151123, + "learning_rate": 6.189766952864903e-05, + "loss": 1.6716, + "step": 4352 + }, + { + "epoch": 1.1440207573816796, + "grad_norm": 0.6541095972061157, + "learning_rate": 6.188014718766428e-05, + "loss": 1.6748, + "step": 4354 + }, + { + "epoch": 1.1445462607153414, + "grad_norm": 0.6046082973480225, + "learning_rate": 6.186262484667952e-05, + "loss": 1.6677, + "step": 4356 + }, + { + "epoch": 1.1450717640490031, + "grad_norm": 0.5532662868499756, + "learning_rate": 6.184510250569476e-05, + "loss": 1.6567, + "step": 4358 + }, + { + "epoch": 1.1455972673826649, + "grad_norm": 0.6711378693580627, + "learning_rate": 6.182758016471e-05, + "loss": 1.6731, + "step": 4360 + }, + { + "epoch": 1.1461227707163268, + "grad_norm": 0.6071760654449463, + "learning_rate": 6.181005782372525e-05, + "loss": 1.6749, + "step": 4362 + }, + { + "epoch": 1.1466482740499886, + "grad_norm": 0.6723074913024902, + "learning_rate": 6.17925354827405e-05, + "loss": 1.6757, + "step": 4364 + }, + { + "epoch": 1.1471737773836503, + "grad_norm": 0.649804949760437, + "learning_rate": 6.177501314175575e-05, + "loss": 1.6809, + "step": 4366 + }, + { + "epoch": 1.147699280717312, + "grad_norm": 0.6812226176261902, + "learning_rate": 6.1757490800771e-05, + "loss": 1.694, + "step": 4368 + }, + { + "epoch": 1.1482247840509738, + "grad_norm": 0.6040016412734985, + "learning_rate": 6.173996845978623e-05, + "loss": 1.7262, + "step": 4370 + }, + { + "epoch": 1.1487502873846356, + "grad_norm": 0.652336597442627, + "learning_rate": 6.172244611880148e-05, + "loss": 1.6702, + "step": 4372 + }, + { + "epoch": 1.1492757907182973, + "grad_norm": 0.5667666792869568, + "learning_rate": 6.170492377781671e-05, + "loss": 1.6873, + "step": 4374 + }, + { + "epoch": 1.149801294051959, + "grad_norm": 0.6907036304473877, + "learning_rate": 6.168740143683196e-05, + "loss": 1.7076, + "step": 4376 + }, + { + "epoch": 1.1503267973856208, + "grad_norm": 0.5834628939628601, + "learning_rate": 6.16698790958472e-05, + "loss": 1.6956, + "step": 4378 + }, + { + "epoch": 1.1508523007192828, + "grad_norm": 0.6255402565002441, + "learning_rate": 6.165235675486245e-05, + "loss": 1.7165, + "step": 4380 + }, + { + "epoch": 1.1513778040529445, + "grad_norm": 0.6630857586860657, + "learning_rate": 6.16348344138777e-05, + "loss": 1.6898, + "step": 4382 + }, + { + "epoch": 1.1519033073866063, + "grad_norm": 0.6802171468734741, + "learning_rate": 6.161731207289293e-05, + "loss": 1.6946, + "step": 4384 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.5508185029029846, + "learning_rate": 6.159978973190818e-05, + "loss": 1.676, + "step": 4386 + }, + { + "epoch": 1.1529543140539298, + "grad_norm": 0.5373386740684509, + "learning_rate": 6.158226739092343e-05, + "loss": 1.6595, + "step": 4388 + }, + { + "epoch": 1.1534798173875915, + "grad_norm": 0.5505905747413635, + "learning_rate": 6.156474504993868e-05, + "loss": 1.679, + "step": 4390 + }, + { + "epoch": 1.1540053207212533, + "grad_norm": 0.5813027620315552, + "learning_rate": 6.154722270895392e-05, + "loss": 1.6824, + "step": 4392 + }, + { + "epoch": 1.154530824054915, + "grad_norm": 0.6369917392730713, + "learning_rate": 6.152970036796917e-05, + "loss": 1.6704, + "step": 4394 + }, + { + "epoch": 1.1550563273885768, + "grad_norm": 0.5745425224304199, + "learning_rate": 6.15121780269844e-05, + "loss": 1.6583, + "step": 4396 + }, + { + "epoch": 1.1555818307222387, + "grad_norm": 0.6166395545005798, + "learning_rate": 6.149465568599964e-05, + "loss": 1.6866, + "step": 4398 + }, + { + "epoch": 1.1561073340559005, + "grad_norm": 0.5606328248977661, + "learning_rate": 6.147713334501489e-05, + "loss": 1.6844, + "step": 4400 + }, + { + "epoch": 1.1561073340559005, + "eval_loss": 1.7070621252059937, + "eval_runtime": 487.2485, + "eval_samples_per_second": 249.953, + "eval_steps_per_second": 31.245, + "step": 4400 + }, + { + "epoch": 1.1566328373895622, + "grad_norm": 0.6324471235275269, + "learning_rate": 6.145961100403014e-05, + "loss": 1.6756, + "step": 4402 + }, + { + "epoch": 1.157158340723224, + "grad_norm": 0.66776043176651, + "learning_rate": 6.144208866304538e-05, + "loss": 1.7205, + "step": 4404 + }, + { + "epoch": 1.1576838440568857, + "grad_norm": 0.5929023623466492, + "learning_rate": 6.142456632206063e-05, + "loss": 1.6974, + "step": 4406 + }, + { + "epoch": 1.1582093473905475, + "grad_norm": 0.6727093458175659, + "learning_rate": 6.140704398107588e-05, + "loss": 1.6482, + "step": 4408 + }, + { + "epoch": 1.1587348507242092, + "grad_norm": 0.5299540758132935, + "learning_rate": 6.138952164009111e-05, + "loss": 1.712, + "step": 4410 + }, + { + "epoch": 1.159260354057871, + "grad_norm": 0.6730796098709106, + "learning_rate": 6.137199929910636e-05, + "loss": 1.661, + "step": 4412 + }, + { + "epoch": 1.1597858573915327, + "grad_norm": 0.6617345213890076, + "learning_rate": 6.135447695812161e-05, + "loss": 1.706, + "step": 4414 + }, + { + "epoch": 1.1603113607251947, + "grad_norm": 0.5500732660293579, + "learning_rate": 6.133695461713686e-05, + "loss": 1.7226, + "step": 4416 + }, + { + "epoch": 1.1608368640588564, + "grad_norm": 0.6082700490951538, + "learning_rate": 6.13194322761521e-05, + "loss": 1.6724, + "step": 4418 + }, + { + "epoch": 1.1613623673925182, + "grad_norm": 0.6681337356567383, + "learning_rate": 6.130190993516735e-05, + "loss": 1.6395, + "step": 4420 + }, + { + "epoch": 1.16188787072618, + "grad_norm": 0.6328011155128479, + "learning_rate": 6.128438759418258e-05, + "loss": 1.6802, + "step": 4422 + }, + { + "epoch": 1.1624133740598417, + "grad_norm": 0.6065677404403687, + "learning_rate": 6.126686525319782e-05, + "loss": 1.6797, + "step": 4424 + }, + { + "epoch": 1.1629388773935034, + "grad_norm": 0.5718259811401367, + "learning_rate": 6.124934291221307e-05, + "loss": 1.6686, + "step": 4426 + }, + { + "epoch": 1.1634643807271652, + "grad_norm": 0.5725675821304321, + "learning_rate": 6.123182057122831e-05, + "loss": 1.7017, + "step": 4428 + }, + { + "epoch": 1.163989884060827, + "grad_norm": 0.5983812808990479, + "learning_rate": 6.121429823024356e-05, + "loss": 1.6779, + "step": 4430 + }, + { + "epoch": 1.1645153873944887, + "grad_norm": 0.5576106309890747, + "learning_rate": 6.119677588925881e-05, + "loss": 1.6843, + "step": 4432 + }, + { + "epoch": 1.1650408907281506, + "grad_norm": 0.9009653329849243, + "learning_rate": 6.117925354827406e-05, + "loss": 1.6473, + "step": 4434 + }, + { + "epoch": 1.1655663940618124, + "grad_norm": 0.7119090557098389, + "learning_rate": 6.116173120728929e-05, + "loss": 1.6738, + "step": 4436 + }, + { + "epoch": 1.1660918973954741, + "grad_norm": 0.8985000848770142, + "learning_rate": 6.114420886630454e-05, + "loss": 1.6753, + "step": 4438 + }, + { + "epoch": 1.166617400729136, + "grad_norm": 0.6928833723068237, + "learning_rate": 6.112668652531979e-05, + "loss": 1.6922, + "step": 4440 + }, + { + "epoch": 1.1671429040627976, + "grad_norm": 0.5745196342468262, + "learning_rate": 6.110916418433503e-05, + "loss": 1.6845, + "step": 4442 + }, + { + "epoch": 1.1676684073964594, + "grad_norm": 0.6172153949737549, + "learning_rate": 6.109164184335028e-05, + "loss": 1.7065, + "step": 4444 + }, + { + "epoch": 1.1681939107301211, + "grad_norm": 0.606926441192627, + "learning_rate": 6.107411950236553e-05, + "loss": 1.6673, + "step": 4446 + }, + { + "epoch": 1.1687194140637829, + "grad_norm": 0.6731804609298706, + "learning_rate": 6.105659716138076e-05, + "loss": 1.7006, + "step": 4448 + }, + { + "epoch": 1.1692449173974446, + "grad_norm": 0.8230140805244446, + "learning_rate": 6.103907482039601e-05, + "loss": 1.694, + "step": 4450 + }, + { + "epoch": 1.1697704207311066, + "grad_norm": 0.809855580329895, + "learning_rate": 6.102155247941125e-05, + "loss": 1.699, + "step": 4452 + }, + { + "epoch": 1.1702959240647683, + "grad_norm": 0.6014560461044312, + "learning_rate": 6.100403013842649e-05, + "loss": 1.7296, + "step": 4454 + }, + { + "epoch": 1.17082142739843, + "grad_norm": 0.6129987835884094, + "learning_rate": 6.098650779744174e-05, + "loss": 1.6596, + "step": 4456 + }, + { + "epoch": 1.1713469307320918, + "grad_norm": 0.8771330118179321, + "learning_rate": 6.0968985456456986e-05, + "loss": 1.6907, + "step": 4458 + }, + { + "epoch": 1.1718724340657536, + "grad_norm": 0.5508044362068176, + "learning_rate": 6.095146311547223e-05, + "loss": 1.7086, + "step": 4460 + }, + { + "epoch": 1.1723979373994153, + "grad_norm": 0.7334650158882141, + "learning_rate": 6.0933940774487474e-05, + "loss": 1.6956, + "step": 4462 + }, + { + "epoch": 1.172923440733077, + "grad_norm": 0.7739654183387756, + "learning_rate": 6.091641843350272e-05, + "loss": 1.6785, + "step": 4464 + }, + { + "epoch": 1.1734489440667388, + "grad_norm": 0.6686828136444092, + "learning_rate": 6.089889609251796e-05, + "loss": 1.6571, + "step": 4466 + }, + { + "epoch": 1.1739744474004006, + "grad_norm": 0.6591079831123352, + "learning_rate": 6.088137375153321e-05, + "loss": 1.6661, + "step": 4468 + }, + { + "epoch": 1.1744999507340625, + "grad_norm": 0.520408570766449, + "learning_rate": 6.086385141054846e-05, + "loss": 1.7076, + "step": 4470 + }, + { + "epoch": 1.1750254540677243, + "grad_norm": 0.7765664458274841, + "learning_rate": 6.08463290695637e-05, + "loss": 1.6801, + "step": 4472 + }, + { + "epoch": 1.175550957401386, + "grad_norm": 0.6031415462493896, + "learning_rate": 6.0828806728578946e-05, + "loss": 1.6999, + "step": 4474 + }, + { + "epoch": 1.1760764607350478, + "grad_norm": 0.694837749004364, + "learning_rate": 6.081128438759418e-05, + "loss": 1.7186, + "step": 4476 + }, + { + "epoch": 1.1766019640687095, + "grad_norm": 0.5900973081588745, + "learning_rate": 6.079376204660943e-05, + "loss": 1.6832, + "step": 4478 + }, + { + "epoch": 1.1771274674023713, + "grad_norm": 0.5364487171173096, + "learning_rate": 6.077623970562467e-05, + "loss": 1.6756, + "step": 4480 + }, + { + "epoch": 1.177652970736033, + "grad_norm": 0.5482999086380005, + "learning_rate": 6.0758717364639916e-05, + "loss": 1.697, + "step": 4482 + }, + { + "epoch": 1.178178474069695, + "grad_norm": 0.7080986499786377, + "learning_rate": 6.0741195023655164e-05, + "loss": 1.7187, + "step": 4484 + }, + { + "epoch": 1.1787039774033565, + "grad_norm": 0.6446167230606079, + "learning_rate": 6.0723672682670405e-05, + "loss": 1.6753, + "step": 4486 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.6965610384941101, + "learning_rate": 6.070615034168565e-05, + "loss": 1.689, + "step": 4488 + }, + { + "epoch": 1.1797549840706802, + "grad_norm": 0.5724831819534302, + "learning_rate": 6.06886280007009e-05, + "loss": 1.6872, + "step": 4490 + }, + { + "epoch": 1.180280487404342, + "grad_norm": 0.7344499230384827, + "learning_rate": 6.067110565971614e-05, + "loss": 1.6421, + "step": 4492 + }, + { + "epoch": 1.1808059907380037, + "grad_norm": 0.663684070110321, + "learning_rate": 6.065358331873139e-05, + "loss": 1.6909, + "step": 4494 + }, + { + "epoch": 1.1813314940716655, + "grad_norm": 0.5467018485069275, + "learning_rate": 6.0636060977746636e-05, + "loss": 1.6707, + "step": 4496 + }, + { + "epoch": 1.1818569974053272, + "grad_norm": 0.5809999704360962, + "learning_rate": 6.0618538636761876e-05, + "loss": 1.692, + "step": 4498 + }, + { + "epoch": 1.182382500738989, + "grad_norm": 0.6746166944503784, + "learning_rate": 6.060101629577711e-05, + "loss": 1.6944, + "step": 4500 + }, + { + "epoch": 1.182908004072651, + "grad_norm": 0.5990844368934631, + "learning_rate": 6.058349395479236e-05, + "loss": 1.703, + "step": 4502 + }, + { + "epoch": 1.1834335074063127, + "grad_norm": 0.7285276651382446, + "learning_rate": 6.0565971613807606e-05, + "loss": 1.7037, + "step": 4504 + }, + { + "epoch": 1.1839590107399744, + "grad_norm": 0.5541601181030273, + "learning_rate": 6.0548449272822846e-05, + "loss": 1.712, + "step": 4506 + }, + { + "epoch": 1.1844845140736362, + "grad_norm": 0.5341848134994507, + "learning_rate": 6.0530926931838094e-05, + "loss": 1.6848, + "step": 4508 + }, + { + "epoch": 1.185010017407298, + "grad_norm": 0.5783855319023132, + "learning_rate": 6.051340459085334e-05, + "loss": 1.6941, + "step": 4510 + }, + { + "epoch": 1.1855355207409597, + "grad_norm": 0.7894533276557922, + "learning_rate": 6.049588224986858e-05, + "loss": 1.699, + "step": 4512 + }, + { + "epoch": 1.1860610240746214, + "grad_norm": 0.6631985306739807, + "learning_rate": 6.047835990888383e-05, + "loss": 1.7063, + "step": 4514 + }, + { + "epoch": 1.1865865274082832, + "grad_norm": 0.5982935428619385, + "learning_rate": 6.046083756789908e-05, + "loss": 1.6921, + "step": 4516 + }, + { + "epoch": 1.187112030741945, + "grad_norm": 0.5341594219207764, + "learning_rate": 6.044331522691432e-05, + "loss": 1.7252, + "step": 4518 + }, + { + "epoch": 1.187637534075607, + "grad_norm": 0.5512247681617737, + "learning_rate": 6.0425792885929566e-05, + "loss": 1.7033, + "step": 4520 + }, + { + "epoch": 1.1881630374092687, + "grad_norm": 0.5994440913200378, + "learning_rate": 6.040827054494481e-05, + "loss": 1.7003, + "step": 4522 + }, + { + "epoch": 1.1886885407429304, + "grad_norm": 0.6756665706634521, + "learning_rate": 6.0390748203960054e-05, + "loss": 1.6817, + "step": 4524 + }, + { + "epoch": 1.1892140440765921, + "grad_norm": 0.6200160980224609, + "learning_rate": 6.037322586297529e-05, + "loss": 1.7068, + "step": 4526 + }, + { + "epoch": 1.189739547410254, + "grad_norm": 0.5417330861091614, + "learning_rate": 6.0355703521990536e-05, + "loss": 1.7199, + "step": 4528 + }, + { + "epoch": 1.1902650507439156, + "grad_norm": 0.6741480231285095, + "learning_rate": 6.033818118100578e-05, + "loss": 1.6577, + "step": 4530 + }, + { + "epoch": 1.1907905540775774, + "grad_norm": 0.667620062828064, + "learning_rate": 6.0320658840021024e-05, + "loss": 1.6835, + "step": 4532 + }, + { + "epoch": 1.1913160574112391, + "grad_norm": 0.783929705619812, + "learning_rate": 6.030313649903627e-05, + "loss": 1.7028, + "step": 4534 + }, + { + "epoch": 1.1918415607449009, + "grad_norm": 0.6510748267173767, + "learning_rate": 6.028561415805152e-05, + "loss": 1.6668, + "step": 4536 + }, + { + "epoch": 1.1923670640785629, + "grad_norm": 0.5531290769577026, + "learning_rate": 6.026809181706676e-05, + "loss": 1.6691, + "step": 4538 + }, + { + "epoch": 1.1928925674122246, + "grad_norm": 0.5906832218170166, + "learning_rate": 6.025056947608201e-05, + "loss": 1.6689, + "step": 4540 + }, + { + "epoch": 1.1934180707458864, + "grad_norm": 0.580287516117096, + "learning_rate": 6.0233047135097255e-05, + "loss": 1.7105, + "step": 4542 + }, + { + "epoch": 1.193943574079548, + "grad_norm": 0.6183664798736572, + "learning_rate": 6.0215524794112496e-05, + "loss": 1.7059, + "step": 4544 + }, + { + "epoch": 1.1944690774132098, + "grad_norm": 0.5283176898956299, + "learning_rate": 6.019800245312774e-05, + "loss": 1.701, + "step": 4546 + }, + { + "epoch": 1.1949945807468716, + "grad_norm": 0.5322774052619934, + "learning_rate": 6.018048011214299e-05, + "loss": 1.6725, + "step": 4548 + }, + { + "epoch": 1.1955200840805333, + "grad_norm": 0.6731706857681274, + "learning_rate": 6.016295777115823e-05, + "loss": 1.6892, + "step": 4550 + }, + { + "epoch": 1.196045587414195, + "grad_norm": 0.5544896125793457, + "learning_rate": 6.0145435430173466e-05, + "loss": 1.6925, + "step": 4552 + }, + { + "epoch": 1.1965710907478568, + "grad_norm": 0.6916193962097168, + "learning_rate": 6.012791308918871e-05, + "loss": 1.7058, + "step": 4554 + }, + { + "epoch": 1.1970965940815188, + "grad_norm": 0.6361691355705261, + "learning_rate": 6.011039074820396e-05, + "loss": 1.6505, + "step": 4556 + }, + { + "epoch": 1.1976220974151806, + "grad_norm": 0.5423670411109924, + "learning_rate": 6.00928684072192e-05, + "loss": 1.6935, + "step": 4558 + }, + { + "epoch": 1.1981476007488423, + "grad_norm": 0.737524688243866, + "learning_rate": 6.007534606623445e-05, + "loss": 1.7217, + "step": 4560 + }, + { + "epoch": 1.198673104082504, + "grad_norm": 0.5811054110527039, + "learning_rate": 6.00578237252497e-05, + "loss": 1.6936, + "step": 4562 + }, + { + "epoch": 1.1991986074161658, + "grad_norm": 0.554607093334198, + "learning_rate": 6.004030138426494e-05, + "loss": 1.6963, + "step": 4564 + }, + { + "epoch": 1.1997241107498275, + "grad_norm": 0.6123262643814087, + "learning_rate": 6.0022779043280185e-05, + "loss": 1.6998, + "step": 4566 + }, + { + "epoch": 1.2002496140834893, + "grad_norm": 0.7441146969795227, + "learning_rate": 6.000525670229543e-05, + "loss": 1.6764, + "step": 4568 + }, + { + "epoch": 1.200775117417151, + "grad_norm": 0.6230175495147705, + "learning_rate": 5.9987734361310673e-05, + "loss": 1.6832, + "step": 4570 + }, + { + "epoch": 1.2013006207508128, + "grad_norm": 0.5986462831497192, + "learning_rate": 5.997021202032592e-05, + "loss": 1.6863, + "step": 4572 + }, + { + "epoch": 1.2018261240844748, + "grad_norm": 0.7910165786743164, + "learning_rate": 5.995268967934117e-05, + "loss": 1.6787, + "step": 4574 + }, + { + "epoch": 1.2023516274181365, + "grad_norm": 0.5927668213844299, + "learning_rate": 5.993516733835641e-05, + "loss": 1.6826, + "step": 4576 + }, + { + "epoch": 1.2028771307517983, + "grad_norm": 0.6229565143585205, + "learning_rate": 5.991764499737165e-05, + "loss": 1.6938, + "step": 4578 + }, + { + "epoch": 1.20340263408546, + "grad_norm": 0.5543463826179504, + "learning_rate": 5.990012265638689e-05, + "loss": 1.6557, + "step": 4580 + }, + { + "epoch": 1.2039281374191217, + "grad_norm": 0.6359471678733826, + "learning_rate": 5.988260031540214e-05, + "loss": 1.6984, + "step": 4582 + }, + { + "epoch": 1.2044536407527835, + "grad_norm": 0.6143317222595215, + "learning_rate": 5.9865077974417386e-05, + "loss": 1.6618, + "step": 4584 + }, + { + "epoch": 1.2049791440864452, + "grad_norm": 0.5732699036598206, + "learning_rate": 5.984755563343263e-05, + "loss": 1.7093, + "step": 4586 + }, + { + "epoch": 1.205504647420107, + "grad_norm": 0.6090834140777588, + "learning_rate": 5.9830033292447874e-05, + "loss": 1.6769, + "step": 4588 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.5685290098190308, + "learning_rate": 5.981251095146312e-05, + "loss": 1.689, + "step": 4590 + }, + { + "epoch": 1.2065556540874307, + "grad_norm": 0.641834557056427, + "learning_rate": 5.979498861047836e-05, + "loss": 1.6734, + "step": 4592 + }, + { + "epoch": 1.2070811574210925, + "grad_norm": 0.6050872206687927, + "learning_rate": 5.977746626949361e-05, + "loss": 1.6887, + "step": 4594 + }, + { + "epoch": 1.2076066607547542, + "grad_norm": 0.8347049951553345, + "learning_rate": 5.975994392850886e-05, + "loss": 1.7161, + "step": 4596 + }, + { + "epoch": 1.208132164088416, + "grad_norm": 0.5846490859985352, + "learning_rate": 5.97424215875241e-05, + "loss": 1.7121, + "step": 4598 + }, + { + "epoch": 1.2086576674220777, + "grad_norm": 0.8007673621177673, + "learning_rate": 5.9724899246539346e-05, + "loss": 1.6825, + "step": 4600 + }, + { + "epoch": 1.2091831707557394, + "grad_norm": 0.7300782203674316, + "learning_rate": 5.970737690555458e-05, + "loss": 1.6842, + "step": 4602 + }, + { + "epoch": 1.2097086740894012, + "grad_norm": 0.5872364044189453, + "learning_rate": 5.968985456456983e-05, + "loss": 1.6854, + "step": 4604 + }, + { + "epoch": 1.210234177423063, + "grad_norm": 0.6040078997612, + "learning_rate": 5.967233222358507e-05, + "loss": 1.7254, + "step": 4606 + }, + { + "epoch": 1.2107596807567247, + "grad_norm": 0.6352037787437439, + "learning_rate": 5.9654809882600316e-05, + "loss": 1.6559, + "step": 4608 + }, + { + "epoch": 1.2112851840903867, + "grad_norm": 0.8045578002929688, + "learning_rate": 5.9637287541615564e-05, + "loss": 1.6984, + "step": 4610 + }, + { + "epoch": 1.2118106874240484, + "grad_norm": 0.5676342248916626, + "learning_rate": 5.9619765200630805e-05, + "loss": 1.6881, + "step": 4612 + }, + { + "epoch": 1.2123361907577102, + "grad_norm": 0.5795140862464905, + "learning_rate": 5.960224285964605e-05, + "loss": 1.6856, + "step": 4614 + }, + { + "epoch": 1.212861694091372, + "grad_norm": 1.0082522630691528, + "learning_rate": 5.95847205186613e-05, + "loss": 1.7146, + "step": 4616 + }, + { + "epoch": 1.2133871974250336, + "grad_norm": 0.5609422922134399, + "learning_rate": 5.956719817767654e-05, + "loss": 1.708, + "step": 4618 + }, + { + "epoch": 1.2139127007586954, + "grad_norm": 0.6195114850997925, + "learning_rate": 5.954967583669179e-05, + "loss": 1.6505, + "step": 4620 + }, + { + "epoch": 1.2144382040923571, + "grad_norm": 0.7452388405799866, + "learning_rate": 5.9532153495707036e-05, + "loss": 1.6737, + "step": 4622 + }, + { + "epoch": 1.214963707426019, + "grad_norm": 0.624179482460022, + "learning_rate": 5.9514631154722276e-05, + "loss": 1.6642, + "step": 4624 + }, + { + "epoch": 1.2154892107596806, + "grad_norm": 0.5954496264457703, + "learning_rate": 5.9497108813737524e-05, + "loss": 1.6683, + "step": 4626 + }, + { + "epoch": 1.2160147140933426, + "grad_norm": 0.6502910852432251, + "learning_rate": 5.947958647275276e-05, + "loss": 1.6515, + "step": 4628 + }, + { + "epoch": 1.2165402174270044, + "grad_norm": 0.6815327405929565, + "learning_rate": 5.9462064131768005e-05, + "loss": 1.7016, + "step": 4630 + }, + { + "epoch": 1.217065720760666, + "grad_norm": 0.6848923563957214, + "learning_rate": 5.9444541790783246e-05, + "loss": 1.6792, + "step": 4632 + }, + { + "epoch": 1.2175912240943279, + "grad_norm": 0.5819631218910217, + "learning_rate": 5.9427019449798494e-05, + "loss": 1.6822, + "step": 4634 + }, + { + "epoch": 1.2181167274279896, + "grad_norm": 0.5571796894073486, + "learning_rate": 5.940949710881374e-05, + "loss": 1.6894, + "step": 4636 + }, + { + "epoch": 1.2186422307616513, + "grad_norm": 0.7528451681137085, + "learning_rate": 5.939197476782898e-05, + "loss": 1.6863, + "step": 4638 + }, + { + "epoch": 1.219167734095313, + "grad_norm": 0.5486443042755127, + "learning_rate": 5.937445242684423e-05, + "loss": 1.7017, + "step": 4640 + }, + { + "epoch": 1.219693237428975, + "grad_norm": 0.7798599004745483, + "learning_rate": 5.935693008585948e-05, + "loss": 1.7108, + "step": 4642 + }, + { + "epoch": 1.2202187407626366, + "grad_norm": 0.6270305514335632, + "learning_rate": 5.933940774487472e-05, + "loss": 1.6979, + "step": 4644 + }, + { + "epoch": 1.2207442440962986, + "grad_norm": 0.5500916838645935, + "learning_rate": 5.9321885403889966e-05, + "loss": 1.6832, + "step": 4646 + }, + { + "epoch": 1.2212697474299603, + "grad_norm": 0.6205405592918396, + "learning_rate": 5.930436306290521e-05, + "loss": 1.7544, + "step": 4648 + }, + { + "epoch": 1.221795250763622, + "grad_norm": 0.6169667840003967, + "learning_rate": 5.9286840721920454e-05, + "loss": 1.7024, + "step": 4650 + }, + { + "epoch": 1.2223207540972838, + "grad_norm": 0.7890437841415405, + "learning_rate": 5.92693183809357e-05, + "loss": 1.6707, + "step": 4652 + }, + { + "epoch": 1.2228462574309455, + "grad_norm": 0.6405115723609924, + "learning_rate": 5.9251796039950936e-05, + "loss": 1.6503, + "step": 4654 + }, + { + "epoch": 1.2233717607646073, + "grad_norm": 0.6368988156318665, + "learning_rate": 5.923427369896618e-05, + "loss": 1.6963, + "step": 4656 + }, + { + "epoch": 1.223897264098269, + "grad_norm": 0.7032740712165833, + "learning_rate": 5.9216751357981424e-05, + "loss": 1.6675, + "step": 4658 + }, + { + "epoch": 1.224422767431931, + "grad_norm": 0.6669613718986511, + "learning_rate": 5.919922901699667e-05, + "loss": 1.7002, + "step": 4660 + }, + { + "epoch": 1.2249482707655928, + "grad_norm": 0.6690141558647156, + "learning_rate": 5.918170667601192e-05, + "loss": 1.6499, + "step": 4662 + }, + { + "epoch": 1.2254737740992545, + "grad_norm": 0.5619009733200073, + "learning_rate": 5.916418433502716e-05, + "loss": 1.7039, + "step": 4664 + }, + { + "epoch": 1.2259992774329163, + "grad_norm": 0.6794602274894714, + "learning_rate": 5.914666199404241e-05, + "loss": 1.6732, + "step": 4666 + }, + { + "epoch": 1.226524780766578, + "grad_norm": 0.8924415707588196, + "learning_rate": 5.9129139653057655e-05, + "loss": 1.6833, + "step": 4668 + }, + { + "epoch": 1.2270502841002398, + "grad_norm": 0.6746704578399658, + "learning_rate": 5.9111617312072896e-05, + "loss": 1.6816, + "step": 4670 + }, + { + "epoch": 1.2275757874339015, + "grad_norm": 0.5335586071014404, + "learning_rate": 5.909409497108814e-05, + "loss": 1.7214, + "step": 4672 + }, + { + "epoch": 1.2281012907675632, + "grad_norm": 0.6099651455879211, + "learning_rate": 5.907657263010339e-05, + "loss": 1.6652, + "step": 4674 + }, + { + "epoch": 1.228626794101225, + "grad_norm": 0.6068762540817261, + "learning_rate": 5.905905028911863e-05, + "loss": 1.6563, + "step": 4676 + }, + { + "epoch": 1.229152297434887, + "grad_norm": 0.561000645160675, + "learning_rate": 5.904152794813388e-05, + "loss": 1.6927, + "step": 4678 + }, + { + "epoch": 1.2296778007685487, + "grad_norm": 0.5757449269294739, + "learning_rate": 5.902400560714911e-05, + "loss": 1.7042, + "step": 4680 + }, + { + "epoch": 1.2302033041022105, + "grad_norm": 0.5956955552101135, + "learning_rate": 5.900648326616436e-05, + "loss": 1.6785, + "step": 4682 + }, + { + "epoch": 1.2307288074358722, + "grad_norm": 0.5795134902000427, + "learning_rate": 5.89889609251796e-05, + "loss": 1.6774, + "step": 4684 + }, + { + "epoch": 1.231254310769534, + "grad_norm": 0.8627724051475525, + "learning_rate": 5.897143858419485e-05, + "loss": 1.7131, + "step": 4686 + }, + { + "epoch": 1.2317798141031957, + "grad_norm": 0.5390210747718811, + "learning_rate": 5.89539162432101e-05, + "loss": 1.7153, + "step": 4688 + }, + { + "epoch": 1.2323053174368575, + "grad_norm": 0.8949792981147766, + "learning_rate": 5.893639390222534e-05, + "loss": 1.689, + "step": 4690 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.6706136465072632, + "learning_rate": 5.8918871561240585e-05, + "loss": 1.71, + "step": 4692 + }, + { + "epoch": 1.233356324104181, + "grad_norm": 0.5577556490898132, + "learning_rate": 5.890134922025583e-05, + "loss": 1.6462, + "step": 4694 + }, + { + "epoch": 1.233881827437843, + "grad_norm": 0.6497912406921387, + "learning_rate": 5.8883826879271073e-05, + "loss": 1.6806, + "step": 4696 + }, + { + "epoch": 1.2344073307715047, + "grad_norm": 0.5242089629173279, + "learning_rate": 5.886630453828632e-05, + "loss": 1.7029, + "step": 4698 + }, + { + "epoch": 1.2349328341051664, + "grad_norm": 0.9607106447219849, + "learning_rate": 5.884878219730157e-05, + "loss": 1.6864, + "step": 4700 + }, + { + "epoch": 1.2354583374388282, + "grad_norm": 0.9370645880699158, + "learning_rate": 5.883125985631681e-05, + "loss": 1.6973, + "step": 4702 + }, + { + "epoch": 1.23598384077249, + "grad_norm": 0.7093036770820618, + "learning_rate": 5.881373751533204e-05, + "loss": 1.7237, + "step": 4704 + }, + { + "epoch": 1.2365093441061517, + "grad_norm": 0.6128284931182861, + "learning_rate": 5.879621517434729e-05, + "loss": 1.6673, + "step": 4706 + }, + { + "epoch": 1.2370348474398134, + "grad_norm": 1.1151705980300903, + "learning_rate": 5.877869283336254e-05, + "loss": 1.6668, + "step": 4708 + }, + { + "epoch": 1.2375603507734751, + "grad_norm": 0.7388045191764832, + "learning_rate": 5.876117049237778e-05, + "loss": 1.6923, + "step": 4710 + }, + { + "epoch": 1.238085854107137, + "grad_norm": 0.539486289024353, + "learning_rate": 5.874364815139303e-05, + "loss": 1.6779, + "step": 4712 + }, + { + "epoch": 1.2386113574407989, + "grad_norm": 0.7663931250572205, + "learning_rate": 5.8726125810408274e-05, + "loss": 1.686, + "step": 4714 + }, + { + "epoch": 1.2391368607744606, + "grad_norm": 0.6578073501586914, + "learning_rate": 5.8708603469423515e-05, + "loss": 1.7054, + "step": 4716 + }, + { + "epoch": 1.2396623641081224, + "grad_norm": 0.5744708776473999, + "learning_rate": 5.869108112843876e-05, + "loss": 1.6785, + "step": 4718 + }, + { + "epoch": 1.240187867441784, + "grad_norm": 0.7532958388328552, + "learning_rate": 5.867355878745401e-05, + "loss": 1.6743, + "step": 4720 + }, + { + "epoch": 1.2407133707754459, + "grad_norm": 0.5337387323379517, + "learning_rate": 5.865603644646925e-05, + "loss": 1.7303, + "step": 4722 + }, + { + "epoch": 1.2412388741091076, + "grad_norm": 0.5469831228256226, + "learning_rate": 5.86385141054845e-05, + "loss": 1.6566, + "step": 4724 + }, + { + "epoch": 1.2417643774427694, + "grad_norm": 0.5634649395942688, + "learning_rate": 5.8620991764499746e-05, + "loss": 1.6868, + "step": 4726 + }, + { + "epoch": 1.242289880776431, + "grad_norm": 0.7761363983154297, + "learning_rate": 5.860346942351499e-05, + "loss": 1.6792, + "step": 4728 + }, + { + "epoch": 1.2428153841100928, + "grad_norm": 0.6965597867965698, + "learning_rate": 5.858594708253022e-05, + "loss": 1.715, + "step": 4730 + }, + { + "epoch": 1.2433408874437548, + "grad_norm": 0.6957322359085083, + "learning_rate": 5.856842474154547e-05, + "loss": 1.6739, + "step": 4732 + }, + { + "epoch": 1.2438663907774166, + "grad_norm": 0.5814897418022156, + "learning_rate": 5.8550902400560716e-05, + "loss": 1.6988, + "step": 4734 + }, + { + "epoch": 1.2443918941110783, + "grad_norm": 0.9491192698478699, + "learning_rate": 5.853338005957596e-05, + "loss": 1.6994, + "step": 4736 + }, + { + "epoch": 1.24491739744474, + "grad_norm": 0.7140454649925232, + "learning_rate": 5.8515857718591205e-05, + "loss": 1.6665, + "step": 4738 + }, + { + "epoch": 1.2454429007784018, + "grad_norm": 0.5758277773857117, + "learning_rate": 5.849833537760645e-05, + "loss": 1.6842, + "step": 4740 + }, + { + "epoch": 1.2459684041120636, + "grad_norm": 0.5660161972045898, + "learning_rate": 5.848081303662169e-05, + "loss": 1.6844, + "step": 4742 + }, + { + "epoch": 1.2464939074457253, + "grad_norm": 0.7871302962303162, + "learning_rate": 5.846329069563694e-05, + "loss": 1.6983, + "step": 4744 + }, + { + "epoch": 1.247019410779387, + "grad_norm": 0.5468810200691223, + "learning_rate": 5.844576835465219e-05, + "loss": 1.6561, + "step": 4746 + }, + { + "epoch": 1.2475449141130488, + "grad_norm": 0.545019268989563, + "learning_rate": 5.842824601366743e-05, + "loss": 1.6704, + "step": 4748 + }, + { + "epoch": 1.2480704174467108, + "grad_norm": 0.5376728177070618, + "learning_rate": 5.8410723672682676e-05, + "loss": 1.6373, + "step": 4750 + }, + { + "epoch": 1.2485959207803725, + "grad_norm": 0.6872471570968628, + "learning_rate": 5.8393201331697924e-05, + "loss": 1.6629, + "step": 4752 + }, + { + "epoch": 1.2491214241140343, + "grad_norm": 0.6468409299850464, + "learning_rate": 5.8375678990713165e-05, + "loss": 1.6785, + "step": 4754 + }, + { + "epoch": 1.249646927447696, + "grad_norm": 0.7774249315261841, + "learning_rate": 5.83581566497284e-05, + "loss": 1.642, + "step": 4756 + }, + { + "epoch": 1.2501724307813578, + "grad_norm": 0.6062743067741394, + "learning_rate": 5.8340634308743646e-05, + "loss": 1.7039, + "step": 4758 + }, + { + "epoch": 1.2506979341150195, + "grad_norm": 0.6170024871826172, + "learning_rate": 5.8323111967758894e-05, + "loss": 1.7296, + "step": 4760 + }, + { + "epoch": 1.2512234374486813, + "grad_norm": 0.5989388823509216, + "learning_rate": 5.8305589626774135e-05, + "loss": 1.6776, + "step": 4762 + }, + { + "epoch": 1.2517489407823432, + "grad_norm": 0.6566706895828247, + "learning_rate": 5.828806728578938e-05, + "loss": 1.7016, + "step": 4764 + }, + { + "epoch": 1.2522744441160047, + "grad_norm": 0.6715628504753113, + "learning_rate": 5.827054494480463e-05, + "loss": 1.6946, + "step": 4766 + }, + { + "epoch": 1.2527999474496667, + "grad_norm": 0.7730180621147156, + "learning_rate": 5.825302260381987e-05, + "loss": 1.7043, + "step": 4768 + }, + { + "epoch": 1.2533254507833285, + "grad_norm": 0.5837029814720154, + "learning_rate": 5.823550026283512e-05, + "loss": 1.7019, + "step": 4770 + }, + { + "epoch": 1.2538509541169902, + "grad_norm": 0.6363146901130676, + "learning_rate": 5.8217977921850366e-05, + "loss": 1.6965, + "step": 4772 + }, + { + "epoch": 1.254376457450652, + "grad_norm": 0.6856462359428406, + "learning_rate": 5.8200455580865606e-05, + "loss": 1.6899, + "step": 4774 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 0.7007546424865723, + "learning_rate": 5.8182933239880854e-05, + "loss": 1.6673, + "step": 4776 + }, + { + "epoch": 1.2554274641179755, + "grad_norm": 0.6396967172622681, + "learning_rate": 5.81654108988961e-05, + "loss": 1.6826, + "step": 4778 + }, + { + "epoch": 1.2559529674516372, + "grad_norm": 0.6191834211349487, + "learning_rate": 5.814788855791134e-05, + "loss": 1.6848, + "step": 4780 + }, + { + "epoch": 1.2564784707852992, + "grad_norm": 0.7049083709716797, + "learning_rate": 5.8130366216926576e-05, + "loss": 1.6785, + "step": 4782 + }, + { + "epoch": 1.2570039741189607, + "grad_norm": 0.5608694553375244, + "learning_rate": 5.8112843875941824e-05, + "loss": 1.7138, + "step": 4784 + }, + { + "epoch": 1.2575294774526227, + "grad_norm": 0.5731740593910217, + "learning_rate": 5.809532153495707e-05, + "loss": 1.6912, + "step": 4786 + }, + { + "epoch": 1.2580549807862844, + "grad_norm": 0.5545026063919067, + "learning_rate": 5.807779919397231e-05, + "loss": 1.6621, + "step": 4788 + }, + { + "epoch": 1.2585804841199462, + "grad_norm": 0.5389024019241333, + "learning_rate": 5.806027685298756e-05, + "loss": 1.6956, + "step": 4790 + }, + { + "epoch": 1.259105987453608, + "grad_norm": 0.5764096975326538, + "learning_rate": 5.804275451200281e-05, + "loss": 1.6556, + "step": 4792 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.5584897398948669, + "learning_rate": 5.802523217101805e-05, + "loss": 1.6837, + "step": 4794 + }, + { + "epoch": 1.2601569941209314, + "grad_norm": 0.6685154438018799, + "learning_rate": 5.8007709830033296e-05, + "loss": 1.6343, + "step": 4796 + }, + { + "epoch": 1.2606824974545932, + "grad_norm": 0.6008186936378479, + "learning_rate": 5.799018748904854e-05, + "loss": 1.6674, + "step": 4798 + }, + { + "epoch": 1.2612080007882551, + "grad_norm": 0.6519007086753845, + "learning_rate": 5.7972665148063784e-05, + "loss": 1.6898, + "step": 4800 + }, + { + "epoch": 1.2612080007882551, + "eval_loss": 1.700656533241272, + "eval_runtime": 487.1424, + "eval_samples_per_second": 250.007, + "eval_steps_per_second": 31.252, + "step": 4800 + }, + { + "epoch": 1.2617335041219166, + "grad_norm": 0.5863731503486633, + "learning_rate": 5.795514280707903e-05, + "loss": 1.6239, + "step": 4802 + }, + { + "epoch": 1.2622590074555786, + "grad_norm": 0.6118308901786804, + "learning_rate": 5.793762046609428e-05, + "loss": 1.6997, + "step": 4804 + }, + { + "epoch": 1.2627845107892404, + "grad_norm": 0.7632036209106445, + "learning_rate": 5.792009812510951e-05, + "loss": 1.6626, + "step": 4806 + }, + { + "epoch": 1.2633100141229021, + "grad_norm": 0.593360960483551, + "learning_rate": 5.7902575784124754e-05, + "loss": 1.6919, + "step": 4808 + }, + { + "epoch": 1.2638355174565639, + "grad_norm": 0.8928617238998413, + "learning_rate": 5.788505344314e-05, + "loss": 1.7171, + "step": 4810 + }, + { + "epoch": 1.2643610207902256, + "grad_norm": 0.6598185896873474, + "learning_rate": 5.786753110215525e-05, + "loss": 1.6581, + "step": 4812 + }, + { + "epoch": 1.2648865241238874, + "grad_norm": 0.6565474271774292, + "learning_rate": 5.785000876117049e-05, + "loss": 1.6763, + "step": 4814 + }, + { + "epoch": 1.265412027457549, + "grad_norm": 0.796642541885376, + "learning_rate": 5.783248642018574e-05, + "loss": 1.6925, + "step": 4816 + }, + { + "epoch": 1.265937530791211, + "grad_norm": 0.517112135887146, + "learning_rate": 5.7814964079200985e-05, + "loss": 1.7095, + "step": 4818 + }, + { + "epoch": 1.2664630341248726, + "grad_norm": 0.6834176182746887, + "learning_rate": 5.7797441738216226e-05, + "loss": 1.6919, + "step": 4820 + }, + { + "epoch": 1.2669885374585346, + "grad_norm": 0.5728265643119812, + "learning_rate": 5.7779919397231473e-05, + "loss": 1.6796, + "step": 4822 + }, + { + "epoch": 1.2675140407921963, + "grad_norm": 0.8350217342376709, + "learning_rate": 5.776239705624672e-05, + "loss": 1.6946, + "step": 4824 + }, + { + "epoch": 1.268039544125858, + "grad_norm": 0.7911032438278198, + "learning_rate": 5.774487471526196e-05, + "loss": 1.705, + "step": 4826 + }, + { + "epoch": 1.2685650474595198, + "grad_norm": 0.6383318305015564, + "learning_rate": 5.772735237427721e-05, + "loss": 1.6945, + "step": 4828 + }, + { + "epoch": 1.2690905507931816, + "grad_norm": 0.7389723062515259, + "learning_rate": 5.770983003329246e-05, + "loss": 1.6889, + "step": 4830 + }, + { + "epoch": 1.2696160541268433, + "grad_norm": 0.5114458203315735, + "learning_rate": 5.769230769230769e-05, + "loss": 1.69, + "step": 4832 + }, + { + "epoch": 1.270141557460505, + "grad_norm": 0.7528431415557861, + "learning_rate": 5.767478535132294e-05, + "loss": 1.6829, + "step": 4834 + }, + { + "epoch": 1.270667060794167, + "grad_norm": 0.6436894536018372, + "learning_rate": 5.765726301033818e-05, + "loss": 1.6968, + "step": 4836 + }, + { + "epoch": 1.2711925641278286, + "grad_norm": 0.601234495639801, + "learning_rate": 5.763974066935343e-05, + "loss": 1.6826, + "step": 4838 + }, + { + "epoch": 1.2717180674614905, + "grad_norm": 0.6918678283691406, + "learning_rate": 5.7622218328368674e-05, + "loss": 1.698, + "step": 4840 + }, + { + "epoch": 1.2722435707951523, + "grad_norm": 0.5951560139656067, + "learning_rate": 5.7604695987383915e-05, + "loss": 1.7462, + "step": 4842 + }, + { + "epoch": 1.272769074128814, + "grad_norm": 0.8118217587471008, + "learning_rate": 5.758717364639916e-05, + "loss": 1.6818, + "step": 4844 + }, + { + "epoch": 1.2732945774624758, + "grad_norm": 0.5995089411735535, + "learning_rate": 5.756965130541441e-05, + "loss": 1.7061, + "step": 4846 + }, + { + "epoch": 1.2738200807961375, + "grad_norm": 0.5712124705314636, + "learning_rate": 5.755212896442965e-05, + "loss": 1.6635, + "step": 4848 + }, + { + "epoch": 1.2743455841297993, + "grad_norm": 0.975643515586853, + "learning_rate": 5.75346066234449e-05, + "loss": 1.6871, + "step": 4850 + }, + { + "epoch": 1.274871087463461, + "grad_norm": 0.6375470161437988, + "learning_rate": 5.7517084282460146e-05, + "loss": 1.6946, + "step": 4852 + }, + { + "epoch": 1.275396590797123, + "grad_norm": 0.6874341368675232, + "learning_rate": 5.749956194147539e-05, + "loss": 1.6987, + "step": 4854 + }, + { + "epoch": 1.2759220941307847, + "grad_norm": 0.5625795125961304, + "learning_rate": 5.7482039600490635e-05, + "loss": 1.6946, + "step": 4856 + }, + { + "epoch": 1.2764475974644465, + "grad_norm": 0.7773122191429138, + "learning_rate": 5.746451725950587e-05, + "loss": 1.6922, + "step": 4858 + }, + { + "epoch": 1.2769731007981082, + "grad_norm": 0.6410396099090576, + "learning_rate": 5.7446994918521116e-05, + "loss": 1.676, + "step": 4860 + }, + { + "epoch": 1.27749860413177, + "grad_norm": 0.5476670861244202, + "learning_rate": 5.742947257753636e-05, + "loss": 1.7063, + "step": 4862 + }, + { + "epoch": 1.2780241074654317, + "grad_norm": 0.5646496415138245, + "learning_rate": 5.7411950236551604e-05, + "loss": 1.6657, + "step": 4864 + }, + { + "epoch": 1.2785496107990935, + "grad_norm": 0.7577764987945557, + "learning_rate": 5.739442789556685e-05, + "loss": 1.6869, + "step": 4866 + }, + { + "epoch": 1.2790751141327552, + "grad_norm": 0.5627464652061462, + "learning_rate": 5.737690555458209e-05, + "loss": 1.6608, + "step": 4868 + }, + { + "epoch": 1.279600617466417, + "grad_norm": 0.5386861562728882, + "learning_rate": 5.735938321359734e-05, + "loss": 1.6826, + "step": 4870 + }, + { + "epoch": 1.280126120800079, + "grad_norm": 0.8496121168136597, + "learning_rate": 5.734186087261259e-05, + "loss": 1.6771, + "step": 4872 + }, + { + "epoch": 1.2806516241337407, + "grad_norm": 0.6117552518844604, + "learning_rate": 5.732433853162783e-05, + "loss": 1.6957, + "step": 4874 + }, + { + "epoch": 1.2811771274674024, + "grad_norm": 0.7759193181991577, + "learning_rate": 5.7306816190643076e-05, + "loss": 1.6396, + "step": 4876 + }, + { + "epoch": 1.2817026308010642, + "grad_norm": 0.6560829281806946, + "learning_rate": 5.7289293849658324e-05, + "loss": 1.6983, + "step": 4878 + }, + { + "epoch": 1.282228134134726, + "grad_norm": 0.6807093024253845, + "learning_rate": 5.7271771508673565e-05, + "loss": 1.6545, + "step": 4880 + }, + { + "epoch": 1.2827536374683877, + "grad_norm": 0.6546826958656311, + "learning_rate": 5.72542491676888e-05, + "loss": 1.7138, + "step": 4882 + }, + { + "epoch": 1.2832791408020494, + "grad_norm": 0.5959972739219666, + "learning_rate": 5.7236726826704046e-05, + "loss": 1.6567, + "step": 4884 + }, + { + "epoch": 1.2838046441357112, + "grad_norm": 0.6360346078872681, + "learning_rate": 5.7219204485719294e-05, + "loss": 1.6764, + "step": 4886 + }, + { + "epoch": 1.284330147469373, + "grad_norm": 0.7171315550804138, + "learning_rate": 5.7201682144734535e-05, + "loss": 1.6489, + "step": 4888 + }, + { + "epoch": 1.2848556508030349, + "grad_norm": 0.6501976251602173, + "learning_rate": 5.718415980374978e-05, + "loss": 1.7136, + "step": 4890 + }, + { + "epoch": 1.2853811541366966, + "grad_norm": 0.7134879231452942, + "learning_rate": 5.716663746276503e-05, + "loss": 1.6789, + "step": 4892 + }, + { + "epoch": 1.2859066574703584, + "grad_norm": 0.6386318206787109, + "learning_rate": 5.714911512178027e-05, + "loss": 1.6877, + "step": 4894 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.5816686749458313, + "learning_rate": 5.713159278079552e-05, + "loss": 1.7148, + "step": 4896 + }, + { + "epoch": 1.2869576641376819, + "grad_norm": 0.49736329913139343, + "learning_rate": 5.7114070439810766e-05, + "loss": 1.6591, + "step": 4898 + }, + { + "epoch": 1.2874831674713436, + "grad_norm": 0.5454695820808411, + "learning_rate": 5.7096548098826006e-05, + "loss": 1.7062, + "step": 4900 + }, + { + "epoch": 1.2880086708050054, + "grad_norm": 0.5578910708427429, + "learning_rate": 5.7079025757841254e-05, + "loss": 1.6773, + "step": 4902 + }, + { + "epoch": 1.288534174138667, + "grad_norm": 0.7455135583877563, + "learning_rate": 5.70615034168565e-05, + "loss": 1.6605, + "step": 4904 + }, + { + "epoch": 1.2890596774723289, + "grad_norm": 0.6031019687652588, + "learning_rate": 5.704398107587174e-05, + "loss": 1.706, + "step": 4906 + }, + { + "epoch": 1.2895851808059908, + "grad_norm": 0.6043726801872253, + "learning_rate": 5.7026458734886976e-05, + "loss": 1.6919, + "step": 4908 + }, + { + "epoch": 1.2901106841396526, + "grad_norm": 0.6083821654319763, + "learning_rate": 5.7008936393902224e-05, + "loss": 1.6752, + "step": 4910 + }, + { + "epoch": 1.2906361874733143, + "grad_norm": 0.5966817736625671, + "learning_rate": 5.699141405291747e-05, + "loss": 1.6838, + "step": 4912 + }, + { + "epoch": 1.291161690806976, + "grad_norm": 0.5941465497016907, + "learning_rate": 5.697389171193271e-05, + "loss": 1.6899, + "step": 4914 + }, + { + "epoch": 1.2916871941406378, + "grad_norm": 0.6362673044204712, + "learning_rate": 5.695636937094796e-05, + "loss": 1.684, + "step": 4916 + }, + { + "epoch": 1.2922126974742996, + "grad_norm": 0.6817765831947327, + "learning_rate": 5.693884702996321e-05, + "loss": 1.668, + "step": 4918 + }, + { + "epoch": 1.2927382008079613, + "grad_norm": 0.6063424944877625, + "learning_rate": 5.692132468897845e-05, + "loss": 1.6628, + "step": 4920 + }, + { + "epoch": 1.2932637041416233, + "grad_norm": 0.595859169960022, + "learning_rate": 5.6903802347993696e-05, + "loss": 1.6989, + "step": 4922 + }, + { + "epoch": 1.2937892074752848, + "grad_norm": 0.5616616606712341, + "learning_rate": 5.688628000700894e-05, + "loss": 1.7036, + "step": 4924 + }, + { + "epoch": 1.2943147108089468, + "grad_norm": 0.5995229482650757, + "learning_rate": 5.6868757666024184e-05, + "loss": 1.6894, + "step": 4926 + }, + { + "epoch": 1.2948402141426085, + "grad_norm": 0.5990728735923767, + "learning_rate": 5.685123532503943e-05, + "loss": 1.6612, + "step": 4928 + }, + { + "epoch": 1.2953657174762703, + "grad_norm": 0.5425363183021545, + "learning_rate": 5.683371298405468e-05, + "loss": 1.6523, + "step": 4930 + }, + { + "epoch": 1.295891220809932, + "grad_norm": 0.5370776653289795, + "learning_rate": 5.681619064306992e-05, + "loss": 1.6977, + "step": 4932 + }, + { + "epoch": 1.2964167241435938, + "grad_norm": 0.6286599636077881, + "learning_rate": 5.6798668302085154e-05, + "loss": 1.7288, + "step": 4934 + }, + { + "epoch": 1.2969422274772555, + "grad_norm": 0.575813889503479, + "learning_rate": 5.67811459611004e-05, + "loss": 1.6914, + "step": 4936 + }, + { + "epoch": 1.2974677308109173, + "grad_norm": 0.5949034690856934, + "learning_rate": 5.676362362011565e-05, + "loss": 1.6733, + "step": 4938 + }, + { + "epoch": 1.2979932341445792, + "grad_norm": 0.6935321688652039, + "learning_rate": 5.674610127913089e-05, + "loss": 1.6886, + "step": 4940 + }, + { + "epoch": 1.2985187374782408, + "grad_norm": 0.6900460124015808, + "learning_rate": 5.672857893814614e-05, + "loss": 1.6985, + "step": 4942 + }, + { + "epoch": 1.2990442408119027, + "grad_norm": 0.5892564654350281, + "learning_rate": 5.6711056597161385e-05, + "loss": 1.6837, + "step": 4944 + }, + { + "epoch": 1.2995697441455645, + "grad_norm": 0.6516983509063721, + "learning_rate": 5.6693534256176626e-05, + "loss": 1.6797, + "step": 4946 + }, + { + "epoch": 1.3000952474792262, + "grad_norm": 0.8544719219207764, + "learning_rate": 5.6676011915191873e-05, + "loss": 1.6911, + "step": 4948 + }, + { + "epoch": 1.300620750812888, + "grad_norm": 0.7540829181671143, + "learning_rate": 5.665848957420712e-05, + "loss": 1.7333, + "step": 4950 + }, + { + "epoch": 1.3011462541465497, + "grad_norm": 0.8198524117469788, + "learning_rate": 5.664096723322236e-05, + "loss": 1.7286, + "step": 4952 + }, + { + "epoch": 1.3016717574802115, + "grad_norm": 0.6572164297103882, + "learning_rate": 5.662344489223761e-05, + "loss": 1.7171, + "step": 4954 + }, + { + "epoch": 1.3021972608138732, + "grad_norm": 0.6047423481941223, + "learning_rate": 5.660592255125286e-05, + "loss": 1.6762, + "step": 4956 + }, + { + "epoch": 1.3027227641475352, + "grad_norm": 0.6690229177474976, + "learning_rate": 5.65884002102681e-05, + "loss": 1.6802, + "step": 4958 + }, + { + "epoch": 1.3032482674811967, + "grad_norm": 0.57923823595047, + "learning_rate": 5.657087786928333e-05, + "loss": 1.6864, + "step": 4960 + }, + { + "epoch": 1.3037737708148587, + "grad_norm": 0.6738536357879639, + "learning_rate": 5.655335552829858e-05, + "loss": 1.6883, + "step": 4962 + }, + { + "epoch": 1.3042992741485204, + "grad_norm": 0.5950745344161987, + "learning_rate": 5.653583318731383e-05, + "loss": 1.6688, + "step": 4964 + }, + { + "epoch": 1.3048247774821822, + "grad_norm": 0.5781100988388062, + "learning_rate": 5.651831084632907e-05, + "loss": 1.699, + "step": 4966 + }, + { + "epoch": 1.305350280815844, + "grad_norm": 0.535622775554657, + "learning_rate": 5.6500788505344315e-05, + "loss": 1.6881, + "step": 4968 + }, + { + "epoch": 1.3058757841495057, + "grad_norm": 0.6280853152275085, + "learning_rate": 5.648326616435956e-05, + "loss": 1.6802, + "step": 4970 + }, + { + "epoch": 1.3064012874831674, + "grad_norm": 0.538373589515686, + "learning_rate": 5.6465743823374803e-05, + "loss": 1.6446, + "step": 4972 + }, + { + "epoch": 1.3069267908168292, + "grad_norm": 0.5861368775367737, + "learning_rate": 5.644822148239005e-05, + "loss": 1.6667, + "step": 4974 + }, + { + "epoch": 1.3074522941504911, + "grad_norm": 0.6625964045524597, + "learning_rate": 5.64306991414053e-05, + "loss": 1.6879, + "step": 4976 + }, + { + "epoch": 1.3079777974841527, + "grad_norm": 0.6457346081733704, + "learning_rate": 5.641317680042054e-05, + "loss": 1.688, + "step": 4978 + }, + { + "epoch": 1.3085033008178146, + "grad_norm": 0.6461663246154785, + "learning_rate": 5.639565445943579e-05, + "loss": 1.6974, + "step": 4980 + }, + { + "epoch": 1.3090288041514764, + "grad_norm": 0.6430699825286865, + "learning_rate": 5.6378132118451035e-05, + "loss": 1.7059, + "step": 4982 + }, + { + "epoch": 1.3095543074851381, + "grad_norm": 0.6152409315109253, + "learning_rate": 5.636060977746627e-05, + "loss": 1.649, + "step": 4984 + }, + { + "epoch": 1.3100798108187999, + "grad_norm": 0.5380004644393921, + "learning_rate": 5.634308743648151e-05, + "loss": 1.6893, + "step": 4986 + }, + { + "epoch": 1.3106053141524616, + "grad_norm": 0.6044327020645142, + "learning_rate": 5.632556509549676e-05, + "loss": 1.714, + "step": 4988 + }, + { + "epoch": 1.3111308174861234, + "grad_norm": 0.5950967073440552, + "learning_rate": 5.6308042754512004e-05, + "loss": 1.695, + "step": 4990 + }, + { + "epoch": 1.3116563208197851, + "grad_norm": 0.5887439846992493, + "learning_rate": 5.6290520413527245e-05, + "loss": 1.6914, + "step": 4992 + }, + { + "epoch": 1.312181824153447, + "grad_norm": 0.6020603775978088, + "learning_rate": 5.627299807254249e-05, + "loss": 1.6847, + "step": 4994 + }, + { + "epoch": 1.3127073274871086, + "grad_norm": 0.6895347237586975, + "learning_rate": 5.625547573155774e-05, + "loss": 1.6888, + "step": 4996 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.7459006309509277, + "learning_rate": 5.623795339057298e-05, + "loss": 1.6828, + "step": 4998 + }, + { + "epoch": 1.3137583341544323, + "grad_norm": 0.528291642665863, + "learning_rate": 5.622043104958823e-05, + "loss": 1.6612, + "step": 5000 + }, + { + "epoch": 1.314283837488094, + "grad_norm": 0.5710020065307617, + "learning_rate": 5.6202908708603476e-05, + "loss": 1.7157, + "step": 5002 + }, + { + "epoch": 1.3148093408217558, + "grad_norm": 0.7354927062988281, + "learning_rate": 5.618538636761872e-05, + "loss": 1.6948, + "step": 5004 + }, + { + "epoch": 1.3153348441554176, + "grad_norm": 0.5487669110298157, + "learning_rate": 5.6167864026633965e-05, + "loss": 1.6546, + "step": 5006 + }, + { + "epoch": 1.3158603474890793, + "grad_norm": 0.5558237433433533, + "learning_rate": 5.615034168564921e-05, + "loss": 1.6729, + "step": 5008 + }, + { + "epoch": 1.316385850822741, + "grad_norm": 0.6180663704872131, + "learning_rate": 5.6132819344664446e-05, + "loss": 1.672, + "step": 5010 + }, + { + "epoch": 1.316911354156403, + "grad_norm": 0.5914913415908813, + "learning_rate": 5.611529700367969e-05, + "loss": 1.6727, + "step": 5012 + }, + { + "epoch": 1.3174368574900648, + "grad_norm": 0.6253445148468018, + "learning_rate": 5.6097774662694935e-05, + "loss": 1.6805, + "step": 5014 + }, + { + "epoch": 1.3179623608237265, + "grad_norm": 0.755477249622345, + "learning_rate": 5.608025232171018e-05, + "loss": 1.6646, + "step": 5016 + }, + { + "epoch": 1.3184878641573883, + "grad_norm": 0.5822807550430298, + "learning_rate": 5.606272998072542e-05, + "loss": 1.6779, + "step": 5018 + }, + { + "epoch": 1.31901336749105, + "grad_norm": 0.5383006930351257, + "learning_rate": 5.604520763974067e-05, + "loss": 1.6942, + "step": 5020 + }, + { + "epoch": 1.3195388708247118, + "grad_norm": 0.5826961398124695, + "learning_rate": 5.602768529875592e-05, + "loss": 1.6874, + "step": 5022 + }, + { + "epoch": 1.3200643741583735, + "grad_norm": 0.5648834705352783, + "learning_rate": 5.601016295777116e-05, + "loss": 1.6948, + "step": 5024 + }, + { + "epoch": 1.3205898774920353, + "grad_norm": 0.6096558570861816, + "learning_rate": 5.5992640616786406e-05, + "loss": 1.6924, + "step": 5026 + }, + { + "epoch": 1.321115380825697, + "grad_norm": 0.5826833844184875, + "learning_rate": 5.5975118275801654e-05, + "loss": 1.7116, + "step": 5028 + }, + { + "epoch": 1.321640884159359, + "grad_norm": 0.5931800007820129, + "learning_rate": 5.5957595934816895e-05, + "loss": 1.6996, + "step": 5030 + }, + { + "epoch": 1.3221663874930207, + "grad_norm": 0.6546478271484375, + "learning_rate": 5.594007359383214e-05, + "loss": 1.6639, + "step": 5032 + }, + { + "epoch": 1.3226918908266825, + "grad_norm": 0.6511355042457581, + "learning_rate": 5.592255125284739e-05, + "loss": 1.6912, + "step": 5034 + }, + { + "epoch": 1.3232173941603442, + "grad_norm": 0.57513028383255, + "learning_rate": 5.5905028911862624e-05, + "loss": 1.6479, + "step": 5036 + }, + { + "epoch": 1.323742897494006, + "grad_norm": 0.5553883910179138, + "learning_rate": 5.5887506570877865e-05, + "loss": 1.6877, + "step": 5038 + }, + { + "epoch": 1.3242684008276677, + "grad_norm": 0.6477178931236267, + "learning_rate": 5.586998422989311e-05, + "loss": 1.7209, + "step": 5040 + }, + { + "epoch": 1.3247939041613295, + "grad_norm": 0.6500737071037292, + "learning_rate": 5.585246188890836e-05, + "loss": 1.6838, + "step": 5042 + }, + { + "epoch": 1.3253194074949912, + "grad_norm": 0.499759703874588, + "learning_rate": 5.58349395479236e-05, + "loss": 1.6858, + "step": 5044 + }, + { + "epoch": 1.325844910828653, + "grad_norm": 0.6388137340545654, + "learning_rate": 5.581741720693885e-05, + "loss": 1.6836, + "step": 5046 + }, + { + "epoch": 1.326370414162315, + "grad_norm": 0.7066943049430847, + "learning_rate": 5.5799894865954096e-05, + "loss": 1.7018, + "step": 5048 + }, + { + "epoch": 1.3268959174959767, + "grad_norm": 0.5318599343299866, + "learning_rate": 5.5782372524969336e-05, + "loss": 1.676, + "step": 5050 + }, + { + "epoch": 1.3274214208296384, + "grad_norm": 0.6772581338882446, + "learning_rate": 5.5764850183984584e-05, + "loss": 1.6909, + "step": 5052 + }, + { + "epoch": 1.3279469241633002, + "grad_norm": 0.6018180847167969, + "learning_rate": 5.574732784299983e-05, + "loss": 1.6817, + "step": 5054 + }, + { + "epoch": 1.328472427496962, + "grad_norm": 0.5764731168746948, + "learning_rate": 5.572980550201507e-05, + "loss": 1.6866, + "step": 5056 + }, + { + "epoch": 1.3289979308306237, + "grad_norm": 0.613559365272522, + "learning_rate": 5.571228316103032e-05, + "loss": 1.6798, + "step": 5058 + }, + { + "epoch": 1.3295234341642854, + "grad_norm": 0.6392707228660583, + "learning_rate": 5.569476082004557e-05, + "loss": 1.6794, + "step": 5060 + }, + { + "epoch": 1.3300489374979472, + "grad_norm": 0.6719961762428284, + "learning_rate": 5.56772384790608e-05, + "loss": 1.6773, + "step": 5062 + }, + { + "epoch": 1.330574440831609, + "grad_norm": 0.6991645693778992, + "learning_rate": 5.565971613807604e-05, + "loss": 1.7275, + "step": 5064 + }, + { + "epoch": 1.3310999441652709, + "grad_norm": 0.54388827085495, + "learning_rate": 5.564219379709129e-05, + "loss": 1.6641, + "step": 5066 + }, + { + "epoch": 1.3316254474989326, + "grad_norm": 0.8866559267044067, + "learning_rate": 5.562467145610654e-05, + "loss": 1.666, + "step": 5068 + }, + { + "epoch": 1.3321509508325944, + "grad_norm": 0.5608734488487244, + "learning_rate": 5.560714911512178e-05, + "loss": 1.6793, + "step": 5070 + }, + { + "epoch": 1.3326764541662561, + "grad_norm": 0.5888034701347351, + "learning_rate": 5.5589626774137026e-05, + "loss": 1.6902, + "step": 5072 + }, + { + "epoch": 1.3332019574999179, + "grad_norm": 0.5383687019348145, + "learning_rate": 5.557210443315227e-05, + "loss": 1.6738, + "step": 5074 + }, + { + "epoch": 1.3337274608335796, + "grad_norm": 0.681666374206543, + "learning_rate": 5.5554582092167514e-05, + "loss": 1.6709, + "step": 5076 + }, + { + "epoch": 1.3342529641672414, + "grad_norm": 0.6175423264503479, + "learning_rate": 5.553705975118276e-05, + "loss": 1.6617, + "step": 5078 + }, + { + "epoch": 1.3347784675009033, + "grad_norm": 0.5964341163635254, + "learning_rate": 5.551953741019801e-05, + "loss": 1.6484, + "step": 5080 + }, + { + "epoch": 1.3353039708345649, + "grad_norm": 0.5749648809432983, + "learning_rate": 5.550201506921325e-05, + "loss": 1.6819, + "step": 5082 + }, + { + "epoch": 1.3358294741682268, + "grad_norm": 0.6597902774810791, + "learning_rate": 5.54844927282285e-05, + "loss": 1.7234, + "step": 5084 + }, + { + "epoch": 1.3363549775018886, + "grad_norm": 0.6115815043449402, + "learning_rate": 5.546697038724373e-05, + "loss": 1.683, + "step": 5086 + }, + { + "epoch": 1.3368804808355503, + "grad_norm": 0.63287353515625, + "learning_rate": 5.544944804625898e-05, + "loss": 1.6759, + "step": 5088 + }, + { + "epoch": 1.337405984169212, + "grad_norm": 0.6917816400527954, + "learning_rate": 5.543192570527423e-05, + "loss": 1.6938, + "step": 5090 + }, + { + "epoch": 1.3379314875028738, + "grad_norm": 0.5193148255348206, + "learning_rate": 5.541440336428947e-05, + "loss": 1.6918, + "step": 5092 + }, + { + "epoch": 1.3384569908365356, + "grad_norm": 0.5470016002655029, + "learning_rate": 5.5396881023304715e-05, + "loss": 1.6733, + "step": 5094 + }, + { + "epoch": 1.3389824941701973, + "grad_norm": 0.6966432332992554, + "learning_rate": 5.537935868231996e-05, + "loss": 1.6883, + "step": 5096 + }, + { + "epoch": 1.3395079975038593, + "grad_norm": 0.6289670467376709, + "learning_rate": 5.5361836341335203e-05, + "loss": 1.6666, + "step": 5098 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.5251250863075256, + "learning_rate": 5.534431400035045e-05, + "loss": 1.6477, + "step": 5100 + }, + { + "epoch": 1.3405590041711828, + "grad_norm": 0.663750171661377, + "learning_rate": 5.53267916593657e-05, + "loss": 1.6713, + "step": 5102 + }, + { + "epoch": 1.3410845075048445, + "grad_norm": 0.5892371535301208, + "learning_rate": 5.530926931838094e-05, + "loss": 1.6935, + "step": 5104 + }, + { + "epoch": 1.3416100108385063, + "grad_norm": 0.7290005683898926, + "learning_rate": 5.529174697739619e-05, + "loss": 1.6685, + "step": 5106 + }, + { + "epoch": 1.342135514172168, + "grad_norm": 0.6136685013771057, + "learning_rate": 5.5274224636411435e-05, + "loss": 1.6441, + "step": 5108 + }, + { + "epoch": 1.3426610175058298, + "grad_norm": 0.648530900478363, + "learning_rate": 5.5256702295426675e-05, + "loss": 1.6874, + "step": 5110 + }, + { + "epoch": 1.3431865208394915, + "grad_norm": 0.556920051574707, + "learning_rate": 5.523917995444191e-05, + "loss": 1.6945, + "step": 5112 + }, + { + "epoch": 1.3437120241731533, + "grad_norm": 0.6408731937408447, + "learning_rate": 5.522165761345716e-05, + "loss": 1.6719, + "step": 5114 + }, + { + "epoch": 1.3442375275068152, + "grad_norm": 0.6126033663749695, + "learning_rate": 5.5204135272472404e-05, + "loss": 1.6851, + "step": 5116 + }, + { + "epoch": 1.3447630308404768, + "grad_norm": 0.7299725413322449, + "learning_rate": 5.5186612931487645e-05, + "loss": 1.6442, + "step": 5118 + }, + { + "epoch": 1.3452885341741387, + "grad_norm": 0.5226702690124512, + "learning_rate": 5.516909059050289e-05, + "loss": 1.6779, + "step": 5120 + }, + { + "epoch": 1.3458140375078005, + "grad_norm": 0.5271426439285278, + "learning_rate": 5.515156824951814e-05, + "loss": 1.6756, + "step": 5122 + }, + { + "epoch": 1.3463395408414622, + "grad_norm": 0.5417149066925049, + "learning_rate": 5.513404590853338e-05, + "loss": 1.7041, + "step": 5124 + }, + { + "epoch": 1.346865044175124, + "grad_norm": 0.639298677444458, + "learning_rate": 5.511652356754863e-05, + "loss": 1.7372, + "step": 5126 + }, + { + "epoch": 1.3473905475087857, + "grad_norm": 0.7180479764938354, + "learning_rate": 5.5099001226563876e-05, + "loss": 1.7078, + "step": 5128 + }, + { + "epoch": 1.3479160508424475, + "grad_norm": 0.6249936819076538, + "learning_rate": 5.508147888557912e-05, + "loss": 1.672, + "step": 5130 + }, + { + "epoch": 1.3484415541761092, + "grad_norm": 0.6733811497688293, + "learning_rate": 5.5063956544594365e-05, + "loss": 1.6648, + "step": 5132 + }, + { + "epoch": 1.3489670575097712, + "grad_norm": 0.628979504108429, + "learning_rate": 5.504643420360961e-05, + "loss": 1.6724, + "step": 5134 + }, + { + "epoch": 1.3494925608434327, + "grad_norm": 0.6797083020210266, + "learning_rate": 5.502891186262485e-05, + "loss": 1.6681, + "step": 5136 + }, + { + "epoch": 1.3500180641770947, + "grad_norm": 0.5499573945999146, + "learning_rate": 5.501138952164009e-05, + "loss": 1.6877, + "step": 5138 + }, + { + "epoch": 1.3505435675107564, + "grad_norm": 0.6924422383308411, + "learning_rate": 5.4993867180655335e-05, + "loss": 1.6836, + "step": 5140 + }, + { + "epoch": 1.3510690708444182, + "grad_norm": 0.5735469460487366, + "learning_rate": 5.497634483967058e-05, + "loss": 1.6953, + "step": 5142 + }, + { + "epoch": 1.35159457417808, + "grad_norm": 0.6161128282546997, + "learning_rate": 5.495882249868582e-05, + "loss": 1.6663, + "step": 5144 + }, + { + "epoch": 1.3521200775117417, + "grad_norm": 0.5718648433685303, + "learning_rate": 5.494130015770107e-05, + "loss": 1.6785, + "step": 5146 + }, + { + "epoch": 1.3526455808454034, + "grad_norm": 0.6534953713417053, + "learning_rate": 5.492377781671632e-05, + "loss": 1.6859, + "step": 5148 + }, + { + "epoch": 1.3531710841790652, + "grad_norm": 0.7436297535896301, + "learning_rate": 5.490625547573156e-05, + "loss": 1.6833, + "step": 5150 + }, + { + "epoch": 1.3536965875127271, + "grad_norm": 0.5223855972290039, + "learning_rate": 5.4888733134746806e-05, + "loss": 1.7009, + "step": 5152 + }, + { + "epoch": 1.3542220908463887, + "grad_norm": 0.6421153545379639, + "learning_rate": 5.4871210793762054e-05, + "loss": 1.6884, + "step": 5154 + }, + { + "epoch": 1.3547475941800506, + "grad_norm": 0.5627347826957703, + "learning_rate": 5.4853688452777295e-05, + "loss": 1.6761, + "step": 5156 + }, + { + "epoch": 1.3552730975137124, + "grad_norm": 0.5863370299339294, + "learning_rate": 5.483616611179254e-05, + "loss": 1.7144, + "step": 5158 + }, + { + "epoch": 1.3557986008473741, + "grad_norm": 0.6079491376876831, + "learning_rate": 5.481864377080779e-05, + "loss": 1.6679, + "step": 5160 + }, + { + "epoch": 1.3563241041810359, + "grad_norm": 0.603889524936676, + "learning_rate": 5.480112142982303e-05, + "loss": 1.6858, + "step": 5162 + }, + { + "epoch": 1.3568496075146976, + "grad_norm": 0.7292947173118591, + "learning_rate": 5.4783599088838265e-05, + "loss": 1.6597, + "step": 5164 + }, + { + "epoch": 1.3573751108483594, + "grad_norm": 0.6873249411582947, + "learning_rate": 5.476607674785351e-05, + "loss": 1.6614, + "step": 5166 + }, + { + "epoch": 1.3579006141820211, + "grad_norm": 0.6474335789680481, + "learning_rate": 5.474855440686876e-05, + "loss": 1.6651, + "step": 5168 + }, + { + "epoch": 1.358426117515683, + "grad_norm": 0.6388329863548279, + "learning_rate": 5.4731032065884e-05, + "loss": 1.7278, + "step": 5170 + }, + { + "epoch": 1.3589516208493448, + "grad_norm": 0.6011612415313721, + "learning_rate": 5.471350972489925e-05, + "loss": 1.6685, + "step": 5172 + }, + { + "epoch": 1.3594771241830066, + "grad_norm": 0.5934230089187622, + "learning_rate": 5.4695987383914496e-05, + "loss": 1.6956, + "step": 5174 + }, + { + "epoch": 1.3600026275166683, + "grad_norm": 0.5996755361557007, + "learning_rate": 5.4678465042929736e-05, + "loss": 1.6717, + "step": 5176 + }, + { + "epoch": 1.36052813085033, + "grad_norm": 0.5604281425476074, + "learning_rate": 5.4660942701944984e-05, + "loss": 1.6535, + "step": 5178 + }, + { + "epoch": 1.3610536341839918, + "grad_norm": 0.7021388411521912, + "learning_rate": 5.464342036096023e-05, + "loss": 1.6387, + "step": 5180 + }, + { + "epoch": 1.3615791375176536, + "grad_norm": 0.599388599395752, + "learning_rate": 5.462589801997547e-05, + "loss": 1.7095, + "step": 5182 + }, + { + "epoch": 1.3621046408513153, + "grad_norm": 0.6312151551246643, + "learning_rate": 5.460837567899072e-05, + "loss": 1.7024, + "step": 5184 + }, + { + "epoch": 1.362630144184977, + "grad_norm": 0.6004204154014587, + "learning_rate": 5.459085333800597e-05, + "loss": 1.6731, + "step": 5186 + }, + { + "epoch": 1.363155647518639, + "grad_norm": 0.5637236833572388, + "learning_rate": 5.45733309970212e-05, + "loss": 1.6846, + "step": 5188 + }, + { + "epoch": 1.3636811508523008, + "grad_norm": 0.5435346961021423, + "learning_rate": 5.455580865603644e-05, + "loss": 1.697, + "step": 5190 + }, + { + "epoch": 1.3642066541859625, + "grad_norm": 0.7027901411056519, + "learning_rate": 5.453828631505169e-05, + "loss": 1.6658, + "step": 5192 + }, + { + "epoch": 1.3647321575196243, + "grad_norm": 0.5427135229110718, + "learning_rate": 5.452076397406694e-05, + "loss": 1.6802, + "step": 5194 + }, + { + "epoch": 1.365257660853286, + "grad_norm": 0.9551213979721069, + "learning_rate": 5.450324163308218e-05, + "loss": 1.6675, + "step": 5196 + }, + { + "epoch": 1.3657831641869478, + "grad_norm": 0.6727270483970642, + "learning_rate": 5.4485719292097426e-05, + "loss": 1.6909, + "step": 5198 + }, + { + "epoch": 1.3663086675206095, + "grad_norm": 0.5825324058532715, + "learning_rate": 5.446819695111267e-05, + "loss": 1.6678, + "step": 5200 + }, + { + "epoch": 1.3663086675206095, + "eval_loss": 1.6925063133239746, + "eval_runtime": 487.2699, + "eval_samples_per_second": 249.942, + "eval_steps_per_second": 31.243, + "step": 5200 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.5803776979446411, + "learning_rate": 5.4450674610127914e-05, + "loss": 1.687, + "step": 5202 + }, + { + "epoch": 1.367359674187933, + "grad_norm": 0.5400435328483582, + "learning_rate": 5.443315226914316e-05, + "loss": 1.7248, + "step": 5204 + }, + { + "epoch": 1.367885177521595, + "grad_norm": 0.5866122245788574, + "learning_rate": 5.441562992815841e-05, + "loss": 1.6612, + "step": 5206 + }, + { + "epoch": 1.3684106808552567, + "grad_norm": 0.6416072249412537, + "learning_rate": 5.439810758717365e-05, + "loss": 1.6948, + "step": 5208 + }, + { + "epoch": 1.3689361841889185, + "grad_norm": 0.5607174634933472, + "learning_rate": 5.43805852461889e-05, + "loss": 1.7079, + "step": 5210 + }, + { + "epoch": 1.3694616875225802, + "grad_norm": 0.5060856938362122, + "learning_rate": 5.4363062905204145e-05, + "loss": 1.6811, + "step": 5212 + }, + { + "epoch": 1.369987190856242, + "grad_norm": 0.5724950432777405, + "learning_rate": 5.434554056421938e-05, + "loss": 1.6251, + "step": 5214 + }, + { + "epoch": 1.3705126941899037, + "grad_norm": 0.595393717288971, + "learning_rate": 5.432801822323462e-05, + "loss": 1.6867, + "step": 5216 + }, + { + "epoch": 1.3710381975235655, + "grad_norm": 0.5957738161087036, + "learning_rate": 5.431049588224987e-05, + "loss": 1.6971, + "step": 5218 + }, + { + "epoch": 1.3715637008572272, + "grad_norm": 0.6889095306396484, + "learning_rate": 5.4292973541265115e-05, + "loss": 1.6999, + "step": 5220 + }, + { + "epoch": 1.372089204190889, + "grad_norm": 0.6807567477226257, + "learning_rate": 5.4275451200280356e-05, + "loss": 1.6674, + "step": 5222 + }, + { + "epoch": 1.372614707524551, + "grad_norm": 0.6046966314315796, + "learning_rate": 5.4257928859295603e-05, + "loss": 1.6622, + "step": 5224 + }, + { + "epoch": 1.3731402108582127, + "grad_norm": 0.6781222224235535, + "learning_rate": 5.424040651831085e-05, + "loss": 1.6974, + "step": 5226 + }, + { + "epoch": 1.3736657141918744, + "grad_norm": 0.5710211992263794, + "learning_rate": 5.422288417732609e-05, + "loss": 1.6769, + "step": 5228 + }, + { + "epoch": 1.3741912175255362, + "grad_norm": 0.6032019257545471, + "learning_rate": 5.420536183634134e-05, + "loss": 1.6615, + "step": 5230 + }, + { + "epoch": 1.374716720859198, + "grad_norm": 0.6502026915550232, + "learning_rate": 5.418783949535659e-05, + "loss": 1.6965, + "step": 5232 + }, + { + "epoch": 1.3752422241928597, + "grad_norm": 0.7333769798278809, + "learning_rate": 5.417031715437183e-05, + "loss": 1.689, + "step": 5234 + }, + { + "epoch": 1.3757677275265214, + "grad_norm": 0.5849090218544006, + "learning_rate": 5.4152794813387075e-05, + "loss": 1.6696, + "step": 5236 + }, + { + "epoch": 1.3762932308601834, + "grad_norm": 0.7076939940452576, + "learning_rate": 5.413527247240232e-05, + "loss": 1.6674, + "step": 5238 + }, + { + "epoch": 1.376818734193845, + "grad_norm": 0.686165988445282, + "learning_rate": 5.411775013141756e-05, + "loss": 1.6729, + "step": 5240 + }, + { + "epoch": 1.377344237527507, + "grad_norm": 0.7064648270606995, + "learning_rate": 5.41002277904328e-05, + "loss": 1.6532, + "step": 5242 + }, + { + "epoch": 1.3778697408611686, + "grad_norm": 0.707788348197937, + "learning_rate": 5.4082705449448045e-05, + "loss": 1.635, + "step": 5244 + }, + { + "epoch": 1.3783952441948304, + "grad_norm": 0.711150586605072, + "learning_rate": 5.406518310846329e-05, + "loss": 1.6671, + "step": 5246 + }, + { + "epoch": 1.3789207475284921, + "grad_norm": 0.627653181552887, + "learning_rate": 5.4047660767478534e-05, + "loss": 1.6887, + "step": 5248 + }, + { + "epoch": 1.3794462508621539, + "grad_norm": 0.5618575215339661, + "learning_rate": 5.403013842649378e-05, + "loss": 1.6773, + "step": 5250 + }, + { + "epoch": 1.3799717541958156, + "grad_norm": 0.6045578718185425, + "learning_rate": 5.401261608550903e-05, + "loss": 1.7151, + "step": 5252 + }, + { + "epoch": 1.3804972575294774, + "grad_norm": 0.6123912930488586, + "learning_rate": 5.399509374452427e-05, + "loss": 1.6428, + "step": 5254 + }, + { + "epoch": 1.3810227608631394, + "grad_norm": 0.7001442909240723, + "learning_rate": 5.397757140353952e-05, + "loss": 1.7063, + "step": 5256 + }, + { + "epoch": 1.3815482641968009, + "grad_norm": 0.5598956346511841, + "learning_rate": 5.3960049062554765e-05, + "loss": 1.671, + "step": 5258 + }, + { + "epoch": 1.3820737675304628, + "grad_norm": 0.5916640758514404, + "learning_rate": 5.3942526721570005e-05, + "loss": 1.6685, + "step": 5260 + }, + { + "epoch": 1.3825992708641246, + "grad_norm": 0.5688573718070984, + "learning_rate": 5.392500438058525e-05, + "loss": 1.6357, + "step": 5262 + }, + { + "epoch": 1.3831247741977863, + "grad_norm": 0.6486282348632812, + "learning_rate": 5.39074820396005e-05, + "loss": 1.6803, + "step": 5264 + }, + { + "epoch": 1.383650277531448, + "grad_norm": 0.6464768052101135, + "learning_rate": 5.3889959698615735e-05, + "loss": 1.6781, + "step": 5266 + }, + { + "epoch": 1.3841757808651098, + "grad_norm": 0.5832222104072571, + "learning_rate": 5.3872437357630975e-05, + "loss": 1.7009, + "step": 5268 + }, + { + "epoch": 1.3847012841987716, + "grad_norm": 0.6113573908805847, + "learning_rate": 5.385491501664622e-05, + "loss": 1.6855, + "step": 5270 + }, + { + "epoch": 1.3852267875324333, + "grad_norm": 0.6312457323074341, + "learning_rate": 5.383739267566147e-05, + "loss": 1.7076, + "step": 5272 + }, + { + "epoch": 1.3857522908660953, + "grad_norm": 0.6757798790931702, + "learning_rate": 5.381987033467671e-05, + "loss": 1.6796, + "step": 5274 + }, + { + "epoch": 1.3862777941997568, + "grad_norm": 0.7711691856384277, + "learning_rate": 5.380234799369196e-05, + "loss": 1.656, + "step": 5276 + }, + { + "epoch": 1.3868032975334188, + "grad_norm": 0.6918832659721375, + "learning_rate": 5.3784825652707206e-05, + "loss": 1.6723, + "step": 5278 + }, + { + "epoch": 1.3873288008670805, + "grad_norm": 0.5812153220176697, + "learning_rate": 5.376730331172245e-05, + "loss": 1.6769, + "step": 5280 + }, + { + "epoch": 1.3878543042007423, + "grad_norm": 0.642285168170929, + "learning_rate": 5.3749780970737695e-05, + "loss": 1.6962, + "step": 5282 + }, + { + "epoch": 1.388379807534404, + "grad_norm": 0.5948686003684998, + "learning_rate": 5.373225862975294e-05, + "loss": 1.6874, + "step": 5284 + }, + { + "epoch": 1.3889053108680658, + "grad_norm": 0.8630838394165039, + "learning_rate": 5.371473628876818e-05, + "loss": 1.6602, + "step": 5286 + }, + { + "epoch": 1.3894308142017275, + "grad_norm": 0.624987006187439, + "learning_rate": 5.369721394778343e-05, + "loss": 1.6674, + "step": 5288 + }, + { + "epoch": 1.3899563175353893, + "grad_norm": 0.5534707307815552, + "learning_rate": 5.3679691606798665e-05, + "loss": 1.7101, + "step": 5290 + }, + { + "epoch": 1.3904818208690513, + "grad_norm": 0.547791063785553, + "learning_rate": 5.366216926581391e-05, + "loss": 1.7103, + "step": 5292 + }, + { + "epoch": 1.3910073242027128, + "grad_norm": 0.7196782827377319, + "learning_rate": 5.364464692482915e-05, + "loss": 1.7016, + "step": 5294 + }, + { + "epoch": 1.3915328275363748, + "grad_norm": 0.5943964123725891, + "learning_rate": 5.36271245838444e-05, + "loss": 1.6937, + "step": 5296 + }, + { + "epoch": 1.3920583308700365, + "grad_norm": 0.5824291706085205, + "learning_rate": 5.360960224285965e-05, + "loss": 1.6606, + "step": 5298 + }, + { + "epoch": 1.3925838342036982, + "grad_norm": 0.6096850633621216, + "learning_rate": 5.359207990187489e-05, + "loss": 1.6732, + "step": 5300 + }, + { + "epoch": 1.39310933753736, + "grad_norm": 0.5567105412483215, + "learning_rate": 5.3574557560890136e-05, + "loss": 1.6917, + "step": 5302 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.808890700340271, + "learning_rate": 5.3557035219905384e-05, + "loss": 1.6916, + "step": 5304 + }, + { + "epoch": 1.3941603442046835, + "grad_norm": 0.602245032787323, + "learning_rate": 5.3539512878920625e-05, + "loss": 1.6565, + "step": 5306 + }, + { + "epoch": 1.3946858475383452, + "grad_norm": 0.5515073537826538, + "learning_rate": 5.352199053793587e-05, + "loss": 1.6927, + "step": 5308 + }, + { + "epoch": 1.3952113508720072, + "grad_norm": 0.6020869016647339, + "learning_rate": 5.350446819695112e-05, + "loss": 1.6813, + "step": 5310 + }, + { + "epoch": 1.3957368542056687, + "grad_norm": 0.6068871021270752, + "learning_rate": 5.348694585596636e-05, + "loss": 1.699, + "step": 5312 + }, + { + "epoch": 1.3962623575393307, + "grad_norm": 0.8839384913444519, + "learning_rate": 5.346942351498161e-05, + "loss": 1.7296, + "step": 5314 + }, + { + "epoch": 1.3967878608729924, + "grad_norm": 0.6876543164253235, + "learning_rate": 5.345190117399684e-05, + "loss": 1.6827, + "step": 5316 + }, + { + "epoch": 1.3973133642066542, + "grad_norm": 0.9712225198745728, + "learning_rate": 5.343437883301209e-05, + "loss": 1.6821, + "step": 5318 + }, + { + "epoch": 1.397838867540316, + "grad_norm": 0.644990861415863, + "learning_rate": 5.341685649202733e-05, + "loss": 1.671, + "step": 5320 + }, + { + "epoch": 1.3983643708739777, + "grad_norm": 0.7299576997756958, + "learning_rate": 5.339933415104258e-05, + "loss": 1.7054, + "step": 5322 + }, + { + "epoch": 1.3988898742076394, + "grad_norm": 0.5608115196228027, + "learning_rate": 5.3381811810057826e-05, + "loss": 1.6713, + "step": 5324 + }, + { + "epoch": 1.3994153775413012, + "grad_norm": 0.6044563055038452, + "learning_rate": 5.3364289469073067e-05, + "loss": 1.6838, + "step": 5326 + }, + { + "epoch": 1.3999408808749632, + "grad_norm": 0.5304017663002014, + "learning_rate": 5.3346767128088314e-05, + "loss": 1.6351, + "step": 5328 + }, + { + "epoch": 1.400466384208625, + "grad_norm": 0.5934461355209351, + "learning_rate": 5.332924478710356e-05, + "loss": 1.6485, + "step": 5330 + }, + { + "epoch": 1.4009918875422867, + "grad_norm": 0.5502288937568665, + "learning_rate": 5.33117224461188e-05, + "loss": 1.683, + "step": 5332 + }, + { + "epoch": 1.4015173908759484, + "grad_norm": 0.6545527577400208, + "learning_rate": 5.329420010513405e-05, + "loss": 1.6823, + "step": 5334 + }, + { + "epoch": 1.4020428942096101, + "grad_norm": 0.5551027655601501, + "learning_rate": 5.32766777641493e-05, + "loss": 1.6867, + "step": 5336 + }, + { + "epoch": 1.402568397543272, + "grad_norm": 0.6732555627822876, + "learning_rate": 5.325915542316454e-05, + "loss": 1.7031, + "step": 5338 + }, + { + "epoch": 1.4030939008769336, + "grad_norm": 0.5857707858085632, + "learning_rate": 5.3241633082179786e-05, + "loss": 1.6607, + "step": 5340 + }, + { + "epoch": 1.4036194042105954, + "grad_norm": 0.6332495212554932, + "learning_rate": 5.322411074119502e-05, + "loss": 1.6979, + "step": 5342 + }, + { + "epoch": 1.4041449075442571, + "grad_norm": 0.5486958026885986, + "learning_rate": 5.320658840021027e-05, + "loss": 1.6582, + "step": 5344 + }, + { + "epoch": 1.404670410877919, + "grad_norm": 0.5463687181472778, + "learning_rate": 5.3189066059225515e-05, + "loss": 1.7192, + "step": 5346 + }, + { + "epoch": 1.4051959142115809, + "grad_norm": 0.4964401125907898, + "learning_rate": 5.3171543718240756e-05, + "loss": 1.6486, + "step": 5348 + }, + { + "epoch": 1.4057214175452426, + "grad_norm": 0.5375365018844604, + "learning_rate": 5.3154021377256003e-05, + "loss": 1.714, + "step": 5350 + }, + { + "epoch": 1.4062469208789043, + "grad_norm": 0.5706174373626709, + "learning_rate": 5.313649903627125e-05, + "loss": 1.702, + "step": 5352 + }, + { + "epoch": 1.406772424212566, + "grad_norm": 0.6177610158920288, + "learning_rate": 5.311897669528649e-05, + "loss": 1.6439, + "step": 5354 + }, + { + "epoch": 1.4072979275462278, + "grad_norm": 0.5553119778633118, + "learning_rate": 5.310145435430174e-05, + "loss": 1.69, + "step": 5356 + }, + { + "epoch": 1.4078234308798896, + "grad_norm": 0.6558260917663574, + "learning_rate": 5.308393201331699e-05, + "loss": 1.7068, + "step": 5358 + }, + { + "epoch": 1.4083489342135513, + "grad_norm": 0.7940452098846436, + "learning_rate": 5.306640967233223e-05, + "loss": 1.6912, + "step": 5360 + }, + { + "epoch": 1.408874437547213, + "grad_norm": 0.577286958694458, + "learning_rate": 5.3048887331347475e-05, + "loss": 1.6755, + "step": 5362 + }, + { + "epoch": 1.409399940880875, + "grad_norm": 0.6552362442016602, + "learning_rate": 5.303136499036272e-05, + "loss": 1.6952, + "step": 5364 + }, + { + "epoch": 1.4099254442145368, + "grad_norm": 0.6791645884513855, + "learning_rate": 5.3013842649377964e-05, + "loss": 1.6979, + "step": 5366 + }, + { + "epoch": 1.4104509475481986, + "grad_norm": 0.5956308841705322, + "learning_rate": 5.29963203083932e-05, + "loss": 1.6742, + "step": 5368 + }, + { + "epoch": 1.4109764508818603, + "grad_norm": 0.6051899790763855, + "learning_rate": 5.2978797967408445e-05, + "loss": 1.6704, + "step": 5370 + }, + { + "epoch": 1.411501954215522, + "grad_norm": 0.6038895845413208, + "learning_rate": 5.296127562642369e-05, + "loss": 1.6894, + "step": 5372 + }, + { + "epoch": 1.4120274575491838, + "grad_norm": 0.5314784049987793, + "learning_rate": 5.2943753285438934e-05, + "loss": 1.6656, + "step": 5374 + }, + { + "epoch": 1.4125529608828455, + "grad_norm": 0.5296756029129028, + "learning_rate": 5.292623094445418e-05, + "loss": 1.6957, + "step": 5376 + }, + { + "epoch": 1.4130784642165073, + "grad_norm": 0.5364149808883667, + "learning_rate": 5.290870860346943e-05, + "loss": 1.6585, + "step": 5378 + }, + { + "epoch": 1.413603967550169, + "grad_norm": 0.5761370062828064, + "learning_rate": 5.289118626248467e-05, + "loss": 1.6692, + "step": 5380 + }, + { + "epoch": 1.414129470883831, + "grad_norm": 0.6723489165306091, + "learning_rate": 5.287366392149992e-05, + "loss": 1.6826, + "step": 5382 + }, + { + "epoch": 1.4146549742174928, + "grad_norm": 0.5427255034446716, + "learning_rate": 5.2856141580515165e-05, + "loss": 1.6612, + "step": 5384 + }, + { + "epoch": 1.4151804775511545, + "grad_norm": 0.5773686170578003, + "learning_rate": 5.2838619239530405e-05, + "loss": 1.6751, + "step": 5386 + }, + { + "epoch": 1.4157059808848163, + "grad_norm": 0.9336940050125122, + "learning_rate": 5.282109689854565e-05, + "loss": 1.6976, + "step": 5388 + }, + { + "epoch": 1.416231484218478, + "grad_norm": 0.5543003082275391, + "learning_rate": 5.28035745575609e-05, + "loss": 1.6991, + "step": 5390 + }, + { + "epoch": 1.4167569875521397, + "grad_norm": 0.5574566125869751, + "learning_rate": 5.2786052216576134e-05, + "loss": 1.6575, + "step": 5392 + }, + { + "epoch": 1.4172824908858015, + "grad_norm": 0.6602552533149719, + "learning_rate": 5.2768529875591375e-05, + "loss": 1.6748, + "step": 5394 + }, + { + "epoch": 1.4178079942194635, + "grad_norm": 0.644187331199646, + "learning_rate": 5.275100753460662e-05, + "loss": 1.6565, + "step": 5396 + }, + { + "epoch": 1.418333497553125, + "grad_norm": 0.5814189314842224, + "learning_rate": 5.273348519362187e-05, + "loss": 1.6965, + "step": 5398 + }, + { + "epoch": 1.418859000886787, + "grad_norm": 0.5712095499038696, + "learning_rate": 5.271596285263711e-05, + "loss": 1.6593, + "step": 5400 + }, + { + "epoch": 1.4193845042204487, + "grad_norm": 0.6919686198234558, + "learning_rate": 5.269844051165236e-05, + "loss": 1.692, + "step": 5402 + }, + { + "epoch": 1.4199100075541105, + "grad_norm": 0.6162757277488708, + "learning_rate": 5.2680918170667606e-05, + "loss": 1.6726, + "step": 5404 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.5998090505599976, + "learning_rate": 5.266339582968285e-05, + "loss": 1.7206, + "step": 5406 + }, + { + "epoch": 1.420961014221434, + "grad_norm": 0.6242619156837463, + "learning_rate": 5.2645873488698095e-05, + "loss": 1.6647, + "step": 5408 + }, + { + "epoch": 1.4214865175550957, + "grad_norm": 0.5882295966148376, + "learning_rate": 5.262835114771334e-05, + "loss": 1.6562, + "step": 5410 + }, + { + "epoch": 1.4220120208887574, + "grad_norm": 0.5960384607315063, + "learning_rate": 5.261082880672858e-05, + "loss": 1.6877, + "step": 5412 + }, + { + "epoch": 1.4225375242224194, + "grad_norm": 0.6366286873817444, + "learning_rate": 5.259330646574383e-05, + "loss": 1.7009, + "step": 5414 + }, + { + "epoch": 1.423063027556081, + "grad_norm": 0.581294059753418, + "learning_rate": 5.257578412475908e-05, + "loss": 1.6918, + "step": 5416 + }, + { + "epoch": 1.423588530889743, + "grad_norm": 0.6820783615112305, + "learning_rate": 5.255826178377431e-05, + "loss": 1.6602, + "step": 5418 + }, + { + "epoch": 1.4241140342234047, + "grad_norm": 0.7352914810180664, + "learning_rate": 5.254073944278955e-05, + "loss": 1.6837, + "step": 5420 + }, + { + "epoch": 1.4246395375570664, + "grad_norm": 0.5740265250205994, + "learning_rate": 5.25232171018048e-05, + "loss": 1.6741, + "step": 5422 + }, + { + "epoch": 1.4251650408907282, + "grad_norm": 0.5686757564544678, + "learning_rate": 5.250569476082005e-05, + "loss": 1.6908, + "step": 5424 + }, + { + "epoch": 1.42569054422439, + "grad_norm": 0.765792191028595, + "learning_rate": 5.248817241983529e-05, + "loss": 1.6928, + "step": 5426 + }, + { + "epoch": 1.4262160475580516, + "grad_norm": 0.605812668800354, + "learning_rate": 5.2470650078850536e-05, + "loss": 1.7009, + "step": 5428 + }, + { + "epoch": 1.4267415508917134, + "grad_norm": 0.5373610258102417, + "learning_rate": 5.2453127737865784e-05, + "loss": 1.6906, + "step": 5430 + }, + { + "epoch": 1.4272670542253754, + "grad_norm": 0.6103860139846802, + "learning_rate": 5.2435605396881025e-05, + "loss": 1.683, + "step": 5432 + }, + { + "epoch": 1.427792557559037, + "grad_norm": 0.5279322862625122, + "learning_rate": 5.241808305589627e-05, + "loss": 1.6679, + "step": 5434 + }, + { + "epoch": 1.4283180608926989, + "grad_norm": 0.6445699334144592, + "learning_rate": 5.240056071491152e-05, + "loss": 1.6908, + "step": 5436 + }, + { + "epoch": 1.4288435642263606, + "grad_norm": 0.5819953680038452, + "learning_rate": 5.238303837392676e-05, + "loss": 1.6849, + "step": 5438 + }, + { + "epoch": 1.4293690675600224, + "grad_norm": 0.564022421836853, + "learning_rate": 5.236551603294201e-05, + "loss": 1.6799, + "step": 5440 + }, + { + "epoch": 1.429894570893684, + "grad_norm": 0.5687031149864197, + "learning_rate": 5.2347993691957256e-05, + "loss": 1.6905, + "step": 5442 + }, + { + "epoch": 1.4304200742273459, + "grad_norm": 0.6546675562858582, + "learning_rate": 5.233047135097249e-05, + "loss": 1.6752, + "step": 5444 + }, + { + "epoch": 1.4309455775610076, + "grad_norm": 0.6557585597038269, + "learning_rate": 5.231294900998773e-05, + "loss": 1.6959, + "step": 5446 + }, + { + "epoch": 1.4314710808946693, + "grad_norm": 0.6117187142372131, + "learning_rate": 5.229542666900298e-05, + "loss": 1.6983, + "step": 5448 + }, + { + "epoch": 1.4319965842283313, + "grad_norm": 0.6230733394622803, + "learning_rate": 5.2277904328018226e-05, + "loss": 1.7063, + "step": 5450 + }, + { + "epoch": 1.4325220875619928, + "grad_norm": 0.5881343483924866, + "learning_rate": 5.2260381987033467e-05, + "loss": 1.6718, + "step": 5452 + }, + { + "epoch": 1.4330475908956548, + "grad_norm": 0.606519341468811, + "learning_rate": 5.2242859646048714e-05, + "loss": 1.6876, + "step": 5454 + }, + { + "epoch": 1.4335730942293166, + "grad_norm": 0.6653869152069092, + "learning_rate": 5.222533730506396e-05, + "loss": 1.6707, + "step": 5456 + }, + { + "epoch": 1.4340985975629783, + "grad_norm": 0.5549430251121521, + "learning_rate": 5.22078149640792e-05, + "loss": 1.6994, + "step": 5458 + }, + { + "epoch": 1.43462410089664, + "grad_norm": 0.7070857882499695, + "learning_rate": 5.219029262309445e-05, + "loss": 1.7043, + "step": 5460 + }, + { + "epoch": 1.4351496042303018, + "grad_norm": 0.4982629716396332, + "learning_rate": 5.21727702821097e-05, + "loss": 1.6567, + "step": 5462 + }, + { + "epoch": 1.4356751075639635, + "grad_norm": 0.6242676973342896, + "learning_rate": 5.215524794112494e-05, + "loss": 1.6898, + "step": 5464 + }, + { + "epoch": 1.4362006108976253, + "grad_norm": 0.5926311016082764, + "learning_rate": 5.2137725600140186e-05, + "loss": 1.7012, + "step": 5466 + }, + { + "epoch": 1.4367261142312873, + "grad_norm": 0.7169223427772522, + "learning_rate": 5.212020325915542e-05, + "loss": 1.6782, + "step": 5468 + }, + { + "epoch": 1.4372516175649488, + "grad_norm": 0.7072371244430542, + "learning_rate": 5.210268091817067e-05, + "loss": 1.6442, + "step": 5470 + }, + { + "epoch": 1.4377771208986108, + "grad_norm": 0.5502269268035889, + "learning_rate": 5.208515857718591e-05, + "loss": 1.6743, + "step": 5472 + }, + { + "epoch": 1.4383026242322725, + "grad_norm": 0.7821040749549866, + "learning_rate": 5.2067636236201156e-05, + "loss": 1.7106, + "step": 5474 + }, + { + "epoch": 1.4388281275659343, + "grad_norm": 0.6043164730072021, + "learning_rate": 5.2050113895216403e-05, + "loss": 1.708, + "step": 5476 + }, + { + "epoch": 1.439353630899596, + "grad_norm": 0.5633584856987, + "learning_rate": 5.2032591554231644e-05, + "loss": 1.6696, + "step": 5478 + }, + { + "epoch": 1.4398791342332578, + "grad_norm": 0.5389137864112854, + "learning_rate": 5.201506921324689e-05, + "loss": 1.669, + "step": 5480 + }, + { + "epoch": 1.4404046375669195, + "grad_norm": 0.6789732575416565, + "learning_rate": 5.199754687226214e-05, + "loss": 1.6899, + "step": 5482 + }, + { + "epoch": 1.4409301409005812, + "grad_norm": 0.6296239495277405, + "learning_rate": 5.198002453127738e-05, + "loss": 1.6631, + "step": 5484 + }, + { + "epoch": 1.4414556442342432, + "grad_norm": 0.700206458568573, + "learning_rate": 5.196250219029263e-05, + "loss": 1.6616, + "step": 5486 + }, + { + "epoch": 1.441981147567905, + "grad_norm": 0.6670316457748413, + "learning_rate": 5.1944979849307875e-05, + "loss": 1.6699, + "step": 5488 + }, + { + "epoch": 1.4425066509015667, + "grad_norm": 0.6687952876091003, + "learning_rate": 5.1927457508323116e-05, + "loss": 1.7015, + "step": 5490 + }, + { + "epoch": 1.4430321542352285, + "grad_norm": 0.6341938972473145, + "learning_rate": 5.1909935167338364e-05, + "loss": 1.6713, + "step": 5492 + }, + { + "epoch": 1.4435576575688902, + "grad_norm": 0.5659570693969727, + "learning_rate": 5.18924128263536e-05, + "loss": 1.6662, + "step": 5494 + }, + { + "epoch": 1.444083160902552, + "grad_norm": 0.559827983379364, + "learning_rate": 5.1874890485368845e-05, + "loss": 1.6901, + "step": 5496 + }, + { + "epoch": 1.4446086642362137, + "grad_norm": 0.5669519901275635, + "learning_rate": 5.1857368144384086e-05, + "loss": 1.6966, + "step": 5498 + }, + { + "epoch": 1.4451341675698754, + "grad_norm": 0.544907808303833, + "learning_rate": 5.1839845803399334e-05, + "loss": 1.6837, + "step": 5500 + }, + { + "epoch": 1.4456596709035372, + "grad_norm": 0.510985255241394, + "learning_rate": 5.182232346241458e-05, + "loss": 1.7191, + "step": 5502 + }, + { + "epoch": 1.4461851742371992, + "grad_norm": 0.5953521728515625, + "learning_rate": 5.180480112142982e-05, + "loss": 1.6899, + "step": 5504 + }, + { + "epoch": 1.446710677570861, + "grad_norm": 0.5401633381843567, + "learning_rate": 5.178727878044507e-05, + "loss": 1.6602, + "step": 5506 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.6041406393051147, + "learning_rate": 5.176975643946032e-05, + "loss": 1.6573, + "step": 5508 + }, + { + "epoch": 1.4477616842381844, + "grad_norm": 0.5650386214256287, + "learning_rate": 5.175223409847556e-05, + "loss": 1.6747, + "step": 5510 + }, + { + "epoch": 1.4482871875718462, + "grad_norm": 0.6061777472496033, + "learning_rate": 5.1734711757490805e-05, + "loss": 1.677, + "step": 5512 + }, + { + "epoch": 1.448812690905508, + "grad_norm": 0.5969398617744446, + "learning_rate": 5.171718941650605e-05, + "loss": 1.6579, + "step": 5514 + }, + { + "epoch": 1.4493381942391697, + "grad_norm": 0.5931181311607361, + "learning_rate": 5.1699667075521294e-05, + "loss": 1.6772, + "step": 5516 + }, + { + "epoch": 1.4498636975728314, + "grad_norm": 0.5793447494506836, + "learning_rate": 5.168214473453654e-05, + "loss": 1.6302, + "step": 5518 + }, + { + "epoch": 1.4503892009064931, + "grad_norm": 0.5775421857833862, + "learning_rate": 5.1664622393551775e-05, + "loss": 1.6691, + "step": 5520 + }, + { + "epoch": 1.4509147042401551, + "grad_norm": 0.606317937374115, + "learning_rate": 5.164710005256702e-05, + "loss": 1.6876, + "step": 5522 + }, + { + "epoch": 1.4514402075738169, + "grad_norm": 0.5722896456718445, + "learning_rate": 5.1629577711582264e-05, + "loss": 1.643, + "step": 5524 + }, + { + "epoch": 1.4519657109074786, + "grad_norm": 0.7023299932479858, + "learning_rate": 5.161205537059751e-05, + "loss": 1.6527, + "step": 5526 + }, + { + "epoch": 1.4524912142411404, + "grad_norm": 0.5290958285331726, + "learning_rate": 5.159453302961276e-05, + "loss": 1.6556, + "step": 5528 + }, + { + "epoch": 1.453016717574802, + "grad_norm": 0.6962506771087646, + "learning_rate": 5.1577010688628e-05, + "loss": 1.6781, + "step": 5530 + }, + { + "epoch": 1.4535422209084639, + "grad_norm": 0.5442536473274231, + "learning_rate": 5.155948834764325e-05, + "loss": 1.6712, + "step": 5532 + }, + { + "epoch": 1.4540677242421256, + "grad_norm": 0.6162612438201904, + "learning_rate": 5.1541966006658495e-05, + "loss": 1.7003, + "step": 5534 + }, + { + "epoch": 1.4545932275757874, + "grad_norm": 0.5466321110725403, + "learning_rate": 5.1524443665673735e-05, + "loss": 1.6908, + "step": 5536 + }, + { + "epoch": 1.455118730909449, + "grad_norm": 0.5761128067970276, + "learning_rate": 5.150692132468898e-05, + "loss": 1.6745, + "step": 5538 + }, + { + "epoch": 1.455644234243111, + "grad_norm": 0.5764265656471252, + "learning_rate": 5.148939898370423e-05, + "loss": 1.6308, + "step": 5540 + }, + { + "epoch": 1.4561697375767728, + "grad_norm": 0.5794236063957214, + "learning_rate": 5.147187664271947e-05, + "loss": 1.7178, + "step": 5542 + }, + { + "epoch": 1.4566952409104346, + "grad_norm": 0.8042659759521484, + "learning_rate": 5.145435430173472e-05, + "loss": 1.6589, + "step": 5544 + }, + { + "epoch": 1.4572207442440963, + "grad_norm": 0.63565993309021, + "learning_rate": 5.143683196074995e-05, + "loss": 1.6601, + "step": 5546 + }, + { + "epoch": 1.457746247577758, + "grad_norm": 0.5769701600074768, + "learning_rate": 5.14193096197652e-05, + "loss": 1.6565, + "step": 5548 + }, + { + "epoch": 1.4582717509114198, + "grad_norm": 0.6259598135948181, + "learning_rate": 5.140178727878044e-05, + "loss": 1.6664, + "step": 5550 + }, + { + "epoch": 1.4587972542450816, + "grad_norm": 1.0551050901412964, + "learning_rate": 5.138426493779569e-05, + "loss": 1.7123, + "step": 5552 + }, + { + "epoch": 1.4593227575787435, + "grad_norm": 0.5595609545707703, + "learning_rate": 5.1366742596810936e-05, + "loss": 1.6494, + "step": 5554 + }, + { + "epoch": 1.459848260912405, + "grad_norm": 0.607704222202301, + "learning_rate": 5.134922025582618e-05, + "loss": 1.6585, + "step": 5556 + }, + { + "epoch": 1.460373764246067, + "grad_norm": 0.7237640023231506, + "learning_rate": 5.1331697914841425e-05, + "loss": 1.6797, + "step": 5558 + }, + { + "epoch": 1.4608992675797288, + "grad_norm": 0.7079179286956787, + "learning_rate": 5.131417557385667e-05, + "loss": 1.6882, + "step": 5560 + }, + { + "epoch": 1.4614247709133905, + "grad_norm": 0.5847229361534119, + "learning_rate": 5.129665323287191e-05, + "loss": 1.6694, + "step": 5562 + }, + { + "epoch": 1.4619502742470523, + "grad_norm": 0.5582857728004456, + "learning_rate": 5.127913089188716e-05, + "loss": 1.6723, + "step": 5564 + }, + { + "epoch": 1.462475777580714, + "grad_norm": 0.5761253833770752, + "learning_rate": 5.126160855090241e-05, + "loss": 1.6953, + "step": 5566 + }, + { + "epoch": 1.4630012809143758, + "grad_norm": 0.6386553645133972, + "learning_rate": 5.124408620991765e-05, + "loss": 1.6842, + "step": 5568 + }, + { + "epoch": 1.4635267842480375, + "grad_norm": 0.6150875687599182, + "learning_rate": 5.122656386893288e-05, + "loss": 1.6365, + "step": 5570 + }, + { + "epoch": 1.4640522875816995, + "grad_norm": 0.632836639881134, + "learning_rate": 5.120904152794813e-05, + "loss": 1.6497, + "step": 5572 + }, + { + "epoch": 1.464577790915361, + "grad_norm": 0.713758111000061, + "learning_rate": 5.119151918696338e-05, + "loss": 1.6505, + "step": 5574 + }, + { + "epoch": 1.465103294249023, + "grad_norm": 0.5495973825454712, + "learning_rate": 5.117399684597862e-05, + "loss": 1.6852, + "step": 5576 + }, + { + "epoch": 1.4656287975826847, + "grad_norm": 0.6123538613319397, + "learning_rate": 5.1156474504993867e-05, + "loss": 1.6334, + "step": 5578 + }, + { + "epoch": 1.4661543009163465, + "grad_norm": 0.6316397786140442, + "learning_rate": 5.1138952164009114e-05, + "loss": 1.6877, + "step": 5580 + }, + { + "epoch": 1.4666798042500082, + "grad_norm": 0.6072596311569214, + "learning_rate": 5.1121429823024355e-05, + "loss": 1.6586, + "step": 5582 + }, + { + "epoch": 1.46720530758367, + "grad_norm": 0.6176102757453918, + "learning_rate": 5.11039074820396e-05, + "loss": 1.6752, + "step": 5584 + }, + { + "epoch": 1.4677308109173317, + "grad_norm": 0.6328986883163452, + "learning_rate": 5.108638514105485e-05, + "loss": 1.6886, + "step": 5586 + }, + { + "epoch": 1.4682563142509935, + "grad_norm": 0.6120469570159912, + "learning_rate": 5.10688628000701e-05, + "loss": 1.6834, + "step": 5588 + }, + { + "epoch": 1.4687818175846554, + "grad_norm": 0.6649283170700073, + "learning_rate": 5.105134045908534e-05, + "loss": 1.6399, + "step": 5590 + }, + { + "epoch": 1.469307320918317, + "grad_norm": 0.6802614331245422, + "learning_rate": 5.1033818118100586e-05, + "loss": 1.7099, + "step": 5592 + }, + { + "epoch": 1.469832824251979, + "grad_norm": 0.5907096862792969, + "learning_rate": 5.1016295777115833e-05, + "loss": 1.6647, + "step": 5594 + }, + { + "epoch": 1.4703583275856407, + "grad_norm": 0.5924010276794434, + "learning_rate": 5.099877343613107e-05, + "loss": 1.6655, + "step": 5596 + }, + { + "epoch": 1.4708838309193024, + "grad_norm": 0.5676859617233276, + "learning_rate": 5.098125109514631e-05, + "loss": 1.6651, + "step": 5598 + }, + { + "epoch": 1.4714093342529642, + "grad_norm": 0.5564827919006348, + "learning_rate": 5.0963728754161556e-05, + "loss": 1.7036, + "step": 5600 + }, + { + "epoch": 1.4714093342529642, + "eval_loss": 1.6887091398239136, + "eval_runtime": 487.1901, + "eval_samples_per_second": 249.982, + "eval_steps_per_second": 31.249, + "step": 5600 + }, + { + "epoch": 1.471934837586626, + "grad_norm": 0.6894435286521912, + "learning_rate": 5.09462064131768e-05, + "loss": 1.6565, + "step": 5602 + }, + { + "epoch": 1.4724603409202877, + "grad_norm": 0.5259155035018921, + "learning_rate": 5.0928684072192044e-05, + "loss": 1.6428, + "step": 5604 + }, + { + "epoch": 1.4729858442539494, + "grad_norm": 0.6473843455314636, + "learning_rate": 5.091116173120729e-05, + "loss": 1.6773, + "step": 5606 + }, + { + "epoch": 1.4735113475876114, + "grad_norm": 0.5902252793312073, + "learning_rate": 5.089363939022254e-05, + "loss": 1.683, + "step": 5608 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.5241686105728149, + "learning_rate": 5.087611704923778e-05, + "loss": 1.7068, + "step": 5610 + }, + { + "epoch": 1.4745623542549349, + "grad_norm": 0.5719613432884216, + "learning_rate": 5.085859470825303e-05, + "loss": 1.6997, + "step": 5612 + }, + { + "epoch": 1.4750878575885966, + "grad_norm": 0.7695503234863281, + "learning_rate": 5.0841072367268275e-05, + "loss": 1.7057, + "step": 5614 + }, + { + "epoch": 1.4756133609222584, + "grad_norm": 0.5506905913352966, + "learning_rate": 5.0823550026283516e-05, + "loss": 1.6853, + "step": 5616 + }, + { + "epoch": 1.4761388642559201, + "grad_norm": 0.5574632883071899, + "learning_rate": 5.0806027685298764e-05, + "loss": 1.7207, + "step": 5618 + }, + { + "epoch": 1.4766643675895819, + "grad_norm": 0.5256906747817993, + "learning_rate": 5.078850534431401e-05, + "loss": 1.6743, + "step": 5620 + }, + { + "epoch": 1.4771898709232436, + "grad_norm": 1.0742486715316772, + "learning_rate": 5.0770983003329245e-05, + "loss": 1.6613, + "step": 5622 + }, + { + "epoch": 1.4777153742569054, + "grad_norm": 0.6463772654533386, + "learning_rate": 5.0753460662344486e-05, + "loss": 1.6872, + "step": 5624 + }, + { + "epoch": 1.4782408775905673, + "grad_norm": 0.5381261706352234, + "learning_rate": 5.0735938321359733e-05, + "loss": 1.6418, + "step": 5626 + }, + { + "epoch": 1.4787663809242289, + "grad_norm": 0.7155745029449463, + "learning_rate": 5.071841598037498e-05, + "loss": 1.7034, + "step": 5628 + }, + { + "epoch": 1.4792918842578908, + "grad_norm": 0.8163059949874878, + "learning_rate": 5.070089363939022e-05, + "loss": 1.6595, + "step": 5630 + }, + { + "epoch": 1.4798173875915526, + "grad_norm": 0.6401390433311462, + "learning_rate": 5.068337129840547e-05, + "loss": 1.6658, + "step": 5632 + }, + { + "epoch": 1.4803428909252143, + "grad_norm": 0.5915647149085999, + "learning_rate": 5.066584895742072e-05, + "loss": 1.6495, + "step": 5634 + }, + { + "epoch": 1.480868394258876, + "grad_norm": 0.5638352632522583, + "learning_rate": 5.064832661643596e-05, + "loss": 1.6619, + "step": 5636 + }, + { + "epoch": 1.4813938975925378, + "grad_norm": 0.6860741376876831, + "learning_rate": 5.0630804275451205e-05, + "loss": 1.6827, + "step": 5638 + }, + { + "epoch": 1.4819194009261996, + "grad_norm": 0.5409403443336487, + "learning_rate": 5.061328193446645e-05, + "loss": 1.6391, + "step": 5640 + }, + { + "epoch": 1.4824449042598613, + "grad_norm": 0.6866961717605591, + "learning_rate": 5.0595759593481694e-05, + "loss": 1.6657, + "step": 5642 + }, + { + "epoch": 1.4829704075935233, + "grad_norm": 0.6035512089729309, + "learning_rate": 5.057823725249694e-05, + "loss": 1.6962, + "step": 5644 + }, + { + "epoch": 1.483495910927185, + "grad_norm": 0.539889931678772, + "learning_rate": 5.056071491151219e-05, + "loss": 1.676, + "step": 5646 + }, + { + "epoch": 1.4840214142608468, + "grad_norm": 0.5658326148986816, + "learning_rate": 5.054319257052742e-05, + "loss": 1.6876, + "step": 5648 + }, + { + "epoch": 1.4845469175945085, + "grad_norm": 0.9115592241287231, + "learning_rate": 5.0525670229542664e-05, + "loss": 1.6691, + "step": 5650 + }, + { + "epoch": 1.4850724209281703, + "grad_norm": 0.6758273839950562, + "learning_rate": 5.050814788855791e-05, + "loss": 1.6405, + "step": 5652 + }, + { + "epoch": 1.485597924261832, + "grad_norm": 0.6089572310447693, + "learning_rate": 5.049062554757316e-05, + "loss": 1.6954, + "step": 5654 + }, + { + "epoch": 1.4861234275954938, + "grad_norm": 0.5882745981216431, + "learning_rate": 5.04731032065884e-05, + "loss": 1.6768, + "step": 5656 + }, + { + "epoch": 1.4866489309291555, + "grad_norm": 0.6646392345428467, + "learning_rate": 5.045558086560365e-05, + "loss": 1.6832, + "step": 5658 + }, + { + "epoch": 1.4871744342628173, + "grad_norm": 0.7076845765113831, + "learning_rate": 5.0438058524618895e-05, + "loss": 1.6734, + "step": 5660 + }, + { + "epoch": 1.4876999375964792, + "grad_norm": 0.6591199636459351, + "learning_rate": 5.0420536183634135e-05, + "loss": 1.7321, + "step": 5662 + }, + { + "epoch": 1.488225440930141, + "grad_norm": 0.5403279066085815, + "learning_rate": 5.040301384264938e-05, + "loss": 1.6665, + "step": 5664 + }, + { + "epoch": 1.4887509442638027, + "grad_norm": 0.5921054482460022, + "learning_rate": 5.038549150166463e-05, + "loss": 1.6738, + "step": 5666 + }, + { + "epoch": 1.4892764475974645, + "grad_norm": 0.6299567818641663, + "learning_rate": 5.036796916067987e-05, + "loss": 1.6627, + "step": 5668 + }, + { + "epoch": 1.4898019509311262, + "grad_norm": 0.6079999208450317, + "learning_rate": 5.035044681969512e-05, + "loss": 1.6921, + "step": 5670 + }, + { + "epoch": 1.490327454264788, + "grad_norm": 0.5412185788154602, + "learning_rate": 5.033292447871035e-05, + "loss": 1.6779, + "step": 5672 + }, + { + "epoch": 1.4908529575984497, + "grad_norm": 0.603542685508728, + "learning_rate": 5.03154021377256e-05, + "loss": 1.6948, + "step": 5674 + }, + { + "epoch": 1.4913784609321115, + "grad_norm": 0.818874716758728, + "learning_rate": 5.029787979674084e-05, + "loss": 1.6824, + "step": 5676 + }, + { + "epoch": 1.4919039642657732, + "grad_norm": 0.5792141556739807, + "learning_rate": 5.028035745575609e-05, + "loss": 1.679, + "step": 5678 + }, + { + "epoch": 1.4924294675994352, + "grad_norm": 0.576353907585144, + "learning_rate": 5.0262835114771336e-05, + "loss": 1.6835, + "step": 5680 + }, + { + "epoch": 1.492954970933097, + "grad_norm": 0.5527434349060059, + "learning_rate": 5.024531277378658e-05, + "loss": 1.7095, + "step": 5682 + }, + { + "epoch": 1.4934804742667587, + "grad_norm": 0.5520890355110168, + "learning_rate": 5.0227790432801825e-05, + "loss": 1.6672, + "step": 5684 + }, + { + "epoch": 1.4940059776004204, + "grad_norm": 0.5043492317199707, + "learning_rate": 5.021026809181707e-05, + "loss": 1.6717, + "step": 5686 + }, + { + "epoch": 1.4945314809340822, + "grad_norm": 0.6247971653938293, + "learning_rate": 5.019274575083231e-05, + "loss": 1.6792, + "step": 5688 + }, + { + "epoch": 1.495056984267744, + "grad_norm": 0.6311819553375244, + "learning_rate": 5.017522340984756e-05, + "loss": 1.6628, + "step": 5690 + }, + { + "epoch": 1.4955824876014057, + "grad_norm": 0.5443778038024902, + "learning_rate": 5.015770106886281e-05, + "loss": 1.6362, + "step": 5692 + }, + { + "epoch": 1.4961079909350674, + "grad_norm": 0.6705971360206604, + "learning_rate": 5.014017872787805e-05, + "loss": 1.6975, + "step": 5694 + }, + { + "epoch": 1.4966334942687292, + "grad_norm": 0.6342272758483887, + "learning_rate": 5.0122656386893297e-05, + "loss": 1.6652, + "step": 5696 + }, + { + "epoch": 1.4971589976023911, + "grad_norm": 0.615489661693573, + "learning_rate": 5.010513404590853e-05, + "loss": 1.6503, + "step": 5698 + }, + { + "epoch": 1.4976845009360529, + "grad_norm": 0.5811960697174072, + "learning_rate": 5.008761170492378e-05, + "loss": 1.6369, + "step": 5700 + }, + { + "epoch": 1.4982100042697146, + "grad_norm": 0.5534473061561584, + "learning_rate": 5.007008936393902e-05, + "loss": 1.6311, + "step": 5702 + }, + { + "epoch": 1.4987355076033764, + "grad_norm": 0.6678853034973145, + "learning_rate": 5.0052567022954266e-05, + "loss": 1.6513, + "step": 5704 + }, + { + "epoch": 1.4992610109370381, + "grad_norm": 0.5876911878585815, + "learning_rate": 5.0035044681969514e-05, + "loss": 1.6847, + "step": 5706 + }, + { + "epoch": 1.4997865142706999, + "grad_norm": 0.6282883286476135, + "learning_rate": 5.0017522340984755e-05, + "loss": 1.6909, + "step": 5708 + }, + { + "epoch": 1.5003120176043616, + "grad_norm": 0.5211758613586426, + "learning_rate": 5e-05, + "loss": 1.6478, + "step": 5710 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.5276235938072205, + "learning_rate": 4.998247765901525e-05, + "loss": 1.6706, + "step": 5712 + }, + { + "epoch": 1.501363024271685, + "grad_norm": 0.6381866335868835, + "learning_rate": 4.996495531803049e-05, + "loss": 1.672, + "step": 5714 + }, + { + "epoch": 1.501888527605347, + "grad_norm": 0.5875132083892822, + "learning_rate": 4.994743297704574e-05, + "loss": 1.6611, + "step": 5716 + }, + { + "epoch": 1.5024140309390086, + "grad_norm": 0.5340953469276428, + "learning_rate": 4.992991063606098e-05, + "loss": 1.6612, + "step": 5718 + }, + { + "epoch": 1.5029395342726706, + "grad_norm": 0.6308703422546387, + "learning_rate": 4.991238829507623e-05, + "loss": 1.6988, + "step": 5720 + }, + { + "epoch": 1.5034650376063323, + "grad_norm": 0.6118327975273132, + "learning_rate": 4.989486595409147e-05, + "loss": 1.6677, + "step": 5722 + }, + { + "epoch": 1.503990540939994, + "grad_norm": 0.6005641222000122, + "learning_rate": 4.9877343613106715e-05, + "loss": 1.6629, + "step": 5724 + }, + { + "epoch": 1.5045160442736558, + "grad_norm": 0.6117488145828247, + "learning_rate": 4.985982127212196e-05, + "loss": 1.6429, + "step": 5726 + }, + { + "epoch": 1.5050415476073176, + "grad_norm": 0.7005740404129028, + "learning_rate": 4.98422989311372e-05, + "loss": 1.6896, + "step": 5728 + }, + { + "epoch": 1.5055670509409795, + "grad_norm": 0.5488820672035217, + "learning_rate": 4.9824776590152444e-05, + "loss": 1.6603, + "step": 5730 + }, + { + "epoch": 1.506092554274641, + "grad_norm": 0.6293522119522095, + "learning_rate": 4.980725424916769e-05, + "loss": 1.6465, + "step": 5732 + }, + { + "epoch": 1.506618057608303, + "grad_norm": 0.5625261664390564, + "learning_rate": 4.978973190818293e-05, + "loss": 1.6572, + "step": 5734 + }, + { + "epoch": 1.5071435609419648, + "grad_norm": 0.5663225054740906, + "learning_rate": 4.977220956719818e-05, + "loss": 1.6674, + "step": 5736 + }, + { + "epoch": 1.5076690642756265, + "grad_norm": 0.5049892663955688, + "learning_rate": 4.975468722621343e-05, + "loss": 1.6529, + "step": 5738 + }, + { + "epoch": 1.5081945676092883, + "grad_norm": 0.5977827906608582, + "learning_rate": 4.973716488522867e-05, + "loss": 1.6644, + "step": 5740 + }, + { + "epoch": 1.50872007094295, + "grad_norm": 0.5831950306892395, + "learning_rate": 4.971964254424391e-05, + "loss": 1.6659, + "step": 5742 + }, + { + "epoch": 1.5092455742766118, + "grad_norm": 0.6863638162612915, + "learning_rate": 4.970212020325916e-05, + "loss": 1.6686, + "step": 5744 + }, + { + "epoch": 1.5097710776102735, + "grad_norm": 0.5951880812644958, + "learning_rate": 4.9684597862274404e-05, + "loss": 1.6547, + "step": 5746 + }, + { + "epoch": 1.5102965809439355, + "grad_norm": 0.6561357975006104, + "learning_rate": 4.9667075521289645e-05, + "loss": 1.6984, + "step": 5748 + }, + { + "epoch": 1.510822084277597, + "grad_norm": 0.5892223715782166, + "learning_rate": 4.964955318030489e-05, + "loss": 1.6927, + "step": 5750 + }, + { + "epoch": 1.511347587611259, + "grad_norm": 0.5931475162506104, + "learning_rate": 4.963203083932014e-05, + "loss": 1.6643, + "step": 5752 + }, + { + "epoch": 1.5118730909449207, + "grad_norm": 0.5545600652694702, + "learning_rate": 4.961450849833538e-05, + "loss": 1.6818, + "step": 5754 + }, + { + "epoch": 1.5123985942785825, + "grad_norm": 0.606566309928894, + "learning_rate": 4.959698615735062e-05, + "loss": 1.6676, + "step": 5756 + }, + { + "epoch": 1.5129240976122442, + "grad_norm": 0.5394220352172852, + "learning_rate": 4.957946381636587e-05, + "loss": 1.6563, + "step": 5758 + }, + { + "epoch": 1.513449600945906, + "grad_norm": 0.5019289255142212, + "learning_rate": 4.956194147538111e-05, + "loss": 1.6536, + "step": 5760 + }, + { + "epoch": 1.5139751042795677, + "grad_norm": 0.5706535577774048, + "learning_rate": 4.954441913439636e-05, + "loss": 1.6781, + "step": 5762 + }, + { + "epoch": 1.5145006076132295, + "grad_norm": 0.7017900347709656, + "learning_rate": 4.9526896793411605e-05, + "loss": 1.7036, + "step": 5764 + }, + { + "epoch": 1.5150261109468914, + "grad_norm": 0.627980649471283, + "learning_rate": 4.9509374452426846e-05, + "loss": 1.7071, + "step": 5766 + }, + { + "epoch": 1.515551614280553, + "grad_norm": 0.5035885572433472, + "learning_rate": 4.949185211144209e-05, + "loss": 1.6837, + "step": 5768 + }, + { + "epoch": 1.516077117614215, + "grad_norm": 0.5663711428642273, + "learning_rate": 4.9474329770457334e-05, + "loss": 1.6622, + "step": 5770 + }, + { + "epoch": 1.5166026209478767, + "grad_norm": 0.5828589797019958, + "learning_rate": 4.945680742947258e-05, + "loss": 1.7211, + "step": 5772 + }, + { + "epoch": 1.5171281242815384, + "grad_norm": 0.644883930683136, + "learning_rate": 4.943928508848782e-05, + "loss": 1.6592, + "step": 5774 + }, + { + "epoch": 1.5176536276152002, + "grad_norm": 0.5231825113296509, + "learning_rate": 4.942176274750307e-05, + "loss": 1.6896, + "step": 5776 + }, + { + "epoch": 1.518179130948862, + "grad_norm": 0.6508246064186096, + "learning_rate": 4.940424040651832e-05, + "loss": 1.709, + "step": 5778 + }, + { + "epoch": 1.518704634282524, + "grad_norm": 0.7368707656860352, + "learning_rate": 4.938671806553356e-05, + "loss": 1.6449, + "step": 5780 + }, + { + "epoch": 1.5192301376161854, + "grad_norm": 0.5632103085517883, + "learning_rate": 4.93691957245488e-05, + "loss": 1.6599, + "step": 5782 + }, + { + "epoch": 1.5197556409498474, + "grad_norm": 0.5499205589294434, + "learning_rate": 4.935167338356405e-05, + "loss": 1.6798, + "step": 5784 + }, + { + "epoch": 1.520281144283509, + "grad_norm": 0.6031380295753479, + "learning_rate": 4.933415104257929e-05, + "loss": 1.7011, + "step": 5786 + }, + { + "epoch": 1.5208066476171709, + "grad_norm": 0.6366339325904846, + "learning_rate": 4.9316628701594535e-05, + "loss": 1.6722, + "step": 5788 + }, + { + "epoch": 1.5213321509508326, + "grad_norm": 0.6513427495956421, + "learning_rate": 4.929910636060978e-05, + "loss": 1.6998, + "step": 5790 + }, + { + "epoch": 1.5218576542844944, + "grad_norm": 0.5762157440185547, + "learning_rate": 4.9281584019625024e-05, + "loss": 1.7176, + "step": 5792 + }, + { + "epoch": 1.5223831576181561, + "grad_norm": 0.5817427635192871, + "learning_rate": 4.9264061678640265e-05, + "loss": 1.6686, + "step": 5794 + }, + { + "epoch": 1.5229086609518179, + "grad_norm": 0.6498470902442932, + "learning_rate": 4.924653933765551e-05, + "loss": 1.6536, + "step": 5796 + }, + { + "epoch": 1.5234341642854798, + "grad_norm": 0.5209649205207825, + "learning_rate": 4.922901699667076e-05, + "loss": 1.6711, + "step": 5798 + }, + { + "epoch": 1.5239596676191414, + "grad_norm": 0.5542787909507751, + "learning_rate": 4.9211494655686e-05, + "loss": 1.6931, + "step": 5800 + }, + { + "epoch": 1.5244851709528033, + "grad_norm": 0.5448528528213501, + "learning_rate": 4.919397231470125e-05, + "loss": 1.6952, + "step": 5802 + }, + { + "epoch": 1.5250106742864649, + "grad_norm": 0.6163997054100037, + "learning_rate": 4.9176449973716496e-05, + "loss": 1.638, + "step": 5804 + }, + { + "epoch": 1.5255361776201268, + "grad_norm": 0.5389247536659241, + "learning_rate": 4.915892763273173e-05, + "loss": 1.7243, + "step": 5806 + }, + { + "epoch": 1.5260616809537886, + "grad_norm": 0.5348747372627258, + "learning_rate": 4.914140529174698e-05, + "loss": 1.699, + "step": 5808 + }, + { + "epoch": 1.5265871842874503, + "grad_norm": 0.5715484023094177, + "learning_rate": 4.9123882950762225e-05, + "loss": 1.6672, + "step": 5810 + }, + { + "epoch": 1.527112687621112, + "grad_norm": 0.64728844165802, + "learning_rate": 4.9106360609777465e-05, + "loss": 1.66, + "step": 5812 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.5770621299743652, + "learning_rate": 4.908883826879271e-05, + "loss": 1.6614, + "step": 5814 + }, + { + "epoch": 1.5281636942884358, + "grad_norm": 0.9658989310264587, + "learning_rate": 4.907131592780796e-05, + "loss": 1.6941, + "step": 5816 + }, + { + "epoch": 1.5286891976220973, + "grad_norm": 0.6207730174064636, + "learning_rate": 4.90537935868232e-05, + "loss": 1.6654, + "step": 5818 + }, + { + "epoch": 1.5292147009557593, + "grad_norm": 0.5695872902870178, + "learning_rate": 4.903627124583844e-05, + "loss": 1.6503, + "step": 5820 + }, + { + "epoch": 1.5297402042894208, + "grad_norm": 0.7392182946205139, + "learning_rate": 4.901874890485369e-05, + "loss": 1.6731, + "step": 5822 + }, + { + "epoch": 1.5302657076230828, + "grad_norm": 0.5596084594726562, + "learning_rate": 4.900122656386894e-05, + "loss": 1.6816, + "step": 5824 + }, + { + "epoch": 1.5307912109567445, + "grad_norm": 0.5417515635490417, + "learning_rate": 4.898370422288418e-05, + "loss": 1.6556, + "step": 5826 + }, + { + "epoch": 1.5313167142904063, + "grad_norm": 0.6508921980857849, + "learning_rate": 4.8966181881899426e-05, + "loss": 1.684, + "step": 5828 + }, + { + "epoch": 1.531842217624068, + "grad_norm": 0.562555193901062, + "learning_rate": 4.894865954091467e-05, + "loss": 1.6706, + "step": 5830 + }, + { + "epoch": 1.5323677209577298, + "grad_norm": 0.6852477788925171, + "learning_rate": 4.893113719992991e-05, + "loss": 1.6604, + "step": 5832 + }, + { + "epoch": 1.5328932242913917, + "grad_norm": 0.8392126560211182, + "learning_rate": 4.8913614858945155e-05, + "loss": 1.6496, + "step": 5834 + }, + { + "epoch": 1.5334187276250533, + "grad_norm": 0.635260283946991, + "learning_rate": 4.88960925179604e-05, + "loss": 1.6715, + "step": 5836 + }, + { + "epoch": 1.5339442309587152, + "grad_norm": 0.5033748149871826, + "learning_rate": 4.887857017697565e-05, + "loss": 1.6391, + "step": 5838 + }, + { + "epoch": 1.5344697342923768, + "grad_norm": 0.6123384237289429, + "learning_rate": 4.886104783599089e-05, + "loss": 1.6689, + "step": 5840 + }, + { + "epoch": 1.5349952376260387, + "grad_norm": 0.8817549347877502, + "learning_rate": 4.884352549500614e-05, + "loss": 1.7004, + "step": 5842 + }, + { + "epoch": 1.5355207409597005, + "grad_norm": 0.5635519027709961, + "learning_rate": 4.882600315402138e-05, + "loss": 1.6854, + "step": 5844 + }, + { + "epoch": 1.5360462442933622, + "grad_norm": 0.5673317313194275, + "learning_rate": 4.880848081303662e-05, + "loss": 1.6538, + "step": 5846 + }, + { + "epoch": 1.536571747627024, + "grad_norm": 0.5565547347068787, + "learning_rate": 4.879095847205187e-05, + "loss": 1.6524, + "step": 5848 + }, + { + "epoch": 1.5370972509606857, + "grad_norm": 0.5782384276390076, + "learning_rate": 4.8773436131067115e-05, + "loss": 1.6929, + "step": 5850 + }, + { + "epoch": 1.5376227542943477, + "grad_norm": 0.6354172229766846, + "learning_rate": 4.8755913790082356e-05, + "loss": 1.6959, + "step": 5852 + }, + { + "epoch": 1.5381482576280092, + "grad_norm": 0.8500383496284485, + "learning_rate": 4.87383914490976e-05, + "loss": 1.6716, + "step": 5854 + }, + { + "epoch": 1.5386737609616712, + "grad_norm": 0.6829351782798767, + "learning_rate": 4.872086910811285e-05, + "loss": 1.619, + "step": 5856 + }, + { + "epoch": 1.5391992642953327, + "grad_norm": 0.5141264200210571, + "learning_rate": 4.870334676712809e-05, + "loss": 1.6637, + "step": 5858 + }, + { + "epoch": 1.5397247676289947, + "grad_norm": 0.5253787636756897, + "learning_rate": 4.868582442614333e-05, + "loss": 1.6297, + "step": 5860 + }, + { + "epoch": 1.5402502709626564, + "grad_norm": 0.5017191171646118, + "learning_rate": 4.866830208515858e-05, + "loss": 1.6502, + "step": 5862 + }, + { + "epoch": 1.5407757742963182, + "grad_norm": 0.5245093703269958, + "learning_rate": 4.865077974417383e-05, + "loss": 1.6876, + "step": 5864 + }, + { + "epoch": 1.54130127762998, + "grad_norm": 0.6714510321617126, + "learning_rate": 4.863325740318907e-05, + "loss": 1.6635, + "step": 5866 + }, + { + "epoch": 1.5418267809636417, + "grad_norm": 0.47801029682159424, + "learning_rate": 4.8615735062204316e-05, + "loss": 1.673, + "step": 5868 + }, + { + "epoch": 1.5423522842973036, + "grad_norm": 0.5477654337882996, + "learning_rate": 4.859821272121956e-05, + "loss": 1.7127, + "step": 5870 + }, + { + "epoch": 1.5428777876309652, + "grad_norm": 0.6024754643440247, + "learning_rate": 4.85806903802348e-05, + "loss": 1.6402, + "step": 5872 + }, + { + "epoch": 1.5434032909646271, + "grad_norm": 0.5506448149681091, + "learning_rate": 4.8563168039250045e-05, + "loss": 1.6795, + "step": 5874 + }, + { + "epoch": 1.5439287942982887, + "grad_norm": 0.6219335794448853, + "learning_rate": 4.854564569826529e-05, + "loss": 1.7043, + "step": 5876 + }, + { + "epoch": 1.5444542976319506, + "grad_norm": 0.5376720428466797, + "learning_rate": 4.8528123357280533e-05, + "loss": 1.6416, + "step": 5878 + }, + { + "epoch": 1.5449798009656124, + "grad_norm": 0.5363356471061707, + "learning_rate": 4.851060101629578e-05, + "loss": 1.6787, + "step": 5880 + }, + { + "epoch": 1.5455053042992741, + "grad_norm": 0.6602938175201416, + "learning_rate": 4.849307867531102e-05, + "loss": 1.6648, + "step": 5882 + }, + { + "epoch": 1.5460308076329359, + "grad_norm": 0.5136982798576355, + "learning_rate": 4.847555633432627e-05, + "loss": 1.6636, + "step": 5884 + }, + { + "epoch": 1.5465563109665976, + "grad_norm": 0.5922480225563049, + "learning_rate": 4.845803399334151e-05, + "loss": 1.645, + "step": 5886 + }, + { + "epoch": 1.5470818143002596, + "grad_norm": 0.6566057205200195, + "learning_rate": 4.844051165235676e-05, + "loss": 1.7023, + "step": 5888 + }, + { + "epoch": 1.5476073176339211, + "grad_norm": 0.5478838682174683, + "learning_rate": 4.8422989311372005e-05, + "loss": 1.6359, + "step": 5890 + }, + { + "epoch": 1.548132820967583, + "grad_norm": 0.5223366618156433, + "learning_rate": 4.8405466970387246e-05, + "loss": 1.6827, + "step": 5892 + }, + { + "epoch": 1.5486583243012448, + "grad_norm": 0.5406147837638855, + "learning_rate": 4.8387944629402494e-05, + "loss": 1.6776, + "step": 5894 + }, + { + "epoch": 1.5491838276349066, + "grad_norm": 0.5778117179870605, + "learning_rate": 4.8370422288417734e-05, + "loss": 1.6862, + "step": 5896 + }, + { + "epoch": 1.5497093309685683, + "grad_norm": 0.7566986083984375, + "learning_rate": 4.8352899947432975e-05, + "loss": 1.6777, + "step": 5898 + }, + { + "epoch": 1.55023483430223, + "grad_norm": 0.5816596150398254, + "learning_rate": 4.833537760644822e-05, + "loss": 1.6782, + "step": 5900 + }, + { + "epoch": 1.5507603376358918, + "grad_norm": 0.6017654538154602, + "learning_rate": 4.831785526546347e-05, + "loss": 1.6683, + "step": 5902 + }, + { + "epoch": 1.5512858409695536, + "grad_norm": 0.6033945679664612, + "learning_rate": 4.830033292447871e-05, + "loss": 1.6734, + "step": 5904 + }, + { + "epoch": 1.5518113443032155, + "grad_norm": 0.6159767508506775, + "learning_rate": 4.828281058349396e-05, + "loss": 1.6499, + "step": 5906 + }, + { + "epoch": 1.552336847636877, + "grad_norm": 0.5407187938690186, + "learning_rate": 4.82652882425092e-05, + "loss": 1.6808, + "step": 5908 + }, + { + "epoch": 1.552862350970539, + "grad_norm": 0.6374317407608032, + "learning_rate": 4.824776590152445e-05, + "loss": 1.6735, + "step": 5910 + }, + { + "epoch": 1.5533878543042008, + "grad_norm": 0.6518111228942871, + "learning_rate": 4.823024356053969e-05, + "loss": 1.6818, + "step": 5912 + }, + { + "epoch": 1.5539133576378625, + "grad_norm": 0.5397405028343201, + "learning_rate": 4.8212721219554935e-05, + "loss": 1.6484, + "step": 5914 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.5633911490440369, + "learning_rate": 4.819519887857018e-05, + "loss": 1.6438, + "step": 5916 + }, + { + "epoch": 1.554964364305186, + "grad_norm": 0.5151371359825134, + "learning_rate": 4.8177676537585424e-05, + "loss": 1.6492, + "step": 5918 + }, + { + "epoch": 1.5554898676388478, + "grad_norm": 0.5781606435775757, + "learning_rate": 4.816015419660067e-05, + "loss": 1.6881, + "step": 5920 + }, + { + "epoch": 1.5560153709725095, + "grad_norm": 0.6180148720741272, + "learning_rate": 4.814263185561591e-05, + "loss": 1.6847, + "step": 5922 + }, + { + "epoch": 1.5565408743061715, + "grad_norm": 0.5826863646507263, + "learning_rate": 4.812510951463115e-05, + "loss": 1.6514, + "step": 5924 + }, + { + "epoch": 1.557066377639833, + "grad_norm": 0.5639335513114929, + "learning_rate": 4.81075871736464e-05, + "loss": 1.6146, + "step": 5926 + }, + { + "epoch": 1.557591880973495, + "grad_norm": 0.5149716734886169, + "learning_rate": 4.809006483266165e-05, + "loss": 1.7064, + "step": 5928 + }, + { + "epoch": 1.5581173843071567, + "grad_norm": 0.5117688179016113, + "learning_rate": 4.807254249167689e-05, + "loss": 1.6757, + "step": 5930 + }, + { + "epoch": 1.5586428876408185, + "grad_norm": 0.6055058836936951, + "learning_rate": 4.8055020150692136e-05, + "loss": 1.6777, + "step": 5932 + }, + { + "epoch": 1.5591683909744802, + "grad_norm": 0.5205301642417908, + "learning_rate": 4.803749780970738e-05, + "loss": 1.6599, + "step": 5934 + }, + { + "epoch": 1.559693894308142, + "grad_norm": 0.5878111124038696, + "learning_rate": 4.8019975468722625e-05, + "loss": 1.6491, + "step": 5936 + }, + { + "epoch": 1.560219397641804, + "grad_norm": 0.6508564949035645, + "learning_rate": 4.8002453127737865e-05, + "loss": 1.7085, + "step": 5938 + }, + { + "epoch": 1.5607449009754655, + "grad_norm": 0.7473523020744324, + "learning_rate": 4.798493078675311e-05, + "loss": 1.6765, + "step": 5940 + }, + { + "epoch": 1.5612704043091274, + "grad_norm": 0.5754930377006531, + "learning_rate": 4.796740844576836e-05, + "loss": 1.6825, + "step": 5942 + }, + { + "epoch": 1.561795907642789, + "grad_norm": 0.5699328780174255, + "learning_rate": 4.79498861047836e-05, + "loss": 1.6851, + "step": 5944 + }, + { + "epoch": 1.562321410976451, + "grad_norm": 0.5697868466377258, + "learning_rate": 4.793236376379884e-05, + "loss": 1.644, + "step": 5946 + }, + { + "epoch": 1.5628469143101127, + "grad_norm": 0.5635419487953186, + "learning_rate": 4.791484142281409e-05, + "loss": 1.6765, + "step": 5948 + }, + { + "epoch": 1.5633724176437744, + "grad_norm": 0.7371823191642761, + "learning_rate": 4.789731908182933e-05, + "loss": 1.6876, + "step": 5950 + }, + { + "epoch": 1.5638979209774362, + "grad_norm": 0.4995562434196472, + "learning_rate": 4.787979674084458e-05, + "loss": 1.6593, + "step": 5952 + }, + { + "epoch": 1.564423424311098, + "grad_norm": 0.5490265488624573, + "learning_rate": 4.7862274399859826e-05, + "loss": 1.6424, + "step": 5954 + }, + { + "epoch": 1.56494892764476, + "grad_norm": 0.5627312064170837, + "learning_rate": 4.7844752058875066e-05, + "loss": 1.6849, + "step": 5956 + }, + { + "epoch": 1.5654744309784214, + "grad_norm": 0.5825894474983215, + "learning_rate": 4.7827229717890314e-05, + "loss": 1.6714, + "step": 5958 + }, + { + "epoch": 1.5659999343120834, + "grad_norm": 0.6863036751747131, + "learning_rate": 4.7809707376905555e-05, + "loss": 1.69, + "step": 5960 + }, + { + "epoch": 1.566525437645745, + "grad_norm": 0.6272795796394348, + "learning_rate": 4.77921850359208e-05, + "loss": 1.6536, + "step": 5962 + }, + { + "epoch": 1.567050940979407, + "grad_norm": 0.6180011630058289, + "learning_rate": 4.777466269493604e-05, + "loss": 1.6975, + "step": 5964 + }, + { + "epoch": 1.5675764443130686, + "grad_norm": 0.5767374038696289, + "learning_rate": 4.775714035395129e-05, + "loss": 1.6504, + "step": 5966 + }, + { + "epoch": 1.5681019476467304, + "grad_norm": 0.5190562009811401, + "learning_rate": 4.773961801296654e-05, + "loss": 1.6131, + "step": 5968 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.6157041788101196, + "learning_rate": 4.772209567198178e-05, + "loss": 1.6916, + "step": 5970 + }, + { + "epoch": 1.5691529543140539, + "grad_norm": 0.5237650275230408, + "learning_rate": 4.770457333099702e-05, + "loss": 1.6746, + "step": 5972 + }, + { + "epoch": 1.5696784576477159, + "grad_norm": 0.5459091663360596, + "learning_rate": 4.768705099001227e-05, + "loss": 1.6643, + "step": 5974 + }, + { + "epoch": 1.5702039609813774, + "grad_norm": 0.5838450193405151, + "learning_rate": 4.7669528649027515e-05, + "loss": 1.6745, + "step": 5976 + }, + { + "epoch": 1.5707294643150393, + "grad_norm": 0.5081990361213684, + "learning_rate": 4.7652006308042756e-05, + "loss": 1.6876, + "step": 5978 + }, + { + "epoch": 1.5712549676487009, + "grad_norm": 0.6588307619094849, + "learning_rate": 4.7634483967058e-05, + "loss": 1.7017, + "step": 5980 + }, + { + "epoch": 1.5717804709823628, + "grad_norm": 0.8295912146568298, + "learning_rate": 4.761696162607325e-05, + "loss": 1.6709, + "step": 5982 + }, + { + "epoch": 1.5723059743160246, + "grad_norm": 0.6136393547058105, + "learning_rate": 4.7599439285088485e-05, + "loss": 1.6989, + "step": 5984 + }, + { + "epoch": 1.5728314776496863, + "grad_norm": 0.5983635187149048, + "learning_rate": 4.758191694410373e-05, + "loss": 1.6786, + "step": 5986 + }, + { + "epoch": 1.573356980983348, + "grad_norm": 0.6584506630897522, + "learning_rate": 4.756439460311898e-05, + "loss": 1.6549, + "step": 5988 + }, + { + "epoch": 1.5738824843170098, + "grad_norm": 0.513239860534668, + "learning_rate": 4.754687226213422e-05, + "loss": 1.683, + "step": 5990 + }, + { + "epoch": 1.5744079876506718, + "grad_norm": 0.5512750744819641, + "learning_rate": 4.752934992114947e-05, + "loss": 1.703, + "step": 5992 + }, + { + "epoch": 1.5749334909843333, + "grad_norm": 0.7697680592536926, + "learning_rate": 4.7511827580164716e-05, + "loss": 1.6361, + "step": 5994 + }, + { + "epoch": 1.5754589943179953, + "grad_norm": 0.5221796035766602, + "learning_rate": 4.749430523917996e-05, + "loss": 1.6525, + "step": 5996 + }, + { + "epoch": 1.5759844976516568, + "grad_norm": 0.5528659820556641, + "learning_rate": 4.74767828981952e-05, + "loss": 1.6904, + "step": 5998 + }, + { + "epoch": 1.5765100009853188, + "grad_norm": 0.6376633644104004, + "learning_rate": 4.7459260557210445e-05, + "loss": 1.6849, + "step": 6000 + }, + { + "epoch": 1.5765100009853188, + "eval_loss": 1.6817389726638794, + "eval_runtime": 487.2585, + "eval_samples_per_second": 249.947, + "eval_steps_per_second": 31.244, + "step": 6000 + }, + { + "epoch": 1.5770355043189805, + "grad_norm": 0.5638622641563416, + "learning_rate": 4.744173821622569e-05, + "loss": 1.6566, + "step": 6002 + }, + { + "epoch": 1.5775610076526423, + "grad_norm": 0.6327787637710571, + "learning_rate": 4.7424215875240933e-05, + "loss": 1.6806, + "step": 6004 + }, + { + "epoch": 1.578086510986304, + "grad_norm": 0.6774953007698059, + "learning_rate": 4.740669353425618e-05, + "loss": 1.6806, + "step": 6006 + }, + { + "epoch": 1.5786120143199658, + "grad_norm": 0.6806586980819702, + "learning_rate": 4.738917119327143e-05, + "loss": 1.6435, + "step": 6008 + }, + { + "epoch": 1.5791375176536278, + "grad_norm": 0.5648463368415833, + "learning_rate": 4.737164885228666e-05, + "loss": 1.7014, + "step": 6010 + }, + { + "epoch": 1.5796630209872893, + "grad_norm": 0.5950681567192078, + "learning_rate": 4.735412651130191e-05, + "loss": 1.6408, + "step": 6012 + }, + { + "epoch": 1.5801885243209512, + "grad_norm": 0.6674718856811523, + "learning_rate": 4.733660417031716e-05, + "loss": 1.6451, + "step": 6014 + }, + { + "epoch": 1.5807140276546128, + "grad_norm": 0.8182973265647888, + "learning_rate": 4.73190818293324e-05, + "loss": 1.6913, + "step": 6016 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.5322865843772888, + "learning_rate": 4.7301559488347646e-05, + "loss": 1.7063, + "step": 6018 + }, + { + "epoch": 1.5817650343219365, + "grad_norm": 0.5284510850906372, + "learning_rate": 4.7284037147362894e-05, + "loss": 1.6556, + "step": 6020 + }, + { + "epoch": 1.5822905376555982, + "grad_norm": 0.6167967915534973, + "learning_rate": 4.7266514806378134e-05, + "loss": 1.6455, + "step": 6022 + }, + { + "epoch": 1.58281604098926, + "grad_norm": 0.5635978579521179, + "learning_rate": 4.7248992465393375e-05, + "loss": 1.6768, + "step": 6024 + }, + { + "epoch": 1.5833415443229217, + "grad_norm": 0.6442639231681824, + "learning_rate": 4.723147012440862e-05, + "loss": 1.6707, + "step": 6026 + }, + { + "epoch": 1.5838670476565837, + "grad_norm": 0.5623191595077515, + "learning_rate": 4.721394778342387e-05, + "loss": 1.711, + "step": 6028 + }, + { + "epoch": 1.5843925509902452, + "grad_norm": 0.5593612194061279, + "learning_rate": 4.719642544243911e-05, + "loss": 1.6713, + "step": 6030 + }, + { + "epoch": 1.5849180543239072, + "grad_norm": 0.7203925251960754, + "learning_rate": 4.717890310145436e-05, + "loss": 1.6842, + "step": 6032 + }, + { + "epoch": 1.5854435576575687, + "grad_norm": 0.5466020703315735, + "learning_rate": 4.7161380760469606e-05, + "loss": 1.6921, + "step": 6034 + }, + { + "epoch": 1.5859690609912307, + "grad_norm": 0.5434938669204712, + "learning_rate": 4.714385841948484e-05, + "loss": 1.6605, + "step": 6036 + }, + { + "epoch": 1.5864945643248924, + "grad_norm": 0.5552278757095337, + "learning_rate": 4.712633607850009e-05, + "loss": 1.707, + "step": 6038 + }, + { + "epoch": 1.5870200676585542, + "grad_norm": 0.6083989143371582, + "learning_rate": 4.7108813737515335e-05, + "loss": 1.6888, + "step": 6040 + }, + { + "epoch": 1.587545570992216, + "grad_norm": 0.5356237292289734, + "learning_rate": 4.7091291396530576e-05, + "loss": 1.6478, + "step": 6042 + }, + { + "epoch": 1.5880710743258777, + "grad_norm": 0.4878697097301483, + "learning_rate": 4.7073769055545824e-05, + "loss": 1.6558, + "step": 6044 + }, + { + "epoch": 1.5885965776595397, + "grad_norm": 0.5412904620170593, + "learning_rate": 4.705624671456107e-05, + "loss": 1.6763, + "step": 6046 + }, + { + "epoch": 1.5891220809932012, + "grad_norm": 0.5351582765579224, + "learning_rate": 4.703872437357631e-05, + "loss": 1.6891, + "step": 6048 + }, + { + "epoch": 1.5896475843268632, + "grad_norm": 0.5699527859687805, + "learning_rate": 4.702120203259155e-05, + "loss": 1.6841, + "step": 6050 + }, + { + "epoch": 1.590173087660525, + "grad_norm": 0.5373657941818237, + "learning_rate": 4.70036796916068e-05, + "loss": 1.6589, + "step": 6052 + }, + { + "epoch": 1.5906985909941866, + "grad_norm": 0.5621985197067261, + "learning_rate": 4.698615735062205e-05, + "loss": 1.6845, + "step": 6054 + }, + { + "epoch": 1.5912240943278484, + "grad_norm": 0.5501397848129272, + "learning_rate": 4.696863500963729e-05, + "loss": 1.6752, + "step": 6056 + }, + { + "epoch": 1.5917495976615101, + "grad_norm": 0.6588435173034668, + "learning_rate": 4.6951112668652536e-05, + "loss": 1.6727, + "step": 6058 + }, + { + "epoch": 1.5922751009951719, + "grad_norm": 0.5301753282546997, + "learning_rate": 4.6933590327667784e-05, + "loss": 1.6811, + "step": 6060 + }, + { + "epoch": 1.5928006043288336, + "grad_norm": 0.5546782612800598, + "learning_rate": 4.691606798668302e-05, + "loss": 1.6298, + "step": 6062 + }, + { + "epoch": 1.5933261076624956, + "grad_norm": 0.6335771679878235, + "learning_rate": 4.6898545645698265e-05, + "loss": 1.6606, + "step": 6064 + }, + { + "epoch": 1.5938516109961571, + "grad_norm": 0.5431744456291199, + "learning_rate": 4.688102330471351e-05, + "loss": 1.6444, + "step": 6066 + }, + { + "epoch": 1.594377114329819, + "grad_norm": 0.6626846790313721, + "learning_rate": 4.6863500963728754e-05, + "loss": 1.6857, + "step": 6068 + }, + { + "epoch": 1.5949026176634808, + "grad_norm": 0.5748207569122314, + "learning_rate": 4.6845978622744e-05, + "loss": 1.6811, + "step": 6070 + }, + { + "epoch": 1.5954281209971426, + "grad_norm": 0.6388061046600342, + "learning_rate": 4.682845628175925e-05, + "loss": 1.6696, + "step": 6072 + }, + { + "epoch": 1.5959536243308043, + "grad_norm": 0.5904421210289001, + "learning_rate": 4.681093394077449e-05, + "loss": 1.6872, + "step": 6074 + }, + { + "epoch": 1.596479127664466, + "grad_norm": 0.5612810254096985, + "learning_rate": 4.679341159978973e-05, + "loss": 1.7049, + "step": 6076 + }, + { + "epoch": 1.597004630998128, + "grad_norm": 0.5887944102287292, + "learning_rate": 4.677588925880498e-05, + "loss": 1.6556, + "step": 6078 + }, + { + "epoch": 1.5975301343317896, + "grad_norm": 0.5331344604492188, + "learning_rate": 4.6758366917820226e-05, + "loss": 1.6706, + "step": 6080 + }, + { + "epoch": 1.5980556376654516, + "grad_norm": 0.6311489939689636, + "learning_rate": 4.6740844576835466e-05, + "loss": 1.6893, + "step": 6082 + }, + { + "epoch": 1.598581140999113, + "grad_norm": 0.5310975313186646, + "learning_rate": 4.6723322235850714e-05, + "loss": 1.6057, + "step": 6084 + }, + { + "epoch": 1.599106644332775, + "grad_norm": 0.5268464684486389, + "learning_rate": 4.6705799894865955e-05, + "loss": 1.6649, + "step": 6086 + }, + { + "epoch": 1.5996321476664368, + "grad_norm": 0.5862019658088684, + "learning_rate": 4.66882775538812e-05, + "loss": 1.6631, + "step": 6088 + }, + { + "epoch": 1.6001576510000985, + "grad_norm": 0.5458804368972778, + "learning_rate": 4.667075521289644e-05, + "loss": 1.671, + "step": 6090 + }, + { + "epoch": 1.6006831543337603, + "grad_norm": 0.5199334025382996, + "learning_rate": 4.665323287191169e-05, + "loss": 1.6498, + "step": 6092 + }, + { + "epoch": 1.601208657667422, + "grad_norm": 0.5879623293876648, + "learning_rate": 4.663571053092694e-05, + "loss": 1.667, + "step": 6094 + }, + { + "epoch": 1.601734161001084, + "grad_norm": 0.6247827410697937, + "learning_rate": 4.661818818994218e-05, + "loss": 1.7004, + "step": 6096 + }, + { + "epoch": 1.6022596643347455, + "grad_norm": 0.6340193152427673, + "learning_rate": 4.6600665848957427e-05, + "loss": 1.6622, + "step": 6098 + }, + { + "epoch": 1.6027851676684075, + "grad_norm": 0.6243396401405334, + "learning_rate": 4.658314350797267e-05, + "loss": 1.6983, + "step": 6100 + }, + { + "epoch": 1.603310671002069, + "grad_norm": 0.584221601486206, + "learning_rate": 4.656562116698791e-05, + "loss": 1.6548, + "step": 6102 + }, + { + "epoch": 1.603836174335731, + "grad_norm": 0.5587270259857178, + "learning_rate": 4.6548098826003156e-05, + "loss": 1.6482, + "step": 6104 + }, + { + "epoch": 1.6043616776693927, + "grad_norm": 0.8525761365890503, + "learning_rate": 4.65305764850184e-05, + "loss": 1.6601, + "step": 6106 + }, + { + "epoch": 1.6048871810030545, + "grad_norm": 0.5316641330718994, + "learning_rate": 4.6513054144033644e-05, + "loss": 1.6809, + "step": 6108 + }, + { + "epoch": 1.6054126843367162, + "grad_norm": 0.5327313542366028, + "learning_rate": 4.649553180304889e-05, + "loss": 1.6303, + "step": 6110 + }, + { + "epoch": 1.605938187670378, + "grad_norm": 0.5327521562576294, + "learning_rate": 4.647800946206413e-05, + "loss": 1.6875, + "step": 6112 + }, + { + "epoch": 1.60646369100404, + "grad_norm": 0.7889205813407898, + "learning_rate": 4.646048712107938e-05, + "loss": 1.7043, + "step": 6114 + }, + { + "epoch": 1.6069891943377015, + "grad_norm": 0.6641364097595215, + "learning_rate": 4.644296478009462e-05, + "loss": 1.7062, + "step": 6116 + }, + { + "epoch": 1.6075146976713635, + "grad_norm": 0.675348162651062, + "learning_rate": 4.642544243910987e-05, + "loss": 1.6911, + "step": 6118 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.541476845741272, + "learning_rate": 4.6407920098125116e-05, + "loss": 1.6667, + "step": 6120 + }, + { + "epoch": 1.608565704338687, + "grad_norm": 0.5840083360671997, + "learning_rate": 4.639039775714036e-05, + "loss": 1.652, + "step": 6122 + }, + { + "epoch": 1.6090912076723487, + "grad_norm": 0.5409373641014099, + "learning_rate": 4.6372875416155604e-05, + "loss": 1.7031, + "step": 6124 + }, + { + "epoch": 1.6096167110060104, + "grad_norm": 0.5161097049713135, + "learning_rate": 4.6355353075170845e-05, + "loss": 1.6873, + "step": 6126 + }, + { + "epoch": 1.6101422143396722, + "grad_norm": 0.6245579719543457, + "learning_rate": 4.6337830734186086e-05, + "loss": 1.6652, + "step": 6128 + }, + { + "epoch": 1.610667717673334, + "grad_norm": 0.60563725233078, + "learning_rate": 4.6320308393201333e-05, + "loss": 1.6783, + "step": 6130 + }, + { + "epoch": 1.611193221006996, + "grad_norm": 0.667809009552002, + "learning_rate": 4.630278605221658e-05, + "loss": 1.6492, + "step": 6132 + }, + { + "epoch": 1.6117187243406574, + "grad_norm": 0.5152156352996826, + "learning_rate": 4.628526371123182e-05, + "loss": 1.6757, + "step": 6134 + }, + { + "epoch": 1.6122442276743194, + "grad_norm": 0.5245918035507202, + "learning_rate": 4.626774137024707e-05, + "loss": 1.6368, + "step": 6136 + }, + { + "epoch": 1.612769731007981, + "grad_norm": 0.5406076908111572, + "learning_rate": 4.625021902926231e-05, + "loss": 1.6817, + "step": 6138 + }, + { + "epoch": 1.613295234341643, + "grad_norm": 0.6298016905784607, + "learning_rate": 4.623269668827756e-05, + "loss": 1.6542, + "step": 6140 + }, + { + "epoch": 1.6138207376753047, + "grad_norm": 0.5466469526290894, + "learning_rate": 4.62151743472928e-05, + "loss": 1.6367, + "step": 6142 + }, + { + "epoch": 1.6143462410089664, + "grad_norm": 0.7066838145256042, + "learning_rate": 4.6197652006308046e-05, + "loss": 1.6461, + "step": 6144 + }, + { + "epoch": 1.6148717443426281, + "grad_norm": 0.5102798938751221, + "learning_rate": 4.6180129665323294e-05, + "loss": 1.6628, + "step": 6146 + }, + { + "epoch": 1.61539724767629, + "grad_norm": 0.72835773229599, + "learning_rate": 4.6162607324338534e-05, + "loss": 1.7141, + "step": 6148 + }, + { + "epoch": 1.6159227510099519, + "grad_norm": 0.5149089097976685, + "learning_rate": 4.6145084983353775e-05, + "loss": 1.6539, + "step": 6150 + }, + { + "epoch": 1.6164482543436134, + "grad_norm": 0.5791059732437134, + "learning_rate": 4.612756264236902e-05, + "loss": 1.6791, + "step": 6152 + }, + { + "epoch": 1.6169737576772754, + "grad_norm": 0.5340200662612915, + "learning_rate": 4.6110040301384263e-05, + "loss": 1.6807, + "step": 6154 + }, + { + "epoch": 1.6174992610109369, + "grad_norm": 0.5943130254745483, + "learning_rate": 4.609251796039951e-05, + "loss": 1.662, + "step": 6156 + }, + { + "epoch": 1.6180247643445989, + "grad_norm": 0.5829207897186279, + "learning_rate": 4.607499561941476e-05, + "loss": 1.67, + "step": 6158 + }, + { + "epoch": 1.6185502676782606, + "grad_norm": 0.5549664497375488, + "learning_rate": 4.605747327843e-05, + "loss": 1.6686, + "step": 6160 + }, + { + "epoch": 1.6190757710119223, + "grad_norm": 0.6359058618545532, + "learning_rate": 4.603995093744525e-05, + "loss": 1.6694, + "step": 6162 + }, + { + "epoch": 1.619601274345584, + "grad_norm": 0.5652121305465698, + "learning_rate": 4.602242859646049e-05, + "loss": 1.6678, + "step": 6164 + }, + { + "epoch": 1.6201267776792458, + "grad_norm": 0.5643856525421143, + "learning_rate": 4.6004906255475735e-05, + "loss": 1.6915, + "step": 6166 + }, + { + "epoch": 1.6206522810129078, + "grad_norm": 0.5165106058120728, + "learning_rate": 4.5987383914490976e-05, + "loss": 1.6643, + "step": 6168 + }, + { + "epoch": 1.6211777843465693, + "grad_norm": 0.6091976761817932, + "learning_rate": 4.5969861573506224e-05, + "loss": 1.6878, + "step": 6170 + }, + { + "epoch": 1.6217032876802313, + "grad_norm": 0.5565605759620667, + "learning_rate": 4.595233923252147e-05, + "loss": 1.6843, + "step": 6172 + }, + { + "epoch": 1.6222287910138928, + "grad_norm": 0.635827362537384, + "learning_rate": 4.593481689153671e-05, + "loss": 1.6554, + "step": 6174 + }, + { + "epoch": 1.6227542943475548, + "grad_norm": 0.6421988010406494, + "learning_rate": 4.591729455055195e-05, + "loss": 1.6298, + "step": 6176 + }, + { + "epoch": 1.6232797976812166, + "grad_norm": 0.7460888624191284, + "learning_rate": 4.58997722095672e-05, + "loss": 1.7074, + "step": 6178 + }, + { + "epoch": 1.6238053010148783, + "grad_norm": 0.5302199721336365, + "learning_rate": 4.588224986858244e-05, + "loss": 1.6546, + "step": 6180 + }, + { + "epoch": 1.62433080434854, + "grad_norm": 0.5736710429191589, + "learning_rate": 4.586472752759769e-05, + "loss": 1.6816, + "step": 6182 + }, + { + "epoch": 1.6248563076822018, + "grad_norm": 0.6789550185203552, + "learning_rate": 4.5847205186612936e-05, + "loss": 1.6643, + "step": 6184 + }, + { + "epoch": 1.6253818110158638, + "grad_norm": 0.5486941337585449, + "learning_rate": 4.582968284562818e-05, + "loss": 1.68, + "step": 6186 + }, + { + "epoch": 1.6259073143495253, + "grad_norm": 0.8008251786231995, + "learning_rate": 4.581216050464342e-05, + "loss": 1.6386, + "step": 6188 + }, + { + "epoch": 1.6264328176831873, + "grad_norm": 0.6284978985786438, + "learning_rate": 4.5794638163658665e-05, + "loss": 1.6497, + "step": 6190 + }, + { + "epoch": 1.6269583210168488, + "grad_norm": 0.5751000642776489, + "learning_rate": 4.577711582267391e-05, + "loss": 1.6897, + "step": 6192 + }, + { + "epoch": 1.6274838243505108, + "grad_norm": 0.5420372486114502, + "learning_rate": 4.5759593481689154e-05, + "loss": 1.6936, + "step": 6194 + }, + { + "epoch": 1.6280093276841725, + "grad_norm": 0.5770832896232605, + "learning_rate": 4.57420711407044e-05, + "loss": 1.6682, + "step": 6196 + }, + { + "epoch": 1.6285348310178343, + "grad_norm": 0.5784463286399841, + "learning_rate": 4.572454879971965e-05, + "loss": 1.6884, + "step": 6198 + }, + { + "epoch": 1.629060334351496, + "grad_norm": 0.5765862464904785, + "learning_rate": 4.570702645873489e-05, + "loss": 1.6228, + "step": 6200 + }, + { + "epoch": 1.6295858376851577, + "grad_norm": 0.6257441639900208, + "learning_rate": 4.568950411775013e-05, + "loss": 1.6559, + "step": 6202 + }, + { + "epoch": 1.6301113410188197, + "grad_norm": 0.5480515360832214, + "learning_rate": 4.567198177676538e-05, + "loss": 1.6524, + "step": 6204 + }, + { + "epoch": 1.6306368443524812, + "grad_norm": 0.5308948755264282, + "learning_rate": 4.565445943578062e-05, + "loss": 1.6954, + "step": 6206 + }, + { + "epoch": 1.6311623476861432, + "grad_norm": 0.640443742275238, + "learning_rate": 4.5636937094795866e-05, + "loss": 1.6434, + "step": 6208 + }, + { + "epoch": 1.631687851019805, + "grad_norm": 0.5399090647697449, + "learning_rate": 4.5619414753811114e-05, + "loss": 1.6317, + "step": 6210 + }, + { + "epoch": 1.6322133543534667, + "grad_norm": 0.5155448913574219, + "learning_rate": 4.5601892412826355e-05, + "loss": 1.6258, + "step": 6212 + }, + { + "epoch": 1.6327388576871285, + "grad_norm": 0.540034830570221, + "learning_rate": 4.5584370071841596e-05, + "loss": 1.6592, + "step": 6214 + }, + { + "epoch": 1.6332643610207902, + "grad_norm": 0.6191601157188416, + "learning_rate": 4.556684773085684e-05, + "loss": 1.6188, + "step": 6216 + }, + { + "epoch": 1.633789864354452, + "grad_norm": 0.5206524133682251, + "learning_rate": 4.554932538987209e-05, + "loss": 1.667, + "step": 6218 + }, + { + "epoch": 1.6343153676881137, + "grad_norm": 0.5997523069381714, + "learning_rate": 4.553180304888733e-05, + "loss": 1.7165, + "step": 6220 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.6088078618049622, + "learning_rate": 4.551428070790258e-05, + "loss": 1.65, + "step": 6222 + }, + { + "epoch": 1.6353663743554372, + "grad_norm": 0.5121122598648071, + "learning_rate": 4.5496758366917827e-05, + "loss": 1.6661, + "step": 6224 + }, + { + "epoch": 1.6358918776890992, + "grad_norm": 0.5405161380767822, + "learning_rate": 4.547923602593307e-05, + "loss": 1.6812, + "step": 6226 + }, + { + "epoch": 1.636417381022761, + "grad_norm": 0.5181125402450562, + "learning_rate": 4.546171368494831e-05, + "loss": 1.6888, + "step": 6228 + }, + { + "epoch": 1.6369428843564227, + "grad_norm": 0.7036988735198975, + "learning_rate": 4.5444191343963556e-05, + "loss": 1.7048, + "step": 6230 + }, + { + "epoch": 1.6374683876900844, + "grad_norm": 0.559990644454956, + "learning_rate": 4.54266690029788e-05, + "loss": 1.6745, + "step": 6232 + }, + { + "epoch": 1.6379938910237462, + "grad_norm": 0.5458931922912598, + "learning_rate": 4.5409146661994044e-05, + "loss": 1.6668, + "step": 6234 + }, + { + "epoch": 1.6385193943574081, + "grad_norm": 0.6637448072433472, + "learning_rate": 4.539162432100929e-05, + "loss": 1.7194, + "step": 6236 + }, + { + "epoch": 1.6390448976910696, + "grad_norm": 0.5551019310951233, + "learning_rate": 4.537410198002454e-05, + "loss": 1.6846, + "step": 6238 + }, + { + "epoch": 1.6395704010247316, + "grad_norm": 0.6413047909736633, + "learning_rate": 4.535657963903977e-05, + "loss": 1.6505, + "step": 6240 + }, + { + "epoch": 1.6400959043583931, + "grad_norm": 0.5083743929862976, + "learning_rate": 4.533905729805502e-05, + "loss": 1.6929, + "step": 6242 + }, + { + "epoch": 1.6406214076920551, + "grad_norm": 0.608717679977417, + "learning_rate": 4.532153495707027e-05, + "loss": 1.6767, + "step": 6244 + }, + { + "epoch": 1.6411469110257169, + "grad_norm": 0.5961493253707886, + "learning_rate": 4.530401261608551e-05, + "loss": 1.6434, + "step": 6246 + }, + { + "epoch": 1.6416724143593786, + "grad_norm": 0.5599090456962585, + "learning_rate": 4.528649027510076e-05, + "loss": 1.6774, + "step": 6248 + }, + { + "epoch": 1.6421979176930404, + "grad_norm": 0.5643408298492432, + "learning_rate": 4.5268967934116004e-05, + "loss": 1.6631, + "step": 6250 + }, + { + "epoch": 1.642723421026702, + "grad_norm": 0.5354277491569519, + "learning_rate": 4.5251445593131245e-05, + "loss": 1.6685, + "step": 6252 + }, + { + "epoch": 1.643248924360364, + "grad_norm": 0.6432374119758606, + "learning_rate": 4.5233923252146486e-05, + "loss": 1.6746, + "step": 6254 + }, + { + "epoch": 1.6437744276940256, + "grad_norm": 0.5566967725753784, + "learning_rate": 4.521640091116173e-05, + "loss": 1.6512, + "step": 6256 + }, + { + "epoch": 1.6442999310276876, + "grad_norm": 0.517951488494873, + "learning_rate": 4.519887857017698e-05, + "loss": 1.6485, + "step": 6258 + }, + { + "epoch": 1.644825434361349, + "grad_norm": 0.5770967602729797, + "learning_rate": 4.518135622919222e-05, + "loss": 1.6786, + "step": 6260 + }, + { + "epoch": 1.645350937695011, + "grad_norm": 0.5681661367416382, + "learning_rate": 4.516383388820747e-05, + "loss": 1.6464, + "step": 6262 + }, + { + "epoch": 1.6458764410286728, + "grad_norm": 0.6962856650352478, + "learning_rate": 4.514631154722272e-05, + "loss": 1.6435, + "step": 6264 + }, + { + "epoch": 1.6464019443623346, + "grad_norm": 0.5683899521827698, + "learning_rate": 4.512878920623795e-05, + "loss": 1.6862, + "step": 6266 + }, + { + "epoch": 1.6469274476959963, + "grad_norm": 0.6042145490646362, + "learning_rate": 4.51112668652532e-05, + "loss": 1.6574, + "step": 6268 + }, + { + "epoch": 1.647452951029658, + "grad_norm": 0.5737462639808655, + "learning_rate": 4.5093744524268446e-05, + "loss": 1.6785, + "step": 6270 + }, + { + "epoch": 1.64797845436332, + "grad_norm": 0.6028643846511841, + "learning_rate": 4.507622218328369e-05, + "loss": 1.6568, + "step": 6272 + }, + { + "epoch": 1.6485039576969815, + "grad_norm": 0.6065294742584229, + "learning_rate": 4.5058699842298934e-05, + "loss": 1.6882, + "step": 6274 + }, + { + "epoch": 1.6490294610306435, + "grad_norm": 0.5672723054885864, + "learning_rate": 4.504117750131418e-05, + "loss": 1.619, + "step": 6276 + }, + { + "epoch": 1.649554964364305, + "grad_norm": 0.727794885635376, + "learning_rate": 4.502365516032942e-05, + "loss": 1.6583, + "step": 6278 + }, + { + "epoch": 1.650080467697967, + "grad_norm": 0.6044321060180664, + "learning_rate": 4.5006132819344663e-05, + "loss": 1.6795, + "step": 6280 + }, + { + "epoch": 1.6506059710316288, + "grad_norm": 0.535193145275116, + "learning_rate": 4.498861047835991e-05, + "loss": 1.6772, + "step": 6282 + }, + { + "epoch": 1.6511314743652905, + "grad_norm": 0.651655375957489, + "learning_rate": 4.497108813737516e-05, + "loss": 1.6993, + "step": 6284 + }, + { + "epoch": 1.6516569776989523, + "grad_norm": 0.6530981659889221, + "learning_rate": 4.49535657963904e-05, + "loss": 1.6768, + "step": 6286 + }, + { + "epoch": 1.652182481032614, + "grad_norm": 0.6580101251602173, + "learning_rate": 4.493604345540565e-05, + "loss": 1.695, + "step": 6288 + }, + { + "epoch": 1.652707984366276, + "grad_norm": 0.6548781991004944, + "learning_rate": 4.491852111442089e-05, + "loss": 1.682, + "step": 6290 + }, + { + "epoch": 1.6532334876999375, + "grad_norm": 0.5895609259605408, + "learning_rate": 4.490099877343613e-05, + "loss": 1.703, + "step": 6292 + }, + { + "epoch": 1.6537589910335995, + "grad_norm": 0.5485368371009827, + "learning_rate": 4.4883476432451376e-05, + "loss": 1.686, + "step": 6294 + }, + { + "epoch": 1.654284494367261, + "grad_norm": 0.6156832575798035, + "learning_rate": 4.4865954091466624e-05, + "loss": 1.68, + "step": 6296 + }, + { + "epoch": 1.654809997700923, + "grad_norm": 0.5617910027503967, + "learning_rate": 4.4848431750481864e-05, + "loss": 1.6528, + "step": 6298 + }, + { + "epoch": 1.6553355010345847, + "grad_norm": 0.568001925945282, + "learning_rate": 4.483090940949711e-05, + "loss": 1.6656, + "step": 6300 + }, + { + "epoch": 1.6558610043682465, + "grad_norm": 0.5492426156997681, + "learning_rate": 4.481338706851236e-05, + "loss": 1.669, + "step": 6302 + }, + { + "epoch": 1.6563865077019082, + "grad_norm": 0.5638085007667542, + "learning_rate": 4.47958647275276e-05, + "loss": 1.7048, + "step": 6304 + }, + { + "epoch": 1.65691201103557, + "grad_norm": 0.5807180404663086, + "learning_rate": 4.477834238654284e-05, + "loss": 1.6495, + "step": 6306 + }, + { + "epoch": 1.657437514369232, + "grad_norm": 0.6146125197410583, + "learning_rate": 4.476082004555809e-05, + "loss": 1.7158, + "step": 6308 + }, + { + "epoch": 1.6579630177028934, + "grad_norm": 0.5622976422309875, + "learning_rate": 4.4743297704573336e-05, + "loss": 1.6907, + "step": 6310 + }, + { + "epoch": 1.6584885210365554, + "grad_norm": 0.49633076786994934, + "learning_rate": 4.472577536358858e-05, + "loss": 1.6695, + "step": 6312 + }, + { + "epoch": 1.659014024370217, + "grad_norm": 0.5975070595741272, + "learning_rate": 4.4708253022603825e-05, + "loss": 1.7011, + "step": 6314 + }, + { + "epoch": 1.659539527703879, + "grad_norm": 0.6197476983070374, + "learning_rate": 4.4690730681619065e-05, + "loss": 1.6522, + "step": 6316 + }, + { + "epoch": 1.6600650310375407, + "grad_norm": 0.6124839782714844, + "learning_rate": 4.4673208340634306e-05, + "loss": 1.7181, + "step": 6318 + }, + { + "epoch": 1.6605905343712024, + "grad_norm": 0.5166561007499695, + "learning_rate": 4.4655685999649554e-05, + "loss": 1.6536, + "step": 6320 + }, + { + "epoch": 1.6611160377048642, + "grad_norm": 0.5568446516990662, + "learning_rate": 4.46381636586648e-05, + "loss": 1.682, + "step": 6322 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.5822721719741821, + "learning_rate": 4.462064131768004e-05, + "loss": 1.6657, + "step": 6324 + }, + { + "epoch": 1.6621670443721879, + "grad_norm": 0.6438559293746948, + "learning_rate": 4.460311897669529e-05, + "loss": 1.6795, + "step": 6326 + }, + { + "epoch": 1.6626925477058494, + "grad_norm": 0.7065990567207336, + "learning_rate": 4.458559663571053e-05, + "loss": 1.6618, + "step": 6328 + }, + { + "epoch": 1.6632180510395114, + "grad_norm": 0.512935996055603, + "learning_rate": 4.456807429472578e-05, + "loss": 1.6742, + "step": 6330 + }, + { + "epoch": 1.663743554373173, + "grad_norm": 0.7639873027801514, + "learning_rate": 4.455055195374102e-05, + "loss": 1.6593, + "step": 6332 + }, + { + "epoch": 1.6642690577068349, + "grad_norm": 0.5977439880371094, + "learning_rate": 4.4533029612756266e-05, + "loss": 1.6691, + "step": 6334 + }, + { + "epoch": 1.6647945610404966, + "grad_norm": 0.5824545621871948, + "learning_rate": 4.4515507271771514e-05, + "loss": 1.6571, + "step": 6336 + }, + { + "epoch": 1.6653200643741584, + "grad_norm": 0.5170328617095947, + "learning_rate": 4.4497984930786755e-05, + "loss": 1.676, + "step": 6338 + }, + { + "epoch": 1.66584556770782, + "grad_norm": 0.6465065479278564, + "learning_rate": 4.4480462589802e-05, + "loss": 1.6516, + "step": 6340 + }, + { + "epoch": 1.6663710710414819, + "grad_norm": 0.639741837978363, + "learning_rate": 4.446294024881724e-05, + "loss": 1.6696, + "step": 6342 + }, + { + "epoch": 1.6668965743751438, + "grad_norm": 0.5170881152153015, + "learning_rate": 4.444541790783249e-05, + "loss": 1.6353, + "step": 6344 + }, + { + "epoch": 1.6674220777088054, + "grad_norm": 0.5460655689239502, + "learning_rate": 4.442789556684773e-05, + "loss": 1.6869, + "step": 6346 + }, + { + "epoch": 1.6679475810424673, + "grad_norm": 0.5527986288070679, + "learning_rate": 4.441037322586298e-05, + "loss": 1.6261, + "step": 6348 + }, + { + "epoch": 1.6684730843761288, + "grad_norm": 0.5400204062461853, + "learning_rate": 4.4392850884878227e-05, + "loss": 1.6559, + "step": 6350 + }, + { + "epoch": 1.6689985877097908, + "grad_norm": 0.5666835904121399, + "learning_rate": 4.437532854389347e-05, + "loss": 1.7025, + "step": 6352 + }, + { + "epoch": 1.6695240910434526, + "grad_norm": 0.5993382930755615, + "learning_rate": 4.435780620290871e-05, + "loss": 1.676, + "step": 6354 + }, + { + "epoch": 1.6700495943771143, + "grad_norm": 0.5372394919395447, + "learning_rate": 4.4340283861923956e-05, + "loss": 1.6549, + "step": 6356 + }, + { + "epoch": 1.670575097710776, + "grad_norm": 0.6082696914672852, + "learning_rate": 4.4322761520939196e-05, + "loss": 1.6643, + "step": 6358 + }, + { + "epoch": 1.6711006010444378, + "grad_norm": 0.6554064154624939, + "learning_rate": 4.4305239179954444e-05, + "loss": 1.692, + "step": 6360 + }, + { + "epoch": 1.6716261043780998, + "grad_norm": 0.4961806833744049, + "learning_rate": 4.428771683896969e-05, + "loss": 1.6921, + "step": 6362 + }, + { + "epoch": 1.6721516077117613, + "grad_norm": 0.5501666069030762, + "learning_rate": 4.427019449798493e-05, + "loss": 1.678, + "step": 6364 + }, + { + "epoch": 1.6726771110454233, + "grad_norm": 0.5967716574668884, + "learning_rate": 4.425267215700018e-05, + "loss": 1.6548, + "step": 6366 + }, + { + "epoch": 1.673202614379085, + "grad_norm": 0.654115617275238, + "learning_rate": 4.423514981601542e-05, + "loss": 1.6859, + "step": 6368 + }, + { + "epoch": 1.6737281177127468, + "grad_norm": 0.5566443204879761, + "learning_rate": 4.421762747503067e-05, + "loss": 1.653, + "step": 6370 + }, + { + "epoch": 1.6742536210464085, + "grad_norm": 0.6134412884712219, + "learning_rate": 4.420010513404591e-05, + "loss": 1.6876, + "step": 6372 + }, + { + "epoch": 1.6747791243800703, + "grad_norm": 0.7075713276863098, + "learning_rate": 4.418258279306116e-05, + "loss": 1.6738, + "step": 6374 + }, + { + "epoch": 1.675304627713732, + "grad_norm": 0.5378595590591431, + "learning_rate": 4.4165060452076404e-05, + "loss": 1.6696, + "step": 6376 + }, + { + "epoch": 1.6758301310473938, + "grad_norm": 0.5113415718078613, + "learning_rate": 4.4147538111091645e-05, + "loss": 1.6662, + "step": 6378 + }, + { + "epoch": 1.6763556343810557, + "grad_norm": 0.6252776384353638, + "learning_rate": 4.4130015770106886e-05, + "loss": 1.6517, + "step": 6380 + }, + { + "epoch": 1.6768811377147173, + "grad_norm": 0.5247318744659424, + "learning_rate": 4.411249342912213e-05, + "loss": 1.6769, + "step": 6382 + }, + { + "epoch": 1.6774066410483792, + "grad_norm": 0.5468015670776367, + "learning_rate": 4.4094971088137374e-05, + "loss": 1.6827, + "step": 6384 + }, + { + "epoch": 1.677932144382041, + "grad_norm": 0.5248594880104065, + "learning_rate": 4.407744874715262e-05, + "loss": 1.691, + "step": 6386 + }, + { + "epoch": 1.6784576477157027, + "grad_norm": 0.5515632629394531, + "learning_rate": 4.405992640616787e-05, + "loss": 1.664, + "step": 6388 + }, + { + "epoch": 1.6789831510493645, + "grad_norm": 0.565548300743103, + "learning_rate": 4.404240406518311e-05, + "loss": 1.6677, + "step": 6390 + }, + { + "epoch": 1.6795086543830262, + "grad_norm": 0.5626314878463745, + "learning_rate": 4.402488172419835e-05, + "loss": 1.6778, + "step": 6392 + }, + { + "epoch": 1.6800341577166882, + "grad_norm": 0.5197250247001648, + "learning_rate": 4.40073593832136e-05, + "loss": 1.6824, + "step": 6394 + }, + { + "epoch": 1.6805596610503497, + "grad_norm": 0.5577792525291443, + "learning_rate": 4.3989837042228846e-05, + "loss": 1.6798, + "step": 6396 + }, + { + "epoch": 1.6810851643840117, + "grad_norm": 0.5743740200996399, + "learning_rate": 4.397231470124409e-05, + "loss": 1.6328, + "step": 6398 + }, + { + "epoch": 1.6816106677176732, + "grad_norm": 0.4996255934238434, + "learning_rate": 4.3954792360259334e-05, + "loss": 1.6781, + "step": 6400 + }, + { + "epoch": 1.6816106677176732, + "eval_loss": 1.6763501167297363, + "eval_runtime": 487.3304, + "eval_samples_per_second": 249.911, + "eval_steps_per_second": 31.24, + "step": 6400 + }, + { + "epoch": 1.6821361710513352, + "grad_norm": 0.6712773442268372, + "learning_rate": 4.393727001927458e-05, + "loss": 1.679, + "step": 6402 + }, + { + "epoch": 1.682661674384997, + "grad_norm": 0.5399149656295776, + "learning_rate": 4.391974767828982e-05, + "loss": 1.7156, + "step": 6404 + }, + { + "epoch": 1.6831871777186587, + "grad_norm": 0.5838908553123474, + "learning_rate": 4.3902225337305063e-05, + "loss": 1.6559, + "step": 6406 + }, + { + "epoch": 1.6837126810523204, + "grad_norm": 0.6989631056785583, + "learning_rate": 4.388470299632031e-05, + "loss": 1.6536, + "step": 6408 + }, + { + "epoch": 1.6842381843859822, + "grad_norm": 0.6087121963500977, + "learning_rate": 4.386718065533555e-05, + "loss": 1.68, + "step": 6410 + }, + { + "epoch": 1.6847636877196441, + "grad_norm": 0.595737099647522, + "learning_rate": 4.38496583143508e-05, + "loss": 1.6431, + "step": 6412 + }, + { + "epoch": 1.6852891910533057, + "grad_norm": 0.5878545045852661, + "learning_rate": 4.383213597336605e-05, + "loss": 1.6524, + "step": 6414 + }, + { + "epoch": 1.6858146943869676, + "grad_norm": 0.5520877242088318, + "learning_rate": 4.381461363238129e-05, + "loss": 1.6234, + "step": 6416 + }, + { + "epoch": 1.6863401977206292, + "grad_norm": 0.530946671962738, + "learning_rate": 4.379709129139653e-05, + "loss": 1.6478, + "step": 6418 + }, + { + "epoch": 1.6868657010542911, + "grad_norm": 0.5743231177330017, + "learning_rate": 4.3779568950411776e-05, + "loss": 1.6882, + "step": 6420 + }, + { + "epoch": 1.6873912043879529, + "grad_norm": 0.5853824019432068, + "learning_rate": 4.3762046609427024e-05, + "loss": 1.6798, + "step": 6422 + }, + { + "epoch": 1.6879167077216146, + "grad_norm": 0.5864454507827759, + "learning_rate": 4.3744524268442264e-05, + "loss": 1.656, + "step": 6424 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.5097535848617554, + "learning_rate": 4.372700192745751e-05, + "loss": 1.6534, + "step": 6426 + }, + { + "epoch": 1.6889677143889381, + "grad_norm": 0.5435791015625, + "learning_rate": 4.370947958647276e-05, + "loss": 1.6782, + "step": 6428 + }, + { + "epoch": 1.6894932177226, + "grad_norm": 0.6465846300125122, + "learning_rate": 4.3691957245487994e-05, + "loss": 1.6816, + "step": 6430 + }, + { + "epoch": 1.6900187210562616, + "grad_norm": 0.561132550239563, + "learning_rate": 4.367443490450324e-05, + "loss": 1.6838, + "step": 6432 + }, + { + "epoch": 1.6905442243899236, + "grad_norm": 0.530135452747345, + "learning_rate": 4.365691256351849e-05, + "loss": 1.6541, + "step": 6434 + }, + { + "epoch": 1.691069727723585, + "grad_norm": 0.6319286823272705, + "learning_rate": 4.363939022253373e-05, + "loss": 1.651, + "step": 6436 + }, + { + "epoch": 1.691595231057247, + "grad_norm": 0.4940394461154938, + "learning_rate": 4.362186788154898e-05, + "loss": 1.6613, + "step": 6438 + }, + { + "epoch": 1.6921207343909088, + "grad_norm": 0.4995363652706146, + "learning_rate": 4.3604345540564225e-05, + "loss": 1.6572, + "step": 6440 + }, + { + "epoch": 1.6926462377245706, + "grad_norm": 0.5799241662025452, + "learning_rate": 4.3586823199579465e-05, + "loss": 1.6753, + "step": 6442 + }, + { + "epoch": 1.6931717410582323, + "grad_norm": 0.5875564813613892, + "learning_rate": 4.3569300858594706e-05, + "loss": 1.6322, + "step": 6444 + }, + { + "epoch": 1.693697244391894, + "grad_norm": 0.5701809525489807, + "learning_rate": 4.3551778517609954e-05, + "loss": 1.7094, + "step": 6446 + }, + { + "epoch": 1.694222747725556, + "grad_norm": 0.576756477355957, + "learning_rate": 4.35342561766252e-05, + "loss": 1.667, + "step": 6448 + }, + { + "epoch": 1.6947482510592176, + "grad_norm": 0.5512332320213318, + "learning_rate": 4.351673383564044e-05, + "loss": 1.6592, + "step": 6450 + }, + { + "epoch": 1.6952737543928795, + "grad_norm": 0.5287933349609375, + "learning_rate": 4.349921149465569e-05, + "loss": 1.6677, + "step": 6452 + }, + { + "epoch": 1.695799257726541, + "grad_norm": 0.623125433921814, + "learning_rate": 4.348168915367094e-05, + "loss": 1.6633, + "step": 6454 + }, + { + "epoch": 1.696324761060203, + "grad_norm": 0.613586962223053, + "learning_rate": 4.346416681268617e-05, + "loss": 1.66, + "step": 6456 + }, + { + "epoch": 1.6968502643938648, + "grad_norm": 0.5631827116012573, + "learning_rate": 4.344664447170142e-05, + "loss": 1.6701, + "step": 6458 + }, + { + "epoch": 1.6973757677275265, + "grad_norm": 0.4900628328323364, + "learning_rate": 4.3429122130716666e-05, + "loss": 1.6603, + "step": 6460 + }, + { + "epoch": 1.6979012710611883, + "grad_norm": 0.5031628608703613, + "learning_rate": 4.341159978973191e-05, + "loss": 1.6554, + "step": 6462 + }, + { + "epoch": 1.69842677439485, + "grad_norm": 0.5051286816596985, + "learning_rate": 4.3394077448747155e-05, + "loss": 1.6324, + "step": 6464 + }, + { + "epoch": 1.698952277728512, + "grad_norm": 0.5495415925979614, + "learning_rate": 4.33765551077624e-05, + "loss": 1.679, + "step": 6466 + }, + { + "epoch": 1.6994777810621735, + "grad_norm": 0.7252418398857117, + "learning_rate": 4.335903276677764e-05, + "loss": 1.6898, + "step": 6468 + }, + { + "epoch": 1.7000032843958355, + "grad_norm": 0.5228211879730225, + "learning_rate": 4.3341510425792884e-05, + "loss": 1.644, + "step": 6470 + }, + { + "epoch": 1.700528787729497, + "grad_norm": 0.5848027467727661, + "learning_rate": 4.332398808480813e-05, + "loss": 1.656, + "step": 6472 + }, + { + "epoch": 1.701054291063159, + "grad_norm": 0.6442865133285522, + "learning_rate": 4.330646574382338e-05, + "loss": 1.6261, + "step": 6474 + }, + { + "epoch": 1.7015797943968207, + "grad_norm": 0.5885564684867859, + "learning_rate": 4.328894340283862e-05, + "loss": 1.6406, + "step": 6476 + }, + { + "epoch": 1.7021052977304825, + "grad_norm": 0.5726144909858704, + "learning_rate": 4.327142106185387e-05, + "loss": 1.6843, + "step": 6478 + }, + { + "epoch": 1.7026308010641442, + "grad_norm": 0.6747820377349854, + "learning_rate": 4.3253898720869115e-05, + "loss": 1.6851, + "step": 6480 + }, + { + "epoch": 1.703156304397806, + "grad_norm": 0.5923687815666199, + "learning_rate": 4.3236376379884356e-05, + "loss": 1.6318, + "step": 6482 + }, + { + "epoch": 1.703681807731468, + "grad_norm": 0.5728587508201599, + "learning_rate": 4.3218854038899596e-05, + "loss": 1.682, + "step": 6484 + }, + { + "epoch": 1.7042073110651295, + "grad_norm": 0.5209431648254395, + "learning_rate": 4.3201331697914844e-05, + "loss": 1.662, + "step": 6486 + }, + { + "epoch": 1.7047328143987914, + "grad_norm": 0.5561770796775818, + "learning_rate": 4.318380935693009e-05, + "loss": 1.7016, + "step": 6488 + }, + { + "epoch": 1.705258317732453, + "grad_norm": 0.5122435688972473, + "learning_rate": 4.316628701594533e-05, + "loss": 1.6682, + "step": 6490 + }, + { + "epoch": 1.705783821066115, + "grad_norm": 0.6389492750167847, + "learning_rate": 4.314876467496058e-05, + "loss": 1.6379, + "step": 6492 + }, + { + "epoch": 1.7063093243997767, + "grad_norm": 0.5349413752555847, + "learning_rate": 4.313124233397582e-05, + "loss": 1.6343, + "step": 6494 + }, + { + "epoch": 1.7068348277334384, + "grad_norm": 0.5826820731163025, + "learning_rate": 4.311371999299106e-05, + "loss": 1.6775, + "step": 6496 + }, + { + "epoch": 1.7073603310671002, + "grad_norm": 0.5609052181243896, + "learning_rate": 4.309619765200631e-05, + "loss": 1.7076, + "step": 6498 + }, + { + "epoch": 1.707885834400762, + "grad_norm": 0.5599504113197327, + "learning_rate": 4.307867531102156e-05, + "loss": 1.6997, + "step": 6500 + }, + { + "epoch": 1.7084113377344239, + "grad_norm": 0.5563739538192749, + "learning_rate": 4.30611529700368e-05, + "loss": 1.6595, + "step": 6502 + }, + { + "epoch": 1.7089368410680854, + "grad_norm": 0.5203465819358826, + "learning_rate": 4.3043630629052045e-05, + "loss": 1.6452, + "step": 6504 + }, + { + "epoch": 1.7094623444017474, + "grad_norm": 0.59616619348526, + "learning_rate": 4.302610828806729e-05, + "loss": 1.6495, + "step": 6506 + }, + { + "epoch": 1.709987847735409, + "grad_norm": 0.7246098518371582, + "learning_rate": 4.300858594708253e-05, + "loss": 1.6682, + "step": 6508 + }, + { + "epoch": 1.7105133510690709, + "grad_norm": 0.5792231559753418, + "learning_rate": 4.2991063606097774e-05, + "loss": 1.6474, + "step": 6510 + }, + { + "epoch": 1.7110388544027326, + "grad_norm": 0.5333255529403687, + "learning_rate": 4.297354126511302e-05, + "loss": 1.6795, + "step": 6512 + }, + { + "epoch": 1.7115643577363944, + "grad_norm": 0.674374520778656, + "learning_rate": 4.295601892412827e-05, + "loss": 1.6514, + "step": 6514 + }, + { + "epoch": 1.7120898610700561, + "grad_norm": 0.5450239181518555, + "learning_rate": 4.293849658314351e-05, + "loss": 1.6606, + "step": 6516 + }, + { + "epoch": 1.7126153644037179, + "grad_norm": 0.6590548157691956, + "learning_rate": 4.292097424215876e-05, + "loss": 1.6337, + "step": 6518 + }, + { + "epoch": 1.7131408677373798, + "grad_norm": 0.5265931487083435, + "learning_rate": 4.2903451901174e-05, + "loss": 1.6842, + "step": 6520 + }, + { + "epoch": 1.7136663710710414, + "grad_norm": 0.6718656420707703, + "learning_rate": 4.288592956018924e-05, + "loss": 1.696, + "step": 6522 + }, + { + "epoch": 1.7141918744047033, + "grad_norm": 0.6330803632736206, + "learning_rate": 4.286840721920449e-05, + "loss": 1.671, + "step": 6524 + }, + { + "epoch": 1.714717377738365, + "grad_norm": 0.4948212206363678, + "learning_rate": 4.2850884878219734e-05, + "loss": 1.6464, + "step": 6526 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.5330238342285156, + "learning_rate": 4.2833362537234975e-05, + "loss": 1.6661, + "step": 6528 + }, + { + "epoch": 1.7157683844056886, + "grad_norm": 0.5928429961204529, + "learning_rate": 4.281584019625022e-05, + "loss": 1.645, + "step": 6530 + }, + { + "epoch": 1.7162938877393503, + "grad_norm": 0.535369336605072, + "learning_rate": 4.2798317855265463e-05, + "loss": 1.6554, + "step": 6532 + }, + { + "epoch": 1.716819391073012, + "grad_norm": 0.5079066157341003, + "learning_rate": 4.278079551428071e-05, + "loss": 1.6498, + "step": 6534 + }, + { + "epoch": 1.7173448944066738, + "grad_norm": 0.5394106507301331, + "learning_rate": 4.276327317329595e-05, + "loss": 1.6538, + "step": 6536 + }, + { + "epoch": 1.7178703977403358, + "grad_norm": 0.5476702451705933, + "learning_rate": 4.27457508323112e-05, + "loss": 1.6568, + "step": 6538 + }, + { + "epoch": 1.7183959010739973, + "grad_norm": 0.6342707276344299, + "learning_rate": 4.272822849132645e-05, + "loss": 1.6829, + "step": 6540 + }, + { + "epoch": 1.7189214044076593, + "grad_norm": 0.6379374265670776, + "learning_rate": 4.271070615034169e-05, + "loss": 1.6742, + "step": 6542 + }, + { + "epoch": 1.719446907741321, + "grad_norm": 0.6817846894264221, + "learning_rate": 4.2693183809356935e-05, + "loss": 1.6354, + "step": 6544 + }, + { + "epoch": 1.7199724110749828, + "grad_norm": 0.6458591222763062, + "learning_rate": 4.2675661468372176e-05, + "loss": 1.6599, + "step": 6546 + }, + { + "epoch": 1.7204979144086445, + "grad_norm": 0.5006933808326721, + "learning_rate": 4.265813912738742e-05, + "loss": 1.6438, + "step": 6548 + }, + { + "epoch": 1.7210234177423063, + "grad_norm": 0.4796706736087799, + "learning_rate": 4.2640616786402664e-05, + "loss": 1.6738, + "step": 6550 + }, + { + "epoch": 1.7215489210759682, + "grad_norm": 0.5186893939971924, + "learning_rate": 4.262309444541791e-05, + "loss": 1.691, + "step": 6552 + }, + { + "epoch": 1.7220744244096298, + "grad_norm": 0.5311626195907593, + "learning_rate": 4.260557210443315e-05, + "loss": 1.6739, + "step": 6554 + }, + { + "epoch": 1.7225999277432917, + "grad_norm": 0.5143429040908813, + "learning_rate": 4.25880497634484e-05, + "loss": 1.6533, + "step": 6556 + }, + { + "epoch": 1.7231254310769533, + "grad_norm": 0.5430511236190796, + "learning_rate": 4.257052742246364e-05, + "loss": 1.6758, + "step": 6558 + }, + { + "epoch": 1.7236509344106152, + "grad_norm": 0.6149283647537231, + "learning_rate": 4.255300508147889e-05, + "loss": 1.6581, + "step": 6560 + }, + { + "epoch": 1.724176437744277, + "grad_norm": 0.5292539000511169, + "learning_rate": 4.253548274049413e-05, + "loss": 1.6494, + "step": 6562 + }, + { + "epoch": 1.7247019410779387, + "grad_norm": 0.615308403968811, + "learning_rate": 4.251796039950938e-05, + "loss": 1.6732, + "step": 6564 + }, + { + "epoch": 1.7252274444116005, + "grad_norm": 0.562659740447998, + "learning_rate": 4.2500438058524625e-05, + "loss": 1.6881, + "step": 6566 + }, + { + "epoch": 1.7257529477452622, + "grad_norm": 0.6096563339233398, + "learning_rate": 4.2482915717539865e-05, + "loss": 1.6344, + "step": 6568 + }, + { + "epoch": 1.7262784510789242, + "grad_norm": 0.568242073059082, + "learning_rate": 4.2465393376555106e-05, + "loss": 1.6506, + "step": 6570 + }, + { + "epoch": 1.7268039544125857, + "grad_norm": 0.6250702738761902, + "learning_rate": 4.2447871035570354e-05, + "loss": 1.6647, + "step": 6572 + }, + { + "epoch": 1.7273294577462477, + "grad_norm": 0.6344661116600037, + "learning_rate": 4.2430348694585594e-05, + "loss": 1.6733, + "step": 6574 + }, + { + "epoch": 1.7278549610799092, + "grad_norm": 0.5727905631065369, + "learning_rate": 4.241282635360084e-05, + "loss": 1.6757, + "step": 6576 + }, + { + "epoch": 1.7283804644135712, + "grad_norm": 0.5363614559173584, + "learning_rate": 4.239530401261609e-05, + "loss": 1.6418, + "step": 6578 + }, + { + "epoch": 1.728905967747233, + "grad_norm": 0.5695384740829468, + "learning_rate": 4.237778167163133e-05, + "loss": 1.6424, + "step": 6580 + }, + { + "epoch": 1.7294314710808947, + "grad_norm": 0.6201584935188293, + "learning_rate": 4.236025933064658e-05, + "loss": 1.6648, + "step": 6582 + }, + { + "epoch": 1.7299569744145564, + "grad_norm": 0.4974352717399597, + "learning_rate": 4.234273698966182e-05, + "loss": 1.6666, + "step": 6584 + }, + { + "epoch": 1.7304824777482182, + "grad_norm": 0.5813178420066833, + "learning_rate": 4.2325214648677066e-05, + "loss": 1.6307, + "step": 6586 + }, + { + "epoch": 1.7310079810818801, + "grad_norm": 0.5724592804908752, + "learning_rate": 4.230769230769231e-05, + "loss": 1.6467, + "step": 6588 + }, + { + "epoch": 1.7315334844155417, + "grad_norm": 0.5860669612884521, + "learning_rate": 4.2290169966707555e-05, + "loss": 1.6765, + "step": 6590 + }, + { + "epoch": 1.7320589877492036, + "grad_norm": 0.5303966999053955, + "learning_rate": 4.22726476257228e-05, + "loss": 1.6575, + "step": 6592 + }, + { + "epoch": 1.7325844910828652, + "grad_norm": 0.5227830410003662, + "learning_rate": 4.225512528473804e-05, + "loss": 1.6884, + "step": 6594 + }, + { + "epoch": 1.7331099944165271, + "grad_norm": 0.5740933418273926, + "learning_rate": 4.2237602943753284e-05, + "loss": 1.6472, + "step": 6596 + }, + { + "epoch": 1.7336354977501889, + "grad_norm": 0.5894073843955994, + "learning_rate": 4.222008060276853e-05, + "loss": 1.6755, + "step": 6598 + }, + { + "epoch": 1.7341610010838506, + "grad_norm": 0.5687074065208435, + "learning_rate": 4.220255826178378e-05, + "loss": 1.675, + "step": 6600 + }, + { + "epoch": 1.7346865044175124, + "grad_norm": 0.6006156206130981, + "learning_rate": 4.218503592079902e-05, + "loss": 1.6651, + "step": 6602 + }, + { + "epoch": 1.7352120077511741, + "grad_norm": 0.564089834690094, + "learning_rate": 4.216751357981427e-05, + "loss": 1.6789, + "step": 6604 + }, + { + "epoch": 1.735737511084836, + "grad_norm": 0.7533421516418457, + "learning_rate": 4.2149991238829515e-05, + "loss": 1.6557, + "step": 6606 + }, + { + "epoch": 1.7362630144184976, + "grad_norm": 0.5872588753700256, + "learning_rate": 4.2132468897844756e-05, + "loss": 1.6713, + "step": 6608 + }, + { + "epoch": 1.7367885177521596, + "grad_norm": 0.608405590057373, + "learning_rate": 4.2114946556859996e-05, + "loss": 1.6648, + "step": 6610 + }, + { + "epoch": 1.7373140210858211, + "grad_norm": 0.5417534112930298, + "learning_rate": 4.2097424215875244e-05, + "loss": 1.6667, + "step": 6612 + }, + { + "epoch": 1.737839524419483, + "grad_norm": 0.5255427956581116, + "learning_rate": 4.2079901874890485e-05, + "loss": 1.6847, + "step": 6614 + }, + { + "epoch": 1.7383650277531448, + "grad_norm": 0.49976515769958496, + "learning_rate": 4.206237953390573e-05, + "loss": 1.6426, + "step": 6616 + }, + { + "epoch": 1.7388905310868066, + "grad_norm": 0.5245642066001892, + "learning_rate": 4.204485719292098e-05, + "loss": 1.6557, + "step": 6618 + }, + { + "epoch": 1.7394160344204683, + "grad_norm": 0.5113621354103088, + "learning_rate": 4.202733485193622e-05, + "loss": 1.6725, + "step": 6620 + }, + { + "epoch": 1.73994153775413, + "grad_norm": 0.6118736863136292, + "learning_rate": 4.200981251095146e-05, + "loss": 1.6524, + "step": 6622 + }, + { + "epoch": 1.740467041087792, + "grad_norm": 0.6329546570777893, + "learning_rate": 4.199229016996671e-05, + "loss": 1.7146, + "step": 6624 + }, + { + "epoch": 1.7409925444214536, + "grad_norm": 0.5709455013275146, + "learning_rate": 4.1974767828981957e-05, + "loss": 1.6481, + "step": 6626 + }, + { + "epoch": 1.7415180477551155, + "grad_norm": 0.5557751655578613, + "learning_rate": 4.19572454879972e-05, + "loss": 1.6298, + "step": 6628 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.5406216979026794, + "learning_rate": 4.1939723147012445e-05, + "loss": 1.7127, + "step": 6630 + }, + { + "epoch": 1.742569054422439, + "grad_norm": 0.6411069631576538, + "learning_rate": 4.192220080602769e-05, + "loss": 1.674, + "step": 6632 + }, + { + "epoch": 1.7430945577561008, + "grad_norm": 0.5912994742393494, + "learning_rate": 4.1904678465042927e-05, + "loss": 1.7002, + "step": 6634 + }, + { + "epoch": 1.7436200610897625, + "grad_norm": 0.5379955172538757, + "learning_rate": 4.1887156124058174e-05, + "loss": 1.6702, + "step": 6636 + }, + { + "epoch": 1.7441455644234243, + "grad_norm": 0.5569443702697754, + "learning_rate": 4.186963378307342e-05, + "loss": 1.6848, + "step": 6638 + }, + { + "epoch": 1.744671067757086, + "grad_norm": 0.5895394086837769, + "learning_rate": 4.185211144208866e-05, + "loss": 1.6775, + "step": 6640 + }, + { + "epoch": 1.745196571090748, + "grad_norm": 0.6112500429153442, + "learning_rate": 4.183458910110391e-05, + "loss": 1.6713, + "step": 6642 + }, + { + "epoch": 1.7457220744244095, + "grad_norm": 0.762654185295105, + "learning_rate": 4.181706676011916e-05, + "loss": 1.6848, + "step": 6644 + }, + { + "epoch": 1.7462475777580715, + "grad_norm": 0.6181445717811584, + "learning_rate": 4.17995444191344e-05, + "loss": 1.6735, + "step": 6646 + }, + { + "epoch": 1.746773081091733, + "grad_norm": 0.5101475119590759, + "learning_rate": 4.178202207814964e-05, + "loss": 1.6639, + "step": 6648 + }, + { + "epoch": 1.747298584425395, + "grad_norm": 0.5235376358032227, + "learning_rate": 4.176449973716489e-05, + "loss": 1.6485, + "step": 6650 + }, + { + "epoch": 1.7478240877590567, + "grad_norm": 0.5604961514472961, + "learning_rate": 4.1746977396180134e-05, + "loss": 1.6501, + "step": 6652 + }, + { + "epoch": 1.7483495910927185, + "grad_norm": 0.759231448173523, + "learning_rate": 4.1729455055195375e-05, + "loss": 1.6752, + "step": 6654 + }, + { + "epoch": 1.7488750944263802, + "grad_norm": 0.5132787823677063, + "learning_rate": 4.171193271421062e-05, + "loss": 1.672, + "step": 6656 + }, + { + "epoch": 1.749400597760042, + "grad_norm": 0.5898250341415405, + "learning_rate": 4.169441037322587e-05, + "loss": 1.6543, + "step": 6658 + }, + { + "epoch": 1.749926101093704, + "grad_norm": 0.5514124631881714, + "learning_rate": 4.1676888032241104e-05, + "loss": 1.6879, + "step": 6660 + }, + { + "epoch": 1.7504516044273655, + "grad_norm": 0.7601991295814514, + "learning_rate": 4.165936569125635e-05, + "loss": 1.7259, + "step": 6662 + }, + { + "epoch": 1.7509771077610274, + "grad_norm": 0.6521760821342468, + "learning_rate": 4.16418433502716e-05, + "loss": 1.6488, + "step": 6664 + }, + { + "epoch": 1.751502611094689, + "grad_norm": 0.6008849143981934, + "learning_rate": 4.162432100928684e-05, + "loss": 1.6577, + "step": 6666 + }, + { + "epoch": 1.752028114428351, + "grad_norm": 0.6038839221000671, + "learning_rate": 4.160679866830209e-05, + "loss": 1.6503, + "step": 6668 + }, + { + "epoch": 1.7525536177620127, + "grad_norm": 0.6234827041625977, + "learning_rate": 4.1589276327317335e-05, + "loss": 1.6779, + "step": 6670 + }, + { + "epoch": 1.7530791210956744, + "grad_norm": 0.5239622592926025, + "learning_rate": 4.1571753986332576e-05, + "loss": 1.6431, + "step": 6672 + }, + { + "epoch": 1.7536046244293362, + "grad_norm": 0.6171594262123108, + "learning_rate": 4.155423164534782e-05, + "loss": 1.6436, + "step": 6674 + }, + { + "epoch": 1.754130127762998, + "grad_norm": 0.651139497756958, + "learning_rate": 4.1536709304363064e-05, + "loss": 1.6492, + "step": 6676 + }, + { + "epoch": 1.75465563109666, + "grad_norm": 0.5682376027107239, + "learning_rate": 4.151918696337831e-05, + "loss": 1.6417, + "step": 6678 + }, + { + "epoch": 1.7551811344303214, + "grad_norm": 0.6295192837715149, + "learning_rate": 4.150166462239355e-05, + "loss": 1.6839, + "step": 6680 + }, + { + "epoch": 1.7557066377639834, + "grad_norm": 0.6096534132957458, + "learning_rate": 4.14841422814088e-05, + "loss": 1.7125, + "step": 6682 + }, + { + "epoch": 1.7562321410976451, + "grad_norm": 0.5466519594192505, + "learning_rate": 4.146661994042405e-05, + "loss": 1.6566, + "step": 6684 + }, + { + "epoch": 1.7567576444313069, + "grad_norm": 0.5740132331848145, + "learning_rate": 4.144909759943928e-05, + "loss": 1.6564, + "step": 6686 + }, + { + "epoch": 1.7572831477649686, + "grad_norm": 0.6351927518844604, + "learning_rate": 4.143157525845453e-05, + "loss": 1.6752, + "step": 6688 + }, + { + "epoch": 1.7578086510986304, + "grad_norm": 0.5739028453826904, + "learning_rate": 4.141405291746978e-05, + "loss": 1.6834, + "step": 6690 + }, + { + "epoch": 1.7583341544322921, + "grad_norm": 0.5718164443969727, + "learning_rate": 4.139653057648502e-05, + "loss": 1.6433, + "step": 6692 + }, + { + "epoch": 1.7588596577659539, + "grad_norm": 0.738676905632019, + "learning_rate": 4.1379008235500265e-05, + "loss": 1.6528, + "step": 6694 + }, + { + "epoch": 1.7593851610996158, + "grad_norm": 0.596734881401062, + "learning_rate": 4.136148589451551e-05, + "loss": 1.6431, + "step": 6696 + }, + { + "epoch": 1.7599106644332774, + "grad_norm": 0.5336854457855225, + "learning_rate": 4.1343963553530754e-05, + "loss": 1.6692, + "step": 6698 + }, + { + "epoch": 1.7604361677669393, + "grad_norm": 0.4876728653907776, + "learning_rate": 4.1326441212545994e-05, + "loss": 1.6595, + "step": 6700 + }, + { + "epoch": 1.760961671100601, + "grad_norm": 0.5300989747047424, + "learning_rate": 4.130891887156124e-05, + "loss": 1.6767, + "step": 6702 + }, + { + "epoch": 1.7614871744342628, + "grad_norm": 0.54608154296875, + "learning_rate": 4.129139653057649e-05, + "loss": 1.6279, + "step": 6704 + }, + { + "epoch": 1.7620126777679246, + "grad_norm": 0.5571487545967102, + "learning_rate": 4.127387418959173e-05, + "loss": 1.6963, + "step": 6706 + }, + { + "epoch": 1.7625381811015863, + "grad_norm": 0.5999481081962585, + "learning_rate": 4.125635184860698e-05, + "loss": 1.6807, + "step": 6708 + }, + { + "epoch": 1.7630636844352483, + "grad_norm": 0.5582924485206604, + "learning_rate": 4.123882950762222e-05, + "loss": 1.6503, + "step": 6710 + }, + { + "epoch": 1.7635891877689098, + "grad_norm": 0.5172569751739502, + "learning_rate": 4.122130716663746e-05, + "loss": 1.6502, + "step": 6712 + }, + { + "epoch": 1.7641146911025718, + "grad_norm": 0.5434536933898926, + "learning_rate": 4.120378482565271e-05, + "loss": 1.624, + "step": 6714 + }, + { + "epoch": 1.7646401944362333, + "grad_norm": 0.5931615233421326, + "learning_rate": 4.1186262484667955e-05, + "loss": 1.6881, + "step": 6716 + }, + { + "epoch": 1.7651656977698953, + "grad_norm": 0.5632887482643127, + "learning_rate": 4.1168740143683195e-05, + "loss": 1.66, + "step": 6718 + }, + { + "epoch": 1.765691201103557, + "grad_norm": 0.6316903233528137, + "learning_rate": 4.115121780269844e-05, + "loss": 1.6821, + "step": 6720 + }, + { + "epoch": 1.7662167044372188, + "grad_norm": 0.5220393538475037, + "learning_rate": 4.113369546171369e-05, + "loss": 1.6585, + "step": 6722 + }, + { + "epoch": 1.7667422077708805, + "grad_norm": 0.5338044166564941, + "learning_rate": 4.111617312072893e-05, + "loss": 1.6359, + "step": 6724 + }, + { + "epoch": 1.7672677111045423, + "grad_norm": 0.5751186013221741, + "learning_rate": 4.109865077974417e-05, + "loss": 1.6678, + "step": 6726 + }, + { + "epoch": 1.7677932144382043, + "grad_norm": 0.5516241788864136, + "learning_rate": 4.108112843875942e-05, + "loss": 1.6589, + "step": 6728 + }, + { + "epoch": 1.7683187177718658, + "grad_norm": 0.5440977811813354, + "learning_rate": 4.106360609777467e-05, + "loss": 1.6496, + "step": 6730 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.5160251259803772, + "learning_rate": 4.104608375678991e-05, + "loss": 1.6586, + "step": 6732 + }, + { + "epoch": 1.7693697244391893, + "grad_norm": 0.6195341348648071, + "learning_rate": 4.1028561415805156e-05, + "loss": 1.6621, + "step": 6734 + }, + { + "epoch": 1.7698952277728512, + "grad_norm": 0.5011487007141113, + "learning_rate": 4.1011039074820396e-05, + "loss": 1.7067, + "step": 6736 + }, + { + "epoch": 1.770420731106513, + "grad_norm": 0.5898102521896362, + "learning_rate": 4.0993516733835644e-05, + "loss": 1.6775, + "step": 6738 + }, + { + "epoch": 1.7709462344401747, + "grad_norm": 0.6446313261985779, + "learning_rate": 4.0975994392850885e-05, + "loss": 1.6751, + "step": 6740 + }, + { + "epoch": 1.7714717377738365, + "grad_norm": 0.5387564301490784, + "learning_rate": 4.095847205186613e-05, + "loss": 1.6861, + "step": 6742 + }, + { + "epoch": 1.7719972411074982, + "grad_norm": 0.6098289489746094, + "learning_rate": 4.094094971088138e-05, + "loss": 1.6653, + "step": 6744 + }, + { + "epoch": 1.7725227444411602, + "grad_norm": 0.5589563846588135, + "learning_rate": 4.092342736989662e-05, + "loss": 1.6878, + "step": 6746 + }, + { + "epoch": 1.7730482477748217, + "grad_norm": 0.6051377058029175, + "learning_rate": 4.090590502891187e-05, + "loss": 1.6779, + "step": 6748 + }, + { + "epoch": 1.7735737511084837, + "grad_norm": 0.5657187104225159, + "learning_rate": 4.088838268792711e-05, + "loss": 1.6585, + "step": 6750 + }, + { + "epoch": 1.7740992544421452, + "grad_norm": 0.9947826862335205, + "learning_rate": 4.087086034694235e-05, + "loss": 1.6364, + "step": 6752 + }, + { + "epoch": 1.7746247577758072, + "grad_norm": 0.5528366565704346, + "learning_rate": 4.08533380059576e-05, + "loss": 1.6796, + "step": 6754 + }, + { + "epoch": 1.775150261109469, + "grad_norm": 0.7105492949485779, + "learning_rate": 4.0835815664972845e-05, + "loss": 1.6359, + "step": 6756 + }, + { + "epoch": 1.7756757644431307, + "grad_norm": 0.5398980975151062, + "learning_rate": 4.0818293323988086e-05, + "loss": 1.6358, + "step": 6758 + }, + { + "epoch": 1.7762012677767924, + "grad_norm": 0.518286943435669, + "learning_rate": 4.080077098300333e-05, + "loss": 1.6691, + "step": 6760 + }, + { + "epoch": 1.7767267711104542, + "grad_norm": 0.5343197584152222, + "learning_rate": 4.0783248642018574e-05, + "loss": 1.6604, + "step": 6762 + }, + { + "epoch": 1.7772522744441162, + "grad_norm": 0.6191185116767883, + "learning_rate": 4.076572630103382e-05, + "loss": 1.6693, + "step": 6764 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.6115537285804749, + "learning_rate": 4.074820396004906e-05, + "loss": 1.6533, + "step": 6766 + }, + { + "epoch": 1.7783032811114396, + "grad_norm": 0.6350643634796143, + "learning_rate": 4.073068161906431e-05, + "loss": 1.6717, + "step": 6768 + }, + { + "epoch": 1.7788287844451012, + "grad_norm": 0.5143194198608398, + "learning_rate": 4.071315927807956e-05, + "loss": 1.6684, + "step": 6770 + }, + { + "epoch": 1.7793542877787631, + "grad_norm": 0.6066332459449768, + "learning_rate": 4.06956369370948e-05, + "loss": 1.6722, + "step": 6772 + }, + { + "epoch": 1.779879791112425, + "grad_norm": 0.5524691939353943, + "learning_rate": 4.067811459611004e-05, + "loss": 1.6423, + "step": 6774 + }, + { + "epoch": 1.7804052944460866, + "grad_norm": 0.6012272834777832, + "learning_rate": 4.066059225512529e-05, + "loss": 1.6931, + "step": 6776 + }, + { + "epoch": 1.7809307977797484, + "grad_norm": 0.58221834897995, + "learning_rate": 4.064306991414053e-05, + "loss": 1.6591, + "step": 6778 + }, + { + "epoch": 1.7814563011134101, + "grad_norm": 0.5327997803688049, + "learning_rate": 4.0625547573155775e-05, + "loss": 1.6903, + "step": 6780 + }, + { + "epoch": 1.781981804447072, + "grad_norm": 0.5887238383293152, + "learning_rate": 4.060802523217102e-05, + "loss": 1.6571, + "step": 6782 + }, + { + "epoch": 1.7825073077807336, + "grad_norm": 0.5583620071411133, + "learning_rate": 4.059050289118626e-05, + "loss": 1.6552, + "step": 6784 + }, + { + "epoch": 1.7830328111143956, + "grad_norm": 0.6321818232536316, + "learning_rate": 4.057298055020151e-05, + "loss": 1.6565, + "step": 6786 + }, + { + "epoch": 1.7835583144480571, + "grad_norm": 0.496971070766449, + "learning_rate": 4.055545820921675e-05, + "loss": 1.6311, + "step": 6788 + }, + { + "epoch": 1.784083817781719, + "grad_norm": 0.5267062783241272, + "learning_rate": 4.0537935868232e-05, + "loss": 1.6598, + "step": 6790 + }, + { + "epoch": 1.7846093211153808, + "grad_norm": 0.502678394317627, + "learning_rate": 4.052041352724724e-05, + "loss": 1.6493, + "step": 6792 + }, + { + "epoch": 1.7851348244490426, + "grad_norm": 0.6034113764762878, + "learning_rate": 4.050289118626249e-05, + "loss": 1.6914, + "step": 6794 + }, + { + "epoch": 1.7856603277827043, + "grad_norm": 0.7164289355278015, + "learning_rate": 4.0485368845277735e-05, + "loss": 1.7111, + "step": 6796 + }, + { + "epoch": 1.786185831116366, + "grad_norm": 0.5487950444221497, + "learning_rate": 4.0467846504292976e-05, + "loss": 1.6346, + "step": 6798 + }, + { + "epoch": 1.786711334450028, + "grad_norm": 0.6355817914009094, + "learning_rate": 4.045032416330822e-05, + "loss": 1.6228, + "step": 6800 + }, + { + "epoch": 1.786711334450028, + "eval_loss": 1.6711539030075073, + "eval_runtime": 487.2898, + "eval_samples_per_second": 249.931, + "eval_steps_per_second": 31.242, + "step": 6800 + }, + { + "epoch": 1.7872368377836896, + "grad_norm": 0.5209816694259644, + "learning_rate": 4.0432801822323464e-05, + "loss": 1.6396, + "step": 6802 + }, + { + "epoch": 1.7877623411173515, + "grad_norm": 0.5979394912719727, + "learning_rate": 4.0415279481338705e-05, + "loss": 1.6686, + "step": 6804 + }, + { + "epoch": 1.788287844451013, + "grad_norm": 0.5141789317131042, + "learning_rate": 4.039775714035395e-05, + "loss": 1.6538, + "step": 6806 + }, + { + "epoch": 1.788813347784675, + "grad_norm": 0.5531857013702393, + "learning_rate": 4.03802347993692e-05, + "loss": 1.6612, + "step": 6808 + }, + { + "epoch": 1.7893388511183368, + "grad_norm": 0.5284379720687866, + "learning_rate": 4.036271245838444e-05, + "loss": 1.6403, + "step": 6810 + }, + { + "epoch": 1.7898643544519985, + "grad_norm": 0.5298596620559692, + "learning_rate": 4.034519011739968e-05, + "loss": 1.6494, + "step": 6812 + }, + { + "epoch": 1.7903898577856603, + "grad_norm": 0.5482889413833618, + "learning_rate": 4.032766777641493e-05, + "loss": 1.6516, + "step": 6814 + }, + { + "epoch": 1.790915361119322, + "grad_norm": 0.5631160736083984, + "learning_rate": 4.031014543543018e-05, + "loss": 1.6419, + "step": 6816 + }, + { + "epoch": 1.791440864452984, + "grad_norm": 0.5150030851364136, + "learning_rate": 4.029262309444542e-05, + "loss": 1.6502, + "step": 6818 + }, + { + "epoch": 1.7919663677866455, + "grad_norm": 0.5491872429847717, + "learning_rate": 4.0275100753460665e-05, + "loss": 1.6489, + "step": 6820 + }, + { + "epoch": 1.7924918711203075, + "grad_norm": 0.5404025912284851, + "learning_rate": 4.025757841247591e-05, + "loss": 1.6455, + "step": 6822 + }, + { + "epoch": 1.793017374453969, + "grad_norm": 0.5373459458351135, + "learning_rate": 4.0240056071491154e-05, + "loss": 1.6487, + "step": 6824 + }, + { + "epoch": 1.793542877787631, + "grad_norm": 0.6558994054794312, + "learning_rate": 4.0222533730506394e-05, + "loss": 1.6603, + "step": 6826 + }, + { + "epoch": 1.7940683811212927, + "grad_norm": 0.5680163502693176, + "learning_rate": 4.020501138952164e-05, + "loss": 1.6607, + "step": 6828 + }, + { + "epoch": 1.7945938844549545, + "grad_norm": 0.5356695652008057, + "learning_rate": 4.018748904853688e-05, + "loss": 1.6526, + "step": 6830 + }, + { + "epoch": 1.7951193877886162, + "grad_norm": 0.5755831003189087, + "learning_rate": 4.016996670755213e-05, + "loss": 1.6748, + "step": 6832 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.6050164103507996, + "learning_rate": 4.015244436656738e-05, + "loss": 1.6366, + "step": 6834 + }, + { + "epoch": 1.79617039445594, + "grad_norm": 0.5978443026542664, + "learning_rate": 4.013492202558262e-05, + "loss": 1.6549, + "step": 6836 + }, + { + "epoch": 1.7966958977896015, + "grad_norm": 0.806139349937439, + "learning_rate": 4.011739968459786e-05, + "loss": 1.6486, + "step": 6838 + }, + { + "epoch": 1.7972214011232635, + "grad_norm": 0.665317714214325, + "learning_rate": 4.009987734361311e-05, + "loss": 1.6468, + "step": 6840 + }, + { + "epoch": 1.7977469044569252, + "grad_norm": 0.5707154870033264, + "learning_rate": 4.0082355002628355e-05, + "loss": 1.6592, + "step": 6842 + }, + { + "epoch": 1.798272407790587, + "grad_norm": 0.5100306868553162, + "learning_rate": 4.0064832661643595e-05, + "loss": 1.6732, + "step": 6844 + }, + { + "epoch": 1.7987979111242487, + "grad_norm": 0.4903377294540405, + "learning_rate": 4.004731032065884e-05, + "loss": 1.6383, + "step": 6846 + }, + { + "epoch": 1.7993234144579104, + "grad_norm": 0.5019045472145081, + "learning_rate": 4.002978797967409e-05, + "loss": 1.6668, + "step": 6848 + }, + { + "epoch": 1.7998489177915722, + "grad_norm": 0.5553399324417114, + "learning_rate": 4.001226563868933e-05, + "loss": 1.6666, + "step": 6850 + }, + { + "epoch": 1.800374421125234, + "grad_norm": 0.5196052193641663, + "learning_rate": 3.999474329770457e-05, + "loss": 1.6928, + "step": 6852 + }, + { + "epoch": 1.800899924458896, + "grad_norm": 0.5712267756462097, + "learning_rate": 3.997722095671982e-05, + "loss": 1.6228, + "step": 6854 + }, + { + "epoch": 1.8014254277925574, + "grad_norm": 0.6430991291999817, + "learning_rate": 3.995969861573507e-05, + "loss": 1.6583, + "step": 6856 + }, + { + "epoch": 1.8019509311262194, + "grad_norm": 0.5435091257095337, + "learning_rate": 3.994217627475031e-05, + "loss": 1.6458, + "step": 6858 + }, + { + "epoch": 1.8024764344598811, + "grad_norm": 0.5334445238113403, + "learning_rate": 3.9924653933765556e-05, + "loss": 1.6534, + "step": 6860 + }, + { + "epoch": 1.803001937793543, + "grad_norm": 0.6035925149917603, + "learning_rate": 3.99071315927808e-05, + "loss": 1.6449, + "step": 6862 + }, + { + "epoch": 1.8035274411272046, + "grad_norm": 0.6789858341217041, + "learning_rate": 3.988960925179604e-05, + "loss": 1.6599, + "step": 6864 + }, + { + "epoch": 1.8040529444608664, + "grad_norm": 0.6741647720336914, + "learning_rate": 3.9872086910811285e-05, + "loss": 1.6474, + "step": 6866 + }, + { + "epoch": 1.8045784477945284, + "grad_norm": 0.601939857006073, + "learning_rate": 3.985456456982653e-05, + "loss": 1.657, + "step": 6868 + }, + { + "epoch": 1.8051039511281899, + "grad_norm": 0.5084041357040405, + "learning_rate": 3.983704222884177e-05, + "loss": 1.6707, + "step": 6870 + }, + { + "epoch": 1.8056294544618519, + "grad_norm": 0.6498923897743225, + "learning_rate": 3.981951988785702e-05, + "loss": 1.6738, + "step": 6872 + }, + { + "epoch": 1.8061549577955134, + "grad_norm": 0.6679303050041199, + "learning_rate": 3.980199754687227e-05, + "loss": 1.6565, + "step": 6874 + }, + { + "epoch": 1.8066804611291754, + "grad_norm": 0.642929196357727, + "learning_rate": 3.978447520588751e-05, + "loss": 1.6437, + "step": 6876 + }, + { + "epoch": 1.807205964462837, + "grad_norm": 0.5809937119483948, + "learning_rate": 3.976695286490275e-05, + "loss": 1.6754, + "step": 6878 + }, + { + "epoch": 1.8077314677964988, + "grad_norm": 0.5564219355583191, + "learning_rate": 3.9749430523918e-05, + "loss": 1.6823, + "step": 6880 + }, + { + "epoch": 1.8082569711301606, + "grad_norm": 0.5706532001495361, + "learning_rate": 3.9731908182933245e-05, + "loss": 1.6705, + "step": 6882 + }, + { + "epoch": 1.8087824744638223, + "grad_norm": 0.5469827651977539, + "learning_rate": 3.9714385841948486e-05, + "loss": 1.6463, + "step": 6884 + }, + { + "epoch": 1.8093079777974843, + "grad_norm": 0.5436684489250183, + "learning_rate": 3.969686350096373e-05, + "loss": 1.6658, + "step": 6886 + }, + { + "epoch": 1.8098334811311458, + "grad_norm": 0.5233422517776489, + "learning_rate": 3.967934115997898e-05, + "loss": 1.6741, + "step": 6888 + }, + { + "epoch": 1.8103589844648078, + "grad_norm": 0.5667338967323303, + "learning_rate": 3.9661818818994215e-05, + "loss": 1.6323, + "step": 6890 + }, + { + "epoch": 1.8108844877984693, + "grad_norm": 0.5260610580444336, + "learning_rate": 3.964429647800946e-05, + "loss": 1.6492, + "step": 6892 + }, + { + "epoch": 1.8114099911321313, + "grad_norm": 0.554559051990509, + "learning_rate": 3.962677413702471e-05, + "loss": 1.6686, + "step": 6894 + }, + { + "epoch": 1.811935494465793, + "grad_norm": 0.6630009412765503, + "learning_rate": 3.960925179603995e-05, + "loss": 1.6507, + "step": 6896 + }, + { + "epoch": 1.8124609977994548, + "grad_norm": 0.5093562006950378, + "learning_rate": 3.95917294550552e-05, + "loss": 1.6631, + "step": 6898 + }, + { + "epoch": 1.8129865011331165, + "grad_norm": 0.5125998854637146, + "learning_rate": 3.9574207114070446e-05, + "loss": 1.6141, + "step": 6900 + }, + { + "epoch": 1.8135120044667783, + "grad_norm": 0.5436182022094727, + "learning_rate": 3.955668477308569e-05, + "loss": 1.6298, + "step": 6902 + }, + { + "epoch": 1.8140375078004403, + "grad_norm": 0.5174747705459595, + "learning_rate": 3.953916243210093e-05, + "loss": 1.6381, + "step": 6904 + }, + { + "epoch": 1.8145630111341018, + "grad_norm": 0.48531410098075867, + "learning_rate": 3.9521640091116175e-05, + "loss": 1.6465, + "step": 6906 + }, + { + "epoch": 1.8150885144677638, + "grad_norm": 0.5112138390541077, + "learning_rate": 3.950411775013142e-05, + "loss": 1.6469, + "step": 6908 + }, + { + "epoch": 1.8156140178014253, + "grad_norm": 0.578628420829773, + "learning_rate": 3.948659540914666e-05, + "loss": 1.6874, + "step": 6910 + }, + { + "epoch": 1.8161395211350873, + "grad_norm": 0.5760912895202637, + "learning_rate": 3.946907306816191e-05, + "loss": 1.6661, + "step": 6912 + }, + { + "epoch": 1.816665024468749, + "grad_norm": 0.5140530467033386, + "learning_rate": 3.945155072717715e-05, + "loss": 1.6359, + "step": 6914 + }, + { + "epoch": 1.8171905278024107, + "grad_norm": 0.5136293172836304, + "learning_rate": 3.943402838619239e-05, + "loss": 1.6839, + "step": 6916 + }, + { + "epoch": 1.8177160311360725, + "grad_norm": 0.6095285415649414, + "learning_rate": 3.941650604520764e-05, + "loss": 1.6552, + "step": 6918 + }, + { + "epoch": 1.8182415344697342, + "grad_norm": 0.5882896780967712, + "learning_rate": 3.939898370422289e-05, + "loss": 1.6532, + "step": 6920 + }, + { + "epoch": 1.8187670378033962, + "grad_norm": 0.5088122487068176, + "learning_rate": 3.938146136323813e-05, + "loss": 1.6719, + "step": 6922 + }, + { + "epoch": 1.8192925411370577, + "grad_norm": 0.5035478472709656, + "learning_rate": 3.9363939022253376e-05, + "loss": 1.641, + "step": 6924 + }, + { + "epoch": 1.8198180444707197, + "grad_norm": 0.559293270111084, + "learning_rate": 3.9346416681268624e-05, + "loss": 1.6971, + "step": 6926 + }, + { + "epoch": 1.8203435478043812, + "grad_norm": 0.5681344866752625, + "learning_rate": 3.9328894340283864e-05, + "loss": 1.6569, + "step": 6928 + }, + { + "epoch": 1.8208690511380432, + "grad_norm": 0.6714912056922913, + "learning_rate": 3.9311371999299105e-05, + "loss": 1.7089, + "step": 6930 + }, + { + "epoch": 1.821394554471705, + "grad_norm": 0.5596345067024231, + "learning_rate": 3.929384965831435e-05, + "loss": 1.6888, + "step": 6932 + }, + { + "epoch": 1.8219200578053667, + "grad_norm": 0.5444710850715637, + "learning_rate": 3.92763273173296e-05, + "loss": 1.6772, + "step": 6934 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.5050802230834961, + "learning_rate": 3.925880497634484e-05, + "loss": 1.6613, + "step": 6936 + }, + { + "epoch": 1.8229710644726902, + "grad_norm": 0.5749676823616028, + "learning_rate": 3.924128263536009e-05, + "loss": 1.6448, + "step": 6938 + }, + { + "epoch": 1.8234965678063522, + "grad_norm": 0.577195405960083, + "learning_rate": 3.922376029437533e-05, + "loss": 1.6614, + "step": 6940 + }, + { + "epoch": 1.8240220711400137, + "grad_norm": 0.5361042022705078, + "learning_rate": 3.920623795339057e-05, + "loss": 1.6764, + "step": 6942 + }, + { + "epoch": 1.8245475744736757, + "grad_norm": 0.5485758781433105, + "learning_rate": 3.918871561240582e-05, + "loss": 1.6486, + "step": 6944 + }, + { + "epoch": 1.8250730778073372, + "grad_norm": 0.7427318096160889, + "learning_rate": 3.9171193271421065e-05, + "loss": 1.6755, + "step": 6946 + }, + { + "epoch": 1.8255985811409992, + "grad_norm": 0.5816475749015808, + "learning_rate": 3.9153670930436306e-05, + "loss": 1.6518, + "step": 6948 + }, + { + "epoch": 1.826124084474661, + "grad_norm": 0.48851844668388367, + "learning_rate": 3.9136148589451554e-05, + "loss": 1.6655, + "step": 6950 + }, + { + "epoch": 1.8266495878083227, + "grad_norm": 0.5446299910545349, + "learning_rate": 3.91186262484668e-05, + "loss": 1.6374, + "step": 6952 + }, + { + "epoch": 1.8271750911419844, + "grad_norm": 0.514245331287384, + "learning_rate": 3.910110390748204e-05, + "loss": 1.6463, + "step": 6954 + }, + { + "epoch": 1.8277005944756461, + "grad_norm": 0.5527070760726929, + "learning_rate": 3.908358156649728e-05, + "loss": 1.6874, + "step": 6956 + }, + { + "epoch": 1.8282260978093081, + "grad_norm": 0.5745643377304077, + "learning_rate": 3.906605922551253e-05, + "loss": 1.6334, + "step": 6958 + }, + { + "epoch": 1.8287516011429696, + "grad_norm": 0.5475890636444092, + "learning_rate": 3.904853688452778e-05, + "loss": 1.6336, + "step": 6960 + }, + { + "epoch": 1.8292771044766316, + "grad_norm": 0.5883870124816895, + "learning_rate": 3.903101454354302e-05, + "loss": 1.6226, + "step": 6962 + }, + { + "epoch": 1.8298026078102931, + "grad_norm": 0.5167810916900635, + "learning_rate": 3.9013492202558266e-05, + "loss": 1.6612, + "step": 6964 + }, + { + "epoch": 1.830328111143955, + "grad_norm": 0.5100778937339783, + "learning_rate": 3.899596986157351e-05, + "loss": 1.6666, + "step": 6966 + }, + { + "epoch": 1.8308536144776169, + "grad_norm": 0.5465711355209351, + "learning_rate": 3.897844752058875e-05, + "loss": 1.6586, + "step": 6968 + }, + { + "epoch": 1.8313791178112786, + "grad_norm": 0.5632458329200745, + "learning_rate": 3.8960925179603995e-05, + "loss": 1.6729, + "step": 6970 + }, + { + "epoch": 1.8319046211449403, + "grad_norm": 0.7153643369674683, + "learning_rate": 3.894340283861924e-05, + "loss": 1.6767, + "step": 6972 + }, + { + "epoch": 1.832430124478602, + "grad_norm": 0.5567420721054077, + "learning_rate": 3.8925880497634484e-05, + "loss": 1.6836, + "step": 6974 + }, + { + "epoch": 1.832955627812264, + "grad_norm": 0.6071416735649109, + "learning_rate": 3.890835815664973e-05, + "loss": 1.6403, + "step": 6976 + }, + { + "epoch": 1.8334811311459256, + "grad_norm": 0.5392048954963684, + "learning_rate": 3.889083581566497e-05, + "loss": 1.6667, + "step": 6978 + }, + { + "epoch": 1.8340066344795876, + "grad_norm": 0.7555952668190002, + "learning_rate": 3.887331347468022e-05, + "loss": 1.6623, + "step": 6980 + }, + { + "epoch": 1.834532137813249, + "grad_norm": 0.6968433856964111, + "learning_rate": 3.885579113369546e-05, + "loss": 1.6556, + "step": 6982 + }, + { + "epoch": 1.835057641146911, + "grad_norm": 0.6319105625152588, + "learning_rate": 3.883826879271071e-05, + "loss": 1.7025, + "step": 6984 + }, + { + "epoch": 1.8355831444805728, + "grad_norm": 0.5621939301490784, + "learning_rate": 3.8820746451725956e-05, + "loss": 1.639, + "step": 6986 + }, + { + "epoch": 1.8361086478142346, + "grad_norm": 0.5498519539833069, + "learning_rate": 3.8803224110741196e-05, + "loss": 1.6726, + "step": 6988 + }, + { + "epoch": 1.8366341511478963, + "grad_norm": 0.6178312301635742, + "learning_rate": 3.8785701769756444e-05, + "loss": 1.6525, + "step": 6990 + }, + { + "epoch": 1.837159654481558, + "grad_norm": 0.4725324511528015, + "learning_rate": 3.8768179428771685e-05, + "loss": 1.6423, + "step": 6992 + }, + { + "epoch": 1.83768515781522, + "grad_norm": 0.5918049812316895, + "learning_rate": 3.875065708778693e-05, + "loss": 1.6728, + "step": 6994 + }, + { + "epoch": 1.8382106611488815, + "grad_norm": 0.6157652735710144, + "learning_rate": 3.873313474680217e-05, + "loss": 1.6549, + "step": 6996 + }, + { + "epoch": 1.8387361644825435, + "grad_norm": 0.596299409866333, + "learning_rate": 3.871561240581742e-05, + "loss": 1.6488, + "step": 6998 + }, + { + "epoch": 1.8392616678162053, + "grad_norm": 0.4981100559234619, + "learning_rate": 3.869809006483267e-05, + "loss": 1.6163, + "step": 7000 + }, + { + "epoch": 1.839787171149867, + "grad_norm": 0.5667177438735962, + "learning_rate": 3.868056772384791e-05, + "loss": 1.7114, + "step": 7002 + }, + { + "epoch": 1.8403126744835288, + "grad_norm": 0.5904120206832886, + "learning_rate": 3.866304538286315e-05, + "loss": 1.644, + "step": 7004 + }, + { + "epoch": 1.8408381778171905, + "grad_norm": 0.5185456871986389, + "learning_rate": 3.86455230418784e-05, + "loss": 1.6537, + "step": 7006 + }, + { + "epoch": 1.8413636811508522, + "grad_norm": 0.5449099540710449, + "learning_rate": 3.862800070089364e-05, + "loss": 1.6807, + "step": 7008 + }, + { + "epoch": 1.841889184484514, + "grad_norm": 0.7209144234657288, + "learning_rate": 3.8610478359908886e-05, + "loss": 1.6648, + "step": 7010 + }, + { + "epoch": 1.842414687818176, + "grad_norm": 0.6890124082565308, + "learning_rate": 3.859295601892413e-05, + "loss": 1.647, + "step": 7012 + }, + { + "epoch": 1.8429401911518375, + "grad_norm": 0.5385224223136902, + "learning_rate": 3.8575433677939374e-05, + "loss": 1.6475, + "step": 7014 + }, + { + "epoch": 1.8434656944854995, + "grad_norm": 0.5653911232948303, + "learning_rate": 3.8557911336954615e-05, + "loss": 1.6752, + "step": 7016 + }, + { + "epoch": 1.8439911978191612, + "grad_norm": 0.5241896510124207, + "learning_rate": 3.854038899596986e-05, + "loss": 1.6747, + "step": 7018 + }, + { + "epoch": 1.844516701152823, + "grad_norm": 0.7143029570579529, + "learning_rate": 3.852286665498511e-05, + "loss": 1.6538, + "step": 7020 + }, + { + "epoch": 1.8450422044864847, + "grad_norm": 0.5827885270118713, + "learning_rate": 3.850534431400035e-05, + "loss": 1.6987, + "step": 7022 + }, + { + "epoch": 1.8455677078201465, + "grad_norm": 0.6297259330749512, + "learning_rate": 3.84878219730156e-05, + "loss": 1.6628, + "step": 7024 + }, + { + "epoch": 1.8460932111538084, + "grad_norm": 0.5605854392051697, + "learning_rate": 3.8470299632030846e-05, + "loss": 1.6216, + "step": 7026 + }, + { + "epoch": 1.84661871448747, + "grad_norm": 0.6488915085792542, + "learning_rate": 3.845277729104609e-05, + "loss": 1.7036, + "step": 7028 + }, + { + "epoch": 1.847144217821132, + "grad_norm": 0.6576531529426575, + "learning_rate": 3.843525495006133e-05, + "loss": 1.6846, + "step": 7030 + }, + { + "epoch": 1.8476697211547934, + "grad_norm": 0.49783241748809814, + "learning_rate": 3.8417732609076575e-05, + "loss": 1.6566, + "step": 7032 + }, + { + "epoch": 1.8481952244884554, + "grad_norm": 0.5974870324134827, + "learning_rate": 3.8400210268091816e-05, + "loss": 1.6787, + "step": 7034 + }, + { + "epoch": 1.8487207278221172, + "grad_norm": 0.5442907810211182, + "learning_rate": 3.838268792710706e-05, + "loss": 1.6414, + "step": 7036 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.4982667565345764, + "learning_rate": 3.836516558612231e-05, + "loss": 1.664, + "step": 7038 + }, + { + "epoch": 1.8497717344894407, + "grad_norm": 0.5198219418525696, + "learning_rate": 3.834764324513755e-05, + "loss": 1.6708, + "step": 7040 + }, + { + "epoch": 1.8502972378231024, + "grad_norm": 0.6068596839904785, + "learning_rate": 3.833012090415279e-05, + "loss": 1.6467, + "step": 7042 + }, + { + "epoch": 1.8508227411567644, + "grad_norm": 0.5471599102020264, + "learning_rate": 3.831259856316804e-05, + "loss": 1.6371, + "step": 7044 + }, + { + "epoch": 1.851348244490426, + "grad_norm": 0.7527357339859009, + "learning_rate": 3.829507622218329e-05, + "loss": 1.6562, + "step": 7046 + }, + { + "epoch": 1.8518737478240879, + "grad_norm": 0.5454807877540588, + "learning_rate": 3.827755388119853e-05, + "loss": 1.6923, + "step": 7048 + }, + { + "epoch": 1.8523992511577494, + "grad_norm": 0.5361452102661133, + "learning_rate": 3.8260031540213776e-05, + "loss": 1.6479, + "step": 7050 + }, + { + "epoch": 1.8529247544914114, + "grad_norm": 0.5446274876594543, + "learning_rate": 3.8242509199229024e-05, + "loss": 1.6576, + "step": 7052 + }, + { + "epoch": 1.853450257825073, + "grad_norm": 0.6107861995697021, + "learning_rate": 3.822498685824426e-05, + "loss": 1.6777, + "step": 7054 + }, + { + "epoch": 1.8539757611587349, + "grad_norm": 0.526027262210846, + "learning_rate": 3.8207464517259505e-05, + "loss": 1.6423, + "step": 7056 + }, + { + "epoch": 1.8545012644923966, + "grad_norm": 0.5729789137840271, + "learning_rate": 3.818994217627475e-05, + "loss": 1.6364, + "step": 7058 + }, + { + "epoch": 1.8550267678260584, + "grad_norm": 0.5303583145141602, + "learning_rate": 3.8172419835289993e-05, + "loss": 1.6984, + "step": 7060 + }, + { + "epoch": 1.8555522711597203, + "grad_norm": 0.6077755093574524, + "learning_rate": 3.815489749430524e-05, + "loss": 1.6934, + "step": 7062 + }, + { + "epoch": 1.8560777744933818, + "grad_norm": 0.5593380331993103, + "learning_rate": 3.813737515332049e-05, + "loss": 1.6546, + "step": 7064 + }, + { + "epoch": 1.8566032778270438, + "grad_norm": 0.5528572797775269, + "learning_rate": 3.811985281233573e-05, + "loss": 1.6722, + "step": 7066 + }, + { + "epoch": 1.8571287811607053, + "grad_norm": 0.5037624835968018, + "learning_rate": 3.810233047135097e-05, + "loss": 1.6701, + "step": 7068 + }, + { + "epoch": 1.8576542844943673, + "grad_norm": 0.5588459968566895, + "learning_rate": 3.808480813036622e-05, + "loss": 1.6871, + "step": 7070 + }, + { + "epoch": 1.858179787828029, + "grad_norm": 0.5192500948905945, + "learning_rate": 3.8067285789381465e-05, + "loss": 1.6539, + "step": 7072 + }, + { + "epoch": 1.8587052911616908, + "grad_norm": 0.5383935570716858, + "learning_rate": 3.8049763448396706e-05, + "loss": 1.672, + "step": 7074 + }, + { + "epoch": 1.8592307944953526, + "grad_norm": 0.6632468104362488, + "learning_rate": 3.8032241107411954e-05, + "loss": 1.6735, + "step": 7076 + }, + { + "epoch": 1.8597562978290143, + "grad_norm": 0.5410235524177551, + "learning_rate": 3.80147187664272e-05, + "loss": 1.6374, + "step": 7078 + }, + { + "epoch": 1.8602818011626763, + "grad_norm": 0.7134387493133545, + "learning_rate": 3.7997196425442435e-05, + "loss": 1.6661, + "step": 7080 + }, + { + "epoch": 1.8608073044963378, + "grad_norm": 0.54608154296875, + "learning_rate": 3.797967408445768e-05, + "loss": 1.6297, + "step": 7082 + }, + { + "epoch": 1.8613328078299998, + "grad_norm": 0.6386792659759521, + "learning_rate": 3.796215174347293e-05, + "loss": 1.6489, + "step": 7084 + }, + { + "epoch": 1.8618583111636613, + "grad_norm": 0.5409175157546997, + "learning_rate": 3.794462940248817e-05, + "loss": 1.6609, + "step": 7086 + }, + { + "epoch": 1.8623838144973233, + "grad_norm": 0.611273467540741, + "learning_rate": 3.792710706150342e-05, + "loss": 1.6685, + "step": 7088 + }, + { + "epoch": 1.862909317830985, + "grad_norm": 0.6014897227287292, + "learning_rate": 3.7909584720518666e-05, + "loss": 1.6068, + "step": 7090 + }, + { + "epoch": 1.8634348211646468, + "grad_norm": 0.5631469488143921, + "learning_rate": 3.789206237953391e-05, + "loss": 1.6478, + "step": 7092 + }, + { + "epoch": 1.8639603244983085, + "grad_norm": 0.566448450088501, + "learning_rate": 3.787454003854915e-05, + "loss": 1.6861, + "step": 7094 + }, + { + "epoch": 1.8644858278319703, + "grad_norm": 0.5702322721481323, + "learning_rate": 3.7857017697564395e-05, + "loss": 1.6507, + "step": 7096 + }, + { + "epoch": 1.8650113311656322, + "grad_norm": 0.5720840096473694, + "learning_rate": 3.783949535657964e-05, + "loss": 1.6615, + "step": 7098 + }, + { + "epoch": 1.8655368344992938, + "grad_norm": 0.586739718914032, + "learning_rate": 3.7821973015594884e-05, + "loss": 1.6837, + "step": 7100 + }, + { + "epoch": 1.8660623378329557, + "grad_norm": 0.5152260065078735, + "learning_rate": 3.780445067461013e-05, + "loss": 1.6565, + "step": 7102 + }, + { + "epoch": 1.8665878411666172, + "grad_norm": 0.5608677864074707, + "learning_rate": 3.778692833362538e-05, + "loss": 1.6448, + "step": 7104 + }, + { + "epoch": 1.8671133445002792, + "grad_norm": 0.5977829694747925, + "learning_rate": 3.776940599264062e-05, + "loss": 1.6415, + "step": 7106 + }, + { + "epoch": 1.867638847833941, + "grad_norm": 0.6419270038604736, + "learning_rate": 3.775188365165586e-05, + "loss": 1.6032, + "step": 7108 + }, + { + "epoch": 1.8681643511676027, + "grad_norm": 0.6474549770355225, + "learning_rate": 3.773436131067111e-05, + "loss": 1.6717, + "step": 7110 + }, + { + "epoch": 1.8686898545012645, + "grad_norm": 0.6148583889007568, + "learning_rate": 3.7716838969686356e-05, + "loss": 1.6476, + "step": 7112 + }, + { + "epoch": 1.8692153578349262, + "grad_norm": 0.5739107728004456, + "learning_rate": 3.7699316628701596e-05, + "loss": 1.652, + "step": 7114 + }, + { + "epoch": 1.8697408611685882, + "grad_norm": 0.5666532516479492, + "learning_rate": 3.7681794287716844e-05, + "loss": 1.6611, + "step": 7116 + }, + { + "epoch": 1.8702663645022497, + "grad_norm": 0.6231557726860046, + "learning_rate": 3.7664271946732085e-05, + "loss": 1.6285, + "step": 7118 + }, + { + "epoch": 1.8707918678359117, + "grad_norm": 0.5692910552024841, + "learning_rate": 3.7646749605747325e-05, + "loss": 1.6195, + "step": 7120 + }, + { + "epoch": 1.8713173711695732, + "grad_norm": 0.5744662880897522, + "learning_rate": 3.762922726476257e-05, + "loss": 1.6603, + "step": 7122 + }, + { + "epoch": 1.8718428745032352, + "grad_norm": 0.5582786798477173, + "learning_rate": 3.761170492377782e-05, + "loss": 1.6719, + "step": 7124 + }, + { + "epoch": 1.872368377836897, + "grad_norm": 0.5148811340332031, + "learning_rate": 3.759418258279306e-05, + "loss": 1.6329, + "step": 7126 + }, + { + "epoch": 1.8728938811705587, + "grad_norm": 0.5253287553787231, + "learning_rate": 3.757666024180831e-05, + "loss": 1.6576, + "step": 7128 + }, + { + "epoch": 1.8734193845042204, + "grad_norm": 0.5456867814064026, + "learning_rate": 3.7559137900823557e-05, + "loss": 1.6682, + "step": 7130 + }, + { + "epoch": 1.8739448878378822, + "grad_norm": 0.5340244770050049, + "learning_rate": 3.75416155598388e-05, + "loss": 1.6758, + "step": 7132 + }, + { + "epoch": 1.8744703911715441, + "grad_norm": 0.5905424356460571, + "learning_rate": 3.752409321885404e-05, + "loss": 1.6707, + "step": 7134 + }, + { + "epoch": 1.8749958945052057, + "grad_norm": 0.5440637469291687, + "learning_rate": 3.7506570877869286e-05, + "loss": 1.6507, + "step": 7136 + }, + { + "epoch": 1.8755213978388676, + "grad_norm": 0.751801073551178, + "learning_rate": 3.748904853688453e-05, + "loss": 1.6656, + "step": 7138 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.5282323956489563, + "learning_rate": 3.7471526195899774e-05, + "loss": 1.6808, + "step": 7140 + }, + { + "epoch": 1.8765724045061911, + "grad_norm": 0.5995266437530518, + "learning_rate": 3.745400385491502e-05, + "loss": 1.6374, + "step": 7142 + }, + { + "epoch": 1.8770979078398529, + "grad_norm": 0.5131629705429077, + "learning_rate": 3.743648151393026e-05, + "loss": 1.631, + "step": 7144 + }, + { + "epoch": 1.8776234111735146, + "grad_norm": 0.5343523025512695, + "learning_rate": 3.74189591729455e-05, + "loss": 1.6253, + "step": 7146 + }, + { + "epoch": 1.8781489145071764, + "grad_norm": 0.6353945136070251, + "learning_rate": 3.740143683196075e-05, + "loss": 1.6843, + "step": 7148 + }, + { + "epoch": 1.878674417840838, + "grad_norm": 0.5876971483230591, + "learning_rate": 3.7383914490976e-05, + "loss": 1.6692, + "step": 7150 + }, + { + "epoch": 1.8791999211745, + "grad_norm": 0.5077223181724548, + "learning_rate": 3.736639214999124e-05, + "loss": 1.6095, + "step": 7152 + }, + { + "epoch": 1.8797254245081616, + "grad_norm": 0.5225921273231506, + "learning_rate": 3.734886980900649e-05, + "loss": 1.6475, + "step": 7154 + }, + { + "epoch": 1.8802509278418236, + "grad_norm": 0.549243688583374, + "learning_rate": 3.733134746802173e-05, + "loss": 1.6529, + "step": 7156 + }, + { + "epoch": 1.8807764311754853, + "grad_norm": 0.5567914843559265, + "learning_rate": 3.7313825127036975e-05, + "loss": 1.6621, + "step": 7158 + }, + { + "epoch": 1.881301934509147, + "grad_norm": 0.5527283549308777, + "learning_rate": 3.7296302786052216e-05, + "loss": 1.6705, + "step": 7160 + }, + { + "epoch": 1.8818274378428088, + "grad_norm": 0.5174548625946045, + "learning_rate": 3.727878044506746e-05, + "loss": 1.6606, + "step": 7162 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.527554988861084, + "learning_rate": 3.726125810408271e-05, + "loss": 1.6543, + "step": 7164 + }, + { + "epoch": 1.8828784445101323, + "grad_norm": 0.6432197093963623, + "learning_rate": 3.724373576309795e-05, + "loss": 1.6551, + "step": 7166 + }, + { + "epoch": 1.883403947843794, + "grad_norm": 0.6546508073806763, + "learning_rate": 3.72262134221132e-05, + "loss": 1.644, + "step": 7168 + }, + { + "epoch": 1.883929451177456, + "grad_norm": 0.5889551043510437, + "learning_rate": 3.720869108112844e-05, + "loss": 1.6767, + "step": 7170 + }, + { + "epoch": 1.8844549545111176, + "grad_norm": 0.5616403222084045, + "learning_rate": 3.719116874014368e-05, + "loss": 1.661, + "step": 7172 + }, + { + "epoch": 1.8849804578447795, + "grad_norm": 0.7210134267807007, + "learning_rate": 3.717364639915893e-05, + "loss": 1.6527, + "step": 7174 + }, + { + "epoch": 1.8855059611784413, + "grad_norm": 0.5026001930236816, + "learning_rate": 3.7156124058174176e-05, + "loss": 1.6589, + "step": 7176 + }, + { + "epoch": 1.886031464512103, + "grad_norm": 0.5873162746429443, + "learning_rate": 3.713860171718942e-05, + "loss": 1.622, + "step": 7178 + }, + { + "epoch": 1.8865569678457648, + "grad_norm": 0.729069709777832, + "learning_rate": 3.7121079376204664e-05, + "loss": 1.6229, + "step": 7180 + }, + { + "epoch": 1.8870824711794265, + "grad_norm": 0.5042127966880798, + "learning_rate": 3.7103557035219905e-05, + "loss": 1.6329, + "step": 7182 + }, + { + "epoch": 1.8876079745130885, + "grad_norm": 0.5304409861564636, + "learning_rate": 3.708603469423515e-05, + "loss": 1.6427, + "step": 7184 + }, + { + "epoch": 1.88813347784675, + "grad_norm": 0.6537191271781921, + "learning_rate": 3.7068512353250393e-05, + "loss": 1.662, + "step": 7186 + }, + { + "epoch": 1.888658981180412, + "grad_norm": 0.5206599235534668, + "learning_rate": 3.705099001226564e-05, + "loss": 1.6375, + "step": 7188 + }, + { + "epoch": 1.8891844845140735, + "grad_norm": 0.6516169905662537, + "learning_rate": 3.703346767128089e-05, + "loss": 1.6364, + "step": 7190 + }, + { + "epoch": 1.8897099878477355, + "grad_norm": 0.5258002877235413, + "learning_rate": 3.701594533029613e-05, + "loss": 1.6826, + "step": 7192 + }, + { + "epoch": 1.8902354911813972, + "grad_norm": 0.5022348761558533, + "learning_rate": 3.699842298931138e-05, + "loss": 1.64, + "step": 7194 + }, + { + "epoch": 1.890760994515059, + "grad_norm": 0.585818350315094, + "learning_rate": 3.698090064832662e-05, + "loss": 1.6968, + "step": 7196 + }, + { + "epoch": 1.8912864978487207, + "grad_norm": 0.5346968770027161, + "learning_rate": 3.696337830734186e-05, + "loss": 1.6514, + "step": 7198 + }, + { + "epoch": 1.8918120011823825, + "grad_norm": 0.6125622391700745, + "learning_rate": 3.6945855966357106e-05, + "loss": 1.6467, + "step": 7200 + }, + { + "epoch": 1.8918120011823825, + "eval_loss": 1.66790771484375, + "eval_runtime": 487.3046, + "eval_samples_per_second": 249.924, + "eval_steps_per_second": 31.241, + "step": 7200 + }, + { + "epoch": 1.8923375045160444, + "grad_norm": 0.5719324350357056, + "learning_rate": 3.6928333625372354e-05, + "loss": 1.6776, + "step": 7202 + }, + { + "epoch": 1.892863007849706, + "grad_norm": 0.5245485901832581, + "learning_rate": 3.6910811284387594e-05, + "loss": 1.6324, + "step": 7204 + }, + { + "epoch": 1.893388511183368, + "grad_norm": 0.5099211931228638, + "learning_rate": 3.689328894340284e-05, + "loss": 1.6764, + "step": 7206 + }, + { + "epoch": 1.8939140145170295, + "grad_norm": 0.7997536659240723, + "learning_rate": 3.687576660241808e-05, + "loss": 1.6619, + "step": 7208 + }, + { + "epoch": 1.8944395178506914, + "grad_norm": 0.5983949899673462, + "learning_rate": 3.685824426143333e-05, + "loss": 1.6557, + "step": 7210 + }, + { + "epoch": 1.8949650211843532, + "grad_norm": 0.6213306784629822, + "learning_rate": 3.684072192044857e-05, + "loss": 1.6559, + "step": 7212 + }, + { + "epoch": 1.895490524518015, + "grad_norm": 0.5697503685951233, + "learning_rate": 3.682319957946382e-05, + "loss": 1.6754, + "step": 7214 + }, + { + "epoch": 1.8960160278516767, + "grad_norm": 0.507168710231781, + "learning_rate": 3.6805677238479066e-05, + "loss": 1.6618, + "step": 7216 + }, + { + "epoch": 1.8965415311853384, + "grad_norm": 0.6169989109039307, + "learning_rate": 3.678815489749431e-05, + "loss": 1.6329, + "step": 7218 + }, + { + "epoch": 1.8970670345190004, + "grad_norm": 0.516326904296875, + "learning_rate": 3.677063255650955e-05, + "loss": 1.65, + "step": 7220 + }, + { + "epoch": 1.897592537852662, + "grad_norm": 0.5588873624801636, + "learning_rate": 3.6753110215524795e-05, + "loss": 1.6707, + "step": 7222 + }, + { + "epoch": 1.8981180411863239, + "grad_norm": 0.7022035717964172, + "learning_rate": 3.6735587874540036e-05, + "loss": 1.6935, + "step": 7224 + }, + { + "epoch": 1.8986435445199854, + "grad_norm": 0.5556198954582214, + "learning_rate": 3.6718065533555284e-05, + "loss": 1.6955, + "step": 7226 + }, + { + "epoch": 1.8991690478536474, + "grad_norm": 0.6451961398124695, + "learning_rate": 3.670054319257053e-05, + "loss": 1.6607, + "step": 7228 + }, + { + "epoch": 1.8996945511873091, + "grad_norm": 0.603360116481781, + "learning_rate": 3.668302085158577e-05, + "loss": 1.626, + "step": 7230 + }, + { + "epoch": 1.9002200545209709, + "grad_norm": 0.7140105962753296, + "learning_rate": 3.666549851060102e-05, + "loss": 1.6587, + "step": 7232 + }, + { + "epoch": 1.9007455578546326, + "grad_norm": 0.5646045804023743, + "learning_rate": 3.664797616961626e-05, + "loss": 1.6681, + "step": 7234 + }, + { + "epoch": 1.9012710611882944, + "grad_norm": 0.5183248519897461, + "learning_rate": 3.663045382863151e-05, + "loss": 1.6912, + "step": 7236 + }, + { + "epoch": 1.9017965645219563, + "grad_norm": 0.6256281137466431, + "learning_rate": 3.661293148764675e-05, + "loss": 1.6699, + "step": 7238 + }, + { + "epoch": 1.9023220678556179, + "grad_norm": 0.5462591052055359, + "learning_rate": 3.6595409146661996e-05, + "loss": 1.6628, + "step": 7240 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.715267539024353, + "learning_rate": 3.6577886805677244e-05, + "loss": 1.6907, + "step": 7242 + }, + { + "epoch": 1.9033730745229414, + "grad_norm": 0.8691731095314026, + "learning_rate": 3.6560364464692485e-05, + "loss": 1.6982, + "step": 7244 + }, + { + "epoch": 1.9038985778566033, + "grad_norm": 0.55055171251297, + "learning_rate": 3.6542842123707725e-05, + "loss": 1.6757, + "step": 7246 + }, + { + "epoch": 1.904424081190265, + "grad_norm": 0.5819158554077148, + "learning_rate": 3.652531978272297e-05, + "loss": 1.6571, + "step": 7248 + }, + { + "epoch": 1.9049495845239268, + "grad_norm": 0.6203599572181702, + "learning_rate": 3.650779744173822e-05, + "loss": 1.7292, + "step": 7250 + }, + { + "epoch": 1.9054750878575886, + "grad_norm": 0.5759249925613403, + "learning_rate": 3.649027510075346e-05, + "loss": 1.6227, + "step": 7252 + }, + { + "epoch": 1.9060005911912503, + "grad_norm": 0.581551730632782, + "learning_rate": 3.647275275976871e-05, + "loss": 1.6741, + "step": 7254 + }, + { + "epoch": 1.9065260945249123, + "grad_norm": 0.6072301864624023, + "learning_rate": 3.6455230418783956e-05, + "loss": 1.6663, + "step": 7256 + }, + { + "epoch": 1.9070515978585738, + "grad_norm": 0.5655650496482849, + "learning_rate": 3.643770807779919e-05, + "loss": 1.6729, + "step": 7258 + }, + { + "epoch": 1.9075771011922358, + "grad_norm": 0.5449069738388062, + "learning_rate": 3.642018573681444e-05, + "loss": 1.6514, + "step": 7260 + }, + { + "epoch": 1.9081026045258973, + "grad_norm": 0.573408842086792, + "learning_rate": 3.6402663395829686e-05, + "loss": 1.6718, + "step": 7262 + }, + { + "epoch": 1.9086281078595593, + "grad_norm": 0.7997104525566101, + "learning_rate": 3.6385141054844926e-05, + "loss": 1.702, + "step": 7264 + }, + { + "epoch": 1.909153611193221, + "grad_norm": 0.565268337726593, + "learning_rate": 3.6367618713860174e-05, + "loss": 1.6244, + "step": 7266 + }, + { + "epoch": 1.9096791145268828, + "grad_norm": 0.5995902419090271, + "learning_rate": 3.635009637287542e-05, + "loss": 1.6694, + "step": 7268 + }, + { + "epoch": 1.9102046178605445, + "grad_norm": 0.6907638907432556, + "learning_rate": 3.633257403189066e-05, + "loss": 1.6425, + "step": 7270 + }, + { + "epoch": 1.9107301211942063, + "grad_norm": 0.49709975719451904, + "learning_rate": 3.63150516909059e-05, + "loss": 1.6564, + "step": 7272 + }, + { + "epoch": 1.9112556245278682, + "grad_norm": 0.5460817813873291, + "learning_rate": 3.629752934992115e-05, + "loss": 1.6822, + "step": 7274 + }, + { + "epoch": 1.9117811278615298, + "grad_norm": 0.6267193555831909, + "learning_rate": 3.62800070089364e-05, + "loss": 1.6325, + "step": 7276 + }, + { + "epoch": 1.9123066311951917, + "grad_norm": 0.5849470496177673, + "learning_rate": 3.626248466795164e-05, + "loss": 1.6596, + "step": 7278 + }, + { + "epoch": 1.9128321345288533, + "grad_norm": 0.5416064262390137, + "learning_rate": 3.6244962326966887e-05, + "loss": 1.6786, + "step": 7280 + }, + { + "epoch": 1.9133576378625152, + "grad_norm": 0.6678909063339233, + "learning_rate": 3.6227439985982134e-05, + "loss": 1.6392, + "step": 7282 + }, + { + "epoch": 1.913883141196177, + "grad_norm": 0.5513505935668945, + "learning_rate": 3.620991764499737e-05, + "loss": 1.6942, + "step": 7284 + }, + { + "epoch": 1.9144086445298387, + "grad_norm": 0.7723992466926575, + "learning_rate": 3.6192395304012616e-05, + "loss": 1.6853, + "step": 7286 + }, + { + "epoch": 1.9149341478635005, + "grad_norm": 0.610593318939209, + "learning_rate": 3.617487296302786e-05, + "loss": 1.6927, + "step": 7288 + }, + { + "epoch": 1.9154596511971622, + "grad_norm": 0.6118401288986206, + "learning_rate": 3.6157350622043104e-05, + "loss": 1.6397, + "step": 7290 + }, + { + "epoch": 1.9159851545308242, + "grad_norm": 0.5796778798103333, + "learning_rate": 3.613982828105835e-05, + "loss": 1.6625, + "step": 7292 + }, + { + "epoch": 1.9165106578644857, + "grad_norm": 0.520003080368042, + "learning_rate": 3.61223059400736e-05, + "loss": 1.6902, + "step": 7294 + }, + { + "epoch": 1.9170361611981477, + "grad_norm": 0.6326977610588074, + "learning_rate": 3.610478359908884e-05, + "loss": 1.6767, + "step": 7296 + }, + { + "epoch": 1.9175616645318094, + "grad_norm": 0.4994673728942871, + "learning_rate": 3.608726125810408e-05, + "loss": 1.625, + "step": 7298 + }, + { + "epoch": 1.9180871678654712, + "grad_norm": 0.4929233193397522, + "learning_rate": 3.606973891711933e-05, + "loss": 1.7034, + "step": 7300 + }, + { + "epoch": 1.918612671199133, + "grad_norm": 0.5794808268547058, + "learning_rate": 3.6052216576134576e-05, + "loss": 1.6522, + "step": 7302 + }, + { + "epoch": 1.9191381745327947, + "grad_norm": 0.5688311457633972, + "learning_rate": 3.603469423514982e-05, + "loss": 1.7031, + "step": 7304 + }, + { + "epoch": 1.9196636778664564, + "grad_norm": 0.5700064897537231, + "learning_rate": 3.6017171894165064e-05, + "loss": 1.6469, + "step": 7306 + }, + { + "epoch": 1.9201891812001182, + "grad_norm": 0.4965689480304718, + "learning_rate": 3.599964955318031e-05, + "loss": 1.6539, + "step": 7308 + }, + { + "epoch": 1.9207146845337801, + "grad_norm": 0.6212711334228516, + "learning_rate": 3.5982127212195546e-05, + "loss": 1.6558, + "step": 7310 + }, + { + "epoch": 1.9212401878674417, + "grad_norm": 0.5985603928565979, + "learning_rate": 3.5964604871210793e-05, + "loss": 1.6462, + "step": 7312 + }, + { + "epoch": 1.9217656912011036, + "grad_norm": 0.6384910345077515, + "learning_rate": 3.594708253022604e-05, + "loss": 1.6279, + "step": 7314 + }, + { + "epoch": 1.9222911945347654, + "grad_norm": 0.5548694133758545, + "learning_rate": 3.592956018924128e-05, + "loss": 1.6488, + "step": 7316 + }, + { + "epoch": 1.9228166978684271, + "grad_norm": 0.5115952491760254, + "learning_rate": 3.591203784825653e-05, + "loss": 1.6557, + "step": 7318 + }, + { + "epoch": 1.9233422012020889, + "grad_norm": 0.564540684223175, + "learning_rate": 3.589451550727178e-05, + "loss": 1.6506, + "step": 7320 + }, + { + "epoch": 1.9238677045357506, + "grad_norm": 0.5424622297286987, + "learning_rate": 3.587699316628702e-05, + "loss": 1.6837, + "step": 7322 + }, + { + "epoch": 1.9243932078694124, + "grad_norm": 0.6576436758041382, + "learning_rate": 3.585947082530226e-05, + "loss": 1.6663, + "step": 7324 + }, + { + "epoch": 1.9249187112030741, + "grad_norm": 0.6056269407272339, + "learning_rate": 3.5841948484317506e-05, + "loss": 1.6427, + "step": 7326 + }, + { + "epoch": 1.925444214536736, + "grad_norm": 0.5156649947166443, + "learning_rate": 3.5824426143332754e-05, + "loss": 1.649, + "step": 7328 + }, + { + "epoch": 1.9259697178703976, + "grad_norm": 0.5065081119537354, + "learning_rate": 3.5806903802347994e-05, + "loss": 1.6422, + "step": 7330 + }, + { + "epoch": 1.9264952212040596, + "grad_norm": 0.5199773907661438, + "learning_rate": 3.578938146136324e-05, + "loss": 1.634, + "step": 7332 + }, + { + "epoch": 1.9270207245377213, + "grad_norm": 0.5321483016014099, + "learning_rate": 3.577185912037849e-05, + "loss": 1.6501, + "step": 7334 + }, + { + "epoch": 1.927546227871383, + "grad_norm": 0.5092171430587769, + "learning_rate": 3.5754336779393724e-05, + "loss": 1.6337, + "step": 7336 + }, + { + "epoch": 1.9280717312050448, + "grad_norm": 0.4993257224559784, + "learning_rate": 3.573681443840897e-05, + "loss": 1.647, + "step": 7338 + }, + { + "epoch": 1.9285972345387066, + "grad_norm": 0.7736045718193054, + "learning_rate": 3.571929209742422e-05, + "loss": 1.645, + "step": 7340 + }, + { + "epoch": 1.9291227378723685, + "grad_norm": 0.5889518857002258, + "learning_rate": 3.570176975643946e-05, + "loss": 1.6412, + "step": 7342 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5440062880516052, + "learning_rate": 3.568424741545471e-05, + "loss": 1.6537, + "step": 7344 + }, + { + "epoch": 1.930173744539692, + "grad_norm": 0.5857016444206238, + "learning_rate": 3.5666725074469955e-05, + "loss": 1.6627, + "step": 7346 + }, + { + "epoch": 1.9306992478733536, + "grad_norm": 0.5349956750869751, + "learning_rate": 3.5649202733485195e-05, + "loss": 1.6621, + "step": 7348 + }, + { + "epoch": 1.9312247512070155, + "grad_norm": 0.6116125583648682, + "learning_rate": 3.5631680392500436e-05, + "loss": 1.6555, + "step": 7350 + }, + { + "epoch": 1.9317502545406773, + "grad_norm": 0.5522750020027161, + "learning_rate": 3.5614158051515684e-05, + "loss": 1.6551, + "step": 7352 + }, + { + "epoch": 1.932275757874339, + "grad_norm": 0.503121554851532, + "learning_rate": 3.559663571053093e-05, + "loss": 1.6105, + "step": 7354 + }, + { + "epoch": 1.9328012612080008, + "grad_norm": 0.7201147079467773, + "learning_rate": 3.557911336954617e-05, + "loss": 1.6763, + "step": 7356 + }, + { + "epoch": 1.9333267645416625, + "grad_norm": 0.507443904876709, + "learning_rate": 3.556159102856142e-05, + "loss": 1.6476, + "step": 7358 + }, + { + "epoch": 1.9338522678753245, + "grad_norm": 0.5974195003509521, + "learning_rate": 3.554406868757666e-05, + "loss": 1.6446, + "step": 7360 + }, + { + "epoch": 1.934377771208986, + "grad_norm": 0.5196303725242615, + "learning_rate": 3.552654634659191e-05, + "loss": 1.638, + "step": 7362 + }, + { + "epoch": 1.934903274542648, + "grad_norm": 0.5367706418037415, + "learning_rate": 3.550902400560715e-05, + "loss": 1.6712, + "step": 7364 + }, + { + "epoch": 1.9354287778763095, + "grad_norm": 0.5725764632225037, + "learning_rate": 3.5491501664622396e-05, + "loss": 1.6764, + "step": 7366 + }, + { + "epoch": 1.9359542812099715, + "grad_norm": 0.6095370054244995, + "learning_rate": 3.5473979323637644e-05, + "loss": 1.6385, + "step": 7368 + }, + { + "epoch": 1.9364797845436332, + "grad_norm": 0.6413910388946533, + "learning_rate": 3.5456456982652885e-05, + "loss": 1.6665, + "step": 7370 + }, + { + "epoch": 1.937005287877295, + "grad_norm": 0.5054334998130798, + "learning_rate": 3.543893464166813e-05, + "loss": 1.6639, + "step": 7372 + }, + { + "epoch": 1.9375307912109567, + "grad_norm": 0.5909721255302429, + "learning_rate": 3.542141230068337e-05, + "loss": 1.6658, + "step": 7374 + }, + { + "epoch": 1.9380562945446185, + "grad_norm": 0.5415735244750977, + "learning_rate": 3.5403889959698614e-05, + "loss": 1.6872, + "step": 7376 + }, + { + "epoch": 1.9385817978782804, + "grad_norm": 0.5185040831565857, + "learning_rate": 3.538636761871386e-05, + "loss": 1.6371, + "step": 7378 + }, + { + "epoch": 1.939107301211942, + "grad_norm": 0.4956663250923157, + "learning_rate": 3.536884527772911e-05, + "loss": 1.6036, + "step": 7380 + }, + { + "epoch": 1.939632804545604, + "grad_norm": 0.5410330891609192, + "learning_rate": 3.535132293674435e-05, + "loss": 1.6482, + "step": 7382 + }, + { + "epoch": 1.9401583078792655, + "grad_norm": 0.6475924253463745, + "learning_rate": 3.53338005957596e-05, + "loss": 1.6297, + "step": 7384 + }, + { + "epoch": 1.9406838112129274, + "grad_norm": 0.5302926301956177, + "learning_rate": 3.531627825477484e-05, + "loss": 1.6652, + "step": 7386 + }, + { + "epoch": 1.9412093145465892, + "grad_norm": 0.6099222898483276, + "learning_rate": 3.5298755913790086e-05, + "loss": 1.6463, + "step": 7388 + }, + { + "epoch": 1.941734817880251, + "grad_norm": 0.6674348711967468, + "learning_rate": 3.5281233572805326e-05, + "loss": 1.6421, + "step": 7390 + }, + { + "epoch": 1.9422603212139127, + "grad_norm": 0.5395660400390625, + "learning_rate": 3.5263711231820574e-05, + "loss": 1.6465, + "step": 7392 + }, + { + "epoch": 1.9427858245475744, + "grad_norm": 0.629122257232666, + "learning_rate": 3.524618889083582e-05, + "loss": 1.6914, + "step": 7394 + }, + { + "epoch": 1.9433113278812364, + "grad_norm": 0.5263876914978027, + "learning_rate": 3.522866654985106e-05, + "loss": 1.7023, + "step": 7396 + }, + { + "epoch": 1.943836831214898, + "grad_norm": 0.5672745108604431, + "learning_rate": 3.52111442088663e-05, + "loss": 1.652, + "step": 7398 + }, + { + "epoch": 1.94436233454856, + "grad_norm": 0.551514744758606, + "learning_rate": 3.519362186788155e-05, + "loss": 1.6294, + "step": 7400 + }, + { + "epoch": 1.9448878378822214, + "grad_norm": 0.593605101108551, + "learning_rate": 3.517609952689679e-05, + "loss": 1.6561, + "step": 7402 + }, + { + "epoch": 1.9454133412158834, + "grad_norm": 0.5020228028297424, + "learning_rate": 3.515857718591204e-05, + "loss": 1.6724, + "step": 7404 + }, + { + "epoch": 1.9459388445495451, + "grad_norm": 0.5694274306297302, + "learning_rate": 3.5141054844927287e-05, + "loss": 1.6771, + "step": 7406 + }, + { + "epoch": 1.9464643478832069, + "grad_norm": 0.6046936511993408, + "learning_rate": 3.512353250394253e-05, + "loss": 1.6627, + "step": 7408 + }, + { + "epoch": 1.9469898512168686, + "grad_norm": 0.5441648364067078, + "learning_rate": 3.5106010162957775e-05, + "loss": 1.638, + "step": 7410 + }, + { + "epoch": 1.9475153545505304, + "grad_norm": 0.5168318748474121, + "learning_rate": 3.5088487821973016e-05, + "loss": 1.6782, + "step": 7412 + }, + { + "epoch": 1.9480408578841923, + "grad_norm": 0.5330861210823059, + "learning_rate": 3.507096548098826e-05, + "loss": 1.6427, + "step": 7414 + }, + { + "epoch": 1.9485663612178539, + "grad_norm": 0.6046448945999146, + "learning_rate": 3.5053443140003504e-05, + "loss": 1.644, + "step": 7416 + }, + { + "epoch": 1.9490918645515158, + "grad_norm": 0.5944949388504028, + "learning_rate": 3.503592079901875e-05, + "loss": 1.6684, + "step": 7418 + }, + { + "epoch": 1.9496173678851774, + "grad_norm": 0.5597866773605347, + "learning_rate": 3.5018398458034e-05, + "loss": 1.646, + "step": 7420 + }, + { + "epoch": 1.9501428712188393, + "grad_norm": 0.6573653817176819, + "learning_rate": 3.500087611704924e-05, + "loss": 1.6601, + "step": 7422 + }, + { + "epoch": 1.950668374552501, + "grad_norm": 0.5286207795143127, + "learning_rate": 3.498335377606448e-05, + "loss": 1.6259, + "step": 7424 + }, + { + "epoch": 1.9511938778861628, + "grad_norm": 0.5400940179824829, + "learning_rate": 3.496583143507973e-05, + "loss": 1.6581, + "step": 7426 + }, + { + "epoch": 1.9517193812198246, + "grad_norm": 0.6813338994979858, + "learning_rate": 3.494830909409497e-05, + "loss": 1.6552, + "step": 7428 + }, + { + "epoch": 1.9522448845534863, + "grad_norm": 0.5815770030021667, + "learning_rate": 3.493078675311022e-05, + "loss": 1.6845, + "step": 7430 + }, + { + "epoch": 1.9527703878871483, + "grad_norm": 0.5522500872612, + "learning_rate": 3.4913264412125464e-05, + "loss": 1.6749, + "step": 7432 + }, + { + "epoch": 1.9532958912208098, + "grad_norm": 0.5736134052276611, + "learning_rate": 3.4895742071140705e-05, + "loss": 1.6413, + "step": 7434 + }, + { + "epoch": 1.9538213945544718, + "grad_norm": 0.4932587742805481, + "learning_rate": 3.487821973015595e-05, + "loss": 1.6416, + "step": 7436 + }, + { + "epoch": 1.9543468978881333, + "grad_norm": 0.5147337913513184, + "learning_rate": 3.486069738917119e-05, + "loss": 1.6626, + "step": 7438 + }, + { + "epoch": 1.9548724012217953, + "grad_norm": 0.7022672891616821, + "learning_rate": 3.484317504818644e-05, + "loss": 1.6723, + "step": 7440 + }, + { + "epoch": 1.955397904555457, + "grad_norm": 0.6867913007736206, + "learning_rate": 3.482565270720168e-05, + "loss": 1.6431, + "step": 7442 + }, + { + "epoch": 1.9559234078891188, + "grad_norm": 0.5425695180892944, + "learning_rate": 3.480813036621693e-05, + "loss": 1.6566, + "step": 7444 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.6468666195869446, + "learning_rate": 3.479060802523218e-05, + "loss": 1.6638, + "step": 7446 + }, + { + "epoch": 1.9569744145564423, + "grad_norm": 0.5925945043563843, + "learning_rate": 3.477308568424742e-05, + "loss": 1.6443, + "step": 7448 + }, + { + "epoch": 1.9574999178901042, + "grad_norm": 0.6654757857322693, + "learning_rate": 3.475556334326266e-05, + "loss": 1.6064, + "step": 7450 + }, + { + "epoch": 1.9580254212237658, + "grad_norm": 0.5577012300491333, + "learning_rate": 3.4738041002277906e-05, + "loss": 1.6556, + "step": 7452 + }, + { + "epoch": 1.9585509245574277, + "grad_norm": 0.7134981751441956, + "learning_rate": 3.472051866129315e-05, + "loss": 1.6739, + "step": 7454 + }, + { + "epoch": 1.9590764278910895, + "grad_norm": 0.5233617424964905, + "learning_rate": 3.4702996320308394e-05, + "loss": 1.6466, + "step": 7456 + }, + { + "epoch": 1.9596019312247512, + "grad_norm": 0.5530445575714111, + "learning_rate": 3.468547397932364e-05, + "loss": 1.6994, + "step": 7458 + }, + { + "epoch": 1.960127434558413, + "grad_norm": 0.5197651982307434, + "learning_rate": 3.466795163833888e-05, + "loss": 1.6614, + "step": 7460 + }, + { + "epoch": 1.9606529378920747, + "grad_norm": 0.48130765557289124, + "learning_rate": 3.4650429297354123e-05, + "loss": 1.6449, + "step": 7462 + }, + { + "epoch": 1.9611784412257365, + "grad_norm": 0.49137425422668457, + "learning_rate": 3.463290695636937e-05, + "loss": 1.6527, + "step": 7464 + }, + { + "epoch": 1.9617039445593982, + "grad_norm": 0.5547472238540649, + "learning_rate": 3.461538461538462e-05, + "loss": 1.6491, + "step": 7466 + }, + { + "epoch": 1.9622294478930602, + "grad_norm": 0.5760181546211243, + "learning_rate": 3.459786227439986e-05, + "loss": 1.6794, + "step": 7468 + }, + { + "epoch": 1.9627549512267217, + "grad_norm": 0.5289864540100098, + "learning_rate": 3.458033993341511e-05, + "loss": 1.6548, + "step": 7470 + }, + { + "epoch": 1.9632804545603837, + "grad_norm": 0.5183879733085632, + "learning_rate": 3.4562817592430355e-05, + "loss": 1.6629, + "step": 7472 + }, + { + "epoch": 1.9638059578940454, + "grad_norm": 0.5636990666389465, + "learning_rate": 3.4545295251445595e-05, + "loss": 1.6591, + "step": 7474 + }, + { + "epoch": 1.9643314612277072, + "grad_norm": 0.5583269596099854, + "learning_rate": 3.4527772910460836e-05, + "loss": 1.6576, + "step": 7476 + }, + { + "epoch": 1.964856964561369, + "grad_norm": 0.5348683595657349, + "learning_rate": 3.4510250569476084e-05, + "loss": 1.6148, + "step": 7478 + }, + { + "epoch": 1.9653824678950307, + "grad_norm": 0.5336459875106812, + "learning_rate": 3.4492728228491324e-05, + "loss": 1.6704, + "step": 7480 + }, + { + "epoch": 1.9659079712286924, + "grad_norm": 0.5364224314689636, + "learning_rate": 3.447520588750657e-05, + "loss": 1.6391, + "step": 7482 + }, + { + "epoch": 1.9664334745623542, + "grad_norm": 0.5437600612640381, + "learning_rate": 3.445768354652182e-05, + "loss": 1.6817, + "step": 7484 + }, + { + "epoch": 1.9669589778960161, + "grad_norm": 0.6408882737159729, + "learning_rate": 3.444016120553706e-05, + "loss": 1.6505, + "step": 7486 + }, + { + "epoch": 1.9674844812296777, + "grad_norm": 0.5990426540374756, + "learning_rate": 3.44226388645523e-05, + "loss": 1.6634, + "step": 7488 + }, + { + "epoch": 1.9680099845633396, + "grad_norm": 0.5437130928039551, + "learning_rate": 3.440511652356755e-05, + "loss": 1.6716, + "step": 7490 + }, + { + "epoch": 1.9685354878970014, + "grad_norm": 0.6260076761245728, + "learning_rate": 3.4387594182582796e-05, + "loss": 1.6569, + "step": 7492 + }, + { + "epoch": 1.9690609912306631, + "grad_norm": 0.5225280523300171, + "learning_rate": 3.437007184159804e-05, + "loss": 1.6649, + "step": 7494 + }, + { + "epoch": 1.9695864945643249, + "grad_norm": 0.5389422178268433, + "learning_rate": 3.4352549500613285e-05, + "loss": 1.6723, + "step": 7496 + }, + { + "epoch": 1.9701119978979866, + "grad_norm": 0.6174726486206055, + "learning_rate": 3.433502715962853e-05, + "loss": 1.669, + "step": 7498 + }, + { + "epoch": 1.9706375012316486, + "grad_norm": 0.5065149664878845, + "learning_rate": 3.431750481864377e-05, + "loss": 1.6638, + "step": 7500 + }, + { + "epoch": 1.9711630045653101, + "grad_norm": 0.6950640678405762, + "learning_rate": 3.4299982477659014e-05, + "loss": 1.6713, + "step": 7502 + }, + { + "epoch": 1.971688507898972, + "grad_norm": 0.507963240146637, + "learning_rate": 3.428246013667426e-05, + "loss": 1.6332, + "step": 7504 + }, + { + "epoch": 1.9722140112326336, + "grad_norm": 0.6077314019203186, + "learning_rate": 3.426493779568951e-05, + "loss": 1.6641, + "step": 7506 + }, + { + "epoch": 1.9727395145662956, + "grad_norm": 0.5328965783119202, + "learning_rate": 3.424741545470475e-05, + "loss": 1.6593, + "step": 7508 + }, + { + "epoch": 1.9732650178999573, + "grad_norm": 0.6714265942573547, + "learning_rate": 3.422989311372e-05, + "loss": 1.6444, + "step": 7510 + }, + { + "epoch": 1.973790521233619, + "grad_norm": 0.5329544544219971, + "learning_rate": 3.4212370772735245e-05, + "loss": 1.6337, + "step": 7512 + }, + { + "epoch": 1.9743160245672808, + "grad_norm": 0.5360748767852783, + "learning_rate": 3.419484843175048e-05, + "loss": 1.6535, + "step": 7514 + }, + { + "epoch": 1.9748415279009426, + "grad_norm": 0.5412916541099548, + "learning_rate": 3.4177326090765726e-05, + "loss": 1.6504, + "step": 7516 + }, + { + "epoch": 1.9753670312346046, + "grad_norm": 0.6592147946357727, + "learning_rate": 3.4159803749780974e-05, + "loss": 1.6766, + "step": 7518 + }, + { + "epoch": 1.975892534568266, + "grad_norm": 0.5632504820823669, + "learning_rate": 3.4142281408796215e-05, + "loss": 1.6587, + "step": 7520 + }, + { + "epoch": 1.976418037901928, + "grad_norm": 0.5235944390296936, + "learning_rate": 3.412475906781146e-05, + "loss": 1.7091, + "step": 7522 + }, + { + "epoch": 1.9769435412355896, + "grad_norm": 0.7552381753921509, + "learning_rate": 3.410723672682671e-05, + "loss": 1.6765, + "step": 7524 + }, + { + "epoch": 1.9774690445692515, + "grad_norm": 0.5463106036186218, + "learning_rate": 3.408971438584195e-05, + "loss": 1.7028, + "step": 7526 + }, + { + "epoch": 1.9779945479029133, + "grad_norm": 0.5811282396316528, + "learning_rate": 3.407219204485719e-05, + "loss": 1.626, + "step": 7528 + }, + { + "epoch": 1.978520051236575, + "grad_norm": 0.5381944179534912, + "learning_rate": 3.405466970387244e-05, + "loss": 1.6557, + "step": 7530 + }, + { + "epoch": 1.9790455545702368, + "grad_norm": 0.5382105112075806, + "learning_rate": 3.4037147362887687e-05, + "loss": 1.6717, + "step": 7532 + }, + { + "epoch": 1.9795710579038985, + "grad_norm": 0.5555400252342224, + "learning_rate": 3.401962502190293e-05, + "loss": 1.6546, + "step": 7534 + }, + { + "epoch": 1.9800965612375605, + "grad_norm": 0.5068405270576477, + "learning_rate": 3.4002102680918175e-05, + "loss": 1.6378, + "step": 7536 + }, + { + "epoch": 1.980622064571222, + "grad_norm": 0.5872784852981567, + "learning_rate": 3.398458033993342e-05, + "loss": 1.6615, + "step": 7538 + }, + { + "epoch": 1.981147567904884, + "grad_norm": 0.5751393437385559, + "learning_rate": 3.3967057998948656e-05, + "loss": 1.6712, + "step": 7540 + }, + { + "epoch": 1.9816730712385455, + "grad_norm": 0.5746015310287476, + "learning_rate": 3.3949535657963904e-05, + "loss": 1.667, + "step": 7542 + }, + { + "epoch": 1.9821985745722075, + "grad_norm": 0.6387732028961182, + "learning_rate": 3.393201331697915e-05, + "loss": 1.6253, + "step": 7544 + }, + { + "epoch": 1.9827240779058692, + "grad_norm": 0.5271580815315247, + "learning_rate": 3.391449097599439e-05, + "loss": 1.6442, + "step": 7546 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.5445979237556458, + "learning_rate": 3.389696863500964e-05, + "loss": 1.6363, + "step": 7548 + }, + { + "epoch": 1.9837750845731927, + "grad_norm": 0.6095319986343384, + "learning_rate": 3.387944629402489e-05, + "loss": 1.6486, + "step": 7550 + }, + { + "epoch": 1.9843005879068545, + "grad_norm": 0.6398605704307556, + "learning_rate": 3.386192395304013e-05, + "loss": 1.6427, + "step": 7552 + }, + { + "epoch": 1.9848260912405165, + "grad_norm": 0.6042175889015198, + "learning_rate": 3.384440161205537e-05, + "loss": 1.6522, + "step": 7554 + }, + { + "epoch": 1.985351594574178, + "grad_norm": 0.550961971282959, + "learning_rate": 3.382687927107062e-05, + "loss": 1.7006, + "step": 7556 + }, + { + "epoch": 1.98587709790784, + "grad_norm": 0.5935929417610168, + "learning_rate": 3.3809356930085864e-05, + "loss": 1.6537, + "step": 7558 + }, + { + "epoch": 1.9864026012415015, + "grad_norm": 0.6039239764213562, + "learning_rate": 3.3791834589101105e-05, + "loss": 1.6623, + "step": 7560 + }, + { + "epoch": 1.9869281045751634, + "grad_norm": 0.5280980467796326, + "learning_rate": 3.377431224811635e-05, + "loss": 1.6695, + "step": 7562 + }, + { + "epoch": 1.9874536079088252, + "grad_norm": 0.5845438838005066, + "learning_rate": 3.375678990713159e-05, + "loss": 1.6387, + "step": 7564 + }, + { + "epoch": 1.987979111242487, + "grad_norm": 0.5099235773086548, + "learning_rate": 3.3739267566146834e-05, + "loss": 1.644, + "step": 7566 + }, + { + "epoch": 1.9885046145761487, + "grad_norm": 0.6559625864028931, + "learning_rate": 3.372174522516208e-05, + "loss": 1.6683, + "step": 7568 + }, + { + "epoch": 1.9890301179098104, + "grad_norm": 0.5141841173171997, + "learning_rate": 3.370422288417733e-05, + "loss": 1.6487, + "step": 7570 + }, + { + "epoch": 1.9895556212434724, + "grad_norm": 0.5415523052215576, + "learning_rate": 3.368670054319257e-05, + "loss": 1.6654, + "step": 7572 + }, + { + "epoch": 1.990081124577134, + "grad_norm": 0.5395537614822388, + "learning_rate": 3.366917820220782e-05, + "loss": 1.6397, + "step": 7574 + }, + { + "epoch": 1.990606627910796, + "grad_norm": 0.7312270402908325, + "learning_rate": 3.3651655861223065e-05, + "loss": 1.628, + "step": 7576 + }, + { + "epoch": 1.9911321312444574, + "grad_norm": 0.7088197469711304, + "learning_rate": 3.3634133520238306e-05, + "loss": 1.6725, + "step": 7578 + }, + { + "epoch": 1.9916576345781194, + "grad_norm": 0.5116075277328491, + "learning_rate": 3.361661117925355e-05, + "loss": 1.6307, + "step": 7580 + }, + { + "epoch": 1.9921831379117811, + "grad_norm": 0.5469351410865784, + "learning_rate": 3.3599088838268794e-05, + "loss": 1.6465, + "step": 7582 + }, + { + "epoch": 1.992708641245443, + "grad_norm": 0.5921772718429565, + "learning_rate": 3.358156649728404e-05, + "loss": 1.6809, + "step": 7584 + }, + { + "epoch": 1.9932341445791046, + "grad_norm": 0.5668914318084717, + "learning_rate": 3.356404415629928e-05, + "loss": 1.639, + "step": 7586 + }, + { + "epoch": 1.9937596479127664, + "grad_norm": 0.5449174642562866, + "learning_rate": 3.354652181531453e-05, + "loss": 1.6501, + "step": 7588 + }, + { + "epoch": 1.9942851512464284, + "grad_norm": 0.6408663988113403, + "learning_rate": 3.352899947432977e-05, + "loss": 1.6949, + "step": 7590 + }, + { + "epoch": 1.9948106545800899, + "grad_norm": 0.6358173489570618, + "learning_rate": 3.351147713334501e-05, + "loss": 1.6808, + "step": 7592 + }, + { + "epoch": 1.9953361579137519, + "grad_norm": 0.5143889784812927, + "learning_rate": 3.349395479236026e-05, + "loss": 1.6659, + "step": 7594 + }, + { + "epoch": 1.9958616612474134, + "grad_norm": 0.5499483346939087, + "learning_rate": 3.347643245137551e-05, + "loss": 1.6649, + "step": 7596 + }, + { + "epoch": 1.9963871645810753, + "grad_norm": 0.5296257138252258, + "learning_rate": 3.345891011039075e-05, + "loss": 1.6784, + "step": 7598 + }, + { + "epoch": 1.996912667914737, + "grad_norm": 0.6414445638656616, + "learning_rate": 3.3441387769405995e-05, + "loss": 1.6672, + "step": 7600 + }, + { + "epoch": 1.996912667914737, + "eval_loss": 1.6618915796279907, + "eval_runtime": 487.2163, + "eval_samples_per_second": 249.969, + "eval_steps_per_second": 31.247, + "step": 7600 + }, + { + "epoch": 1.9974381712483988, + "grad_norm": 0.5835072994232178, + "learning_rate": 3.3423865428421236e-05, + "loss": 1.6681, + "step": 7602 + }, + { + "epoch": 1.9979636745820606, + "grad_norm": 0.5567203164100647, + "learning_rate": 3.3406343087436484e-05, + "loss": 1.6592, + "step": 7604 + }, + { + "epoch": 1.9984891779157223, + "grad_norm": 0.6631194949150085, + "learning_rate": 3.3388820746451724e-05, + "loss": 1.6551, + "step": 7606 + }, + { + "epoch": 1.9990146812493843, + "grad_norm": 0.6589709520339966, + "learning_rate": 3.337129840546697e-05, + "loss": 1.6358, + "step": 7608 + }, + { + "epoch": 1.9995401845830458, + "grad_norm": 0.5504697561264038, + "learning_rate": 3.335377606448222e-05, + "loss": 1.6477, + "step": 7610 + }, + { + "epoch": 2.000065687916708, + "grad_norm": 0.4931085705757141, + "learning_rate": 3.333625372349746e-05, + "loss": 1.6487, + "step": 7612 + }, + { + "epoch": 2.0005911912503693, + "grad_norm": 0.6642125844955444, + "learning_rate": 3.331873138251271e-05, + "loss": 1.6306, + "step": 7614 + }, + { + "epoch": 2.0011166945840313, + "grad_norm": 0.5301389098167419, + "learning_rate": 3.330120904152795e-05, + "loss": 1.574, + "step": 7616 + }, + { + "epoch": 2.001642197917693, + "grad_norm": 0.5445417761802673, + "learning_rate": 3.3283686700543196e-05, + "loss": 1.6205, + "step": 7618 + }, + { + "epoch": 2.002167701251355, + "grad_norm": 0.5667280554771423, + "learning_rate": 3.326616435955844e-05, + "loss": 1.6149, + "step": 7620 + }, + { + "epoch": 2.0026932045850168, + "grad_norm": 0.5716310739517212, + "learning_rate": 3.3248642018573685e-05, + "loss": 1.6333, + "step": 7622 + }, + { + "epoch": 2.0032187079186783, + "grad_norm": 0.7009231448173523, + "learning_rate": 3.323111967758893e-05, + "loss": 1.6281, + "step": 7624 + }, + { + "epoch": 2.0037442112523403, + "grad_norm": 0.5482553839683533, + "learning_rate": 3.321359733660417e-05, + "loss": 1.6054, + "step": 7626 + }, + { + "epoch": 2.004269714586002, + "grad_norm": 0.48852571845054626, + "learning_rate": 3.3196074995619414e-05, + "loss": 1.636, + "step": 7628 + }, + { + "epoch": 2.0047952179196638, + "grad_norm": 0.5163108110427856, + "learning_rate": 3.317855265463466e-05, + "loss": 1.6036, + "step": 7630 + }, + { + "epoch": 2.0053207212533253, + "grad_norm": 0.5240321755409241, + "learning_rate": 3.31610303136499e-05, + "loss": 1.5722, + "step": 7632 + }, + { + "epoch": 2.0058462245869872, + "grad_norm": 0.5390346646308899, + "learning_rate": 3.314350797266515e-05, + "loss": 1.5939, + "step": 7634 + }, + { + "epoch": 2.006371727920649, + "grad_norm": 0.5812144875526428, + "learning_rate": 3.31259856316804e-05, + "loss": 1.6445, + "step": 7636 + }, + { + "epoch": 2.0068972312543107, + "grad_norm": 0.6792988777160645, + "learning_rate": 3.310846329069564e-05, + "loss": 1.6003, + "step": 7638 + }, + { + "epoch": 2.0074227345879727, + "grad_norm": 0.6539336442947388, + "learning_rate": 3.309094094971088e-05, + "loss": 1.632, + "step": 7640 + }, + { + "epoch": 2.0079482379216342, + "grad_norm": 0.6297439336776733, + "learning_rate": 3.3073418608726126e-05, + "loss": 1.6148, + "step": 7642 + }, + { + "epoch": 2.008473741255296, + "grad_norm": 0.6377638578414917, + "learning_rate": 3.3055896267741374e-05, + "loss": 1.6097, + "step": 7644 + }, + { + "epoch": 2.0089992445889577, + "grad_norm": 0.5924256443977356, + "learning_rate": 3.3038373926756615e-05, + "loss": 1.6239, + "step": 7646 + }, + { + "epoch": 2.0095247479226197, + "grad_norm": 0.541309118270874, + "learning_rate": 3.302085158577186e-05, + "loss": 1.6145, + "step": 7648 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.708676278591156, + "learning_rate": 3.300332924478711e-05, + "loss": 1.6037, + "step": 7650 + }, + { + "epoch": 2.010575754589943, + "grad_norm": 0.6031850576400757, + "learning_rate": 3.298580690380235e-05, + "loss": 1.6071, + "step": 7652 + }, + { + "epoch": 2.011101257923605, + "grad_norm": 0.6296379566192627, + "learning_rate": 3.296828456281759e-05, + "loss": 1.6348, + "step": 7654 + }, + { + "epoch": 2.0116267612572667, + "grad_norm": 0.5863556861877441, + "learning_rate": 3.295076222183284e-05, + "loss": 1.6446, + "step": 7656 + }, + { + "epoch": 2.0121522645909287, + "grad_norm": 0.5582921504974365, + "learning_rate": 3.293323988084808e-05, + "loss": 1.62, + "step": 7658 + }, + { + "epoch": 2.01267776792459, + "grad_norm": 0.5858394503593445, + "learning_rate": 3.291571753986333e-05, + "loss": 1.5987, + "step": 7660 + }, + { + "epoch": 2.013203271258252, + "grad_norm": 0.6063563227653503, + "learning_rate": 3.2898195198878575e-05, + "loss": 1.6469, + "step": 7662 + }, + { + "epoch": 2.0137287745919137, + "grad_norm": 0.536972165107727, + "learning_rate": 3.2880672857893816e-05, + "loss": 1.6042, + "step": 7664 + }, + { + "epoch": 2.0142542779255757, + "grad_norm": 0.6241976618766785, + "learning_rate": 3.2863150516909056e-05, + "loss": 1.5904, + "step": 7666 + }, + { + "epoch": 2.014779781259237, + "grad_norm": 0.5789350271224976, + "learning_rate": 3.2845628175924304e-05, + "loss": 1.5959, + "step": 7668 + }, + { + "epoch": 2.015305284592899, + "grad_norm": 0.6051028370857239, + "learning_rate": 3.282810583493955e-05, + "loss": 1.628, + "step": 7670 + }, + { + "epoch": 2.015830787926561, + "grad_norm": 0.658258855342865, + "learning_rate": 3.281058349395479e-05, + "loss": 1.6064, + "step": 7672 + }, + { + "epoch": 2.0163562912602226, + "grad_norm": 0.5936440825462341, + "learning_rate": 3.279306115297004e-05, + "loss": 1.6102, + "step": 7674 + }, + { + "epoch": 2.0168817945938846, + "grad_norm": 0.5593782663345337, + "learning_rate": 3.277553881198529e-05, + "loss": 1.6103, + "step": 7676 + }, + { + "epoch": 2.017407297927546, + "grad_norm": 0.5200657248497009, + "learning_rate": 3.275801647100053e-05, + "loss": 1.5972, + "step": 7678 + }, + { + "epoch": 2.017932801261208, + "grad_norm": 0.6176472902297974, + "learning_rate": 3.274049413001577e-05, + "loss": 1.5623, + "step": 7680 + }, + { + "epoch": 2.0184583045948696, + "grad_norm": 0.7342426776885986, + "learning_rate": 3.272297178903102e-05, + "loss": 1.6424, + "step": 7682 + }, + { + "epoch": 2.0189838079285316, + "grad_norm": 0.5766749978065491, + "learning_rate": 3.270544944804626e-05, + "loss": 1.6093, + "step": 7684 + }, + { + "epoch": 2.019509311262193, + "grad_norm": 0.5769773125648499, + "learning_rate": 3.2687927107061505e-05, + "loss": 1.6372, + "step": 7686 + }, + { + "epoch": 2.020034814595855, + "grad_norm": 0.6358572244644165, + "learning_rate": 3.267040476607675e-05, + "loss": 1.6274, + "step": 7688 + }, + { + "epoch": 2.020560317929517, + "grad_norm": 0.5427133440971375, + "learning_rate": 3.265288242509199e-05, + "loss": 1.594, + "step": 7690 + }, + { + "epoch": 2.0210858212631786, + "grad_norm": 0.5329941511154175, + "learning_rate": 3.2635360084107234e-05, + "loss": 1.606, + "step": 7692 + }, + { + "epoch": 2.0216113245968406, + "grad_norm": 0.5538175106048584, + "learning_rate": 3.261783774312248e-05, + "loss": 1.5938, + "step": 7694 + }, + { + "epoch": 2.022136827930502, + "grad_norm": 0.5610730051994324, + "learning_rate": 3.260031540213773e-05, + "loss": 1.604, + "step": 7696 + }, + { + "epoch": 2.022662331264164, + "grad_norm": 0.6616785526275635, + "learning_rate": 3.258279306115297e-05, + "loss": 1.6107, + "step": 7698 + }, + { + "epoch": 2.0231878345978256, + "grad_norm": 0.5210354924201965, + "learning_rate": 3.256527072016822e-05, + "loss": 1.6338, + "step": 7700 + }, + { + "epoch": 2.0237133379314876, + "grad_norm": 0.7099502682685852, + "learning_rate": 3.2547748379183465e-05, + "loss": 1.632, + "step": 7702 + }, + { + "epoch": 2.024238841265149, + "grad_norm": 0.5911880731582642, + "learning_rate": 3.25302260381987e-05, + "loss": 1.5943, + "step": 7704 + }, + { + "epoch": 2.024764344598811, + "grad_norm": 0.5507931709289551, + "learning_rate": 3.251270369721395e-05, + "loss": 1.605, + "step": 7706 + }, + { + "epoch": 2.025289847932473, + "grad_norm": 0.5524379014968872, + "learning_rate": 3.2495181356229194e-05, + "loss": 1.5664, + "step": 7708 + }, + { + "epoch": 2.0258153512661345, + "grad_norm": 0.5344357490539551, + "learning_rate": 3.2477659015244435e-05, + "loss": 1.6136, + "step": 7710 + }, + { + "epoch": 2.0263408545997965, + "grad_norm": 0.6390090584754944, + "learning_rate": 3.246013667425968e-05, + "loss": 1.6033, + "step": 7712 + }, + { + "epoch": 2.026866357933458, + "grad_norm": 0.5854797959327698, + "learning_rate": 3.244261433327493e-05, + "loss": 1.5812, + "step": 7714 + }, + { + "epoch": 2.02739186126712, + "grad_norm": 0.6743679642677307, + "learning_rate": 3.242509199229017e-05, + "loss": 1.577, + "step": 7716 + }, + { + "epoch": 2.0279173646007815, + "grad_norm": 0.6346087455749512, + "learning_rate": 3.240756965130541e-05, + "loss": 1.6094, + "step": 7718 + }, + { + "epoch": 2.0284428679344435, + "grad_norm": 0.6077213883399963, + "learning_rate": 3.239004731032066e-05, + "loss": 1.6264, + "step": 7720 + }, + { + "epoch": 2.028968371268105, + "grad_norm": 0.6114009618759155, + "learning_rate": 3.237252496933591e-05, + "loss": 1.6323, + "step": 7722 + }, + { + "epoch": 2.029493874601767, + "grad_norm": 0.5229764580726624, + "learning_rate": 3.235500262835115e-05, + "loss": 1.6128, + "step": 7724 + }, + { + "epoch": 2.030019377935429, + "grad_norm": 0.7623997330665588, + "learning_rate": 3.2337480287366395e-05, + "loss": 1.5912, + "step": 7726 + }, + { + "epoch": 2.0305448812690905, + "grad_norm": 0.7716650366783142, + "learning_rate": 3.231995794638164e-05, + "loss": 1.6001, + "step": 7728 + }, + { + "epoch": 2.0310703846027525, + "grad_norm": 0.5649428963661194, + "learning_rate": 3.230243560539688e-05, + "loss": 1.6171, + "step": 7730 + }, + { + "epoch": 2.031595887936414, + "grad_norm": 0.689694344997406, + "learning_rate": 3.2284913264412124e-05, + "loss": 1.6245, + "step": 7732 + }, + { + "epoch": 2.032121391270076, + "grad_norm": 0.6371480226516724, + "learning_rate": 3.226739092342737e-05, + "loss": 1.6262, + "step": 7734 + }, + { + "epoch": 2.0326468946037375, + "grad_norm": 0.5378861427307129, + "learning_rate": 3.224986858244261e-05, + "loss": 1.6057, + "step": 7736 + }, + { + "epoch": 2.0331723979373995, + "grad_norm": 0.633033037185669, + "learning_rate": 3.223234624145786e-05, + "loss": 1.6442, + "step": 7738 + }, + { + "epoch": 2.033697901271061, + "grad_norm": 0.6772140860557556, + "learning_rate": 3.221482390047311e-05, + "loss": 1.5639, + "step": 7740 + }, + { + "epoch": 2.034223404604723, + "grad_norm": 0.5355550050735474, + "learning_rate": 3.219730155948835e-05, + "loss": 1.5942, + "step": 7742 + }, + { + "epoch": 2.034748907938385, + "grad_norm": 0.5276470184326172, + "learning_rate": 3.217977921850359e-05, + "loss": 1.6259, + "step": 7744 + }, + { + "epoch": 2.0352744112720464, + "grad_norm": 0.5952677726745605, + "learning_rate": 3.216225687751884e-05, + "loss": 1.6073, + "step": 7746 + }, + { + "epoch": 2.0357999146057084, + "grad_norm": 0.5393653512001038, + "learning_rate": 3.2144734536534085e-05, + "loss": 1.612, + "step": 7748 + }, + { + "epoch": 2.03632541793937, + "grad_norm": 0.5733909010887146, + "learning_rate": 3.2127212195549325e-05, + "loss": 1.6513, + "step": 7750 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.6015291810035706, + "learning_rate": 3.210968985456457e-05, + "loss": 1.613, + "step": 7752 + }, + { + "epoch": 2.0373764246066934, + "grad_norm": 0.6741379499435425, + "learning_rate": 3.209216751357982e-05, + "loss": 1.6277, + "step": 7754 + }, + { + "epoch": 2.0379019279403554, + "grad_norm": 0.6242867112159729, + "learning_rate": 3.207464517259506e-05, + "loss": 1.6299, + "step": 7756 + }, + { + "epoch": 2.038427431274017, + "grad_norm": 0.6203280091285706, + "learning_rate": 3.20571228316103e-05, + "loss": 1.624, + "step": 7758 + }, + { + "epoch": 2.038952934607679, + "grad_norm": 0.5922704339027405, + "learning_rate": 3.203960049062555e-05, + "loss": 1.6176, + "step": 7760 + }, + { + "epoch": 2.039478437941341, + "grad_norm": 0.5987135767936707, + "learning_rate": 3.20220781496408e-05, + "loss": 1.631, + "step": 7762 + }, + { + "epoch": 2.0400039412750024, + "grad_norm": 0.658090353012085, + "learning_rate": 3.200455580865604e-05, + "loss": 1.6455, + "step": 7764 + }, + { + "epoch": 2.0405294446086644, + "grad_norm": 0.5447620749473572, + "learning_rate": 3.1987033467671286e-05, + "loss": 1.6257, + "step": 7766 + }, + { + "epoch": 2.041054947942326, + "grad_norm": 0.5644233226776123, + "learning_rate": 3.1969511126686526e-05, + "loss": 1.6187, + "step": 7768 + }, + { + "epoch": 2.041580451275988, + "grad_norm": 0.6040834188461304, + "learning_rate": 3.195198878570177e-05, + "loss": 1.651, + "step": 7770 + }, + { + "epoch": 2.0421059546096494, + "grad_norm": 0.5864139199256897, + "learning_rate": 3.1934466444717015e-05, + "loss": 1.6151, + "step": 7772 + }, + { + "epoch": 2.0426314579433114, + "grad_norm": 0.5407016277313232, + "learning_rate": 3.191694410373226e-05, + "loss": 1.6075, + "step": 7774 + }, + { + "epoch": 2.043156961276973, + "grad_norm": 0.6198828816413879, + "learning_rate": 3.18994217627475e-05, + "loss": 1.6075, + "step": 7776 + }, + { + "epoch": 2.043682464610635, + "grad_norm": 0.5452790856361389, + "learning_rate": 3.188189942176275e-05, + "loss": 1.5942, + "step": 7778 + }, + { + "epoch": 2.044207967944297, + "grad_norm": 0.5661458969116211, + "learning_rate": 3.1864377080778e-05, + "loss": 1.64, + "step": 7780 + }, + { + "epoch": 2.0447334712779583, + "grad_norm": 0.5775606036186218, + "learning_rate": 3.184685473979324e-05, + "loss": 1.6221, + "step": 7782 + }, + { + "epoch": 2.0452589746116203, + "grad_norm": 0.6352986693382263, + "learning_rate": 3.182933239880848e-05, + "loss": 1.5736, + "step": 7784 + }, + { + "epoch": 2.045784477945282, + "grad_norm": 0.6562779545783997, + "learning_rate": 3.181181005782373e-05, + "loss": 1.659, + "step": 7786 + }, + { + "epoch": 2.046309981278944, + "grad_norm": 0.6051666140556335, + "learning_rate": 3.1794287716838975e-05, + "loss": 1.6418, + "step": 7788 + }, + { + "epoch": 2.0468354846126053, + "grad_norm": 0.5075708031654358, + "learning_rate": 3.1776765375854216e-05, + "loss": 1.6209, + "step": 7790 + }, + { + "epoch": 2.0473609879462673, + "grad_norm": 0.646185576915741, + "learning_rate": 3.175924303486946e-05, + "loss": 1.6081, + "step": 7792 + }, + { + "epoch": 2.047886491279929, + "grad_norm": 0.5304019451141357, + "learning_rate": 3.1741720693884704e-05, + "loss": 1.6065, + "step": 7794 + }, + { + "epoch": 2.048411994613591, + "grad_norm": 0.6288559436798096, + "learning_rate": 3.1724198352899945e-05, + "loss": 1.63, + "step": 7796 + }, + { + "epoch": 2.0489374979472528, + "grad_norm": 0.5334764122962952, + "learning_rate": 3.170667601191519e-05, + "loss": 1.6161, + "step": 7798 + }, + { + "epoch": 2.0494630012809143, + "grad_norm": 0.5528808236122131, + "learning_rate": 3.168915367093044e-05, + "loss": 1.5925, + "step": 7800 + }, + { + "epoch": 2.0499885046145763, + "grad_norm": 0.5791876912117004, + "learning_rate": 3.167163132994568e-05, + "loss": 1.6322, + "step": 7802 + }, + { + "epoch": 2.050514007948238, + "grad_norm": 0.6923157572746277, + "learning_rate": 3.165410898896093e-05, + "loss": 1.6301, + "step": 7804 + }, + { + "epoch": 2.0510395112818998, + "grad_norm": 0.5527494549751282, + "learning_rate": 3.163658664797617e-05, + "loss": 1.6363, + "step": 7806 + }, + { + "epoch": 2.0515650146155613, + "grad_norm": 0.554539144039154, + "learning_rate": 3.1619064306991417e-05, + "loss": 1.6324, + "step": 7808 + }, + { + "epoch": 2.0520905179492233, + "grad_norm": 0.6409008502960205, + "learning_rate": 3.160154196600666e-05, + "loss": 1.6378, + "step": 7810 + }, + { + "epoch": 2.0526160212828852, + "grad_norm": 0.6073037385940552, + "learning_rate": 3.1584019625021905e-05, + "loss": 1.5801, + "step": 7812 + }, + { + "epoch": 2.0531415246165468, + "grad_norm": 0.6152986288070679, + "learning_rate": 3.156649728403715e-05, + "loss": 1.6128, + "step": 7814 + }, + { + "epoch": 2.0536670279502087, + "grad_norm": 0.5760658383369446, + "learning_rate": 3.154897494305239e-05, + "loss": 1.5979, + "step": 7816 + }, + { + "epoch": 2.0541925312838702, + "grad_norm": 0.5909692645072937, + "learning_rate": 3.153145260206764e-05, + "loss": 1.5931, + "step": 7818 + }, + { + "epoch": 2.054718034617532, + "grad_norm": 0.5456188917160034, + "learning_rate": 3.151393026108288e-05, + "loss": 1.6557, + "step": 7820 + }, + { + "epoch": 2.0552435379511937, + "grad_norm": 0.6995749473571777, + "learning_rate": 3.149640792009812e-05, + "loss": 1.5974, + "step": 7822 + }, + { + "epoch": 2.0557690412848557, + "grad_norm": 0.7031815648078918, + "learning_rate": 3.147888557911337e-05, + "loss": 1.6331, + "step": 7824 + }, + { + "epoch": 2.0562945446185172, + "grad_norm": 0.5628200769424438, + "learning_rate": 3.146136323812862e-05, + "loss": 1.6148, + "step": 7826 + }, + { + "epoch": 2.056820047952179, + "grad_norm": 0.5670493841171265, + "learning_rate": 3.144384089714386e-05, + "loss": 1.6436, + "step": 7828 + }, + { + "epoch": 2.057345551285841, + "grad_norm": 0.5815064311027527, + "learning_rate": 3.1426318556159106e-05, + "loss": 1.6107, + "step": 7830 + }, + { + "epoch": 2.0578710546195027, + "grad_norm": 0.4898243844509125, + "learning_rate": 3.140879621517435e-05, + "loss": 1.5852, + "step": 7832 + }, + { + "epoch": 2.0583965579531647, + "grad_norm": 0.71734219789505, + "learning_rate": 3.1391273874189594e-05, + "loss": 1.6009, + "step": 7834 + }, + { + "epoch": 2.058922061286826, + "grad_norm": 0.5565480589866638, + "learning_rate": 3.1373751533204835e-05, + "loss": 1.611, + "step": 7836 + }, + { + "epoch": 2.059447564620488, + "grad_norm": 0.5884687900543213, + "learning_rate": 3.135622919222008e-05, + "loss": 1.5922, + "step": 7838 + }, + { + "epoch": 2.0599730679541497, + "grad_norm": 0.5969564318656921, + "learning_rate": 3.133870685123533e-05, + "loss": 1.6237, + "step": 7840 + }, + { + "epoch": 2.0604985712878117, + "grad_norm": 0.6498041152954102, + "learning_rate": 3.132118451025057e-05, + "loss": 1.6127, + "step": 7842 + }, + { + "epoch": 2.061024074621473, + "grad_norm": 0.6092948913574219, + "learning_rate": 3.130366216926581e-05, + "loss": 1.6377, + "step": 7844 + }, + { + "epoch": 2.061549577955135, + "grad_norm": 0.5632656812667847, + "learning_rate": 3.128613982828106e-05, + "loss": 1.5976, + "step": 7846 + }, + { + "epoch": 2.062075081288797, + "grad_norm": 0.6170439720153809, + "learning_rate": 3.12686174872963e-05, + "loss": 1.5962, + "step": 7848 + }, + { + "epoch": 2.0626005846224587, + "grad_norm": 0.5882241725921631, + "learning_rate": 3.125109514631155e-05, + "loss": 1.5918, + "step": 7850 + }, + { + "epoch": 2.0631260879561206, + "grad_norm": 0.5418891906738281, + "learning_rate": 3.1233572805326795e-05, + "loss": 1.6289, + "step": 7852 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.6917753219604492, + "learning_rate": 3.1216050464342036e-05, + "loss": 1.6565, + "step": 7854 + }, + { + "epoch": 2.064177094623444, + "grad_norm": 0.596343457698822, + "learning_rate": 3.1198528123357284e-05, + "loss": 1.6087, + "step": 7856 + }, + { + "epoch": 2.0647025979571056, + "grad_norm": 0.5705437660217285, + "learning_rate": 3.1181005782372524e-05, + "loss": 1.607, + "step": 7858 + }, + { + "epoch": 2.0652281012907676, + "grad_norm": 0.5608698129653931, + "learning_rate": 3.116348344138777e-05, + "loss": 1.5935, + "step": 7860 + }, + { + "epoch": 2.065753604624429, + "grad_norm": 0.5956361293792725, + "learning_rate": 3.114596110040301e-05, + "loss": 1.5784, + "step": 7862 + }, + { + "epoch": 2.066279107958091, + "grad_norm": 0.5428637862205505, + "learning_rate": 3.112843875941826e-05, + "loss": 1.6384, + "step": 7864 + }, + { + "epoch": 2.066804611291753, + "grad_norm": 0.5758678317070007, + "learning_rate": 3.111091641843351e-05, + "loss": 1.6397, + "step": 7866 + }, + { + "epoch": 2.0673301146254146, + "grad_norm": 0.605904221534729, + "learning_rate": 3.109339407744875e-05, + "loss": 1.6312, + "step": 7868 + }, + { + "epoch": 2.0678556179590766, + "grad_norm": 0.5100085735321045, + "learning_rate": 3.107587173646399e-05, + "loss": 1.583, + "step": 7870 + }, + { + "epoch": 2.068381121292738, + "grad_norm": 0.5583624839782715, + "learning_rate": 3.105834939547924e-05, + "loss": 1.6319, + "step": 7872 + }, + { + "epoch": 2.0689066246264, + "grad_norm": 0.7279824018478394, + "learning_rate": 3.1040827054494485e-05, + "loss": 1.6142, + "step": 7874 + }, + { + "epoch": 2.0694321279600616, + "grad_norm": 0.5635095238685608, + "learning_rate": 3.1023304713509725e-05, + "loss": 1.6363, + "step": 7876 + }, + { + "epoch": 2.0699576312937236, + "grad_norm": 0.6370921730995178, + "learning_rate": 3.100578237252497e-05, + "loss": 1.6132, + "step": 7878 + }, + { + "epoch": 2.070483134627385, + "grad_norm": 0.6772926449775696, + "learning_rate": 3.098826003154022e-05, + "loss": 1.6033, + "step": 7880 + }, + { + "epoch": 2.071008637961047, + "grad_norm": 0.7656962275505066, + "learning_rate": 3.0970737690555454e-05, + "loss": 1.5948, + "step": 7882 + }, + { + "epoch": 2.071534141294709, + "grad_norm": 0.6695175766944885, + "learning_rate": 3.09532153495707e-05, + "loss": 1.6419, + "step": 7884 + }, + { + "epoch": 2.0720596446283706, + "grad_norm": 0.7357918620109558, + "learning_rate": 3.093569300858595e-05, + "loss": 1.5987, + "step": 7886 + }, + { + "epoch": 2.0725851479620325, + "grad_norm": 0.5544195175170898, + "learning_rate": 3.091817066760119e-05, + "loss": 1.6389, + "step": 7888 + }, + { + "epoch": 2.073110651295694, + "grad_norm": 0.674032986164093, + "learning_rate": 3.090064832661644e-05, + "loss": 1.6046, + "step": 7890 + }, + { + "epoch": 2.073636154629356, + "grad_norm": 0.5453786253929138, + "learning_rate": 3.0883125985631686e-05, + "loss": 1.6194, + "step": 7892 + }, + { + "epoch": 2.0741616579630175, + "grad_norm": 0.6969361901283264, + "learning_rate": 3.0865603644646926e-05, + "loss": 1.5962, + "step": 7894 + }, + { + "epoch": 2.0746871612966795, + "grad_norm": 0.8166598677635193, + "learning_rate": 3.084808130366217e-05, + "loss": 1.5563, + "step": 7896 + }, + { + "epoch": 2.075212664630341, + "grad_norm": 0.6729124784469604, + "learning_rate": 3.0830558962677415e-05, + "loss": 1.5873, + "step": 7898 + }, + { + "epoch": 2.075738167964003, + "grad_norm": 0.5767272114753723, + "learning_rate": 3.081303662169266e-05, + "loss": 1.6117, + "step": 7900 + }, + { + "epoch": 2.076263671297665, + "grad_norm": 0.5525824427604675, + "learning_rate": 3.07955142807079e-05, + "loss": 1.6047, + "step": 7902 + }, + { + "epoch": 2.0767891746313265, + "grad_norm": 0.6294286251068115, + "learning_rate": 3.077799193972315e-05, + "loss": 1.6076, + "step": 7904 + }, + { + "epoch": 2.0773146779649885, + "grad_norm": 0.5846551060676575, + "learning_rate": 3.07604695987384e-05, + "loss": 1.6207, + "step": 7906 + }, + { + "epoch": 2.07784018129865, + "grad_norm": 0.5915541052818298, + "learning_rate": 3.074294725775363e-05, + "loss": 1.6222, + "step": 7908 + }, + { + "epoch": 2.078365684632312, + "grad_norm": 0.5178691148757935, + "learning_rate": 3.072542491676888e-05, + "loss": 1.6086, + "step": 7910 + }, + { + "epoch": 2.0788911879659735, + "grad_norm": 0.5767921805381775, + "learning_rate": 3.070790257578413e-05, + "loss": 1.6102, + "step": 7912 + }, + { + "epoch": 2.0794166912996355, + "grad_norm": 0.7136160135269165, + "learning_rate": 3.069038023479937e-05, + "loss": 1.6103, + "step": 7914 + }, + { + "epoch": 2.079942194633297, + "grad_norm": 0.5835669040679932, + "learning_rate": 3.0672857893814616e-05, + "loss": 1.6199, + "step": 7916 + }, + { + "epoch": 2.080467697966959, + "grad_norm": 0.6543889045715332, + "learning_rate": 3.065533555282986e-05, + "loss": 1.6452, + "step": 7918 + }, + { + "epoch": 2.080993201300621, + "grad_norm": 0.5414628386497498, + "learning_rate": 3.0637813211845104e-05, + "loss": 1.6277, + "step": 7920 + }, + { + "epoch": 2.0815187046342825, + "grad_norm": 0.5655497908592224, + "learning_rate": 3.0620290870860345e-05, + "loss": 1.6294, + "step": 7922 + }, + { + "epoch": 2.0820442079679444, + "grad_norm": 0.5398779511451721, + "learning_rate": 3.060276852987559e-05, + "loss": 1.5906, + "step": 7924 + }, + { + "epoch": 2.082569711301606, + "grad_norm": 0.6147701740264893, + "learning_rate": 3.058524618889084e-05, + "loss": 1.6274, + "step": 7926 + }, + { + "epoch": 2.083095214635268, + "grad_norm": 0.5175689458847046, + "learning_rate": 3.056772384790608e-05, + "loss": 1.6206, + "step": 7928 + }, + { + "epoch": 2.0836207179689294, + "grad_norm": 0.5908101797103882, + "learning_rate": 3.055020150692133e-05, + "loss": 1.5894, + "step": 7930 + }, + { + "epoch": 2.0841462213025914, + "grad_norm": 0.7519561648368835, + "learning_rate": 3.0532679165936576e-05, + "loss": 1.5862, + "step": 7932 + }, + { + "epoch": 2.0846717246362534, + "grad_norm": 0.6898811459541321, + "learning_rate": 3.0515156824951813e-05, + "loss": 1.5917, + "step": 7934 + }, + { + "epoch": 2.085197227969915, + "grad_norm": 0.6252642869949341, + "learning_rate": 3.0497634483967057e-05, + "loss": 1.616, + "step": 7936 + }, + { + "epoch": 2.085722731303577, + "grad_norm": 0.6489799618721008, + "learning_rate": 3.0480112142982305e-05, + "loss": 1.5802, + "step": 7938 + }, + { + "epoch": 2.0862482346372384, + "grad_norm": 0.544003427028656, + "learning_rate": 3.046258980199755e-05, + "loss": 1.6274, + "step": 7940 + }, + { + "epoch": 2.0867737379709004, + "grad_norm": 0.6666077971458435, + "learning_rate": 3.0445067461012793e-05, + "loss": 1.5999, + "step": 7942 + }, + { + "epoch": 2.087299241304562, + "grad_norm": 0.6988151669502258, + "learning_rate": 3.042754512002804e-05, + "loss": 1.6314, + "step": 7944 + }, + { + "epoch": 2.087824744638224, + "grad_norm": 0.5597438812255859, + "learning_rate": 3.0410022779043278e-05, + "loss": 1.6348, + "step": 7946 + }, + { + "epoch": 2.0883502479718854, + "grad_norm": 0.5571982264518738, + "learning_rate": 3.0392500438058526e-05, + "loss": 1.6356, + "step": 7948 + }, + { + "epoch": 2.0888757513055474, + "grad_norm": 0.5218850374221802, + "learning_rate": 3.037497809707377e-05, + "loss": 1.588, + "step": 7950 + }, + { + "epoch": 2.089401254639209, + "grad_norm": 0.627077579498291, + "learning_rate": 3.0357455756089014e-05, + "loss": 1.5931, + "step": 7952 + }, + { + "epoch": 2.089926757972871, + "grad_norm": 0.6151776313781738, + "learning_rate": 3.0339933415104262e-05, + "loss": 1.6073, + "step": 7954 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.5469711422920227, + "learning_rate": 3.0322411074119506e-05, + "loss": 1.5982, + "step": 7956 + }, + { + "epoch": 2.0909777646401944, + "grad_norm": 0.6806207299232483, + "learning_rate": 3.030488873313475e-05, + "loss": 1.6376, + "step": 7958 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.6746312379837036, + "learning_rate": 3.028736639214999e-05, + "loss": 1.6113, + "step": 7960 + }, + { + "epoch": 2.092028771307518, + "grad_norm": 0.6440801024436951, + "learning_rate": 3.0269844051165235e-05, + "loss": 1.6223, + "step": 7962 + }, + { + "epoch": 2.09255427464118, + "grad_norm": 0.5747029185295105, + "learning_rate": 3.0252321710180483e-05, + "loss": 1.5959, + "step": 7964 + }, + { + "epoch": 2.0930797779748413, + "grad_norm": 0.5977646708488464, + "learning_rate": 3.0234799369195727e-05, + "loss": 1.6403, + "step": 7966 + }, + { + "epoch": 2.0936052813085033, + "grad_norm": 0.5634188055992126, + "learning_rate": 3.021727702821097e-05, + "loss": 1.6062, + "step": 7968 + }, + { + "epoch": 2.0941307846421653, + "grad_norm": 0.661744236946106, + "learning_rate": 3.019975468722622e-05, + "loss": 1.6394, + "step": 7970 + }, + { + "epoch": 2.094656287975827, + "grad_norm": 0.5878733396530151, + "learning_rate": 3.0182232346241456e-05, + "loss": 1.5781, + "step": 7972 + }, + { + "epoch": 2.095181791309489, + "grad_norm": 0.5121323466300964, + "learning_rate": 3.0164710005256703e-05, + "loss": 1.6238, + "step": 7974 + }, + { + "epoch": 2.0957072946431503, + "grad_norm": 0.6021749377250671, + "learning_rate": 3.0147187664271948e-05, + "loss": 1.6077, + "step": 7976 + }, + { + "epoch": 2.0962327979768123, + "grad_norm": 0.5374654531478882, + "learning_rate": 3.0129665323287192e-05, + "loss": 1.6252, + "step": 7978 + }, + { + "epoch": 2.096758301310474, + "grad_norm": 0.5462518930435181, + "learning_rate": 3.011214298230244e-05, + "loss": 1.619, + "step": 7980 + }, + { + "epoch": 2.0972838046441358, + "grad_norm": 0.5597232580184937, + "learning_rate": 3.0094620641317684e-05, + "loss": 1.6151, + "step": 7982 + }, + { + "epoch": 2.0978093079777973, + "grad_norm": 0.6084597110748291, + "learning_rate": 3.0077098300332924e-05, + "loss": 1.6398, + "step": 7984 + }, + { + "epoch": 2.0983348113114593, + "grad_norm": 0.5504382252693176, + "learning_rate": 3.005957595934817e-05, + "loss": 1.5902, + "step": 7986 + }, + { + "epoch": 2.0988603146451212, + "grad_norm": 0.5516804456710815, + "learning_rate": 3.0042053618363413e-05, + "loss": 1.625, + "step": 7988 + }, + { + "epoch": 2.0993858179787828, + "grad_norm": 0.5523831248283386, + "learning_rate": 3.002453127737866e-05, + "loss": 1.6291, + "step": 7990 + }, + { + "epoch": 2.0999113213124447, + "grad_norm": 0.6193360686302185, + "learning_rate": 3.0007008936393904e-05, + "loss": 1.5764, + "step": 7992 + }, + { + "epoch": 2.1004368246461063, + "grad_norm": 0.620893657207489, + "learning_rate": 2.998948659540915e-05, + "loss": 1.6295, + "step": 7994 + }, + { + "epoch": 2.1009623279797682, + "grad_norm": 0.5495589971542358, + "learning_rate": 2.9971964254424396e-05, + "loss": 1.6159, + "step": 7996 + }, + { + "epoch": 2.1014878313134298, + "grad_norm": 0.5830954909324646, + "learning_rate": 2.9954441913439634e-05, + "loss": 1.5844, + "step": 7998 + }, + { + "epoch": 2.1020133346470917, + "grad_norm": 0.6515240669250488, + "learning_rate": 2.993691957245488e-05, + "loss": 1.6092, + "step": 8000 + }, + { + "epoch": 2.1020133346470917, + "eval_loss": 1.6652451753616333, + "eval_runtime": 487.2001, + "eval_samples_per_second": 249.977, + "eval_steps_per_second": 31.248, + "step": 8000 + }, + { + "epoch": 2.1025388379807533, + "grad_norm": 0.5562619566917419, + "learning_rate": 2.9919397231470125e-05, + "loss": 1.6272, + "step": 8002 + }, + { + "epoch": 2.103064341314415, + "grad_norm": 0.8904392123222351, + "learning_rate": 2.990187489048537e-05, + "loss": 1.6286, + "step": 8004 + }, + { + "epoch": 2.103589844648077, + "grad_norm": 0.6131122708320618, + "learning_rate": 2.9884352549500617e-05, + "loss": 1.6225, + "step": 8006 + }, + { + "epoch": 2.1041153479817387, + "grad_norm": 0.5321511030197144, + "learning_rate": 2.986683020851586e-05, + "loss": 1.5881, + "step": 8008 + }, + { + "epoch": 2.1046408513154007, + "grad_norm": 0.5121209621429443, + "learning_rate": 2.9849307867531102e-05, + "loss": 1.5884, + "step": 8010 + }, + { + "epoch": 2.105166354649062, + "grad_norm": 0.5307869911193848, + "learning_rate": 2.9831785526546346e-05, + "loss": 1.5926, + "step": 8012 + }, + { + "epoch": 2.105691857982724, + "grad_norm": 0.5231491327285767, + "learning_rate": 2.981426318556159e-05, + "loss": 1.6319, + "step": 8014 + }, + { + "epoch": 2.1062173613163857, + "grad_norm": 0.5306436419487, + "learning_rate": 2.9796740844576838e-05, + "loss": 1.6526, + "step": 8016 + }, + { + "epoch": 2.1067428646500477, + "grad_norm": 0.6463943123817444, + "learning_rate": 2.9779218503592082e-05, + "loss": 1.6074, + "step": 8018 + }, + { + "epoch": 2.107268367983709, + "grad_norm": 0.5345489382743835, + "learning_rate": 2.9761696162607326e-05, + "loss": 1.6413, + "step": 8020 + }, + { + "epoch": 2.107793871317371, + "grad_norm": 0.6284579634666443, + "learning_rate": 2.9744173821622574e-05, + "loss": 1.63, + "step": 8022 + }, + { + "epoch": 2.108319374651033, + "grad_norm": 0.5651626586914062, + "learning_rate": 2.972665148063781e-05, + "loss": 1.5837, + "step": 8024 + }, + { + "epoch": 2.1088448779846947, + "grad_norm": 0.692808210849762, + "learning_rate": 2.970912913965306e-05, + "loss": 1.6361, + "step": 8026 + }, + { + "epoch": 2.1093703813183566, + "grad_norm": 0.5368518829345703, + "learning_rate": 2.9691606798668303e-05, + "loss": 1.6577, + "step": 8028 + }, + { + "epoch": 2.109895884652018, + "grad_norm": 0.561772346496582, + "learning_rate": 2.9674084457683547e-05, + "loss": 1.5959, + "step": 8030 + }, + { + "epoch": 2.11042138798568, + "grad_norm": 0.6566132307052612, + "learning_rate": 2.9656562116698795e-05, + "loss": 1.622, + "step": 8032 + }, + { + "epoch": 2.1109468913193417, + "grad_norm": 0.7207258939743042, + "learning_rate": 2.963903977571404e-05, + "loss": 1.6318, + "step": 8034 + }, + { + "epoch": 2.1114723946530036, + "grad_norm": 0.5623325705528259, + "learning_rate": 2.962151743472928e-05, + "loss": 1.5964, + "step": 8036 + }, + { + "epoch": 2.111997897986665, + "grad_norm": 0.6407482624053955, + "learning_rate": 2.9603995093744524e-05, + "loss": 1.5759, + "step": 8038 + }, + { + "epoch": 2.112523401320327, + "grad_norm": 0.5547378659248352, + "learning_rate": 2.9586472752759768e-05, + "loss": 1.6112, + "step": 8040 + }, + { + "epoch": 2.113048904653989, + "grad_norm": 0.5998631715774536, + "learning_rate": 2.9568950411775016e-05, + "loss": 1.6102, + "step": 8042 + }, + { + "epoch": 2.1135744079876506, + "grad_norm": 0.5907427072525024, + "learning_rate": 2.955142807079026e-05, + "loss": 1.6056, + "step": 8044 + }, + { + "epoch": 2.1140999113213126, + "grad_norm": 0.5377500653266907, + "learning_rate": 2.9533905729805504e-05, + "loss": 1.626, + "step": 8046 + }, + { + "epoch": 2.114625414654974, + "grad_norm": 0.5881298780441284, + "learning_rate": 2.9516383388820745e-05, + "loss": 1.6245, + "step": 8048 + }, + { + "epoch": 2.115150917988636, + "grad_norm": 0.6338145136833191, + "learning_rate": 2.949886104783599e-05, + "loss": 1.6133, + "step": 8050 + }, + { + "epoch": 2.1156764213222976, + "grad_norm": 0.7909775972366333, + "learning_rate": 2.9481338706851236e-05, + "loss": 1.625, + "step": 8052 + }, + { + "epoch": 2.1162019246559596, + "grad_norm": 0.5986071228981018, + "learning_rate": 2.946381636586648e-05, + "loss": 1.6209, + "step": 8054 + }, + { + "epoch": 2.116727427989621, + "grad_norm": 0.6373353600502014, + "learning_rate": 2.9446294024881725e-05, + "loss": 1.5849, + "step": 8056 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.7317929267883301, + "learning_rate": 2.9428771683896972e-05, + "loss": 1.6032, + "step": 8058 + }, + { + "epoch": 2.117778434656945, + "grad_norm": 0.5454069375991821, + "learning_rate": 2.9411249342912217e-05, + "loss": 1.6171, + "step": 8060 + }, + { + "epoch": 2.1183039379906066, + "grad_norm": 0.5561736226081848, + "learning_rate": 2.9393727001927457e-05, + "loss": 1.6434, + "step": 8062 + }, + { + "epoch": 2.1188294413242685, + "grad_norm": 0.685099720954895, + "learning_rate": 2.93762046609427e-05, + "loss": 1.5534, + "step": 8064 + }, + { + "epoch": 2.11935494465793, + "grad_norm": 0.5449316501617432, + "learning_rate": 2.935868231995795e-05, + "loss": 1.5926, + "step": 8066 + }, + { + "epoch": 2.119880447991592, + "grad_norm": 0.5778328776359558, + "learning_rate": 2.9341159978973193e-05, + "loss": 1.6166, + "step": 8068 + }, + { + "epoch": 2.1204059513252536, + "grad_norm": 0.5505948662757874, + "learning_rate": 2.9323637637988437e-05, + "loss": 1.5876, + "step": 8070 + }, + { + "epoch": 2.1209314546589155, + "grad_norm": 0.5759081244468689, + "learning_rate": 2.9306115297003685e-05, + "loss": 1.6145, + "step": 8072 + }, + { + "epoch": 2.121456957992577, + "grad_norm": 0.5547658205032349, + "learning_rate": 2.9288592956018922e-05, + "loss": 1.564, + "step": 8074 + }, + { + "epoch": 2.121982461326239, + "grad_norm": 0.6756736040115356, + "learning_rate": 2.927107061503417e-05, + "loss": 1.6301, + "step": 8076 + }, + { + "epoch": 2.122507964659901, + "grad_norm": 0.8322302103042603, + "learning_rate": 2.9253548274049414e-05, + "loss": 1.6044, + "step": 8078 + }, + { + "epoch": 2.1230334679935625, + "grad_norm": 0.6076900362968445, + "learning_rate": 2.923602593306466e-05, + "loss": 1.5982, + "step": 8080 + }, + { + "epoch": 2.1235589713272245, + "grad_norm": 0.5850244760513306, + "learning_rate": 2.9218503592079906e-05, + "loss": 1.6248, + "step": 8082 + }, + { + "epoch": 2.124084474660886, + "grad_norm": 0.5558748841285706, + "learning_rate": 2.920098125109515e-05, + "loss": 1.6224, + "step": 8084 + }, + { + "epoch": 2.124609977994548, + "grad_norm": 0.5563886165618896, + "learning_rate": 2.918345891011039e-05, + "loss": 1.5805, + "step": 8086 + }, + { + "epoch": 2.1251354813282095, + "grad_norm": 0.6558648943901062, + "learning_rate": 2.9165936569125635e-05, + "loss": 1.6255, + "step": 8088 + }, + { + "epoch": 2.1256609846618715, + "grad_norm": 0.6515272259712219, + "learning_rate": 2.914841422814088e-05, + "loss": 1.6127, + "step": 8090 + }, + { + "epoch": 2.1261864879955334, + "grad_norm": 0.5626866221427917, + "learning_rate": 2.9130891887156127e-05, + "loss": 1.6125, + "step": 8092 + }, + { + "epoch": 2.126711991329195, + "grad_norm": 0.6942625045776367, + "learning_rate": 2.911336954617137e-05, + "loss": 1.5824, + "step": 8094 + }, + { + "epoch": 2.127237494662857, + "grad_norm": 0.5993350148200989, + "learning_rate": 2.9095847205186615e-05, + "loss": 1.6309, + "step": 8096 + }, + { + "epoch": 2.1277629979965185, + "grad_norm": 0.5620863437652588, + "learning_rate": 2.9078324864201863e-05, + "loss": 1.6287, + "step": 8098 + }, + { + "epoch": 2.1282885013301804, + "grad_norm": 0.5420330762863159, + "learning_rate": 2.90608025232171e-05, + "loss": 1.6263, + "step": 8100 + }, + { + "epoch": 2.128814004663842, + "grad_norm": 0.5831283330917358, + "learning_rate": 2.9043280182232348e-05, + "loss": 1.6221, + "step": 8102 + }, + { + "epoch": 2.129339507997504, + "grad_norm": 0.6942195296287537, + "learning_rate": 2.9025757841247592e-05, + "loss": 1.6411, + "step": 8104 + }, + { + "epoch": 2.1298650113311655, + "grad_norm": 0.5689584016799927, + "learning_rate": 2.9008235500262836e-05, + "loss": 1.6443, + "step": 8106 + }, + { + "epoch": 2.1303905146648274, + "grad_norm": 0.6463392376899719, + "learning_rate": 2.8990713159278084e-05, + "loss": 1.5987, + "step": 8108 + }, + { + "epoch": 2.130916017998489, + "grad_norm": 0.591022253036499, + "learning_rate": 2.8973190818293328e-05, + "loss": 1.6283, + "step": 8110 + }, + { + "epoch": 2.131441521332151, + "grad_norm": 0.6164734363555908, + "learning_rate": 2.895566847730857e-05, + "loss": 1.6188, + "step": 8112 + }, + { + "epoch": 2.131967024665813, + "grad_norm": 0.5488820672035217, + "learning_rate": 2.8938146136323813e-05, + "loss": 1.6562, + "step": 8114 + }, + { + "epoch": 2.1324925279994744, + "grad_norm": 0.7101133465766907, + "learning_rate": 2.8920623795339057e-05, + "loss": 1.6206, + "step": 8116 + }, + { + "epoch": 2.1330180313331364, + "grad_norm": 0.5032434463500977, + "learning_rate": 2.8903101454354304e-05, + "loss": 1.6042, + "step": 8118 + }, + { + "epoch": 2.133543534666798, + "grad_norm": 0.5224511623382568, + "learning_rate": 2.888557911336955e-05, + "loss": 1.5865, + "step": 8120 + }, + { + "epoch": 2.13406903800046, + "grad_norm": 0.5269969701766968, + "learning_rate": 2.8868056772384793e-05, + "loss": 1.6296, + "step": 8122 + }, + { + "epoch": 2.1345945413341214, + "grad_norm": 0.5620083212852478, + "learning_rate": 2.885053443140004e-05, + "loss": 1.6425, + "step": 8124 + }, + { + "epoch": 2.1351200446677834, + "grad_norm": 0.6436342597007751, + "learning_rate": 2.8833012090415278e-05, + "loss": 1.6053, + "step": 8126 + }, + { + "epoch": 2.1356455480014453, + "grad_norm": 0.6406291127204895, + "learning_rate": 2.8815489749430525e-05, + "loss": 1.6254, + "step": 8128 + }, + { + "epoch": 2.136171051335107, + "grad_norm": 0.551295280456543, + "learning_rate": 2.879796740844577e-05, + "loss": 1.621, + "step": 8130 + }, + { + "epoch": 2.136696554668769, + "grad_norm": 0.6591944098472595, + "learning_rate": 2.8780445067461014e-05, + "loss": 1.6578, + "step": 8132 + }, + { + "epoch": 2.1372220580024304, + "grad_norm": 0.7538178563117981, + "learning_rate": 2.876292272647626e-05, + "loss": 1.6057, + "step": 8134 + }, + { + "epoch": 2.1377475613360923, + "grad_norm": 0.5904437303543091, + "learning_rate": 2.8745400385491505e-05, + "loss": 1.5984, + "step": 8136 + }, + { + "epoch": 2.138273064669754, + "grad_norm": 0.5584096312522888, + "learning_rate": 2.8727878044506746e-05, + "loss": 1.5751, + "step": 8138 + }, + { + "epoch": 2.138798568003416, + "grad_norm": 0.6350988745689392, + "learning_rate": 2.871035570352199e-05, + "loss": 1.6102, + "step": 8140 + }, + { + "epoch": 2.1393240713370774, + "grad_norm": 0.8004497289657593, + "learning_rate": 2.8692833362537235e-05, + "loss": 1.6205, + "step": 8142 + }, + { + "epoch": 2.1398495746707393, + "grad_norm": 0.6031829118728638, + "learning_rate": 2.8675311021552482e-05, + "loss": 1.615, + "step": 8144 + }, + { + "epoch": 2.140375078004401, + "grad_norm": 0.5796357989311218, + "learning_rate": 2.8657788680567726e-05, + "loss": 1.6007, + "step": 8146 + }, + { + "epoch": 2.140900581338063, + "grad_norm": 0.5166471004486084, + "learning_rate": 2.864026633958297e-05, + "loss": 1.592, + "step": 8148 + }, + { + "epoch": 2.141426084671725, + "grad_norm": 0.6542901396751404, + "learning_rate": 2.862274399859821e-05, + "loss": 1.648, + "step": 8150 + }, + { + "epoch": 2.1419515880053863, + "grad_norm": 0.5643502473831177, + "learning_rate": 2.8605221657613455e-05, + "loss": 1.6187, + "step": 8152 + }, + { + "epoch": 2.1424770913390483, + "grad_norm": 0.5144004821777344, + "learning_rate": 2.8587699316628703e-05, + "loss": 1.5961, + "step": 8154 + }, + { + "epoch": 2.14300259467271, + "grad_norm": 0.5907447934150696, + "learning_rate": 2.8570176975643947e-05, + "loss": 1.6091, + "step": 8156 + }, + { + "epoch": 2.143528098006372, + "grad_norm": 0.6062257885932922, + "learning_rate": 2.855265463465919e-05, + "loss": 1.596, + "step": 8158 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.5850198268890381, + "learning_rate": 2.853513229367444e-05, + "loss": 1.6202, + "step": 8160 + }, + { + "epoch": 2.1445791046736953, + "grad_norm": 0.6049871444702148, + "learning_rate": 2.8517609952689683e-05, + "loss": 1.6151, + "step": 8162 + }, + { + "epoch": 2.1451046080073572, + "grad_norm": 0.6060041785240173, + "learning_rate": 2.8500087611704924e-05, + "loss": 1.566, + "step": 8164 + }, + { + "epoch": 2.1456301113410188, + "grad_norm": 0.5683371424674988, + "learning_rate": 2.8482565270720168e-05, + "loss": 1.6301, + "step": 8166 + }, + { + "epoch": 2.1461556146746807, + "grad_norm": 0.5292571783065796, + "learning_rate": 2.8465042929735412e-05, + "loss": 1.6151, + "step": 8168 + }, + { + "epoch": 2.1466811180083423, + "grad_norm": 0.6342765688896179, + "learning_rate": 2.844752058875066e-05, + "loss": 1.5728, + "step": 8170 + }, + { + "epoch": 2.1472066213420042, + "grad_norm": 0.551582932472229, + "learning_rate": 2.8429998247765904e-05, + "loss": 1.6054, + "step": 8172 + }, + { + "epoch": 2.1477321246756658, + "grad_norm": 0.548427939414978, + "learning_rate": 2.8412475906781148e-05, + "loss": 1.5991, + "step": 8174 + }, + { + "epoch": 2.1482576280093277, + "grad_norm": 0.6584466695785522, + "learning_rate": 2.839495356579639e-05, + "loss": 1.6213, + "step": 8176 + }, + { + "epoch": 2.1487831313429893, + "grad_norm": 0.6075412631034851, + "learning_rate": 2.8377431224811633e-05, + "loss": 1.6279, + "step": 8178 + }, + { + "epoch": 2.1493086346766512, + "grad_norm": 0.569524884223938, + "learning_rate": 2.835990888382688e-05, + "loss": 1.6053, + "step": 8180 + }, + { + "epoch": 2.149834138010313, + "grad_norm": 0.5383790731430054, + "learning_rate": 2.8342386542842125e-05, + "loss": 1.5938, + "step": 8182 + }, + { + "epoch": 2.1503596413439747, + "grad_norm": 0.5958060026168823, + "learning_rate": 2.832486420185737e-05, + "loss": 1.6272, + "step": 8184 + }, + { + "epoch": 2.1508851446776367, + "grad_norm": 0.5532635450363159, + "learning_rate": 2.8307341860872617e-05, + "loss": 1.6431, + "step": 8186 + }, + { + "epoch": 2.151410648011298, + "grad_norm": 0.5432900190353394, + "learning_rate": 2.8289819519887857e-05, + "loss": 1.6579, + "step": 8188 + }, + { + "epoch": 2.15193615134496, + "grad_norm": 0.6363177299499512, + "learning_rate": 2.82722971789031e-05, + "loss": 1.6349, + "step": 8190 + }, + { + "epoch": 2.1524616546786217, + "grad_norm": 0.6205646395683289, + "learning_rate": 2.8254774837918346e-05, + "loss": 1.6014, + "step": 8192 + }, + { + "epoch": 2.1529871580122837, + "grad_norm": 0.5696967244148254, + "learning_rate": 2.8237252496933593e-05, + "loss": 1.6086, + "step": 8194 + }, + { + "epoch": 2.153512661345945, + "grad_norm": 0.5662211775779724, + "learning_rate": 2.8219730155948837e-05, + "loss": 1.6246, + "step": 8196 + }, + { + "epoch": 2.154038164679607, + "grad_norm": 0.5794999599456787, + "learning_rate": 2.820220781496408e-05, + "loss": 1.6127, + "step": 8198 + }, + { + "epoch": 2.154563668013269, + "grad_norm": 0.6433011889457703, + "learning_rate": 2.818468547397933e-05, + "loss": 1.5973, + "step": 8200 + }, + { + "epoch": 2.1550891713469307, + "grad_norm": 0.5784979462623596, + "learning_rate": 2.8167163132994567e-05, + "loss": 1.6314, + "step": 8202 + }, + { + "epoch": 2.1556146746805926, + "grad_norm": 0.5415730476379395, + "learning_rate": 2.8149640792009814e-05, + "loss": 1.635, + "step": 8204 + }, + { + "epoch": 2.156140178014254, + "grad_norm": 0.5347517132759094, + "learning_rate": 2.8132118451025058e-05, + "loss": 1.5721, + "step": 8206 + }, + { + "epoch": 2.156665681347916, + "grad_norm": 0.5696678757667542, + "learning_rate": 2.8114596110040302e-05, + "loss": 1.6163, + "step": 8208 + }, + { + "epoch": 2.1571911846815777, + "grad_norm": 0.5461645126342773, + "learning_rate": 2.809707376905555e-05, + "loss": 1.6097, + "step": 8210 + }, + { + "epoch": 2.1577166880152396, + "grad_norm": 0.5799645781517029, + "learning_rate": 2.8079551428070794e-05, + "loss": 1.6526, + "step": 8212 + }, + { + "epoch": 2.158242191348901, + "grad_norm": 0.5340853333473206, + "learning_rate": 2.8062029087086035e-05, + "loss": 1.5865, + "step": 8214 + }, + { + "epoch": 2.158767694682563, + "grad_norm": 0.5770725607872009, + "learning_rate": 2.804450674610128e-05, + "loss": 1.6281, + "step": 8216 + }, + { + "epoch": 2.159293198016225, + "grad_norm": 0.5207443237304688, + "learning_rate": 2.8026984405116523e-05, + "loss": 1.6303, + "step": 8218 + }, + { + "epoch": 2.1598187013498866, + "grad_norm": 0.6137516498565674, + "learning_rate": 2.800946206413177e-05, + "loss": 1.639, + "step": 8220 + }, + { + "epoch": 2.1603442046835486, + "grad_norm": 0.5726540088653564, + "learning_rate": 2.7991939723147015e-05, + "loss": 1.6108, + "step": 8222 + }, + { + "epoch": 2.16086970801721, + "grad_norm": 0.5607831478118896, + "learning_rate": 2.797441738216226e-05, + "loss": 1.611, + "step": 8224 + }, + { + "epoch": 2.161395211350872, + "grad_norm": 0.6390299201011658, + "learning_rate": 2.79568950411775e-05, + "loss": 1.6325, + "step": 8226 + }, + { + "epoch": 2.1619207146845336, + "grad_norm": 0.548121452331543, + "learning_rate": 2.7939372700192744e-05, + "loss": 1.6311, + "step": 8228 + }, + { + "epoch": 2.1624462180181956, + "grad_norm": 0.6899086833000183, + "learning_rate": 2.7921850359207992e-05, + "loss": 1.5686, + "step": 8230 + }, + { + "epoch": 2.162971721351857, + "grad_norm": 0.6069139838218689, + "learning_rate": 2.7904328018223236e-05, + "loss": 1.603, + "step": 8232 + }, + { + "epoch": 2.163497224685519, + "grad_norm": 0.5628888010978699, + "learning_rate": 2.788680567723848e-05, + "loss": 1.616, + "step": 8234 + }, + { + "epoch": 2.164022728019181, + "grad_norm": 0.5229991674423218, + "learning_rate": 2.7869283336253728e-05, + "loss": 1.6172, + "step": 8236 + }, + { + "epoch": 2.1645482313528426, + "grad_norm": 0.5317997932434082, + "learning_rate": 2.7851760995268972e-05, + "loss": 1.5771, + "step": 8238 + }, + { + "epoch": 2.1650737346865045, + "grad_norm": 0.5842552781105042, + "learning_rate": 2.7834238654284213e-05, + "loss": 1.6199, + "step": 8240 + }, + { + "epoch": 2.165599238020166, + "grad_norm": 0.6741251349449158, + "learning_rate": 2.7816716313299457e-05, + "loss": 1.6156, + "step": 8242 + }, + { + "epoch": 2.166124741353828, + "grad_norm": 0.5447443723678589, + "learning_rate": 2.77991939723147e-05, + "loss": 1.5832, + "step": 8244 + }, + { + "epoch": 2.1666502446874896, + "grad_norm": 0.49737128615379333, + "learning_rate": 2.778167163132995e-05, + "loss": 1.6369, + "step": 8246 + }, + { + "epoch": 2.1671757480211515, + "grad_norm": 0.6779609322547913, + "learning_rate": 2.7764149290345193e-05, + "loss": 1.631, + "step": 8248 + }, + { + "epoch": 2.1677012513548135, + "grad_norm": 0.5730245113372803, + "learning_rate": 2.7746626949360437e-05, + "loss": 1.6004, + "step": 8250 + }, + { + "epoch": 2.168226754688475, + "grad_norm": 0.5977212190628052, + "learning_rate": 2.7729104608375678e-05, + "loss": 1.6173, + "step": 8252 + }, + { + "epoch": 2.168752258022137, + "grad_norm": 0.5202796459197998, + "learning_rate": 2.7711582267390922e-05, + "loss": 1.6027, + "step": 8254 + }, + { + "epoch": 2.1692777613557985, + "grad_norm": 0.7288246154785156, + "learning_rate": 2.769405992640617e-05, + "loss": 1.6007, + "step": 8256 + }, + { + "epoch": 2.1698032646894605, + "grad_norm": 0.5423540472984314, + "learning_rate": 2.7676537585421414e-05, + "loss": 1.6032, + "step": 8258 + }, + { + "epoch": 2.170328768023122, + "grad_norm": 0.6103919744491577, + "learning_rate": 2.7659015244436658e-05, + "loss": 1.6147, + "step": 8260 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.5658676624298096, + "learning_rate": 2.7641492903451905e-05, + "loss": 1.6187, + "step": 8262 + }, + { + "epoch": 2.1713797746904455, + "grad_norm": 0.5302108526229858, + "learning_rate": 2.762397056246715e-05, + "loss": 1.5885, + "step": 8264 + }, + { + "epoch": 2.1719052780241075, + "grad_norm": 0.5685304999351501, + "learning_rate": 2.760644822148239e-05, + "loss": 1.5809, + "step": 8266 + }, + { + "epoch": 2.172430781357769, + "grad_norm": 0.6211028695106506, + "learning_rate": 2.7588925880497635e-05, + "loss": 1.5982, + "step": 8268 + }, + { + "epoch": 2.172956284691431, + "grad_norm": 0.5506071448326111, + "learning_rate": 2.757140353951288e-05, + "loss": 1.6059, + "step": 8270 + }, + { + "epoch": 2.173481788025093, + "grad_norm": 0.5578741431236267, + "learning_rate": 2.7553881198528126e-05, + "loss": 1.6065, + "step": 8272 + }, + { + "epoch": 2.1740072913587545, + "grad_norm": 0.6002737283706665, + "learning_rate": 2.753635885754337e-05, + "loss": 1.6622, + "step": 8274 + }, + { + "epoch": 2.1745327946924164, + "grad_norm": 0.5421825647354126, + "learning_rate": 2.7518836516558615e-05, + "loss": 1.594, + "step": 8276 + }, + { + "epoch": 2.175058298026078, + "grad_norm": 0.5473990440368652, + "learning_rate": 2.7501314175573855e-05, + "loss": 1.6006, + "step": 8278 + }, + { + "epoch": 2.17558380135974, + "grad_norm": 0.6513854265213013, + "learning_rate": 2.74837918345891e-05, + "loss": 1.6222, + "step": 8280 + }, + { + "epoch": 2.1761093046934015, + "grad_norm": 0.5265668034553528, + "learning_rate": 2.7466269493604347e-05, + "loss": 1.5859, + "step": 8282 + }, + { + "epoch": 2.1766348080270634, + "grad_norm": 0.5653666853904724, + "learning_rate": 2.744874715261959e-05, + "loss": 1.6145, + "step": 8284 + }, + { + "epoch": 2.1771603113607254, + "grad_norm": 0.6216524839401245, + "learning_rate": 2.7431224811634835e-05, + "loss": 1.5975, + "step": 8286 + }, + { + "epoch": 2.177685814694387, + "grad_norm": 0.5801669955253601, + "learning_rate": 2.7413702470650083e-05, + "loss": 1.6318, + "step": 8288 + }, + { + "epoch": 2.178211318028049, + "grad_norm": 0.6386235356330872, + "learning_rate": 2.739618012966532e-05, + "loss": 1.6193, + "step": 8290 + }, + { + "epoch": 2.1787368213617104, + "grad_norm": 0.5952389240264893, + "learning_rate": 2.7378657788680568e-05, + "loss": 1.5982, + "step": 8292 + }, + { + "epoch": 2.1792623246953724, + "grad_norm": 0.5205252170562744, + "learning_rate": 2.7361135447695812e-05, + "loss": 1.6088, + "step": 8294 + }, + { + "epoch": 2.179787828029034, + "grad_norm": 0.5996859669685364, + "learning_rate": 2.7343613106711056e-05, + "loss": 1.5929, + "step": 8296 + }, + { + "epoch": 2.180313331362696, + "grad_norm": 0.5440699458122253, + "learning_rate": 2.7326090765726304e-05, + "loss": 1.6266, + "step": 8298 + }, + { + "epoch": 2.1808388346963574, + "grad_norm": 0.5174278616905212, + "learning_rate": 2.7308568424741548e-05, + "loss": 1.6134, + "step": 8300 + }, + { + "epoch": 2.1813643380300194, + "grad_norm": 0.5336670875549316, + "learning_rate": 2.7291046083756792e-05, + "loss": 1.6026, + "step": 8302 + }, + { + "epoch": 2.181889841363681, + "grad_norm": 0.5199602246284485, + "learning_rate": 2.7273523742772033e-05, + "loss": 1.6117, + "step": 8304 + }, + { + "epoch": 2.182415344697343, + "grad_norm": 0.5980170369148254, + "learning_rate": 2.7256001401787277e-05, + "loss": 1.6027, + "step": 8306 + }, + { + "epoch": 2.182940848031005, + "grad_norm": 0.6026611924171448, + "learning_rate": 2.7238479060802525e-05, + "loss": 1.6326, + "step": 8308 + }, + { + "epoch": 2.1834663513646664, + "grad_norm": 0.54527747631073, + "learning_rate": 2.722095671981777e-05, + "loss": 1.587, + "step": 8310 + }, + { + "epoch": 2.1839918546983283, + "grad_norm": 0.6268514394760132, + "learning_rate": 2.7203434378833013e-05, + "loss": 1.6362, + "step": 8312 + }, + { + "epoch": 2.18451735803199, + "grad_norm": 0.6219541430473328, + "learning_rate": 2.718591203784826e-05, + "loss": 1.5783, + "step": 8314 + }, + { + "epoch": 2.185042861365652, + "grad_norm": 0.5530925393104553, + "learning_rate": 2.71683896968635e-05, + "loss": 1.6121, + "step": 8316 + }, + { + "epoch": 2.1855683646993134, + "grad_norm": 0.6584067940711975, + "learning_rate": 2.7150867355878746e-05, + "loss": 1.6114, + "step": 8318 + }, + { + "epoch": 2.1860938680329753, + "grad_norm": 0.5550515055656433, + "learning_rate": 2.713334501489399e-05, + "loss": 1.6087, + "step": 8320 + }, + { + "epoch": 2.1866193713666373, + "grad_norm": 0.595063328742981, + "learning_rate": 2.7115822673909237e-05, + "loss": 1.606, + "step": 8322 + }, + { + "epoch": 2.187144874700299, + "grad_norm": 0.6574273705482483, + "learning_rate": 2.709830033292448e-05, + "loss": 1.6054, + "step": 8324 + }, + { + "epoch": 2.187670378033961, + "grad_norm": 0.5191264748573303, + "learning_rate": 2.7080777991939726e-05, + "loss": 1.5956, + "step": 8326 + }, + { + "epoch": 2.1881958813676223, + "grad_norm": 0.6227658987045288, + "learning_rate": 2.7063255650954967e-05, + "loss": 1.6053, + "step": 8328 + }, + { + "epoch": 2.1887213847012843, + "grad_norm": 0.6377692222595215, + "learning_rate": 2.704573330997021e-05, + "loss": 1.6253, + "step": 8330 + }, + { + "epoch": 2.189246888034946, + "grad_norm": 0.5611161589622498, + "learning_rate": 2.7028210968985458e-05, + "loss": 1.5996, + "step": 8332 + }, + { + "epoch": 2.189772391368608, + "grad_norm": 0.628909170627594, + "learning_rate": 2.7010688628000702e-05, + "loss": 1.6267, + "step": 8334 + }, + { + "epoch": 2.1902978947022693, + "grad_norm": 0.5963556170463562, + "learning_rate": 2.6993166287015947e-05, + "loss": 1.5845, + "step": 8336 + }, + { + "epoch": 2.1908233980359313, + "grad_norm": 0.5764312744140625, + "learning_rate": 2.6975643946031194e-05, + "loss": 1.593, + "step": 8338 + }, + { + "epoch": 2.1913489013695933, + "grad_norm": 0.7820791006088257, + "learning_rate": 2.695812160504644e-05, + "loss": 1.5945, + "step": 8340 + }, + { + "epoch": 2.191874404703255, + "grad_norm": 0.542460560798645, + "learning_rate": 2.694059926406168e-05, + "loss": 1.6183, + "step": 8342 + }, + { + "epoch": 2.1923999080369168, + "grad_norm": 0.6432899832725525, + "learning_rate": 2.6923076923076923e-05, + "loss": 1.6584, + "step": 8344 + }, + { + "epoch": 2.1929254113705783, + "grad_norm": 0.5868924856185913, + "learning_rate": 2.6905554582092167e-05, + "loss": 1.6196, + "step": 8346 + }, + { + "epoch": 2.1934509147042403, + "grad_norm": 0.5434276461601257, + "learning_rate": 2.6888032241107415e-05, + "loss": 1.581, + "step": 8348 + }, + { + "epoch": 2.1939764180379018, + "grad_norm": 0.5824474692344666, + "learning_rate": 2.687050990012266e-05, + "loss": 1.5816, + "step": 8350 + }, + { + "epoch": 2.1945019213715637, + "grad_norm": 0.5336950421333313, + "learning_rate": 2.6852987559137903e-05, + "loss": 1.574, + "step": 8352 + }, + { + "epoch": 2.1950274247052253, + "grad_norm": 0.5753375291824341, + "learning_rate": 2.6835465218153144e-05, + "loss": 1.604, + "step": 8354 + }, + { + "epoch": 2.1955529280388872, + "grad_norm": 0.762776792049408, + "learning_rate": 2.681794287716839e-05, + "loss": 1.6355, + "step": 8356 + }, + { + "epoch": 2.196078431372549, + "grad_norm": 0.5597438812255859, + "learning_rate": 2.6800420536183636e-05, + "loss": 1.5729, + "step": 8358 + }, + { + "epoch": 2.1966039347062107, + "grad_norm": 0.5700204968452454, + "learning_rate": 2.678289819519888e-05, + "loss": 1.6107, + "step": 8360 + }, + { + "epoch": 2.1971294380398727, + "grad_norm": 0.5418670773506165, + "learning_rate": 2.6765375854214124e-05, + "loss": 1.6243, + "step": 8362 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.5794771909713745, + "learning_rate": 2.6747853513229372e-05, + "loss": 1.6099, + "step": 8364 + }, + { + "epoch": 2.198180444707196, + "grad_norm": 0.6689446568489075, + "learning_rate": 2.6730331172244616e-05, + "loss": 1.5979, + "step": 8366 + }, + { + "epoch": 2.1987059480408577, + "grad_norm": 0.6027274131774902, + "learning_rate": 2.6712808831259857e-05, + "loss": 1.6143, + "step": 8368 + }, + { + "epoch": 2.1992314513745197, + "grad_norm": 0.5987696051597595, + "learning_rate": 2.66952864902751e-05, + "loss": 1.5895, + "step": 8370 + }, + { + "epoch": 2.1997569547081812, + "grad_norm": 0.566920816898346, + "learning_rate": 2.6677764149290345e-05, + "loss": 1.6106, + "step": 8372 + }, + { + "epoch": 2.200282458041843, + "grad_norm": 0.611470639705658, + "learning_rate": 2.6660241808305593e-05, + "loss": 1.5572, + "step": 8374 + }, + { + "epoch": 2.200807961375505, + "grad_norm": 0.666123628616333, + "learning_rate": 2.6642719467320837e-05, + "loss": 1.6263, + "step": 8376 + }, + { + "epoch": 2.2013334647091667, + "grad_norm": 0.522304117679596, + "learning_rate": 2.662519712633608e-05, + "loss": 1.6031, + "step": 8378 + }, + { + "epoch": 2.2018589680428287, + "grad_norm": 0.518319845199585, + "learning_rate": 2.6607674785351322e-05, + "loss": 1.5743, + "step": 8380 + }, + { + "epoch": 2.20238447137649, + "grad_norm": 0.547029435634613, + "learning_rate": 2.6590152444366566e-05, + "loss": 1.6111, + "step": 8382 + }, + { + "epoch": 2.202909974710152, + "grad_norm": 0.5654579401016235, + "learning_rate": 2.6572630103381814e-05, + "loss": 1.6057, + "step": 8384 + }, + { + "epoch": 2.2034354780438137, + "grad_norm": 0.6351662874221802, + "learning_rate": 2.6555107762397058e-05, + "loss": 1.6298, + "step": 8386 + }, + { + "epoch": 2.2039609813774756, + "grad_norm": 0.5234793424606323, + "learning_rate": 2.6537585421412302e-05, + "loss": 1.5897, + "step": 8388 + }, + { + "epoch": 2.204486484711137, + "grad_norm": 0.6238772869110107, + "learning_rate": 2.652006308042755e-05, + "loss": 1.6321, + "step": 8390 + }, + { + "epoch": 2.205011988044799, + "grad_norm": 0.5562398433685303, + "learning_rate": 2.6502540739442787e-05, + "loss": 1.575, + "step": 8392 + }, + { + "epoch": 2.205537491378461, + "grad_norm": 0.5776354670524597, + "learning_rate": 2.6485018398458034e-05, + "loss": 1.5714, + "step": 8394 + }, + { + "epoch": 2.2060629947121226, + "grad_norm": 0.5097812414169312, + "learning_rate": 2.646749605747328e-05, + "loss": 1.6081, + "step": 8396 + }, + { + "epoch": 2.2065884980457846, + "grad_norm": 0.5820745825767517, + "learning_rate": 2.6449973716488523e-05, + "loss": 1.646, + "step": 8398 + }, + { + "epoch": 2.207114001379446, + "grad_norm": 0.5649163722991943, + "learning_rate": 2.643245137550377e-05, + "loss": 1.6181, + "step": 8400 + }, + { + "epoch": 2.207114001379446, + "eval_loss": 1.6614795923233032, + "eval_runtime": 487.2799, + "eval_samples_per_second": 249.936, + "eval_steps_per_second": 31.243, + "step": 8400 + }, + { + "epoch": 2.207639504713108, + "grad_norm": 0.5877487063407898, + "learning_rate": 2.6414929034519015e-05, + "loss": 1.6239, + "step": 8402 + }, + { + "epoch": 2.2081650080467696, + "grad_norm": 0.5796919465065002, + "learning_rate": 2.639740669353426e-05, + "loss": 1.6233, + "step": 8404 + }, + { + "epoch": 2.2086905113804316, + "grad_norm": 0.58669114112854, + "learning_rate": 2.63798843525495e-05, + "loss": 1.6164, + "step": 8406 + }, + { + "epoch": 2.2092160147140936, + "grad_norm": 0.6143302917480469, + "learning_rate": 2.6362362011564744e-05, + "loss": 1.6609, + "step": 8408 + }, + { + "epoch": 2.209741518047755, + "grad_norm": 0.6326837539672852, + "learning_rate": 2.634483967057999e-05, + "loss": 1.5887, + "step": 8410 + }, + { + "epoch": 2.210267021381417, + "grad_norm": 0.5528976917266846, + "learning_rate": 2.6327317329595235e-05, + "loss": 1.5859, + "step": 8412 + }, + { + "epoch": 2.2107925247150786, + "grad_norm": 0.654974102973938, + "learning_rate": 2.630979498861048e-05, + "loss": 1.5918, + "step": 8414 + }, + { + "epoch": 2.2113180280487406, + "grad_norm": 0.5469368100166321, + "learning_rate": 2.6292272647625727e-05, + "loss": 1.6064, + "step": 8416 + }, + { + "epoch": 2.211843531382402, + "grad_norm": 0.5513100028038025, + "learning_rate": 2.6274750306640965e-05, + "loss": 1.5964, + "step": 8418 + }, + { + "epoch": 2.212369034716064, + "grad_norm": 0.5723447799682617, + "learning_rate": 2.6257227965656212e-05, + "loss": 1.5847, + "step": 8420 + }, + { + "epoch": 2.2128945380497256, + "grad_norm": 0.5903245806694031, + "learning_rate": 2.6239705624671456e-05, + "loss": 1.6176, + "step": 8422 + }, + { + "epoch": 2.2134200413833875, + "grad_norm": 0.7093879580497742, + "learning_rate": 2.62221832836867e-05, + "loss": 1.627, + "step": 8424 + }, + { + "epoch": 2.213945544717049, + "grad_norm": 0.6263571977615356, + "learning_rate": 2.6204660942701948e-05, + "loss": 1.606, + "step": 8426 + }, + { + "epoch": 2.214471048050711, + "grad_norm": 0.5385801196098328, + "learning_rate": 2.6187138601717192e-05, + "loss": 1.6006, + "step": 8428 + }, + { + "epoch": 2.214996551384373, + "grad_norm": 0.5380145311355591, + "learning_rate": 2.6169616260732433e-05, + "loss": 1.6179, + "step": 8430 + }, + { + "epoch": 2.2155220547180345, + "grad_norm": 0.6820893287658691, + "learning_rate": 2.6152093919747677e-05, + "loss": 1.5878, + "step": 8432 + }, + { + "epoch": 2.2160475580516965, + "grad_norm": 0.6620394587516785, + "learning_rate": 2.613457157876292e-05, + "loss": 1.6155, + "step": 8434 + }, + { + "epoch": 2.216573061385358, + "grad_norm": 0.5877991914749146, + "learning_rate": 2.611704923777817e-05, + "loss": 1.6346, + "step": 8436 + }, + { + "epoch": 2.21709856471902, + "grad_norm": 0.7124659419059753, + "learning_rate": 2.6099526896793413e-05, + "loss": 1.624, + "step": 8438 + }, + { + "epoch": 2.2176240680526815, + "grad_norm": 0.6077954769134521, + "learning_rate": 2.608200455580866e-05, + "loss": 1.606, + "step": 8440 + }, + { + "epoch": 2.2181495713863435, + "grad_norm": 0.7110419869422913, + "learning_rate": 2.6064482214823905e-05, + "loss": 1.5749, + "step": 8442 + }, + { + "epoch": 2.2186750747200055, + "grad_norm": 0.5183662176132202, + "learning_rate": 2.6046959873839146e-05, + "loss": 1.6047, + "step": 8444 + }, + { + "epoch": 2.219200578053667, + "grad_norm": 0.6769959926605225, + "learning_rate": 2.602943753285439e-05, + "loss": 1.6635, + "step": 8446 + }, + { + "epoch": 2.219726081387329, + "grad_norm": 0.610445499420166, + "learning_rate": 2.6011915191869634e-05, + "loss": 1.6053, + "step": 8448 + }, + { + "epoch": 2.2202515847209905, + "grad_norm": 0.537892758846283, + "learning_rate": 2.599439285088488e-05, + "loss": 1.6046, + "step": 8450 + }, + { + "epoch": 2.2207770880546525, + "grad_norm": 0.5192505121231079, + "learning_rate": 2.5976870509900126e-05, + "loss": 1.5956, + "step": 8452 + }, + { + "epoch": 2.221302591388314, + "grad_norm": 0.5772652626037598, + "learning_rate": 2.595934816891537e-05, + "loss": 1.6013, + "step": 8454 + }, + { + "epoch": 2.221828094721976, + "grad_norm": 0.5051235556602478, + "learning_rate": 2.594182582793061e-05, + "loss": 1.5965, + "step": 8456 + }, + { + "epoch": 2.2223535980556375, + "grad_norm": 0.5779315233230591, + "learning_rate": 2.5924303486945855e-05, + "loss": 1.5888, + "step": 8458 + }, + { + "epoch": 2.2228791013892994, + "grad_norm": 0.5708422660827637, + "learning_rate": 2.5906781145961102e-05, + "loss": 1.638, + "step": 8460 + }, + { + "epoch": 2.223404604722961, + "grad_norm": 0.6042600274085999, + "learning_rate": 2.5889258804976347e-05, + "loss": 1.5939, + "step": 8462 + }, + { + "epoch": 2.223930108056623, + "grad_norm": 0.5264221429824829, + "learning_rate": 2.587173646399159e-05, + "loss": 1.6029, + "step": 8464 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.5832411050796509, + "learning_rate": 2.585421412300684e-05, + "loss": 1.6332, + "step": 8466 + }, + { + "epoch": 2.2249811147239464, + "grad_norm": 0.5503547191619873, + "learning_rate": 2.5836691782022076e-05, + "loss": 1.6139, + "step": 8468 + }, + { + "epoch": 2.2255066180576084, + "grad_norm": 0.6002954244613647, + "learning_rate": 2.5819169441037323e-05, + "loss": 1.6085, + "step": 8470 + }, + { + "epoch": 2.22603212139127, + "grad_norm": 0.5220708250999451, + "learning_rate": 2.5801647100052567e-05, + "loss": 1.5779, + "step": 8472 + }, + { + "epoch": 2.226557624724932, + "grad_norm": 0.5423254370689392, + "learning_rate": 2.578412475906781e-05, + "loss": 1.5818, + "step": 8474 + }, + { + "epoch": 2.2270831280585934, + "grad_norm": 0.5738494396209717, + "learning_rate": 2.576660241808306e-05, + "loss": 1.6046, + "step": 8476 + }, + { + "epoch": 2.2276086313922554, + "grad_norm": 0.5414014458656311, + "learning_rate": 2.5749080077098303e-05, + "loss": 1.6238, + "step": 8478 + }, + { + "epoch": 2.2281341347259174, + "grad_norm": 0.5559380650520325, + "learning_rate": 2.5731557736113548e-05, + "loss": 1.6421, + "step": 8480 + }, + { + "epoch": 2.228659638059579, + "grad_norm": 0.5835655331611633, + "learning_rate": 2.571403539512879e-05, + "loss": 1.6208, + "step": 8482 + }, + { + "epoch": 2.229185141393241, + "grad_norm": 0.5489543676376343, + "learning_rate": 2.5696513054144033e-05, + "loss": 1.5749, + "step": 8484 + }, + { + "epoch": 2.2297106447269024, + "grad_norm": 0.5658118724822998, + "learning_rate": 2.567899071315928e-05, + "loss": 1.6382, + "step": 8486 + }, + { + "epoch": 2.2302361480605644, + "grad_norm": 0.5926935076713562, + "learning_rate": 2.5661468372174524e-05, + "loss": 1.6327, + "step": 8488 + }, + { + "epoch": 2.230761651394226, + "grad_norm": 0.5220993757247925, + "learning_rate": 2.564394603118977e-05, + "loss": 1.6266, + "step": 8490 + }, + { + "epoch": 2.231287154727888, + "grad_norm": 0.5491675138473511, + "learning_rate": 2.5626423690205016e-05, + "loss": 1.615, + "step": 8492 + }, + { + "epoch": 2.2318126580615494, + "grad_norm": 0.5789914131164551, + "learning_rate": 2.5608901349220253e-05, + "loss": 1.6568, + "step": 8494 + }, + { + "epoch": 2.2323381613952114, + "grad_norm": 0.5792714357376099, + "learning_rate": 2.55913790082355e-05, + "loss": 1.6344, + "step": 8496 + }, + { + "epoch": 2.2328636647288733, + "grad_norm": 0.4977031648159027, + "learning_rate": 2.5573856667250745e-05, + "loss": 1.6286, + "step": 8498 + }, + { + "epoch": 2.233389168062535, + "grad_norm": 0.6100000739097595, + "learning_rate": 2.555633432626599e-05, + "loss": 1.6234, + "step": 8500 + }, + { + "epoch": 2.233914671396197, + "grad_norm": 0.6163233518600464, + "learning_rate": 2.5538811985281237e-05, + "loss": 1.6096, + "step": 8502 + }, + { + "epoch": 2.2344401747298583, + "grad_norm": 0.5867898464202881, + "learning_rate": 2.552128964429648e-05, + "loss": 1.604, + "step": 8504 + }, + { + "epoch": 2.2349656780635203, + "grad_norm": 0.8214870095252991, + "learning_rate": 2.5503767303311725e-05, + "loss": 1.6476, + "step": 8506 + }, + { + "epoch": 2.235491181397182, + "grad_norm": 0.7553618550300598, + "learning_rate": 2.5486244962326966e-05, + "loss": 1.588, + "step": 8508 + }, + { + "epoch": 2.236016684730844, + "grad_norm": 0.5797671675682068, + "learning_rate": 2.546872262134221e-05, + "loss": 1.6125, + "step": 8510 + }, + { + "epoch": 2.2365421880645053, + "grad_norm": 0.6248254179954529, + "learning_rate": 2.5451200280357458e-05, + "loss": 1.6339, + "step": 8512 + }, + { + "epoch": 2.2370676913981673, + "grad_norm": 0.5321136116981506, + "learning_rate": 2.5433677939372702e-05, + "loss": 1.6058, + "step": 8514 + }, + { + "epoch": 2.2375931947318293, + "grad_norm": 0.685704231262207, + "learning_rate": 2.5416155598387946e-05, + "loss": 1.6305, + "step": 8516 + }, + { + "epoch": 2.238118698065491, + "grad_norm": 0.7109000086784363, + "learning_rate": 2.5398633257403194e-05, + "loss": 1.6176, + "step": 8518 + }, + { + "epoch": 2.2386442013991528, + "grad_norm": 0.5430434346199036, + "learning_rate": 2.538111091641843e-05, + "loss": 1.5943, + "step": 8520 + }, + { + "epoch": 2.2391697047328143, + "grad_norm": 0.5879561305046082, + "learning_rate": 2.536358857543368e-05, + "loss": 1.6418, + "step": 8522 + }, + { + "epoch": 2.2396952080664763, + "grad_norm": 0.5790303349494934, + "learning_rate": 2.5346066234448923e-05, + "loss": 1.6413, + "step": 8524 + }, + { + "epoch": 2.240220711400138, + "grad_norm": 0.5287625193595886, + "learning_rate": 2.5328543893464167e-05, + "loss": 1.5898, + "step": 8526 + }, + { + "epoch": 2.2407462147337998, + "grad_norm": 0.6279188394546509, + "learning_rate": 2.5311021552479415e-05, + "loss": 1.6066, + "step": 8528 + }, + { + "epoch": 2.2412717180674613, + "grad_norm": 0.5477473139762878, + "learning_rate": 2.529349921149466e-05, + "loss": 1.6004, + "step": 8530 + }, + { + "epoch": 2.2417972214011233, + "grad_norm": 0.5529081225395203, + "learning_rate": 2.52759768705099e-05, + "loss": 1.6265, + "step": 8532 + }, + { + "epoch": 2.242322724734785, + "grad_norm": 0.6740888357162476, + "learning_rate": 2.5258454529525144e-05, + "loss": 1.6015, + "step": 8534 + }, + { + "epoch": 2.2428482280684467, + "grad_norm": 0.5619795322418213, + "learning_rate": 2.5240932188540388e-05, + "loss": 1.6388, + "step": 8536 + }, + { + "epoch": 2.2433737314021087, + "grad_norm": 0.5655804872512817, + "learning_rate": 2.5223409847555635e-05, + "loss": 1.6052, + "step": 8538 + }, + { + "epoch": 2.2438992347357702, + "grad_norm": 0.5193714499473572, + "learning_rate": 2.520588750657088e-05, + "loss": 1.6223, + "step": 8540 + }, + { + "epoch": 2.244424738069432, + "grad_norm": 0.5750133991241455, + "learning_rate": 2.5188365165586124e-05, + "loss": 1.6025, + "step": 8542 + }, + { + "epoch": 2.2449502414030937, + "grad_norm": 0.673194169998169, + "learning_rate": 2.517084282460137e-05, + "loss": 1.6275, + "step": 8544 + }, + { + "epoch": 2.2454757447367557, + "grad_norm": 0.5762125849723816, + "learning_rate": 2.515332048361661e-05, + "loss": 1.6061, + "step": 8546 + }, + { + "epoch": 2.2460012480704172, + "grad_norm": 0.5318456292152405, + "learning_rate": 2.5135798142631856e-05, + "loss": 1.6048, + "step": 8548 + }, + { + "epoch": 2.246526751404079, + "grad_norm": 0.5658667683601379, + "learning_rate": 2.51182758016471e-05, + "loss": 1.5761, + "step": 8550 + }, + { + "epoch": 2.247052254737741, + "grad_norm": 0.5093829035758972, + "learning_rate": 2.5100753460662345e-05, + "loss": 1.5823, + "step": 8552 + }, + { + "epoch": 2.2475777580714027, + "grad_norm": 0.6146730780601501, + "learning_rate": 2.5083231119677592e-05, + "loss": 1.6037, + "step": 8554 + }, + { + "epoch": 2.2481032614050647, + "grad_norm": 0.5599241256713867, + "learning_rate": 2.5065708778692836e-05, + "loss": 1.6559, + "step": 8556 + }, + { + "epoch": 2.248628764738726, + "grad_norm": 0.6648929119110107, + "learning_rate": 2.5048186437708077e-05, + "loss": 1.6054, + "step": 8558 + }, + { + "epoch": 2.249154268072388, + "grad_norm": 0.6260311603546143, + "learning_rate": 2.503066409672332e-05, + "loss": 1.6113, + "step": 8560 + }, + { + "epoch": 2.2496797714060497, + "grad_norm": 0.6752457022666931, + "learning_rate": 2.5013141755738566e-05, + "loss": 1.6019, + "step": 8562 + }, + { + "epoch": 2.2502052747397117, + "grad_norm": 0.5815406441688538, + "learning_rate": 2.4995619414753813e-05, + "loss": 1.6158, + "step": 8564 + }, + { + "epoch": 2.2507307780733736, + "grad_norm": 0.7072321772575378, + "learning_rate": 2.4978097073769057e-05, + "loss": 1.6549, + "step": 8566 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5526368021965027, + "learning_rate": 2.49605747327843e-05, + "loss": 1.6122, + "step": 8568 + }, + { + "epoch": 2.251781784740697, + "grad_norm": 0.644477128982544, + "learning_rate": 2.4943052391799546e-05, + "loss": 1.5802, + "step": 8570 + }, + { + "epoch": 2.2523072880743586, + "grad_norm": 0.5599563717842102, + "learning_rate": 2.492553005081479e-05, + "loss": 1.6255, + "step": 8572 + }, + { + "epoch": 2.2528327914080206, + "grad_norm": 0.5871409177780151, + "learning_rate": 2.4908007709830034e-05, + "loss": 1.5929, + "step": 8574 + }, + { + "epoch": 2.253358294741682, + "grad_norm": 0.654251754283905, + "learning_rate": 2.4890485368845278e-05, + "loss": 1.6135, + "step": 8576 + }, + { + "epoch": 2.253883798075344, + "grad_norm": 0.5650330185890198, + "learning_rate": 2.4872963027860526e-05, + "loss": 1.5967, + "step": 8578 + }, + { + "epoch": 2.2544093014090056, + "grad_norm": 0.5818485021591187, + "learning_rate": 2.4855440686875766e-05, + "loss": 1.642, + "step": 8580 + }, + { + "epoch": 2.2549348047426676, + "grad_norm": 0.5410641431808472, + "learning_rate": 2.483791834589101e-05, + "loss": 1.6136, + "step": 8582 + }, + { + "epoch": 2.255460308076329, + "grad_norm": 0.5259566903114319, + "learning_rate": 2.4820396004906258e-05, + "loss": 1.6212, + "step": 8584 + }, + { + "epoch": 2.255985811409991, + "grad_norm": 0.5885828733444214, + "learning_rate": 2.48028736639215e-05, + "loss": 1.6015, + "step": 8586 + }, + { + "epoch": 2.256511314743653, + "grad_norm": 0.6429271697998047, + "learning_rate": 2.4785351322936747e-05, + "loss": 1.621, + "step": 8588 + }, + { + "epoch": 2.2570368180773146, + "grad_norm": 0.5706207156181335, + "learning_rate": 2.476782898195199e-05, + "loss": 1.5826, + "step": 8590 + }, + { + "epoch": 2.2575623214109766, + "grad_norm": 0.6020461320877075, + "learning_rate": 2.4750306640967235e-05, + "loss": 1.6296, + "step": 8592 + }, + { + "epoch": 2.258087824744638, + "grad_norm": 0.5262504816055298, + "learning_rate": 2.473278429998248e-05, + "loss": 1.6015, + "step": 8594 + }, + { + "epoch": 2.2586133280783, + "grad_norm": 0.6201303005218506, + "learning_rate": 2.4715261958997723e-05, + "loss": 1.5931, + "step": 8596 + }, + { + "epoch": 2.2591388314119616, + "grad_norm": 0.5339419841766357, + "learning_rate": 2.4697739618012967e-05, + "loss": 1.6369, + "step": 8598 + }, + { + "epoch": 2.2596643347456236, + "grad_norm": 0.6167606115341187, + "learning_rate": 2.468021727702821e-05, + "loss": 1.6056, + "step": 8600 + }, + { + "epoch": 2.2601898380792855, + "grad_norm": 0.6669736504554749, + "learning_rate": 2.4662694936043456e-05, + "loss": 1.6262, + "step": 8602 + }, + { + "epoch": 2.260715341412947, + "grad_norm": 0.6823409795761108, + "learning_rate": 2.4645172595058703e-05, + "loss": 1.5777, + "step": 8604 + }, + { + "epoch": 2.261240844746609, + "grad_norm": 0.5810272097587585, + "learning_rate": 2.4627650254073944e-05, + "loss": 1.6163, + "step": 8606 + }, + { + "epoch": 2.2617663480802706, + "grad_norm": 0.5693057179450989, + "learning_rate": 2.461012791308919e-05, + "loss": 1.6286, + "step": 8608 + }, + { + "epoch": 2.2622918514139325, + "grad_norm": 0.6601256132125854, + "learning_rate": 2.4592605572104436e-05, + "loss": 1.6179, + "step": 8610 + }, + { + "epoch": 2.262817354747594, + "grad_norm": 0.5781053304672241, + "learning_rate": 2.4575083231119677e-05, + "loss": 1.6364, + "step": 8612 + }, + { + "epoch": 2.263342858081256, + "grad_norm": 0.7442349195480347, + "learning_rate": 2.4557560890134924e-05, + "loss": 1.6105, + "step": 8614 + }, + { + "epoch": 2.2638683614149175, + "grad_norm": 0.6304720044136047, + "learning_rate": 2.454003854915017e-05, + "loss": 1.6368, + "step": 8616 + }, + { + "epoch": 2.2643938647485795, + "grad_norm": 0.8399868607521057, + "learning_rate": 2.452251620816541e-05, + "loss": 1.6196, + "step": 8618 + }, + { + "epoch": 2.264919368082241, + "grad_norm": 0.6608265042304993, + "learning_rate": 2.4504993867180657e-05, + "loss": 1.6327, + "step": 8620 + }, + { + "epoch": 2.265444871415903, + "grad_norm": 0.5849167108535767, + "learning_rate": 2.44874715261959e-05, + "loss": 1.6294, + "step": 8622 + }, + { + "epoch": 2.265970374749565, + "grad_norm": 0.5310716032981873, + "learning_rate": 2.4469949185211145e-05, + "loss": 1.5839, + "step": 8624 + }, + { + "epoch": 2.2664958780832265, + "grad_norm": 0.5808155536651611, + "learning_rate": 2.445242684422639e-05, + "loss": 1.6326, + "step": 8626 + }, + { + "epoch": 2.2670213814168885, + "grad_norm": 0.6455567479133606, + "learning_rate": 2.4434904503241633e-05, + "loss": 1.6199, + "step": 8628 + }, + { + "epoch": 2.26754688475055, + "grad_norm": 0.7273616790771484, + "learning_rate": 2.441738216225688e-05, + "loss": 1.6067, + "step": 8630 + }, + { + "epoch": 2.268072388084212, + "grad_norm": 0.7093321681022644, + "learning_rate": 2.4399859821272122e-05, + "loss": 1.6274, + "step": 8632 + }, + { + "epoch": 2.2685978914178735, + "grad_norm": 0.5976110696792603, + "learning_rate": 2.438233748028737e-05, + "loss": 1.6328, + "step": 8634 + }, + { + "epoch": 2.2691233947515355, + "grad_norm": 0.6373220682144165, + "learning_rate": 2.4364815139302614e-05, + "loss": 1.6248, + "step": 8636 + }, + { + "epoch": 2.2696488980851974, + "grad_norm": 0.5636451244354248, + "learning_rate": 2.4347292798317854e-05, + "loss": 1.6133, + "step": 8638 + }, + { + "epoch": 2.270174401418859, + "grad_norm": 0.6106368899345398, + "learning_rate": 2.4329770457333102e-05, + "loss": 1.6036, + "step": 8640 + }, + { + "epoch": 2.270699904752521, + "grad_norm": 0.526772677898407, + "learning_rate": 2.4312248116348346e-05, + "loss": 1.5967, + "step": 8642 + }, + { + "epoch": 2.2712254080861825, + "grad_norm": 0.5734396576881409, + "learning_rate": 2.429472577536359e-05, + "loss": 1.6056, + "step": 8644 + }, + { + "epoch": 2.2717509114198444, + "grad_norm": 0.5490990281105042, + "learning_rate": 2.4277203434378834e-05, + "loss": 1.6057, + "step": 8646 + }, + { + "epoch": 2.272276414753506, + "grad_norm": 0.5757094621658325, + "learning_rate": 2.425968109339408e-05, + "loss": 1.6127, + "step": 8648 + }, + { + "epoch": 2.272801918087168, + "grad_norm": 0.5849709510803223, + "learning_rate": 2.4242158752409323e-05, + "loss": 1.6182, + "step": 8650 + }, + { + "epoch": 2.27332742142083, + "grad_norm": 0.6258499026298523, + "learning_rate": 2.4224636411424567e-05, + "loss": 1.5819, + "step": 8652 + }, + { + "epoch": 2.2738529247544914, + "grad_norm": 0.5439596176147461, + "learning_rate": 2.420711407043981e-05, + "loss": 1.6197, + "step": 8654 + }, + { + "epoch": 2.274378428088153, + "grad_norm": 0.5642632246017456, + "learning_rate": 2.4189591729455055e-05, + "loss": 1.619, + "step": 8656 + }, + { + "epoch": 2.274903931421815, + "grad_norm": 0.5832663178443909, + "learning_rate": 2.41720693884703e-05, + "loss": 1.6291, + "step": 8658 + }, + { + "epoch": 2.275429434755477, + "grad_norm": 0.5677375197410583, + "learning_rate": 2.4154547047485547e-05, + "loss": 1.6087, + "step": 8660 + }, + { + "epoch": 2.2759549380891384, + "grad_norm": 0.5964575409889221, + "learning_rate": 2.413702470650079e-05, + "loss": 1.5785, + "step": 8662 + }, + { + "epoch": 2.2764804414228004, + "grad_norm": 0.6877849102020264, + "learning_rate": 2.4119502365516032e-05, + "loss": 1.6554, + "step": 8664 + }, + { + "epoch": 2.277005944756462, + "grad_norm": 0.6050941348075867, + "learning_rate": 2.410198002453128e-05, + "loss": 1.6324, + "step": 8666 + }, + { + "epoch": 2.277531448090124, + "grad_norm": 0.6498258113861084, + "learning_rate": 2.4084457683546524e-05, + "loss": 1.6018, + "step": 8668 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.7161571979522705, + "learning_rate": 2.4066935342561768e-05, + "loss": 1.6233, + "step": 8670 + }, + { + "epoch": 2.2785824547574474, + "grad_norm": 0.6215277910232544, + "learning_rate": 2.4049413001577012e-05, + "loss": 1.5664, + "step": 8672 + }, + { + "epoch": 2.2791079580911093, + "grad_norm": 0.6165092587471008, + "learning_rate": 2.4031890660592256e-05, + "loss": 1.6217, + "step": 8674 + }, + { + "epoch": 2.279633461424771, + "grad_norm": 0.5820355415344238, + "learning_rate": 2.40143683196075e-05, + "loss": 1.631, + "step": 8676 + }, + { + "epoch": 2.280158964758433, + "grad_norm": 0.5764251351356506, + "learning_rate": 2.3996845978622745e-05, + "loss": 1.6371, + "step": 8678 + }, + { + "epoch": 2.2806844680920944, + "grad_norm": 0.5712262988090515, + "learning_rate": 2.397932363763799e-05, + "loss": 1.6164, + "step": 8680 + }, + { + "epoch": 2.2812099714257563, + "grad_norm": 0.5973013639450073, + "learning_rate": 2.3961801296653233e-05, + "loss": 1.6543, + "step": 8682 + }, + { + "epoch": 2.281735474759418, + "grad_norm": 0.5426216721534729, + "learning_rate": 2.3944278955668477e-05, + "loss": 1.5761, + "step": 8684 + }, + { + "epoch": 2.28226097809308, + "grad_norm": 0.5732000470161438, + "learning_rate": 2.3926756614683725e-05, + "loss": 1.6052, + "step": 8686 + }, + { + "epoch": 2.282786481426742, + "grad_norm": 0.581996500492096, + "learning_rate": 2.3909234273698966e-05, + "loss": 1.6382, + "step": 8688 + }, + { + "epoch": 2.2833119847604033, + "grad_norm": 0.5811718702316284, + "learning_rate": 2.3891711932714213e-05, + "loss": 1.6127, + "step": 8690 + }, + { + "epoch": 2.2838374880940653, + "grad_norm": 0.59296053647995, + "learning_rate": 2.3874189591729457e-05, + "loss": 1.6458, + "step": 8692 + }, + { + "epoch": 2.284362991427727, + "grad_norm": 0.4986303746700287, + "learning_rate": 2.38566672507447e-05, + "loss": 1.6373, + "step": 8694 + }, + { + "epoch": 2.2848884947613888, + "grad_norm": 0.711003839969635, + "learning_rate": 2.3839144909759946e-05, + "loss": 1.6123, + "step": 8696 + }, + { + "epoch": 2.2854139980950503, + "grad_norm": 0.5871834754943848, + "learning_rate": 2.382162256877519e-05, + "loss": 1.6206, + "step": 8698 + }, + { + "epoch": 2.2859395014287123, + "grad_norm": 0.5751228928565979, + "learning_rate": 2.3804100227790434e-05, + "loss": 1.6307, + "step": 8700 + }, + { + "epoch": 2.286465004762374, + "grad_norm": 0.5439415574073792, + "learning_rate": 2.3786577886805678e-05, + "loss": 1.6081, + "step": 8702 + }, + { + "epoch": 2.2869905080960358, + "grad_norm": 0.6993831992149353, + "learning_rate": 2.3769055545820922e-05, + "loss": 1.6197, + "step": 8704 + }, + { + "epoch": 2.2875160114296973, + "grad_norm": 0.5735906958580017, + "learning_rate": 2.375153320483617e-05, + "loss": 1.6184, + "step": 8706 + }, + { + "epoch": 2.2880415147633593, + "grad_norm": 0.7263919115066528, + "learning_rate": 2.373401086385141e-05, + "loss": 1.626, + "step": 8708 + }, + { + "epoch": 2.2885670180970212, + "grad_norm": 0.6145097613334656, + "learning_rate": 2.3716488522866655e-05, + "loss": 1.5732, + "step": 8710 + }, + { + "epoch": 2.2890925214306828, + "grad_norm": 0.5744717717170715, + "learning_rate": 2.3698966181881902e-05, + "loss": 1.6301, + "step": 8712 + }, + { + "epoch": 2.2896180247643447, + "grad_norm": 0.5422975420951843, + "learning_rate": 2.3681443840897143e-05, + "loss": 1.6168, + "step": 8714 + }, + { + "epoch": 2.2901435280980063, + "grad_norm": 0.5488165020942688, + "learning_rate": 2.366392149991239e-05, + "loss": 1.6252, + "step": 8716 + }, + { + "epoch": 2.2906690314316682, + "grad_norm": 0.5232349634170532, + "learning_rate": 2.3646399158927635e-05, + "loss": 1.5893, + "step": 8718 + }, + { + "epoch": 2.2911945347653297, + "grad_norm": 0.5339794754981995, + "learning_rate": 2.3628876817942876e-05, + "loss": 1.6187, + "step": 8720 + }, + { + "epoch": 2.2917200380989917, + "grad_norm": 0.5548917651176453, + "learning_rate": 2.3611354476958123e-05, + "loss": 1.6204, + "step": 8722 + }, + { + "epoch": 2.2922455414326537, + "grad_norm": 0.5368824005126953, + "learning_rate": 2.3593832135973367e-05, + "loss": 1.6096, + "step": 8724 + }, + { + "epoch": 2.292771044766315, + "grad_norm": 0.581849217414856, + "learning_rate": 2.357630979498861e-05, + "loss": 1.6213, + "step": 8726 + }, + { + "epoch": 2.293296548099977, + "grad_norm": 0.5897862315177917, + "learning_rate": 2.3558787454003856e-05, + "loss": 1.6521, + "step": 8728 + }, + { + "epoch": 2.2938220514336387, + "grad_norm": 0.49220868945121765, + "learning_rate": 2.35412651130191e-05, + "loss": 1.5872, + "step": 8730 + }, + { + "epoch": 2.2943475547673007, + "grad_norm": 0.6717751622200012, + "learning_rate": 2.3523742772034348e-05, + "loss": 1.6153, + "step": 8732 + }, + { + "epoch": 2.294873058100962, + "grad_norm": 0.7588834166526794, + "learning_rate": 2.350622043104959e-05, + "loss": 1.6137, + "step": 8734 + }, + { + "epoch": 2.295398561434624, + "grad_norm": 0.6023603081703186, + "learning_rate": 2.3488698090064832e-05, + "loss": 1.5825, + "step": 8736 + }, + { + "epoch": 2.2959240647682857, + "grad_norm": 0.590947687625885, + "learning_rate": 2.347117574908008e-05, + "loss": 1.5805, + "step": 8738 + }, + { + "epoch": 2.2964495681019477, + "grad_norm": 0.5251069664955139, + "learning_rate": 2.345365340809532e-05, + "loss": 1.6111, + "step": 8740 + }, + { + "epoch": 2.296975071435609, + "grad_norm": 0.5737573504447937, + "learning_rate": 2.343613106711057e-05, + "loss": 1.5913, + "step": 8742 + }, + { + "epoch": 2.297500574769271, + "grad_norm": 0.5265782475471497, + "learning_rate": 2.3418608726125813e-05, + "loss": 1.6285, + "step": 8744 + }, + { + "epoch": 2.298026078102933, + "grad_norm": 0.6365170478820801, + "learning_rate": 2.3401086385141053e-05, + "loss": 1.6699, + "step": 8746 + }, + { + "epoch": 2.2985515814365947, + "grad_norm": 0.5703795552253723, + "learning_rate": 2.33835640441563e-05, + "loss": 1.6226, + "step": 8748 + }, + { + "epoch": 2.2990770847702566, + "grad_norm": 0.6636425852775574, + "learning_rate": 2.3366041703171545e-05, + "loss": 1.6502, + "step": 8750 + }, + { + "epoch": 2.299602588103918, + "grad_norm": 0.5792987942695618, + "learning_rate": 2.334851936218679e-05, + "loss": 1.6144, + "step": 8752 + }, + { + "epoch": 2.30012809143758, + "grad_norm": 0.5882440209388733, + "learning_rate": 2.3330997021202033e-05, + "loss": 1.6421, + "step": 8754 + }, + { + "epoch": 2.3006535947712417, + "grad_norm": 0.8438565135002136, + "learning_rate": 2.3313474680217278e-05, + "loss": 1.6205, + "step": 8756 + }, + { + "epoch": 2.3011790981049036, + "grad_norm": 0.5310785174369812, + "learning_rate": 2.3295952339232522e-05, + "loss": 1.6295, + "step": 8758 + }, + { + "epoch": 2.3017046014385656, + "grad_norm": 0.5153002738952637, + "learning_rate": 2.3278429998247766e-05, + "loss": 1.6023, + "step": 8760 + }, + { + "epoch": 2.302230104772227, + "grad_norm": 0.5312076807022095, + "learning_rate": 2.3260907657263014e-05, + "loss": 1.617, + "step": 8762 + }, + { + "epoch": 2.302755608105889, + "grad_norm": 0.554939329624176, + "learning_rate": 2.3243385316278258e-05, + "loss": 1.6376, + "step": 8764 + }, + { + "epoch": 2.3032811114395506, + "grad_norm": 0.5337182283401489, + "learning_rate": 2.32258629752935e-05, + "loss": 1.6258, + "step": 8766 + }, + { + "epoch": 2.3038066147732126, + "grad_norm": 0.5950394868850708, + "learning_rate": 2.3208340634308746e-05, + "loss": 1.5906, + "step": 8768 + }, + { + "epoch": 2.304332118106874, + "grad_norm": 0.5580046772956848, + "learning_rate": 2.319081829332399e-05, + "loss": 1.602, + "step": 8770 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.5954751968383789, + "learning_rate": 2.3173295952339234e-05, + "loss": 1.6433, + "step": 8772 + }, + { + "epoch": 2.3053831247741976, + "grad_norm": 0.5827406644821167, + "learning_rate": 2.315577361135448e-05, + "loss": 1.6155, + "step": 8774 + }, + { + "epoch": 2.3059086281078596, + "grad_norm": 0.5616747140884399, + "learning_rate": 2.3138251270369723e-05, + "loss": 1.6427, + "step": 8776 + }, + { + "epoch": 2.306434131441521, + "grad_norm": 0.5560279488563538, + "learning_rate": 2.3120728929384967e-05, + "loss": 1.5679, + "step": 8778 + }, + { + "epoch": 2.306959634775183, + "grad_norm": 0.5234860777854919, + "learning_rate": 2.310320658840021e-05, + "loss": 1.6176, + "step": 8780 + }, + { + "epoch": 2.307485138108845, + "grad_norm": 0.5881795287132263, + "learning_rate": 2.3085684247415455e-05, + "loss": 1.635, + "step": 8782 + }, + { + "epoch": 2.3080106414425066, + "grad_norm": 0.6444844603538513, + "learning_rate": 2.30681619064307e-05, + "loss": 1.6151, + "step": 8784 + }, + { + "epoch": 2.3085361447761685, + "grad_norm": 0.5576640367507935, + "learning_rate": 2.3050639565445944e-05, + "loss": 1.6469, + "step": 8786 + }, + { + "epoch": 2.30906164810983, + "grad_norm": 0.6228268146514893, + "learning_rate": 2.303311722446119e-05, + "loss": 1.6463, + "step": 8788 + }, + { + "epoch": 2.309587151443492, + "grad_norm": 0.589922308921814, + "learning_rate": 2.3015594883476432e-05, + "loss": 1.6203, + "step": 8790 + }, + { + "epoch": 2.3101126547771536, + "grad_norm": 0.6492023468017578, + "learning_rate": 2.2998072542491676e-05, + "loss": 1.6371, + "step": 8792 + }, + { + "epoch": 2.3106381581108155, + "grad_norm": 0.5567497611045837, + "learning_rate": 2.2980550201506924e-05, + "loss": 1.6225, + "step": 8794 + }, + { + "epoch": 2.3111636614444775, + "grad_norm": 0.568630576133728, + "learning_rate": 2.2963027860522165e-05, + "loss": 1.6063, + "step": 8796 + }, + { + "epoch": 2.311689164778139, + "grad_norm": 0.5894288420677185, + "learning_rate": 2.2945505519537412e-05, + "loss": 1.6124, + "step": 8798 + }, + { + "epoch": 2.312214668111801, + "grad_norm": 0.5140713453292847, + "learning_rate": 2.2927983178552656e-05, + "loss": 1.6183, + "step": 8800 + }, + { + "epoch": 2.312214668111801, + "eval_loss": 1.6566152572631836, + "eval_runtime": 486.4646, + "eval_samples_per_second": 250.355, + "eval_steps_per_second": 31.295, + "step": 8800 + }, + { + "epoch": 2.3127401714454625, + "grad_norm": 0.5767953395843506, + "learning_rate": 2.29104608375679e-05, + "loss": 1.588, + "step": 8802 + }, + { + "epoch": 2.3132656747791245, + "grad_norm": 0.5397815704345703, + "learning_rate": 2.2892938496583145e-05, + "loss": 1.6311, + "step": 8804 + }, + { + "epoch": 2.313791178112786, + "grad_norm": 0.59912109375, + "learning_rate": 2.287541615559839e-05, + "loss": 1.6282, + "step": 8806 + }, + { + "epoch": 2.314316681446448, + "grad_norm": 0.5919666886329651, + "learning_rate": 2.2857893814613633e-05, + "loss": 1.5869, + "step": 8808 + }, + { + "epoch": 2.31484218478011, + "grad_norm": 0.6073286533355713, + "learning_rate": 2.2840371473628877e-05, + "loss": 1.6117, + "step": 8810 + }, + { + "epoch": 2.3153676881137715, + "grad_norm": 0.5764369368553162, + "learning_rate": 2.282284913264412e-05, + "loss": 1.6216, + "step": 8812 + }, + { + "epoch": 2.315893191447433, + "grad_norm": 0.5165627598762512, + "learning_rate": 2.280532679165937e-05, + "loss": 1.5775, + "step": 8814 + }, + { + "epoch": 2.316418694781095, + "grad_norm": 0.6328101754188538, + "learning_rate": 2.278780445067461e-05, + "loss": 1.6164, + "step": 8816 + }, + { + "epoch": 2.316944198114757, + "grad_norm": 0.5554302930831909, + "learning_rate": 2.2770282109689857e-05, + "loss": 1.5912, + "step": 8818 + }, + { + "epoch": 2.3174697014484185, + "grad_norm": 0.5721133947372437, + "learning_rate": 2.27527597687051e-05, + "loss": 1.5831, + "step": 8820 + }, + { + "epoch": 2.3179952047820804, + "grad_norm": 0.5364890694618225, + "learning_rate": 2.2735237427720342e-05, + "loss": 1.6334, + "step": 8822 + }, + { + "epoch": 2.318520708115742, + "grad_norm": 0.5981882214546204, + "learning_rate": 2.271771508673559e-05, + "loss": 1.5971, + "step": 8824 + }, + { + "epoch": 2.319046211449404, + "grad_norm": 0.5527270436286926, + "learning_rate": 2.2700192745750834e-05, + "loss": 1.5844, + "step": 8826 + }, + { + "epoch": 2.3195717147830655, + "grad_norm": 0.5365408062934875, + "learning_rate": 2.2682670404766078e-05, + "loss": 1.6016, + "step": 8828 + }, + { + "epoch": 2.3200972181167274, + "grad_norm": 0.5421087741851807, + "learning_rate": 2.2665148063781322e-05, + "loss": 1.6095, + "step": 8830 + }, + { + "epoch": 2.3206227214503894, + "grad_norm": 0.6584184765815735, + "learning_rate": 2.2647625722796566e-05, + "loss": 1.6167, + "step": 8832 + }, + { + "epoch": 2.321148224784051, + "grad_norm": 0.6406242251396179, + "learning_rate": 2.2630103381811814e-05, + "loss": 1.6178, + "step": 8834 + }, + { + "epoch": 2.321673728117713, + "grad_norm": 0.5838053226470947, + "learning_rate": 2.2612581040827055e-05, + "loss": 1.596, + "step": 8836 + }, + { + "epoch": 2.3221992314513744, + "grad_norm": 0.5230047702789307, + "learning_rate": 2.25950586998423e-05, + "loss": 1.6143, + "step": 8838 + }, + { + "epoch": 2.3227247347850364, + "grad_norm": 0.698841392993927, + "learning_rate": 2.2577536358857547e-05, + "loss": 1.5636, + "step": 8840 + }, + { + "epoch": 2.323250238118698, + "grad_norm": 0.6338639855384827, + "learning_rate": 2.2560014017872787e-05, + "loss": 1.6137, + "step": 8842 + }, + { + "epoch": 2.32377574145236, + "grad_norm": 0.5631201863288879, + "learning_rate": 2.2542491676888035e-05, + "loss": 1.6118, + "step": 8844 + }, + { + "epoch": 2.324301244786022, + "grad_norm": 0.5797097086906433, + "learning_rate": 2.252496933590328e-05, + "loss": 1.6117, + "step": 8846 + }, + { + "epoch": 2.3248267481196834, + "grad_norm": 0.55458003282547, + "learning_rate": 2.250744699491852e-05, + "loss": 1.6074, + "step": 8848 + }, + { + "epoch": 2.3253522514533453, + "grad_norm": 0.6182234883308411, + "learning_rate": 2.2489924653933767e-05, + "loss": 1.5855, + "step": 8850 + }, + { + "epoch": 2.325877754787007, + "grad_norm": 0.7982610464096069, + "learning_rate": 2.247240231294901e-05, + "loss": 1.6479, + "step": 8852 + }, + { + "epoch": 2.326403258120669, + "grad_norm": 0.5972685813903809, + "learning_rate": 2.2454879971964256e-05, + "loss": 1.6508, + "step": 8854 + }, + { + "epoch": 2.3269287614543304, + "grad_norm": 0.5340309739112854, + "learning_rate": 2.24373576309795e-05, + "loss": 1.587, + "step": 8856 + }, + { + "epoch": 2.3274542647879923, + "grad_norm": 0.598004162311554, + "learning_rate": 2.2419835289994744e-05, + "loss": 1.5807, + "step": 8858 + }, + { + "epoch": 2.327979768121654, + "grad_norm": 0.5330363512039185, + "learning_rate": 2.2402312949009988e-05, + "loss": 1.5864, + "step": 8860 + }, + { + "epoch": 2.328505271455316, + "grad_norm": 0.5319264531135559, + "learning_rate": 2.2384790608025232e-05, + "loss": 1.5815, + "step": 8862 + }, + { + "epoch": 2.3290307747889774, + "grad_norm": 0.7008064389228821, + "learning_rate": 2.2367268267040477e-05, + "loss": 1.5968, + "step": 8864 + }, + { + "epoch": 2.3295562781226393, + "grad_norm": 0.5392124056816101, + "learning_rate": 2.2349745926055724e-05, + "loss": 1.5889, + "step": 8866 + }, + { + "epoch": 2.3300817814563013, + "grad_norm": 0.5684927701950073, + "learning_rate": 2.2332223585070965e-05, + "loss": 1.5789, + "step": 8868 + }, + { + "epoch": 2.330607284789963, + "grad_norm": 0.5385169386863708, + "learning_rate": 2.2314701244086213e-05, + "loss": 1.6039, + "step": 8870 + }, + { + "epoch": 2.331132788123625, + "grad_norm": 0.5375089645385742, + "learning_rate": 2.2297178903101457e-05, + "loss": 1.6012, + "step": 8872 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.5316143035888672, + "learning_rate": 2.2279656562116698e-05, + "loss": 1.6277, + "step": 8874 + }, + { + "epoch": 2.3321837947909483, + "grad_norm": 0.5862188339233398, + "learning_rate": 2.2262134221131945e-05, + "loss": 1.5808, + "step": 8876 + }, + { + "epoch": 2.33270929812461, + "grad_norm": 0.5072478652000427, + "learning_rate": 2.224461188014719e-05, + "loss": 1.6257, + "step": 8878 + }, + { + "epoch": 2.333234801458272, + "grad_norm": 0.7240272164344788, + "learning_rate": 2.2227089539162433e-05, + "loss": 1.6551, + "step": 8880 + }, + { + "epoch": 2.3337603047919337, + "grad_norm": 0.5968764424324036, + "learning_rate": 2.2209567198177678e-05, + "loss": 1.6277, + "step": 8882 + }, + { + "epoch": 2.3342858081255953, + "grad_norm": 0.576076328754425, + "learning_rate": 2.2192044857192922e-05, + "loss": 1.6187, + "step": 8884 + }, + { + "epoch": 2.3348113114592572, + "grad_norm": 0.7722670435905457, + "learning_rate": 2.2174522516208166e-05, + "loss": 1.6401, + "step": 8886 + }, + { + "epoch": 2.3353368147929188, + "grad_norm": 0.5726610422134399, + "learning_rate": 2.215700017522341e-05, + "loss": 1.6182, + "step": 8888 + }, + { + "epoch": 2.3358623181265807, + "grad_norm": 0.6139241456985474, + "learning_rate": 2.2139477834238658e-05, + "loss": 1.6314, + "step": 8890 + }, + { + "epoch": 2.3363878214602423, + "grad_norm": 0.6056875586509705, + "learning_rate": 2.21219554932539e-05, + "loss": 1.603, + "step": 8892 + }, + { + "epoch": 2.3369133247939042, + "grad_norm": 0.6434742212295532, + "learning_rate": 2.2104433152269143e-05, + "loss": 1.6344, + "step": 8894 + }, + { + "epoch": 2.3374388281275658, + "grad_norm": 0.5169597268104553, + "learning_rate": 2.208691081128439e-05, + "loss": 1.5879, + "step": 8896 + }, + { + "epoch": 2.3379643314612277, + "grad_norm": 0.5403237342834473, + "learning_rate": 2.206938847029963e-05, + "loss": 1.5932, + "step": 8898 + }, + { + "epoch": 2.3384898347948893, + "grad_norm": 0.6257784962654114, + "learning_rate": 2.205186612931488e-05, + "loss": 1.6506, + "step": 8900 + }, + { + "epoch": 2.3390153381285512, + "grad_norm": 0.5608735084533691, + "learning_rate": 2.2034343788330123e-05, + "loss": 1.587, + "step": 8902 + }, + { + "epoch": 2.339540841462213, + "grad_norm": 0.6551423668861389, + "learning_rate": 2.2016821447345367e-05, + "loss": 1.5972, + "step": 8904 + }, + { + "epoch": 2.3400663447958747, + "grad_norm": 0.5031147599220276, + "learning_rate": 2.199929910636061e-05, + "loss": 1.5885, + "step": 8906 + }, + { + "epoch": 2.3405918481295367, + "grad_norm": 0.6937734484672546, + "learning_rate": 2.1981776765375855e-05, + "loss": 1.6201, + "step": 8908 + }, + { + "epoch": 2.341117351463198, + "grad_norm": 0.6449792385101318, + "learning_rate": 2.19642544243911e-05, + "loss": 1.6335, + "step": 8910 + }, + { + "epoch": 2.34164285479686, + "grad_norm": 0.8069351315498352, + "learning_rate": 2.1946732083406344e-05, + "loss": 1.6147, + "step": 8912 + }, + { + "epoch": 2.3421683581305217, + "grad_norm": 0.5487890839576721, + "learning_rate": 2.1929209742421588e-05, + "loss": 1.6011, + "step": 8914 + }, + { + "epoch": 2.3426938614641837, + "grad_norm": 0.5937405228614807, + "learning_rate": 2.1911687401436835e-05, + "loss": 1.6196, + "step": 8916 + }, + { + "epoch": 2.3432193647978456, + "grad_norm": 0.5741852521896362, + "learning_rate": 2.1894165060452076e-05, + "loss": 1.5813, + "step": 8918 + }, + { + "epoch": 2.343744868131507, + "grad_norm": 0.5851845741271973, + "learning_rate": 2.187664271946732e-05, + "loss": 1.5938, + "step": 8920 + }, + { + "epoch": 2.344270371465169, + "grad_norm": 0.5910730361938477, + "learning_rate": 2.1859120378482568e-05, + "loss": 1.6274, + "step": 8922 + }, + { + "epoch": 2.3447958747988307, + "grad_norm": 0.5576980710029602, + "learning_rate": 2.184159803749781e-05, + "loss": 1.6288, + "step": 8924 + }, + { + "epoch": 2.3453213781324926, + "grad_norm": 0.5467607975006104, + "learning_rate": 2.1824075696513056e-05, + "loss": 1.5965, + "step": 8926 + }, + { + "epoch": 2.345846881466154, + "grad_norm": 0.5948976874351501, + "learning_rate": 2.18065533555283e-05, + "loss": 1.6238, + "step": 8928 + }, + { + "epoch": 2.346372384799816, + "grad_norm": 0.6754350066184998, + "learning_rate": 2.178903101454354e-05, + "loss": 1.6123, + "step": 8930 + }, + { + "epoch": 2.3468978881334777, + "grad_norm": 0.5642646551132202, + "learning_rate": 2.177150867355879e-05, + "loss": 1.6032, + "step": 8932 + }, + { + "epoch": 2.3474233914671396, + "grad_norm": 0.5213932991027832, + "learning_rate": 2.1753986332574033e-05, + "loss": 1.6275, + "step": 8934 + }, + { + "epoch": 2.347948894800801, + "grad_norm": 0.5514481663703918, + "learning_rate": 2.1736463991589277e-05, + "loss": 1.6281, + "step": 8936 + }, + { + "epoch": 2.348474398134463, + "grad_norm": 0.6020217537879944, + "learning_rate": 2.171894165060452e-05, + "loss": 1.5991, + "step": 8938 + }, + { + "epoch": 2.348999901468125, + "grad_norm": 0.6889709830284119, + "learning_rate": 2.1701419309619765e-05, + "loss": 1.5949, + "step": 8940 + }, + { + "epoch": 2.3495254048017866, + "grad_norm": 0.5877057313919067, + "learning_rate": 2.1683896968635013e-05, + "loss": 1.5978, + "step": 8942 + }, + { + "epoch": 2.3500509081354486, + "grad_norm": 0.7173901200294495, + "learning_rate": 2.1666374627650254e-05, + "loss": 1.594, + "step": 8944 + }, + { + "epoch": 2.35057641146911, + "grad_norm": 0.5696520805358887, + "learning_rate": 2.16488522866655e-05, + "loss": 1.6291, + "step": 8946 + }, + { + "epoch": 2.351101914802772, + "grad_norm": 0.5779030919075012, + "learning_rate": 2.1631329945680746e-05, + "loss": 1.6113, + "step": 8948 + }, + { + "epoch": 2.3516274181364336, + "grad_norm": 0.5482315421104431, + "learning_rate": 2.1613807604695986e-05, + "loss": 1.5829, + "step": 8950 + }, + { + "epoch": 2.3521529214700956, + "grad_norm": 0.6139046549797058, + "learning_rate": 2.1596285263711234e-05, + "loss": 1.6207, + "step": 8952 + }, + { + "epoch": 2.3526784248037576, + "grad_norm": 0.569508969783783, + "learning_rate": 2.1578762922726478e-05, + "loss": 1.6108, + "step": 8954 + }, + { + "epoch": 2.353203928137419, + "grad_norm": 0.5926774740219116, + "learning_rate": 2.1561240581741722e-05, + "loss": 1.6094, + "step": 8956 + }, + { + "epoch": 2.353729431471081, + "grad_norm": 0.527396559715271, + "learning_rate": 2.1543718240756966e-05, + "loss": 1.6116, + "step": 8958 + }, + { + "epoch": 2.3542549348047426, + "grad_norm": 0.589083731174469, + "learning_rate": 2.152619589977221e-05, + "loss": 1.6393, + "step": 8960 + }, + { + "epoch": 2.3547804381384045, + "grad_norm": 0.5615043044090271, + "learning_rate": 2.1508673558787455e-05, + "loss": 1.5967, + "step": 8962 + }, + { + "epoch": 2.355305941472066, + "grad_norm": 0.5468254685401917, + "learning_rate": 2.14911512178027e-05, + "loss": 1.6055, + "step": 8964 + }, + { + "epoch": 2.355831444805728, + "grad_norm": 0.5892741084098816, + "learning_rate": 2.1473628876817943e-05, + "loss": 1.6191, + "step": 8966 + }, + { + "epoch": 2.35635694813939, + "grad_norm": 0.5186066627502441, + "learning_rate": 2.1456106535833187e-05, + "loss": 1.6039, + "step": 8968 + }, + { + "epoch": 2.3568824514730515, + "grad_norm": 0.5194823741912842, + "learning_rate": 2.143858419484843e-05, + "loss": 1.6223, + "step": 8970 + }, + { + "epoch": 2.357407954806713, + "grad_norm": 0.590596616268158, + "learning_rate": 2.142106185386368e-05, + "loss": 1.6085, + "step": 8972 + }, + { + "epoch": 2.357933458140375, + "grad_norm": 0.5250555872917175, + "learning_rate": 2.1403539512878923e-05, + "loss": 1.5753, + "step": 8974 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.5621309280395508, + "learning_rate": 2.1386017171894164e-05, + "loss": 1.5993, + "step": 8976 + }, + { + "epoch": 2.3589844648076985, + "grad_norm": 0.5648888945579529, + "learning_rate": 2.136849483090941e-05, + "loss": 1.6124, + "step": 8978 + }, + { + "epoch": 2.3595099681413605, + "grad_norm": 0.6254675388336182, + "learning_rate": 2.1350972489924656e-05, + "loss": 1.6137, + "step": 8980 + }, + { + "epoch": 2.360035471475022, + "grad_norm": 0.5874773859977722, + "learning_rate": 2.13334501489399e-05, + "loss": 1.6192, + "step": 8982 + }, + { + "epoch": 2.360560974808684, + "grad_norm": 0.5968673229217529, + "learning_rate": 2.1315927807955144e-05, + "loss": 1.5798, + "step": 8984 + }, + { + "epoch": 2.3610864781423455, + "grad_norm": 0.5419182181358337, + "learning_rate": 2.1298405466970388e-05, + "loss": 1.5893, + "step": 8986 + }, + { + "epoch": 2.3616119814760075, + "grad_norm": 0.5870303511619568, + "learning_rate": 2.1280883125985632e-05, + "loss": 1.6178, + "step": 8988 + }, + { + "epoch": 2.3621374848096695, + "grad_norm": 0.5218934416770935, + "learning_rate": 2.1263360785000877e-05, + "loss": 1.5627, + "step": 8990 + }, + { + "epoch": 2.362662988143331, + "grad_norm": 0.7437348365783691, + "learning_rate": 2.124583844401612e-05, + "loss": 1.633, + "step": 8992 + }, + { + "epoch": 2.363188491476993, + "grad_norm": 0.6861118078231812, + "learning_rate": 2.1228316103031365e-05, + "loss": 1.6051, + "step": 8994 + }, + { + "epoch": 2.3637139948106545, + "grad_norm": 0.7474240660667419, + "learning_rate": 2.121079376204661e-05, + "loss": 1.6053, + "step": 8996 + }, + { + "epoch": 2.3642394981443164, + "grad_norm": 0.6993446946144104, + "learning_rate": 2.1193271421061857e-05, + "loss": 1.6337, + "step": 8998 + }, + { + "epoch": 2.364765001477978, + "grad_norm": 0.5786541104316711, + "learning_rate": 2.1175749080077097e-05, + "loss": 1.6258, + "step": 9000 + }, + { + "epoch": 2.36529050481164, + "grad_norm": 0.6487932205200195, + "learning_rate": 2.115822673909234e-05, + "loss": 1.5785, + "step": 9002 + }, + { + "epoch": 2.365816008145302, + "grad_norm": 0.6424734592437744, + "learning_rate": 2.114070439810759e-05, + "loss": 1.6047, + "step": 9004 + }, + { + "epoch": 2.3663415114789634, + "grad_norm": 0.5754727721214294, + "learning_rate": 2.1123182057122833e-05, + "loss": 1.6199, + "step": 9006 + }, + { + "epoch": 2.3668670148126254, + "grad_norm": 0.6415467858314514, + "learning_rate": 2.1105659716138078e-05, + "loss": 1.6112, + "step": 9008 + }, + { + "epoch": 2.367392518146287, + "grad_norm": 0.5382227897644043, + "learning_rate": 2.1088137375153322e-05, + "loss": 1.6199, + "step": 9010 + }, + { + "epoch": 2.367918021479949, + "grad_norm": 0.5709097385406494, + "learning_rate": 2.1070615034168566e-05, + "loss": 1.6274, + "step": 9012 + }, + { + "epoch": 2.3684435248136104, + "grad_norm": 0.584823489189148, + "learning_rate": 2.105309269318381e-05, + "loss": 1.6364, + "step": 9014 + }, + { + "epoch": 2.3689690281472724, + "grad_norm": 0.5769666433334351, + "learning_rate": 2.1035570352199054e-05, + "loss": 1.613, + "step": 9016 + }, + { + "epoch": 2.369494531480934, + "grad_norm": 0.6009288430213928, + "learning_rate": 2.1018048011214302e-05, + "loss": 1.6115, + "step": 9018 + }, + { + "epoch": 2.370020034814596, + "grad_norm": 0.5430034399032593, + "learning_rate": 2.1000525670229543e-05, + "loss": 1.5935, + "step": 9020 + }, + { + "epoch": 2.3705455381482574, + "grad_norm": 0.5354077816009521, + "learning_rate": 2.0983003329244787e-05, + "loss": 1.6303, + "step": 9022 + }, + { + "epoch": 2.3710710414819194, + "grad_norm": 0.5706160664558411, + "learning_rate": 2.0965480988260034e-05, + "loss": 1.5922, + "step": 9024 + }, + { + "epoch": 2.3715965448155814, + "grad_norm": 0.5503175854682922, + "learning_rate": 2.0947958647275275e-05, + "loss": 1.5995, + "step": 9026 + }, + { + "epoch": 2.372122048149243, + "grad_norm": 0.5860917568206787, + "learning_rate": 2.0930436306290523e-05, + "loss": 1.617, + "step": 9028 + }, + { + "epoch": 2.372647551482905, + "grad_norm": 0.5663593411445618, + "learning_rate": 2.0912913965305767e-05, + "loss": 1.6142, + "step": 9030 + }, + { + "epoch": 2.3731730548165664, + "grad_norm": 0.5283660888671875, + "learning_rate": 2.0895391624321008e-05, + "loss": 1.6111, + "step": 9032 + }, + { + "epoch": 2.3736985581502283, + "grad_norm": 0.6298090815544128, + "learning_rate": 2.0877869283336255e-05, + "loss": 1.6362, + "step": 9034 + }, + { + "epoch": 2.37422406148389, + "grad_norm": 0.6019343137741089, + "learning_rate": 2.08603469423515e-05, + "loss": 1.6493, + "step": 9036 + }, + { + "epoch": 2.374749564817552, + "grad_norm": 0.6397587656974792, + "learning_rate": 2.0842824601366744e-05, + "loss": 1.6063, + "step": 9038 + }, + { + "epoch": 2.375275068151214, + "grad_norm": 0.634833037853241, + "learning_rate": 2.0825302260381988e-05, + "loss": 1.6041, + "step": 9040 + }, + { + "epoch": 2.3758005714848753, + "grad_norm": 0.535504162311554, + "learning_rate": 2.0807779919397232e-05, + "loss": 1.6136, + "step": 9042 + }, + { + "epoch": 2.3763260748185373, + "grad_norm": 0.5898988246917725, + "learning_rate": 2.079025757841248e-05, + "loss": 1.5947, + "step": 9044 + }, + { + "epoch": 2.376851578152199, + "grad_norm": 0.6104636192321777, + "learning_rate": 2.077273523742772e-05, + "loss": 1.6149, + "step": 9046 + }, + { + "epoch": 2.377377081485861, + "grad_norm": 0.6539409756660461, + "learning_rate": 2.0755212896442964e-05, + "loss": 1.5934, + "step": 9048 + }, + { + "epoch": 2.3779025848195223, + "grad_norm": 0.6595098972320557, + "learning_rate": 2.0737690555458212e-05, + "loss": 1.588, + "step": 9050 + }, + { + "epoch": 2.3784280881531843, + "grad_norm": 0.5743740200996399, + "learning_rate": 2.0720168214473453e-05, + "loss": 1.6563, + "step": 9052 + }, + { + "epoch": 2.378953591486846, + "grad_norm": 0.7335343956947327, + "learning_rate": 2.07026458734887e-05, + "loss": 1.6172, + "step": 9054 + }, + { + "epoch": 2.379479094820508, + "grad_norm": 0.5948194861412048, + "learning_rate": 2.0685123532503945e-05, + "loss": 1.6038, + "step": 9056 + }, + { + "epoch": 2.3800045981541693, + "grad_norm": 0.6061370968818665, + "learning_rate": 2.0667601191519185e-05, + "loss": 1.6064, + "step": 9058 + }, + { + "epoch": 2.3805301014878313, + "grad_norm": 0.5088295936584473, + "learning_rate": 2.0650078850534433e-05, + "loss": 1.5974, + "step": 9060 + }, + { + "epoch": 2.3810556048214933, + "grad_norm": 0.5611675977706909, + "learning_rate": 2.0632556509549677e-05, + "loss": 1.6219, + "step": 9062 + }, + { + "epoch": 2.381581108155155, + "grad_norm": 0.7418251037597656, + "learning_rate": 2.061503416856492e-05, + "loss": 1.6091, + "step": 9064 + }, + { + "epoch": 2.3821066114888167, + "grad_norm": 0.729642927646637, + "learning_rate": 2.0597511827580165e-05, + "loss": 1.5959, + "step": 9066 + }, + { + "epoch": 2.3826321148224783, + "grad_norm": 0.564006507396698, + "learning_rate": 2.057998948659541e-05, + "loss": 1.6484, + "step": 9068 + }, + { + "epoch": 2.3831576181561402, + "grad_norm": 0.5700123906135559, + "learning_rate": 2.0562467145610654e-05, + "loss": 1.61, + "step": 9070 + }, + { + "epoch": 2.3836831214898018, + "grad_norm": 0.5242093801498413, + "learning_rate": 2.0544944804625898e-05, + "loss": 1.6011, + "step": 9072 + }, + { + "epoch": 2.3842086248234637, + "grad_norm": 0.5738998651504517, + "learning_rate": 2.0527422463641146e-05, + "loss": 1.6154, + "step": 9074 + }, + { + "epoch": 2.3847341281571257, + "grad_norm": 0.5821254253387451, + "learning_rate": 2.050990012265639e-05, + "loss": 1.6443, + "step": 9076 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.6593937873840332, + "learning_rate": 2.049237778167163e-05, + "loss": 1.6266, + "step": 9078 + }, + { + "epoch": 2.385785134824449, + "grad_norm": 0.5662251710891724, + "learning_rate": 2.0474855440686878e-05, + "loss": 1.6036, + "step": 9080 + }, + { + "epoch": 2.3863106381581107, + "grad_norm": 0.6064842939376831, + "learning_rate": 2.0457333099702122e-05, + "loss": 1.6067, + "step": 9082 + }, + { + "epoch": 2.3868361414917727, + "grad_norm": 0.5419683456420898, + "learning_rate": 2.0439810758717366e-05, + "loss": 1.6429, + "step": 9084 + }, + { + "epoch": 2.3873616448254342, + "grad_norm": 0.630354642868042, + "learning_rate": 2.042228841773261e-05, + "loss": 1.6354, + "step": 9086 + }, + { + "epoch": 2.387887148159096, + "grad_norm": 0.5797430276870728, + "learning_rate": 2.0404766076747855e-05, + "loss": 1.596, + "step": 9088 + }, + { + "epoch": 2.3884126514927577, + "grad_norm": 0.7524275183677673, + "learning_rate": 2.03872437357631e-05, + "loss": 1.6484, + "step": 9090 + }, + { + "epoch": 2.3889381548264197, + "grad_norm": 0.5289912819862366, + "learning_rate": 2.0369721394778343e-05, + "loss": 1.6089, + "step": 9092 + }, + { + "epoch": 2.389463658160081, + "grad_norm": 0.610967755317688, + "learning_rate": 2.0352199053793587e-05, + "loss": 1.5842, + "step": 9094 + }, + { + "epoch": 2.389989161493743, + "grad_norm": 0.5420004725456238, + "learning_rate": 2.033467671280883e-05, + "loss": 1.6497, + "step": 9096 + }, + { + "epoch": 2.390514664827405, + "grad_norm": 0.5605329275131226, + "learning_rate": 2.0317154371824076e-05, + "loss": 1.5927, + "step": 9098 + }, + { + "epoch": 2.3910401681610667, + "grad_norm": 0.7482327222824097, + "learning_rate": 2.0299632030839323e-05, + "loss": 1.6462, + "step": 9100 + }, + { + "epoch": 2.3915656714947287, + "grad_norm": 0.536505937576294, + "learning_rate": 2.0282109689854564e-05, + "loss": 1.5756, + "step": 9102 + }, + { + "epoch": 2.39209117482839, + "grad_norm": 0.6347717046737671, + "learning_rate": 2.0264587348869808e-05, + "loss": 1.5967, + "step": 9104 + }, + { + "epoch": 2.392616678162052, + "grad_norm": 0.8039228320121765, + "learning_rate": 2.0247065007885056e-05, + "loss": 1.589, + "step": 9106 + }, + { + "epoch": 2.3931421814957137, + "grad_norm": 0.5720674395561218, + "learning_rate": 2.02295426669003e-05, + "loss": 1.6207, + "step": 9108 + }, + { + "epoch": 2.3936676848293756, + "grad_norm": 0.538987398147583, + "learning_rate": 2.0212020325915544e-05, + "loss": 1.604, + "step": 9110 + }, + { + "epoch": 2.3941931881630376, + "grad_norm": 0.5741251111030579, + "learning_rate": 2.0194497984930788e-05, + "loss": 1.6345, + "step": 9112 + }, + { + "epoch": 2.394718691496699, + "grad_norm": 0.6911687254905701, + "learning_rate": 2.0176975643946032e-05, + "loss": 1.596, + "step": 9114 + }, + { + "epoch": 2.395244194830361, + "grad_norm": 0.5507877469062805, + "learning_rate": 2.0159453302961277e-05, + "loss": 1.5973, + "step": 9116 + }, + { + "epoch": 2.3957696981640226, + "grad_norm": 0.6160647869110107, + "learning_rate": 2.014193096197652e-05, + "loss": 1.6113, + "step": 9118 + }, + { + "epoch": 2.3962952014976846, + "grad_norm": 0.6313309669494629, + "learning_rate": 2.0124408620991765e-05, + "loss": 1.596, + "step": 9120 + }, + { + "epoch": 2.396820704831346, + "grad_norm": 0.6924790740013123, + "learning_rate": 2.010688628000701e-05, + "loss": 1.6417, + "step": 9122 + }, + { + "epoch": 2.397346208165008, + "grad_norm": 0.5447295904159546, + "learning_rate": 2.0089363939022253e-05, + "loss": 1.6247, + "step": 9124 + }, + { + "epoch": 2.39787171149867, + "grad_norm": 0.5692989230155945, + "learning_rate": 2.00718415980375e-05, + "loss": 1.5952, + "step": 9126 + }, + { + "epoch": 2.3983972148323316, + "grad_norm": 0.6437897682189941, + "learning_rate": 2.005431925705274e-05, + "loss": 1.6339, + "step": 9128 + }, + { + "epoch": 2.398922718165993, + "grad_norm": 0.6150215268135071, + "learning_rate": 2.0036796916067986e-05, + "loss": 1.6362, + "step": 9130 + }, + { + "epoch": 2.399448221499655, + "grad_norm": 0.6019598841667175, + "learning_rate": 2.0019274575083233e-05, + "loss": 1.6287, + "step": 9132 + }, + { + "epoch": 2.399973724833317, + "grad_norm": 0.578983724117279, + "learning_rate": 2.0001752234098474e-05, + "loss": 1.6199, + "step": 9134 + }, + { + "epoch": 2.4004992281669786, + "grad_norm": 0.5352420210838318, + "learning_rate": 1.9984229893113722e-05, + "loss": 1.6066, + "step": 9136 + }, + { + "epoch": 2.4010247315006406, + "grad_norm": 0.6357502341270447, + "learning_rate": 1.9966707552128966e-05, + "loss": 1.6451, + "step": 9138 + }, + { + "epoch": 2.401550234834302, + "grad_norm": 0.6802014708518982, + "learning_rate": 1.994918521114421e-05, + "loss": 1.6011, + "step": 9140 + }, + { + "epoch": 2.402075738167964, + "grad_norm": 0.6203175187110901, + "learning_rate": 1.9931662870159454e-05, + "loss": 1.6168, + "step": 9142 + }, + { + "epoch": 2.4026012415016256, + "grad_norm": 0.5582800507545471, + "learning_rate": 1.99141405291747e-05, + "loss": 1.6206, + "step": 9144 + }, + { + "epoch": 2.4031267448352875, + "grad_norm": 0.6481848955154419, + "learning_rate": 1.9896618188189946e-05, + "loss": 1.6198, + "step": 9146 + }, + { + "epoch": 2.4036522481689495, + "grad_norm": 0.5764234662055969, + "learning_rate": 1.9879095847205187e-05, + "loss": 1.6328, + "step": 9148 + }, + { + "epoch": 2.404177751502611, + "grad_norm": 0.6788586378097534, + "learning_rate": 1.986157350622043e-05, + "loss": 1.6166, + "step": 9150 + }, + { + "epoch": 2.404703254836273, + "grad_norm": 0.5868250727653503, + "learning_rate": 1.984405116523568e-05, + "loss": 1.5952, + "step": 9152 + }, + { + "epoch": 2.4052287581699345, + "grad_norm": 0.6610315442085266, + "learning_rate": 1.982652882425092e-05, + "loss": 1.597, + "step": 9154 + }, + { + "epoch": 2.4057542615035965, + "grad_norm": 0.7634231448173523, + "learning_rate": 1.9809006483266167e-05, + "loss": 1.6235, + "step": 9156 + }, + { + "epoch": 2.406279764837258, + "grad_norm": 0.5905351638793945, + "learning_rate": 1.979148414228141e-05, + "loss": 1.6214, + "step": 9158 + }, + { + "epoch": 2.40680526817092, + "grad_norm": 0.5715067982673645, + "learning_rate": 1.9773961801296652e-05, + "loss": 1.5922, + "step": 9160 + }, + { + "epoch": 2.407330771504582, + "grad_norm": 0.593390941619873, + "learning_rate": 1.97564394603119e-05, + "loss": 1.6207, + "step": 9162 + }, + { + "epoch": 2.4078562748382435, + "grad_norm": 0.5678249597549438, + "learning_rate": 1.9738917119327144e-05, + "loss": 1.6033, + "step": 9164 + }, + { + "epoch": 2.4083817781719055, + "grad_norm": 0.6406270861625671, + "learning_rate": 1.9721394778342388e-05, + "loss": 1.6363, + "step": 9166 + }, + { + "epoch": 2.408907281505567, + "grad_norm": 0.5460156202316284, + "learning_rate": 1.9703872437357632e-05, + "loss": 1.6043, + "step": 9168 + }, + { + "epoch": 2.409432784839229, + "grad_norm": 0.5826367735862732, + "learning_rate": 1.9686350096372876e-05, + "loss": 1.6166, + "step": 9170 + }, + { + "epoch": 2.4099582881728905, + "grad_norm": 0.5770626068115234, + "learning_rate": 1.966882775538812e-05, + "loss": 1.5933, + "step": 9172 + }, + { + "epoch": 2.4104837915065525, + "grad_norm": 0.5596659183502197, + "learning_rate": 1.9651305414403364e-05, + "loss": 1.6123, + "step": 9174 + }, + { + "epoch": 2.411009294840214, + "grad_norm": 0.6251547336578369, + "learning_rate": 1.963378307341861e-05, + "loss": 1.6105, + "step": 9176 + }, + { + "epoch": 2.411534798173876, + "grad_norm": 0.566064715385437, + "learning_rate": 1.9616260732433856e-05, + "loss": 1.6127, + "step": 9178 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5645999908447266, + "learning_rate": 1.9598738391449097e-05, + "loss": 1.6105, + "step": 9180 + }, + { + "epoch": 2.4125858048411994, + "grad_norm": 0.5924414396286011, + "learning_rate": 1.9581216050464345e-05, + "loss": 1.6258, + "step": 9182 + }, + { + "epoch": 2.4131113081748614, + "grad_norm": 0.5936945080757141, + "learning_rate": 1.956369370947959e-05, + "loss": 1.6088, + "step": 9184 + }, + { + "epoch": 2.413636811508523, + "grad_norm": 0.711117148399353, + "learning_rate": 1.954617136849483e-05, + "loss": 1.5842, + "step": 9186 + }, + { + "epoch": 2.414162314842185, + "grad_norm": 0.5747193694114685, + "learning_rate": 1.9528649027510077e-05, + "loss": 1.5934, + "step": 9188 + }, + { + "epoch": 2.4146878181758464, + "grad_norm": 0.6013287901878357, + "learning_rate": 1.951112668652532e-05, + "loss": 1.5854, + "step": 9190 + }, + { + "epoch": 2.4152133215095084, + "grad_norm": 0.680145263671875, + "learning_rate": 1.9493604345540565e-05, + "loss": 1.6323, + "step": 9192 + }, + { + "epoch": 2.41573882484317, + "grad_norm": 0.6533603072166443, + "learning_rate": 1.947608200455581e-05, + "loss": 1.5944, + "step": 9194 + }, + { + "epoch": 2.416264328176832, + "grad_norm": 0.549355149269104, + "learning_rate": 1.9458559663571054e-05, + "loss": 1.628, + "step": 9196 + }, + { + "epoch": 2.416789831510494, + "grad_norm": 0.5181589722633362, + "learning_rate": 1.9441037322586298e-05, + "loss": 1.6072, + "step": 9198 + }, + { + "epoch": 2.4173153348441554, + "grad_norm": 0.5603959560394287, + "learning_rate": 1.9423514981601542e-05, + "loss": 1.6101, + "step": 9200 + }, + { + "epoch": 2.4173153348441554, + "eval_loss": 1.6572836637496948, + "eval_runtime": 486.1392, + "eval_samples_per_second": 250.523, + "eval_steps_per_second": 31.316, + "step": 9200 + }, + { + "epoch": 2.4178408381778174, + "grad_norm": 0.5568393468856812, + "learning_rate": 1.940599264061679e-05, + "loss": 1.5855, + "step": 9202 + }, + { + "epoch": 2.418366341511479, + "grad_norm": 0.6060105562210083, + "learning_rate": 1.938847029963203e-05, + "loss": 1.6208, + "step": 9204 + }, + { + "epoch": 2.418891844845141, + "grad_norm": 0.5444419384002686, + "learning_rate": 1.9370947958647275e-05, + "loss": 1.6137, + "step": 9206 + }, + { + "epoch": 2.4194173481788024, + "grad_norm": 0.7899800539016724, + "learning_rate": 1.9353425617662522e-05, + "loss": 1.6181, + "step": 9208 + }, + { + "epoch": 2.4199428515124644, + "grad_norm": 0.5947116017341614, + "learning_rate": 1.9335903276677763e-05, + "loss": 1.5974, + "step": 9210 + }, + { + "epoch": 2.420468354846126, + "grad_norm": 0.6116278171539307, + "learning_rate": 1.931838093569301e-05, + "loss": 1.6519, + "step": 9212 + }, + { + "epoch": 2.420993858179788, + "grad_norm": 0.645462691783905, + "learning_rate": 1.9300858594708255e-05, + "loss": 1.586, + "step": 9214 + }, + { + "epoch": 2.4215193615134494, + "grad_norm": 0.5454298853874207, + "learning_rate": 1.92833362537235e-05, + "loss": 1.5734, + "step": 9216 + }, + { + "epoch": 2.4220448648471113, + "grad_norm": 0.595712423324585, + "learning_rate": 1.9265813912738743e-05, + "loss": 1.6554, + "step": 9218 + }, + { + "epoch": 2.4225703681807733, + "grad_norm": 0.5496743321418762, + "learning_rate": 1.9248291571753987e-05, + "loss": 1.6148, + "step": 9220 + }, + { + "epoch": 2.423095871514435, + "grad_norm": 0.5601218342781067, + "learning_rate": 1.923076923076923e-05, + "loss": 1.59, + "step": 9222 + }, + { + "epoch": 2.423621374848097, + "grad_norm": 0.5808682441711426, + "learning_rate": 1.9213246889784476e-05, + "loss": 1.6106, + "step": 9224 + }, + { + "epoch": 2.4241468781817583, + "grad_norm": 0.6002793908119202, + "learning_rate": 1.919572454879972e-05, + "loss": 1.6091, + "step": 9226 + }, + { + "epoch": 2.4246723815154203, + "grad_norm": 0.5775554776191711, + "learning_rate": 1.9178202207814967e-05, + "loss": 1.5891, + "step": 9228 + }, + { + "epoch": 2.425197884849082, + "grad_norm": 0.6747956871986389, + "learning_rate": 1.9160679866830208e-05, + "loss": 1.5877, + "step": 9230 + }, + { + "epoch": 2.425723388182744, + "grad_norm": 0.549020528793335, + "learning_rate": 1.9143157525845452e-05, + "loss": 1.5931, + "step": 9232 + }, + { + "epoch": 2.4262488915164058, + "grad_norm": 0.6531383395195007, + "learning_rate": 1.91256351848607e-05, + "loss": 1.6393, + "step": 9234 + }, + { + "epoch": 2.4267743948500673, + "grad_norm": 0.7158095240592957, + "learning_rate": 1.910811284387594e-05, + "loss": 1.5868, + "step": 9236 + }, + { + "epoch": 2.4272998981837293, + "grad_norm": 0.5585731267929077, + "learning_rate": 1.9090590502891188e-05, + "loss": 1.6519, + "step": 9238 + }, + { + "epoch": 2.427825401517391, + "grad_norm": 0.6200599670410156, + "learning_rate": 1.9073068161906432e-05, + "loss": 1.6401, + "step": 9240 + }, + { + "epoch": 2.4283509048510528, + "grad_norm": 0.5844702124595642, + "learning_rate": 1.9055545820921673e-05, + "loss": 1.5893, + "step": 9242 + }, + { + "epoch": 2.4288764081847143, + "grad_norm": 0.5763403177261353, + "learning_rate": 1.903802347993692e-05, + "loss": 1.6056, + "step": 9244 + }, + { + "epoch": 2.4294019115183763, + "grad_norm": 0.6376482844352722, + "learning_rate": 1.9020501138952165e-05, + "loss": 1.6083, + "step": 9246 + }, + { + "epoch": 2.429927414852038, + "grad_norm": 0.6373746991157532, + "learning_rate": 1.900297879796741e-05, + "loss": 1.5721, + "step": 9248 + }, + { + "epoch": 2.4304529181856998, + "grad_norm": 0.572599470615387, + "learning_rate": 1.8985456456982653e-05, + "loss": 1.6161, + "step": 9250 + }, + { + "epoch": 2.4309784215193613, + "grad_norm": 0.5486949682235718, + "learning_rate": 1.8967934115997897e-05, + "loss": 1.591, + "step": 9252 + }, + { + "epoch": 2.4315039248530232, + "grad_norm": 0.6981235146522522, + "learning_rate": 1.8950411775013145e-05, + "loss": 1.6399, + "step": 9254 + }, + { + "epoch": 2.432029428186685, + "grad_norm": 0.7503864169120789, + "learning_rate": 1.8932889434028386e-05, + "loss": 1.6222, + "step": 9256 + }, + { + "epoch": 2.4325549315203467, + "grad_norm": 0.5889126062393188, + "learning_rate": 1.891536709304363e-05, + "loss": 1.6176, + "step": 9258 + }, + { + "epoch": 2.4330804348540087, + "grad_norm": 0.5790210962295532, + "learning_rate": 1.8897844752058878e-05, + "loss": 1.6321, + "step": 9260 + }, + { + "epoch": 2.4336059381876702, + "grad_norm": 0.5340055823326111, + "learning_rate": 1.888032241107412e-05, + "loss": 1.6009, + "step": 9262 + }, + { + "epoch": 2.434131441521332, + "grad_norm": 0.5527550578117371, + "learning_rate": 1.8862800070089366e-05, + "loss": 1.6173, + "step": 9264 + }, + { + "epoch": 2.4346569448549937, + "grad_norm": 0.6926127672195435, + "learning_rate": 1.884527772910461e-05, + "loss": 1.5944, + "step": 9266 + }, + { + "epoch": 2.4351824481886557, + "grad_norm": 0.5821343064308167, + "learning_rate": 1.8827755388119854e-05, + "loss": 1.6235, + "step": 9268 + }, + { + "epoch": 2.4357079515223177, + "grad_norm": 0.5754803419113159, + "learning_rate": 1.88102330471351e-05, + "loss": 1.6216, + "step": 9270 + }, + { + "epoch": 2.436233454855979, + "grad_norm": 0.5613455176353455, + "learning_rate": 1.8792710706150343e-05, + "loss": 1.607, + "step": 9272 + }, + { + "epoch": 2.436758958189641, + "grad_norm": 0.5742526650428772, + "learning_rate": 1.8775188365165587e-05, + "loss": 1.5704, + "step": 9274 + }, + { + "epoch": 2.4372844615233027, + "grad_norm": 0.5530628561973572, + "learning_rate": 1.875766602418083e-05, + "loss": 1.5849, + "step": 9276 + }, + { + "epoch": 2.4378099648569647, + "grad_norm": 0.5322666168212891, + "learning_rate": 1.8740143683196075e-05, + "loss": 1.5777, + "step": 9278 + }, + { + "epoch": 2.438335468190626, + "grad_norm": 0.5801582932472229, + "learning_rate": 1.8722621342211323e-05, + "loss": 1.6277, + "step": 9280 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.6362050175666809, + "learning_rate": 1.8705099001226563e-05, + "loss": 1.6053, + "step": 9282 + }, + { + "epoch": 2.43938647485795, + "grad_norm": 0.6227912306785583, + "learning_rate": 1.868757666024181e-05, + "loss": 1.6558, + "step": 9284 + }, + { + "epoch": 2.4399119781916117, + "grad_norm": 0.5319087505340576, + "learning_rate": 1.8670054319257055e-05, + "loss": 1.6402, + "step": 9286 + }, + { + "epoch": 2.440437481525273, + "grad_norm": 0.5639674067497253, + "learning_rate": 1.8652531978272296e-05, + "loss": 1.6191, + "step": 9288 + }, + { + "epoch": 2.440962984858935, + "grad_norm": 0.5504940748214722, + "learning_rate": 1.8635009637287544e-05, + "loss": 1.6054, + "step": 9290 + }, + { + "epoch": 2.441488488192597, + "grad_norm": 0.535571277141571, + "learning_rate": 1.8617487296302788e-05, + "loss": 1.5875, + "step": 9292 + }, + { + "epoch": 2.4420139915262586, + "grad_norm": 0.5902593731880188, + "learning_rate": 1.8599964955318032e-05, + "loss": 1.5771, + "step": 9294 + }, + { + "epoch": 2.4425394948599206, + "grad_norm": 0.5818725824356079, + "learning_rate": 1.8582442614333276e-05, + "loss": 1.5895, + "step": 9296 + }, + { + "epoch": 2.443064998193582, + "grad_norm": 0.5598388314247131, + "learning_rate": 1.856492027334852e-05, + "loss": 1.619, + "step": 9298 + }, + { + "epoch": 2.443590501527244, + "grad_norm": 0.6018016934394836, + "learning_rate": 1.8547397932363764e-05, + "loss": 1.5975, + "step": 9300 + }, + { + "epoch": 2.4441160048609056, + "grad_norm": 0.5516964197158813, + "learning_rate": 1.852987559137901e-05, + "loss": 1.6073, + "step": 9302 + }, + { + "epoch": 2.4446415081945676, + "grad_norm": 0.5745545625686646, + "learning_rate": 1.8512353250394253e-05, + "loss": 1.6345, + "step": 9304 + }, + { + "epoch": 2.4451670115282296, + "grad_norm": 0.5729696750640869, + "learning_rate": 1.8494830909409497e-05, + "loss": 1.6356, + "step": 9306 + }, + { + "epoch": 2.445692514861891, + "grad_norm": 0.6375647783279419, + "learning_rate": 1.847730856842474e-05, + "loss": 1.6262, + "step": 9308 + }, + { + "epoch": 2.446218018195553, + "grad_norm": 0.599912166595459, + "learning_rate": 1.845978622743999e-05, + "loss": 1.5833, + "step": 9310 + }, + { + "epoch": 2.4467435215292146, + "grad_norm": 0.6265223026275635, + "learning_rate": 1.844226388645523e-05, + "loss": 1.5683, + "step": 9312 + }, + { + "epoch": 2.4472690248628766, + "grad_norm": 0.5674051642417908, + "learning_rate": 1.8424741545470474e-05, + "loss": 1.6312, + "step": 9314 + }, + { + "epoch": 2.447794528196538, + "grad_norm": 0.5264484286308289, + "learning_rate": 1.840721920448572e-05, + "loss": 1.6509, + "step": 9316 + }, + { + "epoch": 2.4483200315302, + "grad_norm": 0.5643804669380188, + "learning_rate": 1.8389696863500965e-05, + "loss": 1.6002, + "step": 9318 + }, + { + "epoch": 2.448845534863862, + "grad_norm": 0.5581643581390381, + "learning_rate": 1.837217452251621e-05, + "loss": 1.602, + "step": 9320 + }, + { + "epoch": 2.4493710381975236, + "grad_norm": 0.5524643063545227, + "learning_rate": 1.8354652181531454e-05, + "loss": 1.6092, + "step": 9322 + }, + { + "epoch": 2.4498965415311855, + "grad_norm": 0.7081454992294312, + "learning_rate": 1.8337129840546698e-05, + "loss": 1.5703, + "step": 9324 + }, + { + "epoch": 2.450422044864847, + "grad_norm": 0.6080058217048645, + "learning_rate": 1.8319607499561942e-05, + "loss": 1.6261, + "step": 9326 + }, + { + "epoch": 2.450947548198509, + "grad_norm": 0.5197411775588989, + "learning_rate": 1.8302085158577186e-05, + "loss": 1.6313, + "step": 9328 + }, + { + "epoch": 2.4514730515321705, + "grad_norm": 0.5948798060417175, + "learning_rate": 1.8284562817592434e-05, + "loss": 1.6415, + "step": 9330 + }, + { + "epoch": 2.4519985548658325, + "grad_norm": 0.4755839705467224, + "learning_rate": 1.8267040476607675e-05, + "loss": 1.5945, + "step": 9332 + }, + { + "epoch": 2.452524058199494, + "grad_norm": 0.577564537525177, + "learning_rate": 1.824951813562292e-05, + "loss": 1.6056, + "step": 9334 + }, + { + "epoch": 2.453049561533156, + "grad_norm": 0.5504733324050903, + "learning_rate": 1.8231995794638166e-05, + "loss": 1.5843, + "step": 9336 + }, + { + "epoch": 2.4535750648668175, + "grad_norm": 0.5584169626235962, + "learning_rate": 1.8214473453653407e-05, + "loss": 1.6318, + "step": 9338 + }, + { + "epoch": 2.4541005682004795, + "grad_norm": 0.5598637461662292, + "learning_rate": 1.8196951112668655e-05, + "loss": 1.5763, + "step": 9340 + }, + { + "epoch": 2.4546260715341415, + "grad_norm": 0.6208490133285522, + "learning_rate": 1.81794287716839e-05, + "loss": 1.6339, + "step": 9342 + }, + { + "epoch": 2.455151574867803, + "grad_norm": 0.5243476033210754, + "learning_rate": 1.816190643069914e-05, + "loss": 1.5918, + "step": 9344 + }, + { + "epoch": 2.455677078201465, + "grad_norm": 0.6066216826438904, + "learning_rate": 1.8144384089714387e-05, + "loss": 1.6419, + "step": 9346 + }, + { + "epoch": 2.4562025815351265, + "grad_norm": 0.5985385179519653, + "learning_rate": 1.812686174872963e-05, + "loss": 1.6074, + "step": 9348 + }, + { + "epoch": 2.4567280848687885, + "grad_norm": 0.6051018834114075, + "learning_rate": 1.8109339407744876e-05, + "loss": 1.6076, + "step": 9350 + }, + { + "epoch": 2.45725358820245, + "grad_norm": 0.5187597274780273, + "learning_rate": 1.809181706676012e-05, + "loss": 1.6128, + "step": 9352 + }, + { + "epoch": 2.457779091536112, + "grad_norm": 0.576802670955658, + "learning_rate": 1.8074294725775364e-05, + "loss": 1.6185, + "step": 9354 + }, + { + "epoch": 2.458304594869774, + "grad_norm": 0.6253780722618103, + "learning_rate": 1.805677238479061e-05, + "loss": 1.5805, + "step": 9356 + }, + { + "epoch": 2.4588300982034355, + "grad_norm": 0.5701897740364075, + "learning_rate": 1.8039250043805852e-05, + "loss": 1.622, + "step": 9358 + }, + { + "epoch": 2.4593556015370974, + "grad_norm": 0.5329028367996216, + "learning_rate": 1.8021727702821096e-05, + "loss": 1.616, + "step": 9360 + }, + { + "epoch": 2.459881104870759, + "grad_norm": 0.6378101706504822, + "learning_rate": 1.8004205361836344e-05, + "loss": 1.6283, + "step": 9362 + }, + { + "epoch": 2.460406608204421, + "grad_norm": 0.5629276633262634, + "learning_rate": 1.7986683020851585e-05, + "loss": 1.5953, + "step": 9364 + }, + { + "epoch": 2.4609321115380824, + "grad_norm": 0.55552738904953, + "learning_rate": 1.7969160679866832e-05, + "loss": 1.588, + "step": 9366 + }, + { + "epoch": 2.4614576148717444, + "grad_norm": 0.6844131350517273, + "learning_rate": 1.7951638338882077e-05, + "loss": 1.6477, + "step": 9368 + }, + { + "epoch": 2.461983118205406, + "grad_norm": 0.5406688451766968, + "learning_rate": 1.7934115997897317e-05, + "loss": 1.6109, + "step": 9370 + }, + { + "epoch": 2.462508621539068, + "grad_norm": 0.5940281748771667, + "learning_rate": 1.7916593656912565e-05, + "loss": 1.5788, + "step": 9372 + }, + { + "epoch": 2.4630341248727294, + "grad_norm": 0.5564101338386536, + "learning_rate": 1.789907131592781e-05, + "loss": 1.5828, + "step": 9374 + }, + { + "epoch": 2.4635596282063914, + "grad_norm": 0.8060266971588135, + "learning_rate": 1.7881548974943053e-05, + "loss": 1.5922, + "step": 9376 + }, + { + "epoch": 2.4640851315400534, + "grad_norm": 0.6775248646736145, + "learning_rate": 1.7864026633958297e-05, + "loss": 1.6292, + "step": 9378 + }, + { + "epoch": 2.464610634873715, + "grad_norm": 0.5210546851158142, + "learning_rate": 1.784650429297354e-05, + "loss": 1.5921, + "step": 9380 + }, + { + "epoch": 2.465136138207377, + "grad_norm": 0.5210246443748474, + "learning_rate": 1.7828981951988786e-05, + "loss": 1.5947, + "step": 9382 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.5449514985084534, + "learning_rate": 1.781145961100403e-05, + "loss": 1.5716, + "step": 9384 + }, + { + "epoch": 2.4661871448747004, + "grad_norm": 0.5264145135879517, + "learning_rate": 1.7793937270019274e-05, + "loss": 1.5807, + "step": 9386 + }, + { + "epoch": 2.466712648208362, + "grad_norm": 0.6389797329902649, + "learning_rate": 1.777641492903452e-05, + "loss": 1.5949, + "step": 9388 + }, + { + "epoch": 2.467238151542024, + "grad_norm": 0.5606738924980164, + "learning_rate": 1.7758892588049762e-05, + "loss": 1.5973, + "step": 9390 + }, + { + "epoch": 2.467763654875686, + "grad_norm": 0.634388267993927, + "learning_rate": 1.774137024706501e-05, + "loss": 1.6466, + "step": 9392 + }, + { + "epoch": 2.4682891582093474, + "grad_norm": 0.6049827337265015, + "learning_rate": 1.7723847906080254e-05, + "loss": 1.6121, + "step": 9394 + }, + { + "epoch": 2.4688146615430093, + "grad_norm": 0.6313089728355408, + "learning_rate": 1.77063255650955e-05, + "loss": 1.6176, + "step": 9396 + }, + { + "epoch": 2.469340164876671, + "grad_norm": 0.6693941950798035, + "learning_rate": 1.7688803224110743e-05, + "loss": 1.6162, + "step": 9398 + }, + { + "epoch": 2.469865668210333, + "grad_norm": 0.5389882326126099, + "learning_rate": 1.7671280883125987e-05, + "loss": 1.5935, + "step": 9400 + }, + { + "epoch": 2.4703911715439943, + "grad_norm": 0.6095534563064575, + "learning_rate": 1.765375854214123e-05, + "loss": 1.624, + "step": 9402 + }, + { + "epoch": 2.4709166748776563, + "grad_norm": 0.5612223744392395, + "learning_rate": 1.7636236201156475e-05, + "loss": 1.64, + "step": 9404 + }, + { + "epoch": 2.471442178211318, + "grad_norm": 0.58417147397995, + "learning_rate": 1.761871386017172e-05, + "loss": 1.5979, + "step": 9406 + }, + { + "epoch": 2.47196768154498, + "grad_norm": 0.5258997082710266, + "learning_rate": 1.7601191519186963e-05, + "loss": 1.5915, + "step": 9408 + }, + { + "epoch": 2.4724931848786413, + "grad_norm": 0.5420629382133484, + "learning_rate": 1.7583669178202208e-05, + "loss": 1.6292, + "step": 9410 + }, + { + "epoch": 2.4730186882123033, + "grad_norm": 0.5879486799240112, + "learning_rate": 1.7566146837217455e-05, + "loss": 1.5718, + "step": 9412 + }, + { + "epoch": 2.4735441915459653, + "grad_norm": 0.5970991253852844, + "learning_rate": 1.7548624496232696e-05, + "loss": 1.6217, + "step": 9414 + }, + { + "epoch": 2.474069694879627, + "grad_norm": 0.5610321760177612, + "learning_rate": 1.753110215524794e-05, + "loss": 1.6234, + "step": 9416 + }, + { + "epoch": 2.4745951982132888, + "grad_norm": 0.6712002158164978, + "learning_rate": 1.7513579814263188e-05, + "loss": 1.5815, + "step": 9418 + }, + { + "epoch": 2.4751207015469503, + "grad_norm": 0.5543602108955383, + "learning_rate": 1.7496057473278432e-05, + "loss": 1.616, + "step": 9420 + }, + { + "epoch": 2.4756462048806123, + "grad_norm": 0.6275373101234436, + "learning_rate": 1.7478535132293676e-05, + "loss": 1.61, + "step": 9422 + }, + { + "epoch": 2.476171708214274, + "grad_norm": 0.6385626792907715, + "learning_rate": 1.746101279130892e-05, + "loss": 1.5826, + "step": 9424 + }, + { + "epoch": 2.4766972115479358, + "grad_norm": 0.5762410759925842, + "learning_rate": 1.7443490450324164e-05, + "loss": 1.5961, + "step": 9426 + }, + { + "epoch": 2.4772227148815977, + "grad_norm": 0.5379599332809448, + "learning_rate": 1.742596810933941e-05, + "loss": 1.6381, + "step": 9428 + }, + { + "epoch": 2.4777482182152593, + "grad_norm": 0.5608949661254883, + "learning_rate": 1.7408445768354653e-05, + "loss": 1.6073, + "step": 9430 + }, + { + "epoch": 2.4782737215489212, + "grad_norm": 0.5401448607444763, + "learning_rate": 1.7390923427369897e-05, + "loss": 1.6178, + "step": 9432 + }, + { + "epoch": 2.4787992248825828, + "grad_norm": 0.648591935634613, + "learning_rate": 1.737340108638514e-05, + "loss": 1.5938, + "step": 9434 + }, + { + "epoch": 2.4793247282162447, + "grad_norm": 0.710050106048584, + "learning_rate": 1.7355878745400385e-05, + "loss": 1.6104, + "step": 9436 + }, + { + "epoch": 2.4798502315499062, + "grad_norm": 0.5923548340797424, + "learning_rate": 1.7338356404415633e-05, + "loss": 1.6235, + "step": 9438 + }, + { + "epoch": 2.480375734883568, + "grad_norm": 0.6905649900436401, + "learning_rate": 1.7320834063430874e-05, + "loss": 1.5891, + "step": 9440 + }, + { + "epoch": 2.48090123821723, + "grad_norm": 0.6398130059242249, + "learning_rate": 1.7303311722446118e-05, + "loss": 1.5641, + "step": 9442 + }, + { + "epoch": 2.4814267415508917, + "grad_norm": 0.6153111457824707, + "learning_rate": 1.7285789381461365e-05, + "loss": 1.6005, + "step": 9444 + }, + { + "epoch": 2.4819522448845532, + "grad_norm": 0.5550106763839722, + "learning_rate": 1.7268267040476606e-05, + "loss": 1.5992, + "step": 9446 + }, + { + "epoch": 2.482477748218215, + "grad_norm": 0.5491114258766174, + "learning_rate": 1.7250744699491854e-05, + "loss": 1.5753, + "step": 9448 + }, + { + "epoch": 2.483003251551877, + "grad_norm": 0.5267966985702515, + "learning_rate": 1.7233222358507098e-05, + "loss": 1.5962, + "step": 9450 + }, + { + "epoch": 2.4835287548855387, + "grad_norm": 0.6979205012321472, + "learning_rate": 1.7215700017522342e-05, + "loss": 1.5966, + "step": 9452 + }, + { + "epoch": 2.4840542582192007, + "grad_norm": 0.5285736918449402, + "learning_rate": 1.7198177676537586e-05, + "loss": 1.5854, + "step": 9454 + }, + { + "epoch": 2.484579761552862, + "grad_norm": 0.5110975503921509, + "learning_rate": 1.718065533555283e-05, + "loss": 1.6251, + "step": 9456 + }, + { + "epoch": 2.485105264886524, + "grad_norm": 0.5918470025062561, + "learning_rate": 1.7163132994568078e-05, + "loss": 1.5997, + "step": 9458 + }, + { + "epoch": 2.4856307682201857, + "grad_norm": 0.5723440051078796, + "learning_rate": 1.714561065358332e-05, + "loss": 1.5997, + "step": 9460 + }, + { + "epoch": 2.4861562715538477, + "grad_norm": 0.7100077271461487, + "learning_rate": 1.7128088312598563e-05, + "loss": 1.5781, + "step": 9462 + }, + { + "epoch": 2.4866817748875096, + "grad_norm": 0.6034718155860901, + "learning_rate": 1.711056597161381e-05, + "loss": 1.6122, + "step": 9464 + }, + { + "epoch": 2.487207278221171, + "grad_norm": 0.6209121346473694, + "learning_rate": 1.709304363062905e-05, + "loss": 1.6385, + "step": 9466 + }, + { + "epoch": 2.487732781554833, + "grad_norm": 0.6605438590049744, + "learning_rate": 1.70755212896443e-05, + "loss": 1.5764, + "step": 9468 + }, + { + "epoch": 2.4882582848884947, + "grad_norm": 0.5605977773666382, + "learning_rate": 1.7057998948659543e-05, + "loss": 1.5951, + "step": 9470 + }, + { + "epoch": 2.4887837882221566, + "grad_norm": 0.5933828949928284, + "learning_rate": 1.7040476607674784e-05, + "loss": 1.6321, + "step": 9472 + }, + { + "epoch": 2.489309291555818, + "grad_norm": 0.6970359683036804, + "learning_rate": 1.702295426669003e-05, + "loss": 1.6462, + "step": 9474 + }, + { + "epoch": 2.48983479488948, + "grad_norm": 0.5877122282981873, + "learning_rate": 1.7005431925705276e-05, + "loss": 1.6058, + "step": 9476 + }, + { + "epoch": 2.490360298223142, + "grad_norm": 0.8348531126976013, + "learning_rate": 1.698790958472052e-05, + "loss": 1.6066, + "step": 9478 + }, + { + "epoch": 2.4908858015568036, + "grad_norm": 0.5285534262657166, + "learning_rate": 1.6970387243735764e-05, + "loss": 1.6145, + "step": 9480 + }, + { + "epoch": 2.4914113048904656, + "grad_norm": 0.5586251020431519, + "learning_rate": 1.6952864902751008e-05, + "loss": 1.618, + "step": 9482 + }, + { + "epoch": 2.491936808224127, + "grad_norm": 0.6798741817474365, + "learning_rate": 1.6935342561766252e-05, + "loss": 1.5923, + "step": 9484 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.6360746622085571, + "learning_rate": 1.6917820220781496e-05, + "loss": 1.5958, + "step": 9486 + }, + { + "epoch": 2.4929878148914506, + "grad_norm": 0.6008504629135132, + "learning_rate": 1.690029787979674e-05, + "loss": 1.6299, + "step": 9488 + }, + { + "epoch": 2.4935133182251126, + "grad_norm": 0.6286047697067261, + "learning_rate": 1.6882775538811988e-05, + "loss": 1.6249, + "step": 9490 + }, + { + "epoch": 2.494038821558774, + "grad_norm": 0.5566932559013367, + "learning_rate": 1.686525319782723e-05, + "loss": 1.6175, + "step": 9492 + }, + { + "epoch": 2.494564324892436, + "grad_norm": 0.5241519212722778, + "learning_rate": 1.6847730856842477e-05, + "loss": 1.5978, + "step": 9494 + }, + { + "epoch": 2.4950898282260976, + "grad_norm": 0.5487810969352722, + "learning_rate": 1.683020851585772e-05, + "loss": 1.6114, + "step": 9496 + }, + { + "epoch": 2.4956153315597596, + "grad_norm": 0.518281877040863, + "learning_rate": 1.681268617487296e-05, + "loss": 1.5993, + "step": 9498 + }, + { + "epoch": 2.4961408348934215, + "grad_norm": 0.5262603163719177, + "learning_rate": 1.679516383388821e-05, + "loss": 1.5982, + "step": 9500 + }, + { + "epoch": 2.496666338227083, + "grad_norm": 0.560440182685852, + "learning_rate": 1.6777641492903453e-05, + "loss": 1.6281, + "step": 9502 + }, + { + "epoch": 2.497191841560745, + "grad_norm": 0.6901085376739502, + "learning_rate": 1.6760119151918697e-05, + "loss": 1.6069, + "step": 9504 + }, + { + "epoch": 2.4977173448944066, + "grad_norm": 0.6319709420204163, + "learning_rate": 1.674259681093394e-05, + "loss": 1.6053, + "step": 9506 + }, + { + "epoch": 2.4982428482280685, + "grad_norm": 0.6285496354103088, + "learning_rate": 1.6725074469949186e-05, + "loss": 1.6269, + "step": 9508 + }, + { + "epoch": 2.49876835156173, + "grad_norm": 0.5803909301757812, + "learning_rate": 1.670755212896443e-05, + "loss": 1.5917, + "step": 9510 + }, + { + "epoch": 2.499293854895392, + "grad_norm": 0.519927978515625, + "learning_rate": 1.6690029787979674e-05, + "loss": 1.6247, + "step": 9512 + }, + { + "epoch": 2.499819358229054, + "grad_norm": 0.5453500747680664, + "learning_rate": 1.6672507446994918e-05, + "loss": 1.582, + "step": 9514 + }, + { + "epoch": 2.5003448615627155, + "grad_norm": 0.7243382930755615, + "learning_rate": 1.6654985106010162e-05, + "loss": 1.5941, + "step": 9516 + }, + { + "epoch": 2.500870364896377, + "grad_norm": 0.6101195812225342, + "learning_rate": 1.6637462765025407e-05, + "loss": 1.5646, + "step": 9518 + }, + { + "epoch": 2.501395868230039, + "grad_norm": 0.5308881402015686, + "learning_rate": 1.6619940424040654e-05, + "loss": 1.5863, + "step": 9520 + }, + { + "epoch": 2.501921371563701, + "grad_norm": 0.624251663684845, + "learning_rate": 1.66024180830559e-05, + "loss": 1.5914, + "step": 9522 + }, + { + "epoch": 2.5024468748973625, + "grad_norm": 0.6056520938873291, + "learning_rate": 1.6584895742071143e-05, + "loss": 1.5891, + "step": 9524 + }, + { + "epoch": 2.5029723782310245, + "grad_norm": 0.6396739482879639, + "learning_rate": 1.6567373401086387e-05, + "loss": 1.6248, + "step": 9526 + }, + { + "epoch": 2.5034978815646864, + "grad_norm": 0.5661067962646484, + "learning_rate": 1.654985106010163e-05, + "loss": 1.5923, + "step": 9528 + }, + { + "epoch": 2.504023384898348, + "grad_norm": 0.601337194442749, + "learning_rate": 1.6532328719116875e-05, + "loss": 1.6074, + "step": 9530 + }, + { + "epoch": 2.5045488882320095, + "grad_norm": 0.603577733039856, + "learning_rate": 1.651480637813212e-05, + "loss": 1.6247, + "step": 9532 + }, + { + "epoch": 2.5050743915656715, + "grad_norm": 0.5598317980766296, + "learning_rate": 1.6497284037147363e-05, + "loss": 1.6353, + "step": 9534 + }, + { + "epoch": 2.5055998948993334, + "grad_norm": 0.6507828831672668, + "learning_rate": 1.6479761696162608e-05, + "loss": 1.6215, + "step": 9536 + }, + { + "epoch": 2.506125398232995, + "grad_norm": 0.5373896360397339, + "learning_rate": 1.6462239355177852e-05, + "loss": 1.5898, + "step": 9538 + }, + { + "epoch": 2.506650901566657, + "grad_norm": 0.6569491028785706, + "learning_rate": 1.64447170141931e-05, + "loss": 1.6061, + "step": 9540 + }, + { + "epoch": 2.5071764049003185, + "grad_norm": 0.6949899196624756, + "learning_rate": 1.642719467320834e-05, + "loss": 1.5774, + "step": 9542 + }, + { + "epoch": 2.5077019082339804, + "grad_norm": 0.6420979499816895, + "learning_rate": 1.6409672332223584e-05, + "loss": 1.5848, + "step": 9544 + }, + { + "epoch": 2.508227411567642, + "grad_norm": 0.5891046524047852, + "learning_rate": 1.6392149991238832e-05, + "loss": 1.6175, + "step": 9546 + }, + { + "epoch": 2.508752914901304, + "grad_norm": 0.5385305285453796, + "learning_rate": 1.6374627650254073e-05, + "loss": 1.6017, + "step": 9548 + }, + { + "epoch": 2.509278418234966, + "grad_norm": 0.5676510334014893, + "learning_rate": 1.635710530926932e-05, + "loss": 1.5919, + "step": 9550 + }, + { + "epoch": 2.5098039215686274, + "grad_norm": 0.6153396964073181, + "learning_rate": 1.6339582968284564e-05, + "loss": 1.6062, + "step": 9552 + }, + { + "epoch": 2.5103294249022894, + "grad_norm": 0.625535786151886, + "learning_rate": 1.6322060627299805e-05, + "loss": 1.6017, + "step": 9554 + }, + { + "epoch": 2.510854928235951, + "grad_norm": 0.5933648347854614, + "learning_rate": 1.6304538286315053e-05, + "loss": 1.6464, + "step": 9556 + }, + { + "epoch": 2.511380431569613, + "grad_norm": 0.5222533941268921, + "learning_rate": 1.6287015945330297e-05, + "loss": 1.6486, + "step": 9558 + }, + { + "epoch": 2.5119059349032744, + "grad_norm": 0.511766791343689, + "learning_rate": 1.626949360434554e-05, + "loss": 1.6111, + "step": 9560 + }, + { + "epoch": 2.5124314382369364, + "grad_norm": 0.6270935535430908, + "learning_rate": 1.6251971263360785e-05, + "loss": 1.6497, + "step": 9562 + }, + { + "epoch": 2.5129569415705983, + "grad_norm": 0.6762127876281738, + "learning_rate": 1.623444892237603e-05, + "loss": 1.5915, + "step": 9564 + }, + { + "epoch": 2.51348244490426, + "grad_norm": 0.7019566297531128, + "learning_rate": 1.6216926581391277e-05, + "loss": 1.5817, + "step": 9566 + }, + { + "epoch": 2.5140079482379214, + "grad_norm": 0.6118047833442688, + "learning_rate": 1.6199404240406518e-05, + "loss": 1.6241, + "step": 9568 + }, + { + "epoch": 2.5145334515715834, + "grad_norm": 0.5920790433883667, + "learning_rate": 1.6181881899421762e-05, + "loss": 1.607, + "step": 9570 + }, + { + "epoch": 2.5150589549052453, + "grad_norm": 0.5979636311531067, + "learning_rate": 1.616435955843701e-05, + "loss": 1.5874, + "step": 9572 + }, + { + "epoch": 2.515584458238907, + "grad_norm": 0.6232622265815735, + "learning_rate": 1.614683721745225e-05, + "loss": 1.5878, + "step": 9574 + }, + { + "epoch": 2.516109961572569, + "grad_norm": 0.5250893235206604, + "learning_rate": 1.6129314876467498e-05, + "loss": 1.5994, + "step": 9576 + }, + { + "epoch": 2.5166354649062304, + "grad_norm": 0.5512763857841492, + "learning_rate": 1.6111792535482742e-05, + "loss": 1.6041, + "step": 9578 + }, + { + "epoch": 2.5171609682398923, + "grad_norm": 0.5368601083755493, + "learning_rate": 1.6094270194497986e-05, + "loss": 1.6131, + "step": 9580 + }, + { + "epoch": 2.517686471573554, + "grad_norm": 0.6187223792076111, + "learning_rate": 1.607674785351323e-05, + "loss": 1.6238, + "step": 9582 + }, + { + "epoch": 2.518211974907216, + "grad_norm": 0.5821846127510071, + "learning_rate": 1.6059225512528475e-05, + "loss": 1.6262, + "step": 9584 + }, + { + "epoch": 2.518737478240878, + "grad_norm": 0.5627323985099792, + "learning_rate": 1.604170317154372e-05, + "loss": 1.6379, + "step": 9586 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5581722259521484, + "learning_rate": 1.6024180830558963e-05, + "loss": 1.5849, + "step": 9588 + }, + { + "epoch": 2.5197884849082013, + "grad_norm": 0.677148163318634, + "learning_rate": 1.6006658489574207e-05, + "loss": 1.6116, + "step": 9590 + }, + { + "epoch": 2.520313988241863, + "grad_norm": 0.5418302416801453, + "learning_rate": 1.5989136148589455e-05, + "loss": 1.5981, + "step": 9592 + }, + { + "epoch": 2.520839491575525, + "grad_norm": 0.5710042715072632, + "learning_rate": 1.5971613807604695e-05, + "loss": 1.5515, + "step": 9594 + }, + { + "epoch": 2.5213649949091863, + "grad_norm": 0.5451944470405579, + "learning_rate": 1.5954091466619943e-05, + "loss": 1.603, + "step": 9596 + }, + { + "epoch": 2.5218904982428483, + "grad_norm": 0.7278261780738831, + "learning_rate": 1.5936569125635187e-05, + "loss": 1.6079, + "step": 9598 + }, + { + "epoch": 2.5224160015765102, + "grad_norm": 0.5918084979057312, + "learning_rate": 1.5919046784650428e-05, + "loss": 1.6009, + "step": 9600 + }, + { + "epoch": 2.5224160015765102, + "eval_loss": 1.6515436172485352, + "eval_runtime": 487.0879, + "eval_samples_per_second": 250.035, + "eval_steps_per_second": 31.255, + "step": 9600 + }, + { + "epoch": 2.5229415049101718, + "grad_norm": 0.6685560345649719, + "learning_rate": 1.5901524443665676e-05, + "loss": 1.6337, + "step": 9602 + }, + { + "epoch": 2.5234670082438333, + "grad_norm": 0.579550564289093, + "learning_rate": 1.588400210268092e-05, + "loss": 1.5911, + "step": 9604 + }, + { + "epoch": 2.5239925115774953, + "grad_norm": 0.522769570350647, + "learning_rate": 1.5866479761696164e-05, + "loss": 1.612, + "step": 9606 + }, + { + "epoch": 2.5245180149111572, + "grad_norm": 0.6259097456932068, + "learning_rate": 1.5848957420711408e-05, + "loss": 1.6076, + "step": 9608 + }, + { + "epoch": 2.5250435182448188, + "grad_norm": 0.5262504816055298, + "learning_rate": 1.5831435079726652e-05, + "loss": 1.5794, + "step": 9610 + }, + { + "epoch": 2.5255690215784807, + "grad_norm": 0.6325481534004211, + "learning_rate": 1.5813912738741896e-05, + "loss": 1.5997, + "step": 9612 + }, + { + "epoch": 2.5260945249121423, + "grad_norm": 0.7307614088058472, + "learning_rate": 1.579639039775714e-05, + "loss": 1.624, + "step": 9614 + }, + { + "epoch": 2.5266200282458042, + "grad_norm": 0.5761781334877014, + "learning_rate": 1.5778868056772385e-05, + "loss": 1.6469, + "step": 9616 + }, + { + "epoch": 2.5271455315794658, + "grad_norm": 0.543783962726593, + "learning_rate": 1.576134571578763e-05, + "loss": 1.613, + "step": 9618 + }, + { + "epoch": 2.5276710349131277, + "grad_norm": 0.5635935068130493, + "learning_rate": 1.5743823374802873e-05, + "loss": 1.6249, + "step": 9620 + }, + { + "epoch": 2.5281965382467897, + "grad_norm": 0.5564760565757751, + "learning_rate": 1.572630103381812e-05, + "loss": 1.5941, + "step": 9622 + }, + { + "epoch": 2.528722041580451, + "grad_norm": 0.7148152589797974, + "learning_rate": 1.5708778692833365e-05, + "loss": 1.6357, + "step": 9624 + }, + { + "epoch": 2.529247544914113, + "grad_norm": 0.7149800062179565, + "learning_rate": 1.5691256351848606e-05, + "loss": 1.6298, + "step": 9626 + }, + { + "epoch": 2.5297730482477747, + "grad_norm": 0.6550901532173157, + "learning_rate": 1.5673734010863853e-05, + "loss": 1.6008, + "step": 9628 + }, + { + "epoch": 2.5302985515814367, + "grad_norm": 0.5404581427574158, + "learning_rate": 1.5656211669879097e-05, + "loss": 1.5608, + "step": 9630 + }, + { + "epoch": 2.530824054915098, + "grad_norm": 0.674629807472229, + "learning_rate": 1.563868932889434e-05, + "loss": 1.6215, + "step": 9632 + }, + { + "epoch": 2.53134955824876, + "grad_norm": 0.6244335770606995, + "learning_rate": 1.5621166987909586e-05, + "loss": 1.6011, + "step": 9634 + }, + { + "epoch": 2.531875061582422, + "grad_norm": 0.6089237332344055, + "learning_rate": 1.560364464692483e-05, + "loss": 1.5962, + "step": 9636 + }, + { + "epoch": 2.5324005649160837, + "grad_norm": 0.5669505000114441, + "learning_rate": 1.5586122305940074e-05, + "loss": 1.5876, + "step": 9638 + }, + { + "epoch": 2.532926068249745, + "grad_norm": 0.5484370589256287, + "learning_rate": 1.5568599964955318e-05, + "loss": 1.5994, + "step": 9640 + }, + { + "epoch": 2.533451571583407, + "grad_norm": 0.6164976954460144, + "learning_rate": 1.5551077623970562e-05, + "loss": 1.5965, + "step": 9642 + }, + { + "epoch": 2.533977074917069, + "grad_norm": 0.6072930693626404, + "learning_rate": 1.5533555282985807e-05, + "loss": 1.6167, + "step": 9644 + }, + { + "epoch": 2.5345025782507307, + "grad_norm": 0.6487789750099182, + "learning_rate": 1.551603294200105e-05, + "loss": 1.6376, + "step": 9646 + }, + { + "epoch": 2.5350280815843926, + "grad_norm": 0.641263484954834, + "learning_rate": 1.54985106010163e-05, + "loss": 1.6516, + "step": 9648 + }, + { + "epoch": 2.535553584918054, + "grad_norm": 0.6348699927330017, + "learning_rate": 1.548098826003154e-05, + "loss": 1.5979, + "step": 9650 + }, + { + "epoch": 2.536079088251716, + "grad_norm": 0.5915436744689941, + "learning_rate": 1.5463465919046787e-05, + "loss": 1.6025, + "step": 9652 + }, + { + "epoch": 2.5366045915853777, + "grad_norm": 0.6235045194625854, + "learning_rate": 1.544594357806203e-05, + "loss": 1.6093, + "step": 9654 + }, + { + "epoch": 2.5371300949190396, + "grad_norm": 0.5824310183525085, + "learning_rate": 1.542842123707727e-05, + "loss": 1.6317, + "step": 9656 + }, + { + "epoch": 2.5376555982527016, + "grad_norm": 0.7158166766166687, + "learning_rate": 1.541089889609252e-05, + "loss": 1.609, + "step": 9658 + }, + { + "epoch": 2.538181101586363, + "grad_norm": 0.6311430335044861, + "learning_rate": 1.5393376555107763e-05, + "loss": 1.6262, + "step": 9660 + }, + { + "epoch": 2.538706604920025, + "grad_norm": 0.6356306076049805, + "learning_rate": 1.5375854214123008e-05, + "loss": 1.5805, + "step": 9662 + }, + { + "epoch": 2.5392321082536866, + "grad_norm": 0.57065749168396, + "learning_rate": 1.5358331873138252e-05, + "loss": 1.6181, + "step": 9664 + }, + { + "epoch": 2.5397576115873486, + "grad_norm": 0.7448508143424988, + "learning_rate": 1.5340809532153496e-05, + "loss": 1.6235, + "step": 9666 + }, + { + "epoch": 2.54028311492101, + "grad_norm": 0.5627465844154358, + "learning_rate": 1.5323287191168743e-05, + "loss": 1.6273, + "step": 9668 + }, + { + "epoch": 2.540808618254672, + "grad_norm": 0.6133062839508057, + "learning_rate": 1.5305764850183984e-05, + "loss": 1.5949, + "step": 9670 + }, + { + "epoch": 2.541334121588334, + "grad_norm": 0.5348150134086609, + "learning_rate": 1.528824250919923e-05, + "loss": 1.6181, + "step": 9672 + }, + { + "epoch": 2.5418596249219956, + "grad_norm": 0.5559690594673157, + "learning_rate": 1.5270720168214476e-05, + "loss": 1.6224, + "step": 9674 + }, + { + "epoch": 2.542385128255657, + "grad_norm": 0.612876832485199, + "learning_rate": 1.5253197827229718e-05, + "loss": 1.6112, + "step": 9676 + }, + { + "epoch": 2.542910631589319, + "grad_norm": 0.697488009929657, + "learning_rate": 1.5235675486244963e-05, + "loss": 1.598, + "step": 9678 + }, + { + "epoch": 2.543436134922981, + "grad_norm": 0.6256018280982971, + "learning_rate": 1.5218153145260209e-05, + "loss": 1.6273, + "step": 9680 + }, + { + "epoch": 2.5439616382566426, + "grad_norm": 0.583524763584137, + "learning_rate": 1.5200630804275451e-05, + "loss": 1.648, + "step": 9682 + }, + { + "epoch": 2.5444871415903045, + "grad_norm": 0.682237446308136, + "learning_rate": 1.5183108463290697e-05, + "loss": 1.5871, + "step": 9684 + }, + { + "epoch": 2.5450126449239665, + "grad_norm": 0.5728941559791565, + "learning_rate": 1.5165586122305941e-05, + "loss": 1.5953, + "step": 9686 + }, + { + "epoch": 2.545538148257628, + "grad_norm": 0.5490486025810242, + "learning_rate": 1.5148063781321184e-05, + "loss": 1.6289, + "step": 9688 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.5297114849090576, + "learning_rate": 1.513054144033643e-05, + "loss": 1.6445, + "step": 9690 + }, + { + "epoch": 2.5465891549249515, + "grad_norm": 0.5523902177810669, + "learning_rate": 1.5113019099351675e-05, + "loss": 1.5715, + "step": 9692 + }, + { + "epoch": 2.5471146582586135, + "grad_norm": 0.5788924694061279, + "learning_rate": 1.509549675836692e-05, + "loss": 1.6178, + "step": 9694 + }, + { + "epoch": 2.547640161592275, + "grad_norm": 0.5816612839698792, + "learning_rate": 1.5077974417382162e-05, + "loss": 1.5855, + "step": 9696 + }, + { + "epoch": 2.548165664925937, + "grad_norm": 0.6361863017082214, + "learning_rate": 1.5060452076397408e-05, + "loss": 1.612, + "step": 9698 + }, + { + "epoch": 2.5486911682595985, + "grad_norm": 0.6106998324394226, + "learning_rate": 1.5042929735412654e-05, + "loss": 1.6482, + "step": 9700 + }, + { + "epoch": 2.5492166715932605, + "grad_norm": 0.623910665512085, + "learning_rate": 1.5025407394427896e-05, + "loss": 1.5743, + "step": 9702 + }, + { + "epoch": 2.549742174926922, + "grad_norm": 0.7251524925231934, + "learning_rate": 1.500788505344314e-05, + "loss": 1.6176, + "step": 9704 + }, + { + "epoch": 2.550267678260584, + "grad_norm": 0.5366904735565186, + "learning_rate": 1.4990362712458386e-05, + "loss": 1.5826, + "step": 9706 + }, + { + "epoch": 2.550793181594246, + "grad_norm": 0.5754824876785278, + "learning_rate": 1.4972840371473629e-05, + "loss": 1.6006, + "step": 9708 + }, + { + "epoch": 2.5513186849279075, + "grad_norm": 0.6944889426231384, + "learning_rate": 1.4955318030488875e-05, + "loss": 1.6318, + "step": 9710 + }, + { + "epoch": 2.5518441882615694, + "grad_norm": 0.6008621454238892, + "learning_rate": 1.4937795689504119e-05, + "loss": 1.6363, + "step": 9712 + }, + { + "epoch": 2.552369691595231, + "grad_norm": 0.6449068784713745, + "learning_rate": 1.4920273348519361e-05, + "loss": 1.6009, + "step": 9714 + }, + { + "epoch": 2.552895194928893, + "grad_norm": 0.687323272228241, + "learning_rate": 1.4902751007534607e-05, + "loss": 1.6247, + "step": 9716 + }, + { + "epoch": 2.5534206982625545, + "grad_norm": 0.6217597126960754, + "learning_rate": 1.4885228666549853e-05, + "loss": 1.6128, + "step": 9718 + }, + { + "epoch": 2.5539462015962164, + "grad_norm": 0.6320381760597229, + "learning_rate": 1.4867706325565095e-05, + "loss": 1.643, + "step": 9720 + }, + { + "epoch": 2.5544717049298784, + "grad_norm": 0.604827344417572, + "learning_rate": 1.485018398458034e-05, + "loss": 1.6073, + "step": 9722 + }, + { + "epoch": 2.55499720826354, + "grad_norm": 0.5245053768157959, + "learning_rate": 1.4832661643595585e-05, + "loss": 1.5963, + "step": 9724 + }, + { + "epoch": 2.5555227115972015, + "grad_norm": 0.6272046566009521, + "learning_rate": 1.4815139302610828e-05, + "loss": 1.5939, + "step": 9726 + }, + { + "epoch": 2.5560482149308634, + "grad_norm": 0.5605988502502441, + "learning_rate": 1.4797616961626074e-05, + "loss": 1.6283, + "step": 9728 + }, + { + "epoch": 2.5565737182645254, + "grad_norm": 0.5442174077033997, + "learning_rate": 1.4780094620641318e-05, + "loss": 1.5942, + "step": 9730 + }, + { + "epoch": 2.557099221598187, + "grad_norm": 0.806302011013031, + "learning_rate": 1.4762572279656564e-05, + "loss": 1.6132, + "step": 9732 + }, + { + "epoch": 2.557624724931849, + "grad_norm": 0.6235170960426331, + "learning_rate": 1.4745049938671806e-05, + "loss": 1.6062, + "step": 9734 + }, + { + "epoch": 2.5581502282655104, + "grad_norm": 0.6286174654960632, + "learning_rate": 1.4727527597687052e-05, + "loss": 1.6112, + "step": 9736 + }, + { + "epoch": 2.5586757315991724, + "grad_norm": 0.5533096194267273, + "learning_rate": 1.4710005256702296e-05, + "loss": 1.5906, + "step": 9738 + }, + { + "epoch": 2.559201234932834, + "grad_norm": 0.5867462754249573, + "learning_rate": 1.469248291571754e-05, + "loss": 1.5778, + "step": 9740 + }, + { + "epoch": 2.559726738266496, + "grad_norm": 0.5377023816108704, + "learning_rate": 1.4674960574732785e-05, + "loss": 1.5883, + "step": 9742 + }, + { + "epoch": 2.560252241600158, + "grad_norm": 0.5589543581008911, + "learning_rate": 1.465743823374803e-05, + "loss": 1.602, + "step": 9744 + }, + { + "epoch": 2.5607777449338194, + "grad_norm": 0.6195635795593262, + "learning_rate": 1.4639915892763273e-05, + "loss": 1.622, + "step": 9746 + }, + { + "epoch": 2.5613032482674813, + "grad_norm": 0.5796042680740356, + "learning_rate": 1.4622393551778519e-05, + "loss": 1.6333, + "step": 9748 + }, + { + "epoch": 2.561828751601143, + "grad_norm": 0.5899613499641418, + "learning_rate": 1.4604871210793763e-05, + "loss": 1.5945, + "step": 9750 + }, + { + "epoch": 2.562354254934805, + "grad_norm": 0.5779309272766113, + "learning_rate": 1.4587348869809006e-05, + "loss": 1.59, + "step": 9752 + }, + { + "epoch": 2.5628797582684664, + "grad_norm": 0.6350328326225281, + "learning_rate": 1.4569826528824251e-05, + "loss": 1.6087, + "step": 9754 + }, + { + "epoch": 2.5634052616021283, + "grad_norm": 0.5645972490310669, + "learning_rate": 1.4552304187839497e-05, + "loss": 1.6058, + "step": 9756 + }, + { + "epoch": 2.5639307649357903, + "grad_norm": 0.7305320501327515, + "learning_rate": 1.453478184685474e-05, + "loss": 1.5944, + "step": 9758 + }, + { + "epoch": 2.564456268269452, + "grad_norm": 0.600288987159729, + "learning_rate": 1.4517259505869984e-05, + "loss": 1.589, + "step": 9760 + }, + { + "epoch": 2.5649817716031134, + "grad_norm": 0.6395924091339111, + "learning_rate": 1.449973716488523e-05, + "loss": 1.5878, + "step": 9762 + }, + { + "epoch": 2.5655072749367753, + "grad_norm": 0.6453663110733032, + "learning_rate": 1.4482214823900476e-05, + "loss": 1.5795, + "step": 9764 + }, + { + "epoch": 2.5660327782704373, + "grad_norm": 0.6681911945343018, + "learning_rate": 1.4464692482915718e-05, + "loss": 1.5715, + "step": 9766 + }, + { + "epoch": 2.566558281604099, + "grad_norm": 0.8908346891403198, + "learning_rate": 1.4447170141930962e-05, + "loss": 1.604, + "step": 9768 + }, + { + "epoch": 2.567083784937761, + "grad_norm": 0.5479845404624939, + "learning_rate": 1.4429647800946208e-05, + "loss": 1.5835, + "step": 9770 + }, + { + "epoch": 2.5676092882714223, + "grad_norm": 0.5948062539100647, + "learning_rate": 1.441212545996145e-05, + "loss": 1.5969, + "step": 9772 + }, + { + "epoch": 2.5681347916050843, + "grad_norm": 0.5489128232002258, + "learning_rate": 1.4394603118976697e-05, + "loss": 1.6349, + "step": 9774 + }, + { + "epoch": 2.568660294938746, + "grad_norm": 0.6163507103919983, + "learning_rate": 1.437708077799194e-05, + "loss": 1.5721, + "step": 9776 + }, + { + "epoch": 2.569185798272408, + "grad_norm": 0.6882917881011963, + "learning_rate": 1.4359558437007183e-05, + "loss": 1.6179, + "step": 9778 + }, + { + "epoch": 2.5697113016060698, + "grad_norm": 0.5954879522323608, + "learning_rate": 1.434203609602243e-05, + "loss": 1.6136, + "step": 9780 + }, + { + "epoch": 2.5702368049397313, + "grad_norm": 0.6170343160629272, + "learning_rate": 1.4324513755037675e-05, + "loss": 1.585, + "step": 9782 + }, + { + "epoch": 2.5707623082733932, + "grad_norm": 0.5451639890670776, + "learning_rate": 1.4306991414052918e-05, + "loss": 1.5914, + "step": 9784 + }, + { + "epoch": 2.5712878116070548, + "grad_norm": 0.5690667629241943, + "learning_rate": 1.4289469073068162e-05, + "loss": 1.6272, + "step": 9786 + }, + { + "epoch": 2.5718133149407167, + "grad_norm": 0.7590607404708862, + "learning_rate": 1.4271946732083408e-05, + "loss": 1.6112, + "step": 9788 + }, + { + "epoch": 2.5723388182743783, + "grad_norm": 0.55152827501297, + "learning_rate": 1.425442439109865e-05, + "loss": 1.6291, + "step": 9790 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.5713075399398804, + "learning_rate": 1.4236902050113896e-05, + "loss": 1.6375, + "step": 9792 + }, + { + "epoch": 2.573389824941702, + "grad_norm": 0.5247568488121033, + "learning_rate": 1.421937970912914e-05, + "loss": 1.5643, + "step": 9794 + }, + { + "epoch": 2.5739153282753637, + "grad_norm": 0.7281646132469177, + "learning_rate": 1.4201857368144383e-05, + "loss": 1.5993, + "step": 9796 + }, + { + "epoch": 2.5744408316090253, + "grad_norm": 0.690531849861145, + "learning_rate": 1.4184335027159628e-05, + "loss": 1.6062, + "step": 9798 + }, + { + "epoch": 2.5749663349426872, + "grad_norm": 0.5356507897377014, + "learning_rate": 1.4166812686174874e-05, + "loss": 1.6073, + "step": 9800 + }, + { + "epoch": 2.575491838276349, + "grad_norm": 0.5589247941970825, + "learning_rate": 1.4149290345190118e-05, + "loss": 1.5874, + "step": 9802 + }, + { + "epoch": 2.5760173416100107, + "grad_norm": 0.570418119430542, + "learning_rate": 1.4131768004205363e-05, + "loss": 1.6107, + "step": 9804 + }, + { + "epoch": 2.5765428449436727, + "grad_norm": 0.6067774295806885, + "learning_rate": 1.4114245663220607e-05, + "loss": 1.6309, + "step": 9806 + }, + { + "epoch": 2.577068348277334, + "grad_norm": 0.5615005493164062, + "learning_rate": 1.4096723322235853e-05, + "loss": 1.6374, + "step": 9808 + }, + { + "epoch": 2.577593851610996, + "grad_norm": 0.5466208457946777, + "learning_rate": 1.4079200981251095e-05, + "loss": 1.6007, + "step": 9810 + }, + { + "epoch": 2.5781193549446577, + "grad_norm": 0.6845700740814209, + "learning_rate": 1.4061678640266341e-05, + "loss": 1.6121, + "step": 9812 + }, + { + "epoch": 2.5786448582783197, + "grad_norm": 0.6770169734954834, + "learning_rate": 1.4044156299281585e-05, + "loss": 1.6089, + "step": 9814 + }, + { + "epoch": 2.5791703616119817, + "grad_norm": 0.5839431881904602, + "learning_rate": 1.4026633958296828e-05, + "loss": 1.6216, + "step": 9816 + }, + { + "epoch": 2.579695864945643, + "grad_norm": 0.5246254205703735, + "learning_rate": 1.4009111617312074e-05, + "loss": 1.6482, + "step": 9818 + }, + { + "epoch": 2.580221368279305, + "grad_norm": 0.5945092439651489, + "learning_rate": 1.399158927632732e-05, + "loss": 1.6152, + "step": 9820 + }, + { + "epoch": 2.5807468716129667, + "grad_norm": 0.5551746487617493, + "learning_rate": 1.3974066935342562e-05, + "loss": 1.6123, + "step": 9822 + }, + { + "epoch": 2.5812723749466286, + "grad_norm": 0.5317911505699158, + "learning_rate": 1.3956544594357806e-05, + "loss": 1.6127, + "step": 9824 + }, + { + "epoch": 2.58179787828029, + "grad_norm": 0.5956730842590332, + "learning_rate": 1.3939022253373052e-05, + "loss": 1.6179, + "step": 9826 + }, + { + "epoch": 2.582323381613952, + "grad_norm": 0.5398740768432617, + "learning_rate": 1.3921499912388294e-05, + "loss": 1.6346, + "step": 9828 + }, + { + "epoch": 2.582848884947614, + "grad_norm": 0.5810662508010864, + "learning_rate": 1.390397757140354e-05, + "loss": 1.6218, + "step": 9830 + }, + { + "epoch": 2.5833743882812756, + "grad_norm": 0.5635197162628174, + "learning_rate": 1.3886455230418784e-05, + "loss": 1.5876, + "step": 9832 + }, + { + "epoch": 2.583899891614937, + "grad_norm": 0.626593291759491, + "learning_rate": 1.386893288943403e-05, + "loss": 1.596, + "step": 9834 + }, + { + "epoch": 2.584425394948599, + "grad_norm": 0.528767466545105, + "learning_rate": 1.3851410548449273e-05, + "loss": 1.6265, + "step": 9836 + }, + { + "epoch": 2.584950898282261, + "grad_norm": 0.5446393489837646, + "learning_rate": 1.3833888207464519e-05, + "loss": 1.6101, + "step": 9838 + }, + { + "epoch": 2.5854764016159226, + "grad_norm": 0.5221058130264282, + "learning_rate": 1.3816365866479763e-05, + "loss": 1.6393, + "step": 9840 + }, + { + "epoch": 2.5860019049495846, + "grad_norm": 0.721125602722168, + "learning_rate": 1.3798843525495005e-05, + "loss": 1.5934, + "step": 9842 + }, + { + "epoch": 2.5865274082832466, + "grad_norm": 0.6360064744949341, + "learning_rate": 1.3781321184510251e-05, + "loss": 1.613, + "step": 9844 + }, + { + "epoch": 2.587052911616908, + "grad_norm": 0.5796793699264526, + "learning_rate": 1.3763798843525497e-05, + "loss": 1.6069, + "step": 9846 + }, + { + "epoch": 2.5875784149505696, + "grad_norm": 0.6104748845100403, + "learning_rate": 1.374627650254074e-05, + "loss": 1.6083, + "step": 9848 + }, + { + "epoch": 2.5881039182842316, + "grad_norm": 0.5367170572280884, + "learning_rate": 1.3728754161555984e-05, + "loss": 1.6034, + "step": 9850 + }, + { + "epoch": 2.5886294216178936, + "grad_norm": 0.6482873558998108, + "learning_rate": 1.371123182057123e-05, + "loss": 1.5852, + "step": 9852 + }, + { + "epoch": 2.589154924951555, + "grad_norm": 0.5794985294342041, + "learning_rate": 1.3693709479586472e-05, + "loss": 1.5974, + "step": 9854 + }, + { + "epoch": 2.589680428285217, + "grad_norm": 0.6778134703636169, + "learning_rate": 1.3676187138601718e-05, + "loss": 1.5959, + "step": 9856 + }, + { + "epoch": 2.5902059316188786, + "grad_norm": 0.5274595618247986, + "learning_rate": 1.3658664797616962e-05, + "loss": 1.5888, + "step": 9858 + }, + { + "epoch": 2.5907314349525405, + "grad_norm": 0.648197591304779, + "learning_rate": 1.3641142456632205e-05, + "loss": 1.6369, + "step": 9860 + }, + { + "epoch": 2.591256938286202, + "grad_norm": 0.5107806324958801, + "learning_rate": 1.362362011564745e-05, + "loss": 1.5997, + "step": 9862 + }, + { + "epoch": 2.591782441619864, + "grad_norm": 0.5690180659294128, + "learning_rate": 1.3606097774662696e-05, + "loss": 1.6161, + "step": 9864 + }, + { + "epoch": 2.592307944953526, + "grad_norm": 0.5976451635360718, + "learning_rate": 1.358857543367794e-05, + "loss": 1.5956, + "step": 9866 + }, + { + "epoch": 2.5928334482871875, + "grad_norm": 0.5506587624549866, + "learning_rate": 1.3571053092693185e-05, + "loss": 1.5734, + "step": 9868 + }, + { + "epoch": 2.5933589516208495, + "grad_norm": 0.5882534384727478, + "learning_rate": 1.3553530751708429e-05, + "loss": 1.5834, + "step": 9870 + }, + { + "epoch": 2.593884454954511, + "grad_norm": 0.5685223340988159, + "learning_rate": 1.3536008410723675e-05, + "loss": 1.5997, + "step": 9872 + }, + { + "epoch": 2.594409958288173, + "grad_norm": 0.725277304649353, + "learning_rate": 1.3518486069738917e-05, + "loss": 1.573, + "step": 9874 + }, + { + "epoch": 2.5949354616218345, + "grad_norm": 0.563489556312561, + "learning_rate": 1.3500963728754163e-05, + "loss": 1.6113, + "step": 9876 + }, + { + "epoch": 2.5954609649554965, + "grad_norm": 0.5687708258628845, + "learning_rate": 1.3483441387769407e-05, + "loss": 1.6314, + "step": 9878 + }, + { + "epoch": 2.5959864682891585, + "grad_norm": 0.6753512620925903, + "learning_rate": 1.346591904678465e-05, + "loss": 1.5998, + "step": 9880 + }, + { + "epoch": 2.59651197162282, + "grad_norm": 0.7767539620399475, + "learning_rate": 1.3448396705799896e-05, + "loss": 1.5696, + "step": 9882 + }, + { + "epoch": 2.5970374749564815, + "grad_norm": 0.6438019275665283, + "learning_rate": 1.3430874364815142e-05, + "loss": 1.6511, + "step": 9884 + }, + { + "epoch": 2.5975629782901435, + "grad_norm": 0.673339307308197, + "learning_rate": 1.3413352023830384e-05, + "loss": 1.6375, + "step": 9886 + }, + { + "epoch": 2.5980884816238055, + "grad_norm": 0.6414830684661865, + "learning_rate": 1.3395829682845628e-05, + "loss": 1.6215, + "step": 9888 + }, + { + "epoch": 2.598613984957467, + "grad_norm": 0.5403038263320923, + "learning_rate": 1.3378307341860874e-05, + "loss": 1.5912, + "step": 9890 + }, + { + "epoch": 2.599139488291129, + "grad_norm": 0.6485872268676758, + "learning_rate": 1.3360785000876117e-05, + "loss": 1.6385, + "step": 9892 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.5818743109703064, + "learning_rate": 1.3343262659891362e-05, + "loss": 1.6295, + "step": 9894 + }, + { + "epoch": 2.6001904949584524, + "grad_norm": 0.6335439682006836, + "learning_rate": 1.3325740318906607e-05, + "loss": 1.5726, + "step": 9896 + }, + { + "epoch": 2.600715998292114, + "grad_norm": 0.5946658253669739, + "learning_rate": 1.3308217977921849e-05, + "loss": 1.6393, + "step": 9898 + }, + { + "epoch": 2.601241501625776, + "grad_norm": 0.5518163442611694, + "learning_rate": 1.3290695636937095e-05, + "loss": 1.5873, + "step": 9900 + }, + { + "epoch": 2.601767004959438, + "grad_norm": 0.5967461466789246, + "learning_rate": 1.327317329595234e-05, + "loss": 1.596, + "step": 9902 + }, + { + "epoch": 2.6022925082930994, + "grad_norm": 0.613856852054596, + "learning_rate": 1.3255650954967585e-05, + "loss": 1.6073, + "step": 9904 + }, + { + "epoch": 2.6028180116267614, + "grad_norm": 0.5403199791908264, + "learning_rate": 1.3238128613982827e-05, + "loss": 1.623, + "step": 9906 + }, + { + "epoch": 2.603343514960423, + "grad_norm": 0.6116963624954224, + "learning_rate": 1.3220606272998073e-05, + "loss": 1.6139, + "step": 9908 + }, + { + "epoch": 2.603869018294085, + "grad_norm": 0.6207565069198608, + "learning_rate": 1.320308393201332e-05, + "loss": 1.602, + "step": 9910 + }, + { + "epoch": 2.6043945216277464, + "grad_norm": 0.6587697863578796, + "learning_rate": 1.3185561591028562e-05, + "loss": 1.6115, + "step": 9912 + }, + { + "epoch": 2.6049200249614084, + "grad_norm": 0.5676011443138123, + "learning_rate": 1.3168039250043806e-05, + "loss": 1.6086, + "step": 9914 + }, + { + "epoch": 2.6054455282950704, + "grad_norm": 0.5473529100418091, + "learning_rate": 1.3150516909059052e-05, + "loss": 1.6345, + "step": 9916 + }, + { + "epoch": 2.605971031628732, + "grad_norm": 0.5622715950012207, + "learning_rate": 1.3132994568074294e-05, + "loss": 1.5891, + "step": 9918 + }, + { + "epoch": 2.6064965349623934, + "grad_norm": 0.6009332537651062, + "learning_rate": 1.311547222708954e-05, + "loss": 1.6095, + "step": 9920 + }, + { + "epoch": 2.6070220382960554, + "grad_norm": 0.746606707572937, + "learning_rate": 1.3097949886104784e-05, + "loss": 1.6328, + "step": 9922 + }, + { + "epoch": 2.6075475416297174, + "grad_norm": 0.6133710741996765, + "learning_rate": 1.3080427545120027e-05, + "loss": 1.6085, + "step": 9924 + }, + { + "epoch": 2.608073044963379, + "grad_norm": 0.6758058667182922, + "learning_rate": 1.3062905204135273e-05, + "loss": 1.62, + "step": 9926 + }, + { + "epoch": 2.608598548297041, + "grad_norm": 0.5908955335617065, + "learning_rate": 1.3045382863150518e-05, + "loss": 1.5991, + "step": 9928 + }, + { + "epoch": 2.6091240516307024, + "grad_norm": 0.5754519104957581, + "learning_rate": 1.3027860522165761e-05, + "loss": 1.6316, + "step": 9930 + }, + { + "epoch": 2.6096495549643643, + "grad_norm": 0.5225171446800232, + "learning_rate": 1.3010338181181007e-05, + "loss": 1.5902, + "step": 9932 + }, + { + "epoch": 2.610175058298026, + "grad_norm": 0.6654541492462158, + "learning_rate": 1.2992815840196251e-05, + "loss": 1.588, + "step": 9934 + }, + { + "epoch": 2.610700561631688, + "grad_norm": 0.592370331287384, + "learning_rate": 1.2975293499211497e-05, + "loss": 1.6396, + "step": 9936 + }, + { + "epoch": 2.61122606496535, + "grad_norm": 0.6326411366462708, + "learning_rate": 1.295777115822674e-05, + "loss": 1.6201, + "step": 9938 + }, + { + "epoch": 2.6117515682990113, + "grad_norm": 0.6815057396888733, + "learning_rate": 1.2940248817241985e-05, + "loss": 1.6337, + "step": 9940 + }, + { + "epoch": 2.6122770716326733, + "grad_norm": 0.6094640493392944, + "learning_rate": 1.292272647625723e-05, + "loss": 1.6009, + "step": 9942 + }, + { + "epoch": 2.612802574966335, + "grad_norm": 0.6490272283554077, + "learning_rate": 1.2905204135272472e-05, + "loss": 1.6249, + "step": 9944 + }, + { + "epoch": 2.613328078299997, + "grad_norm": 0.6855607032775879, + "learning_rate": 1.2887681794287718e-05, + "loss": 1.5982, + "step": 9946 + }, + { + "epoch": 2.6138535816336583, + "grad_norm": 0.6429476141929626, + "learning_rate": 1.2870159453302964e-05, + "loss": 1.6119, + "step": 9948 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.5955906510353088, + "learning_rate": 1.2852637112318206e-05, + "loss": 1.604, + "step": 9950 + }, + { + "epoch": 2.6149045883009823, + "grad_norm": 0.7396878600120544, + "learning_rate": 1.283511477133345e-05, + "loss": 1.6084, + "step": 9952 + }, + { + "epoch": 2.615430091634644, + "grad_norm": 0.631075382232666, + "learning_rate": 1.2817592430348696e-05, + "loss": 1.579, + "step": 9954 + }, + { + "epoch": 2.6159555949683053, + "grad_norm": 0.5280560255050659, + "learning_rate": 1.2800070089363939e-05, + "loss": 1.5952, + "step": 9956 + }, + { + "epoch": 2.6164810983019673, + "grad_norm": 0.5811305046081543, + "learning_rate": 1.2782547748379184e-05, + "loss": 1.5981, + "step": 9958 + }, + { + "epoch": 2.6170066016356293, + "grad_norm": 0.616111159324646, + "learning_rate": 1.2765025407394429e-05, + "loss": 1.6146, + "step": 9960 + }, + { + "epoch": 2.617532104969291, + "grad_norm": 0.6001870036125183, + "learning_rate": 1.2747503066409671e-05, + "loss": 1.612, + "step": 9962 + }, + { + "epoch": 2.6180576083029528, + "grad_norm": 0.5698304176330566, + "learning_rate": 1.2729980725424917e-05, + "loss": 1.5854, + "step": 9964 + }, + { + "epoch": 2.6185831116366143, + "grad_norm": 0.5722531676292419, + "learning_rate": 1.2712458384440163e-05, + "loss": 1.595, + "step": 9966 + }, + { + "epoch": 2.6191086149702762, + "grad_norm": 0.556814968585968, + "learning_rate": 1.2694936043455405e-05, + "loss": 1.6079, + "step": 9968 + }, + { + "epoch": 2.6196341183039378, + "grad_norm": 0.7477023601531982, + "learning_rate": 1.267741370247065e-05, + "loss": 1.6101, + "step": 9970 + }, + { + "epoch": 2.6201596216375997, + "grad_norm": 0.5011134743690491, + "learning_rate": 1.2659891361485895e-05, + "loss": 1.5684, + "step": 9972 + }, + { + "epoch": 2.6206851249712617, + "grad_norm": 0.5759318470954895, + "learning_rate": 1.2642369020501141e-05, + "loss": 1.5726, + "step": 9974 + }, + { + "epoch": 2.6212106283049232, + "grad_norm": 0.5837996006011963, + "learning_rate": 1.2624846679516384e-05, + "loss": 1.5975, + "step": 9976 + }, + { + "epoch": 2.621736131638585, + "grad_norm": 0.6139369606971741, + "learning_rate": 1.2607324338531628e-05, + "loss": 1.604, + "step": 9978 + }, + { + "epoch": 2.6222616349722467, + "grad_norm": 0.59381103515625, + "learning_rate": 1.2589801997546874e-05, + "loss": 1.631, + "step": 9980 + }, + { + "epoch": 2.6227871383059087, + "grad_norm": 0.59110027551651, + "learning_rate": 1.2572279656562116e-05, + "loss": 1.6342, + "step": 9982 + }, + { + "epoch": 2.6233126416395702, + "grad_norm": 0.6388522386550903, + "learning_rate": 1.2554757315577362e-05, + "loss": 1.602, + "step": 9984 + }, + { + "epoch": 2.623838144973232, + "grad_norm": 0.5485536456108093, + "learning_rate": 1.2537234974592606e-05, + "loss": 1.5904, + "step": 9986 + }, + { + "epoch": 2.624363648306894, + "grad_norm": 0.6147143840789795, + "learning_rate": 1.2519712633607849e-05, + "loss": 1.6193, + "step": 9988 + }, + { + "epoch": 2.6248891516405557, + "grad_norm": 0.6774529814720154, + "learning_rate": 1.2502190292623095e-05, + "loss": 1.5894, + "step": 9990 + }, + { + "epoch": 2.625414654974217, + "grad_norm": 0.5123162269592285, + "learning_rate": 1.2484667951638339e-05, + "loss": 1.5871, + "step": 9992 + }, + { + "epoch": 2.625940158307879, + "grad_norm": 0.5840069651603699, + "learning_rate": 1.2467145610653585e-05, + "loss": 1.6093, + "step": 9994 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.6273425817489624, + "learning_rate": 1.2449623269668829e-05, + "loss": 1.5992, + "step": 9996 + }, + { + "epoch": 2.6269911649752027, + "grad_norm": 0.5548253655433655, + "learning_rate": 1.2432100928684073e-05, + "loss": 1.5966, + "step": 9998 + }, + { + "epoch": 2.6275166683088647, + "grad_norm": 0.6084941625595093, + "learning_rate": 1.2414578587699317e-05, + "loss": 1.6002, + "step": 10000 + }, + { + "epoch": 2.6275166683088647, + "eval_loss": 1.6519938707351685, + "eval_runtime": 487.1623, + "eval_samples_per_second": 249.997, + "eval_steps_per_second": 31.25, + "step": 10000 + }, + { + "epoch": 2.6280421716425266, + "grad_norm": 0.5661596655845642, + "learning_rate": 1.2397056246714561e-05, + "loss": 1.6081, + "step": 10002 + }, + { + "epoch": 2.628567674976188, + "grad_norm": 0.6092149615287781, + "learning_rate": 1.2379533905729807e-05, + "loss": 1.5984, + "step": 10004 + }, + { + "epoch": 2.6290931783098497, + "grad_norm": 0.6022893786430359, + "learning_rate": 1.236201156474505e-05, + "loss": 1.594, + "step": 10006 + }, + { + "epoch": 2.6296186816435116, + "grad_norm": 0.6382527351379395, + "learning_rate": 1.2344489223760294e-05, + "loss": 1.6076, + "step": 10008 + }, + { + "epoch": 2.6301441849771736, + "grad_norm": 0.6449958682060242, + "learning_rate": 1.232696688277554e-05, + "loss": 1.6056, + "step": 10010 + }, + { + "epoch": 2.630669688310835, + "grad_norm": 0.6634608507156372, + "learning_rate": 1.2309444541790784e-05, + "loss": 1.5862, + "step": 10012 + }, + { + "epoch": 2.631195191644497, + "grad_norm": 0.5740031003952026, + "learning_rate": 1.2291922200806028e-05, + "loss": 1.5988, + "step": 10014 + }, + { + "epoch": 2.6317206949781586, + "grad_norm": 0.7303963899612427, + "learning_rate": 1.2274399859821272e-05, + "loss": 1.6342, + "step": 10016 + }, + { + "epoch": 2.6322461983118206, + "grad_norm": 0.5871624946594238, + "learning_rate": 1.2256877518836516e-05, + "loss": 1.5836, + "step": 10018 + }, + { + "epoch": 2.632771701645482, + "grad_norm": 0.5941208600997925, + "learning_rate": 1.2239355177851762e-05, + "loss": 1.5655, + "step": 10020 + }, + { + "epoch": 2.633297204979144, + "grad_norm": 0.5943103432655334, + "learning_rate": 1.2221832836867007e-05, + "loss": 1.6448, + "step": 10022 + }, + { + "epoch": 2.633822708312806, + "grad_norm": 0.6347323060035706, + "learning_rate": 1.220431049588225e-05, + "loss": 1.621, + "step": 10024 + }, + { + "epoch": 2.6343482116464676, + "grad_norm": Infinity, + "learning_rate": 1.2195549325389873e-05, + "loss": 1.6178, + "step": 10026 + }, + { + "epoch": 2.6348737149801296, + "grad_norm": 0.6692628264427185, + "learning_rate": 1.2178026984405117e-05, + "loss": 1.6063, + "step": 10028 + }, + { + "epoch": 2.635399218313791, + "grad_norm": 0.5815702080726624, + "learning_rate": 1.2160504643420361e-05, + "loss": 1.5884, + "step": 10030 + }, + { + "epoch": 2.635924721647453, + "grad_norm": 0.5981874465942383, + "learning_rate": 1.2142982302435605e-05, + "loss": 1.614, + "step": 10032 + }, + { + "epoch": 2.6364502249811146, + "grad_norm": 0.5921944975852966, + "learning_rate": 1.2125459961450851e-05, + "loss": 1.6172, + "step": 10034 + }, + { + "epoch": 2.6369757283147766, + "grad_norm": 0.5707213878631592, + "learning_rate": 1.2107937620466095e-05, + "loss": 1.6205, + "step": 10036 + }, + { + "epoch": 2.6375012316484385, + "grad_norm": 0.5834996700286865, + "learning_rate": 1.209041527948134e-05, + "loss": 1.598, + "step": 10038 + }, + { + "epoch": 2.6380267349821, + "grad_norm": 0.568755567073822, + "learning_rate": 1.2072892938496584e-05, + "loss": 1.5819, + "step": 10040 + }, + { + "epoch": 2.6385522383157616, + "grad_norm": 0.6087754368782043, + "learning_rate": 1.2055370597511828e-05, + "loss": 1.6223, + "step": 10042 + }, + { + "epoch": 2.6390777416494235, + "grad_norm": 0.5805062651634216, + "learning_rate": 1.2037848256527072e-05, + "loss": 1.6055, + "step": 10044 + }, + { + "epoch": 2.6396032449830855, + "grad_norm": 0.5679923892021179, + "learning_rate": 1.2020325915542318e-05, + "loss": 1.6388, + "step": 10046 + }, + { + "epoch": 2.640128748316747, + "grad_norm": 0.7020934820175171, + "learning_rate": 1.200280357455756e-05, + "loss": 1.6076, + "step": 10048 + }, + { + "epoch": 2.640654251650409, + "grad_norm": 0.5716947913169861, + "learning_rate": 1.1985281233572806e-05, + "loss": 1.6035, + "step": 10050 + }, + { + "epoch": 2.6411797549840705, + "grad_norm": 0.6460137963294983, + "learning_rate": 1.196775889258805e-05, + "loss": 1.5981, + "step": 10052 + }, + { + "epoch": 2.6417052583177325, + "grad_norm": 0.5903118848800659, + "learning_rate": 1.1950236551603295e-05, + "loss": 1.5844, + "step": 10054 + }, + { + "epoch": 2.642230761651394, + "grad_norm": 0.5728901028633118, + "learning_rate": 1.193271421061854e-05, + "loss": 1.5811, + "step": 10056 + }, + { + "epoch": 2.642756264985056, + "grad_norm": 0.6795145273208618, + "learning_rate": 1.1915191869633783e-05, + "loss": 1.6334, + "step": 10058 + }, + { + "epoch": 2.643281768318718, + "grad_norm": 0.552899956703186, + "learning_rate": 1.1897669528649027e-05, + "loss": 1.5956, + "step": 10060 + }, + { + "epoch": 2.6438072716523795, + "grad_norm": 0.5750575065612793, + "learning_rate": 1.1880147187664273e-05, + "loss": 1.5974, + "step": 10062 + }, + { + "epoch": 2.6443327749860415, + "grad_norm": 0.6233214735984802, + "learning_rate": 1.1862624846679517e-05, + "loss": 1.6177, + "step": 10064 + }, + { + "epoch": 2.644858278319703, + "grad_norm": 0.7260066270828247, + "learning_rate": 1.1845102505694761e-05, + "loss": 1.6024, + "step": 10066 + }, + { + "epoch": 2.645383781653365, + "grad_norm": 0.5926297903060913, + "learning_rate": 1.1827580164710006e-05, + "loss": 1.6003, + "step": 10068 + }, + { + "epoch": 2.6459092849870265, + "grad_norm": 0.565710723400116, + "learning_rate": 1.181005782372525e-05, + "loss": 1.5879, + "step": 10070 + }, + { + "epoch": 2.6464347883206885, + "grad_norm": 0.5911100506782532, + "learning_rate": 1.1792535482740496e-05, + "loss": 1.5862, + "step": 10072 + }, + { + "epoch": 2.6469602916543504, + "grad_norm": 0.6018568277359009, + "learning_rate": 1.177501314175574e-05, + "loss": 1.6274, + "step": 10074 + }, + { + "epoch": 2.647485794988012, + "grad_norm": 0.6663408875465393, + "learning_rate": 1.1757490800770982e-05, + "loss": 1.6331, + "step": 10076 + }, + { + "epoch": 2.6480112983216735, + "grad_norm": 0.6028339266777039, + "learning_rate": 1.1739968459786228e-05, + "loss": 1.6208, + "step": 10078 + }, + { + "epoch": 2.6485368016553354, + "grad_norm": 0.5986884236335754, + "learning_rate": 1.1722446118801472e-05, + "loss": 1.5974, + "step": 10080 + }, + { + "epoch": 2.6490623049889974, + "grad_norm": 0.7179824113845825, + "learning_rate": 1.1704923777816718e-05, + "loss": 1.5959, + "step": 10082 + }, + { + "epoch": 2.649587808322659, + "grad_norm": 0.6211957931518555, + "learning_rate": 1.168740143683196e-05, + "loss": 1.6052, + "step": 10084 + }, + { + "epoch": 2.650113311656321, + "grad_norm": 0.5563115477561951, + "learning_rate": 1.1669879095847205e-05, + "loss": 1.6047, + "step": 10086 + }, + { + "epoch": 2.6506388149899824, + "grad_norm": 0.6166182160377502, + "learning_rate": 1.165235675486245e-05, + "loss": 1.591, + "step": 10088 + }, + { + "epoch": 2.6511643183236444, + "grad_norm": 0.5224390625953674, + "learning_rate": 1.1634834413877695e-05, + "loss": 1.6147, + "step": 10090 + }, + { + "epoch": 2.651689821657306, + "grad_norm": 0.6834142208099365, + "learning_rate": 1.1617312072892939e-05, + "loss": 1.6194, + "step": 10092 + }, + { + "epoch": 2.652215324990968, + "grad_norm": 0.5866041779518127, + "learning_rate": 1.1599789731908183e-05, + "loss": 1.6286, + "step": 10094 + }, + { + "epoch": 2.65274082832463, + "grad_norm": 0.6454026699066162, + "learning_rate": 1.1582267390923427e-05, + "loss": 1.6181, + "step": 10096 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.5898521542549133, + "learning_rate": 1.1564745049938672e-05, + "loss": 1.6157, + "step": 10098 + }, + { + "epoch": 2.6537918349919534, + "grad_norm": 0.5379465222358704, + "learning_rate": 1.1547222708953917e-05, + "loss": 1.6015, + "step": 10100 + }, + { + "epoch": 2.654317338325615, + "grad_norm": 0.624577522277832, + "learning_rate": 1.1529700367969162e-05, + "loss": 1.6161, + "step": 10102 + }, + { + "epoch": 2.654842841659277, + "grad_norm": 0.5811554193496704, + "learning_rate": 1.1512178026984406e-05, + "loss": 1.598, + "step": 10104 + }, + { + "epoch": 2.6553683449929384, + "grad_norm": 0.5827350616455078, + "learning_rate": 1.149465568599965e-05, + "loss": 1.625, + "step": 10106 + }, + { + "epoch": 2.6558938483266004, + "grad_norm": 0.6221204996109009, + "learning_rate": 1.1477133345014894e-05, + "loss": 1.6011, + "step": 10108 + }, + { + "epoch": 2.6564193516602623, + "grad_norm": 0.6008239984512329, + "learning_rate": 1.145961100403014e-05, + "loss": 1.6048, + "step": 10110 + }, + { + "epoch": 2.656944854993924, + "grad_norm": 0.5547776222229004, + "learning_rate": 1.1442088663045382e-05, + "loss": 1.6149, + "step": 10112 + }, + { + "epoch": 2.6574703583275854, + "grad_norm": 0.5586524605751038, + "learning_rate": 1.1424566322060627e-05, + "loss": 1.576, + "step": 10114 + }, + { + "epoch": 2.6579958616612473, + "grad_norm": 0.6657798886299133, + "learning_rate": 1.1407043981075873e-05, + "loss": 1.5939, + "step": 10116 + }, + { + "epoch": 2.6585213649949093, + "grad_norm": 0.6012902855873108, + "learning_rate": 1.1389521640091117e-05, + "loss": 1.6089, + "step": 10118 + }, + { + "epoch": 2.659046868328571, + "grad_norm": 0.6174466013908386, + "learning_rate": 1.1371999299106363e-05, + "loss": 1.606, + "step": 10120 + }, + { + "epoch": 2.659572371662233, + "grad_norm": 0.5988878011703491, + "learning_rate": 1.1354476958121605e-05, + "loss": 1.6162, + "step": 10122 + }, + { + "epoch": 2.6600978749958943, + "grad_norm": 0.6152504086494446, + "learning_rate": 1.133695461713685e-05, + "loss": 1.6265, + "step": 10124 + }, + { + "epoch": 2.6606233783295563, + "grad_norm": 0.5866356492042542, + "learning_rate": 1.1319432276152095e-05, + "loss": 1.5945, + "step": 10126 + }, + { + "epoch": 2.661148881663218, + "grad_norm": 0.6397704482078552, + "learning_rate": 1.130190993516734e-05, + "loss": 1.6141, + "step": 10128 + }, + { + "epoch": 2.66167438499688, + "grad_norm": 0.5501188039779663, + "learning_rate": 1.1284387594182583e-05, + "loss": 1.6161, + "step": 10130 + }, + { + "epoch": 2.6621998883305418, + "grad_norm": 0.5947812795639038, + "learning_rate": 1.1266865253197828e-05, + "loss": 1.6168, + "step": 10132 + }, + { + "epoch": 2.6627253916642033, + "grad_norm": 0.571863055229187, + "learning_rate": 1.1249342912213072e-05, + "loss": 1.6012, + "step": 10134 + }, + { + "epoch": 2.6632508949978653, + "grad_norm": 0.5414539575576782, + "learning_rate": 1.1231820571228318e-05, + "loss": 1.5672, + "step": 10136 + }, + { + "epoch": 2.663776398331527, + "grad_norm": 0.5631955862045288, + "learning_rate": 1.1214298230243562e-05, + "loss": 1.6291, + "step": 10138 + }, + { + "epoch": 2.6643019016651888, + "grad_norm": 0.6036393642425537, + "learning_rate": 1.1196775889258804e-05, + "loss": 1.6131, + "step": 10140 + }, + { + "epoch": 2.6648274049988503, + "grad_norm": 0.5736912488937378, + "learning_rate": 1.117925354827405e-05, + "loss": 1.595, + "step": 10142 + }, + { + "epoch": 2.6653529083325123, + "grad_norm": 0.6014968156814575, + "learning_rate": 1.1161731207289294e-05, + "loss": 1.6042, + "step": 10144 + }, + { + "epoch": 2.6658784116661742, + "grad_norm": 0.564576268196106, + "learning_rate": 1.1144208866304539e-05, + "loss": 1.6199, + "step": 10146 + }, + { + "epoch": 2.6664039149998358, + "grad_norm": 0.5501058101654053, + "learning_rate": 1.1126686525319783e-05, + "loss": 1.6063, + "step": 10148 + }, + { + "epoch": 2.6669294183334973, + "grad_norm": 0.6601042151451111, + "learning_rate": 1.1109164184335027e-05, + "loss": 1.5838, + "step": 10150 + }, + { + "epoch": 2.6674549216671593, + "grad_norm": 0.6125220656394958, + "learning_rate": 1.1091641843350273e-05, + "loss": 1.6151, + "step": 10152 + }, + { + "epoch": 2.667980425000821, + "grad_norm": 0.6514031291007996, + "learning_rate": 1.1074119502365517e-05, + "loss": 1.5978, + "step": 10154 + }, + { + "epoch": 2.6685059283344827, + "grad_norm": 0.5755125284194946, + "learning_rate": 1.1056597161380761e-05, + "loss": 1.5971, + "step": 10156 + }, + { + "epoch": 2.6690314316681447, + "grad_norm": 0.6414878368377686, + "learning_rate": 1.1039074820396005e-05, + "loss": 1.6212, + "step": 10158 + }, + { + "epoch": 2.6695569350018067, + "grad_norm": 0.6092830300331116, + "learning_rate": 1.102155247941125e-05, + "loss": 1.6006, + "step": 10160 + }, + { + "epoch": 2.670082438335468, + "grad_norm": 0.6845732927322388, + "learning_rate": 1.1004030138426494e-05, + "loss": 1.6473, + "step": 10162 + }, + { + "epoch": 2.6706079416691297, + "grad_norm": 0.5648114681243896, + "learning_rate": 1.098650779744174e-05, + "loss": 1.5988, + "step": 10164 + }, + { + "epoch": 2.6711334450027917, + "grad_norm": 0.5797778367996216, + "learning_rate": 1.0968985456456984e-05, + "loss": 1.6057, + "step": 10166 + }, + { + "epoch": 2.6716589483364537, + "grad_norm": 0.6136969327926636, + "learning_rate": 1.0951463115472228e-05, + "loss": 1.5615, + "step": 10168 + }, + { + "epoch": 2.672184451670115, + "grad_norm": 0.5537448525428772, + "learning_rate": 1.0933940774487472e-05, + "loss": 1.5739, + "step": 10170 + }, + { + "epoch": 2.672709955003777, + "grad_norm": 0.6372264623641968, + "learning_rate": 1.0916418433502716e-05, + "loss": 1.5985, + "step": 10172 + }, + { + "epoch": 2.6732354583374387, + "grad_norm": 0.5945001840591431, + "learning_rate": 1.0898896092517962e-05, + "loss": 1.6051, + "step": 10174 + }, + { + "epoch": 2.6737609616711007, + "grad_norm": 0.5689924359321594, + "learning_rate": 1.0881373751533205e-05, + "loss": 1.6083, + "step": 10176 + }, + { + "epoch": 2.674286465004762, + "grad_norm": 0.596948504447937, + "learning_rate": 1.0863851410548449e-05, + "loss": 1.6086, + "step": 10178 + }, + { + "epoch": 2.674811968338424, + "grad_norm": 0.581723153591156, + "learning_rate": 1.0846329069563695e-05, + "loss": 1.6127, + "step": 10180 + }, + { + "epoch": 2.675337471672086, + "grad_norm": 0.5663895010948181, + "learning_rate": 1.0828806728578939e-05, + "loss": 1.571, + "step": 10182 + }, + { + "epoch": 2.6758629750057477, + "grad_norm": 0.5802646279335022, + "learning_rate": 1.0811284387594183e-05, + "loss": 1.6068, + "step": 10184 + }, + { + "epoch": 2.6763884783394096, + "grad_norm": 0.615695595741272, + "learning_rate": 1.0793762046609427e-05, + "loss": 1.6147, + "step": 10186 + }, + { + "epoch": 2.676913981673071, + "grad_norm": 0.6105237007141113, + "learning_rate": 1.0776239705624671e-05, + "loss": 1.607, + "step": 10188 + }, + { + "epoch": 2.677439485006733, + "grad_norm": 0.5385169982910156, + "learning_rate": 1.0758717364639917e-05, + "loss": 1.6079, + "step": 10190 + }, + { + "epoch": 2.6779649883403946, + "grad_norm": 0.7373268008232117, + "learning_rate": 1.0741195023655161e-05, + "loss": 1.6043, + "step": 10192 + }, + { + "epoch": 2.6784904916740566, + "grad_norm": 0.5916065573692322, + "learning_rate": 1.0723672682670406e-05, + "loss": 1.6349, + "step": 10194 + }, + { + "epoch": 2.6790159950077186, + "grad_norm": 0.7171288132667542, + "learning_rate": 1.070615034168565e-05, + "loss": 1.6027, + "step": 10196 + }, + { + "epoch": 2.67954149834138, + "grad_norm": 0.5862658619880676, + "learning_rate": 1.0688628000700894e-05, + "loss": 1.5782, + "step": 10198 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.5845574736595154, + "learning_rate": 1.0671105659716138e-05, + "loss": 1.6043, + "step": 10200 + }, + { + "epoch": 2.6805925050087036, + "grad_norm": 0.6461187601089478, + "learning_rate": 1.0653583318731384e-05, + "loss": 1.6035, + "step": 10202 + }, + { + "epoch": 2.6811180083423656, + "grad_norm": 0.5671385526657104, + "learning_rate": 1.0636060977746626e-05, + "loss": 1.5873, + "step": 10204 + }, + { + "epoch": 2.681643511676027, + "grad_norm": 0.614528238773346, + "learning_rate": 1.0618538636761872e-05, + "loss": 1.5885, + "step": 10206 + }, + { + "epoch": 2.682169015009689, + "grad_norm": 0.6290176510810852, + "learning_rate": 1.0601016295777116e-05, + "loss": 1.6293, + "step": 10208 + }, + { + "epoch": 2.6826945183433506, + "grad_norm": 0.7489297986030579, + "learning_rate": 1.058349395479236e-05, + "loss": 1.6248, + "step": 10210 + }, + { + "epoch": 2.6832200216770126, + "grad_norm": 0.5606198310852051, + "learning_rate": 1.0565971613807605e-05, + "loss": 1.5935, + "step": 10212 + }, + { + "epoch": 2.683745525010674, + "grad_norm": 0.5877952575683594, + "learning_rate": 1.0548449272822849e-05, + "loss": 1.5945, + "step": 10214 + }, + { + "epoch": 2.684271028344336, + "grad_norm": 0.6569766998291016, + "learning_rate": 1.0530926931838093e-05, + "loss": 1.5946, + "step": 10216 + }, + { + "epoch": 2.684796531677998, + "grad_norm": 0.5706822276115417, + "learning_rate": 1.0513404590853339e-05, + "loss": 1.6302, + "step": 10218 + }, + { + "epoch": 2.6853220350116596, + "grad_norm": 0.6443889141082764, + "learning_rate": 1.0495882249868583e-05, + "loss": 1.6055, + "step": 10220 + }, + { + "epoch": 2.6858475383453215, + "grad_norm": 0.6561224460601807, + "learning_rate": 1.0478359908883827e-05, + "loss": 1.5979, + "step": 10222 + }, + { + "epoch": 2.686373041678983, + "grad_norm": 0.6297589540481567, + "learning_rate": 1.0460837567899072e-05, + "loss": 1.621, + "step": 10224 + }, + { + "epoch": 2.686898545012645, + "grad_norm": 0.5593584775924683, + "learning_rate": 1.0443315226914316e-05, + "loss": 1.5798, + "step": 10226 + }, + { + "epoch": 2.6874240483463065, + "grad_norm": 0.5252684354782104, + "learning_rate": 1.0425792885929562e-05, + "loss": 1.5672, + "step": 10228 + }, + { + "epoch": 2.6879495516799685, + "grad_norm": 0.7257969379425049, + "learning_rate": 1.0408270544944806e-05, + "loss": 1.6178, + "step": 10230 + }, + { + "epoch": 2.6884750550136305, + "grad_norm": 0.5459820628166199, + "learning_rate": 1.0390748203960048e-05, + "loss": 1.589, + "step": 10232 + }, + { + "epoch": 2.689000558347292, + "grad_norm": 0.6055821180343628, + "learning_rate": 1.0373225862975294e-05, + "loss": 1.5899, + "step": 10234 + }, + { + "epoch": 2.6895260616809535, + "grad_norm": 0.7138866782188416, + "learning_rate": 1.0355703521990538e-05, + "loss": 1.6004, + "step": 10236 + }, + { + "epoch": 2.6900515650146155, + "grad_norm": 0.5606485605239868, + "learning_rate": 1.0338181181005784e-05, + "loss": 1.5703, + "step": 10238 + }, + { + "epoch": 2.6905770683482775, + "grad_norm": 0.6380048990249634, + "learning_rate": 1.0320658840021027e-05, + "loss": 1.6236, + "step": 10240 + }, + { + "epoch": 2.691102571681939, + "grad_norm": 0.59669029712677, + "learning_rate": 1.030313649903627e-05, + "loss": 1.5963, + "step": 10242 + }, + { + "epoch": 2.691628075015601, + "grad_norm": 0.6684574484825134, + "learning_rate": 1.0285614158051517e-05, + "loss": 1.6199, + "step": 10244 + }, + { + "epoch": 2.6921535783492625, + "grad_norm": 0.5515660047531128, + "learning_rate": 1.0268091817066761e-05, + "loss": 1.6143, + "step": 10246 + }, + { + "epoch": 2.6926790816829245, + "grad_norm": 0.6562058329582214, + "learning_rate": 1.0250569476082005e-05, + "loss": 1.6112, + "step": 10248 + }, + { + "epoch": 2.693204585016586, + "grad_norm": 0.5547401309013367, + "learning_rate": 1.023304713509725e-05, + "loss": 1.6124, + "step": 10250 + }, + { + "epoch": 2.693730088350248, + "grad_norm": 0.6307016611099243, + "learning_rate": 1.0215524794112493e-05, + "loss": 1.6256, + "step": 10252 + }, + { + "epoch": 2.69425559168391, + "grad_norm": 0.5722636580467224, + "learning_rate": 1.0198002453127738e-05, + "loss": 1.5713, + "step": 10254 + }, + { + "epoch": 2.6947810950175715, + "grad_norm": 0.587989330291748, + "learning_rate": 1.0180480112142983e-05, + "loss": 1.6191, + "step": 10256 + }, + { + "epoch": 2.6953065983512334, + "grad_norm": 0.7785957455635071, + "learning_rate": 1.0162957771158228e-05, + "loss": 1.6129, + "step": 10258 + }, + { + "epoch": 2.695832101684895, + "grad_norm": 0.5290049910545349, + "learning_rate": 1.0145435430173472e-05, + "loss": 1.5561, + "step": 10260 + }, + { + "epoch": 2.696357605018557, + "grad_norm": 0.6306317448616028, + "learning_rate": 1.0127913089188716e-05, + "loss": 1.5872, + "step": 10262 + }, + { + "epoch": 2.6968831083522184, + "grad_norm": 0.5847423672676086, + "learning_rate": 1.011039074820396e-05, + "loss": 1.5783, + "step": 10264 + }, + { + "epoch": 2.6974086116858804, + "grad_norm": 0.5815863609313965, + "learning_rate": 1.0092868407219206e-05, + "loss": 1.6138, + "step": 10266 + }, + { + "epoch": 2.6979341150195424, + "grad_norm": 0.6759339570999146, + "learning_rate": 1.0075346066234448e-05, + "loss": 1.6146, + "step": 10268 + }, + { + "epoch": 2.698459618353204, + "grad_norm": 0.7321836948394775, + "learning_rate": 1.0057823725249693e-05, + "loss": 1.6333, + "step": 10270 + }, + { + "epoch": 2.6989851216868654, + "grad_norm": 0.595303475856781, + "learning_rate": 1.0040301384264939e-05, + "loss": 1.5903, + "step": 10272 + }, + { + "epoch": 2.6995106250205274, + "grad_norm": 0.6422771215438843, + "learning_rate": 1.0022779043280183e-05, + "loss": 1.6063, + "step": 10274 + }, + { + "epoch": 2.7000361283541894, + "grad_norm": 0.6339772939682007, + "learning_rate": 1.0005256702295427e-05, + "loss": 1.6402, + "step": 10276 + }, + { + "epoch": 2.700561631687851, + "grad_norm": 0.6175129413604736, + "learning_rate": 9.987734361310671e-06, + "loss": 1.5768, + "step": 10278 + }, + { + "epoch": 2.701087135021513, + "grad_norm": 0.6692789793014526, + "learning_rate": 9.970212020325915e-06, + "loss": 1.6293, + "step": 10280 + }, + { + "epoch": 2.7016126383551744, + "grad_norm": 0.6754617094993591, + "learning_rate": 9.952689679341161e-06, + "loss": 1.6127, + "step": 10282 + }, + { + "epoch": 2.7021381416888364, + "grad_norm": 0.5878320336341858, + "learning_rate": 9.935167338356405e-06, + "loss": 1.6195, + "step": 10284 + }, + { + "epoch": 2.702663645022498, + "grad_norm": 0.9269865155220032, + "learning_rate": 9.91764499737165e-06, + "loss": 1.5903, + "step": 10286 + }, + { + "epoch": 2.70318914835616, + "grad_norm": 0.610824465751648, + "learning_rate": 9.900122656386894e-06, + "loss": 1.608, + "step": 10288 + }, + { + "epoch": 2.703714651689822, + "grad_norm": 0.587494432926178, + "learning_rate": 9.882600315402138e-06, + "loss": 1.547, + "step": 10290 + }, + { + "epoch": 2.7042401550234834, + "grad_norm": 0.6791782975196838, + "learning_rate": 9.865077974417384e-06, + "loss": 1.6039, + "step": 10292 + }, + { + "epoch": 2.7047656583571453, + "grad_norm": 0.5835421681404114, + "learning_rate": 9.847555633432628e-06, + "loss": 1.6378, + "step": 10294 + }, + { + "epoch": 2.705291161690807, + "grad_norm": 0.6221166253089905, + "learning_rate": 9.83003329244787e-06, + "loss": 1.5753, + "step": 10296 + }, + { + "epoch": 2.705816665024469, + "grad_norm": 0.5716819763183594, + "learning_rate": 9.812510951463116e-06, + "loss": 1.5876, + "step": 10298 + }, + { + "epoch": 2.7063421683581304, + "grad_norm": 0.725444495677948, + "learning_rate": 9.79498861047836e-06, + "loss": 1.6308, + "step": 10300 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.6209605932235718, + "learning_rate": 9.777466269493605e-06, + "loss": 1.5797, + "step": 10302 + }, + { + "epoch": 2.7073931750254543, + "grad_norm": 0.6639958620071411, + "learning_rate": 9.759943928508849e-06, + "loss": 1.6051, + "step": 10304 + }, + { + "epoch": 2.707918678359116, + "grad_norm": 0.6042879819869995, + "learning_rate": 9.742421587524093e-06, + "loss": 1.5985, + "step": 10306 + }, + { + "epoch": 2.7084441816927773, + "grad_norm": 0.5975037813186646, + "learning_rate": 9.724899246539339e-06, + "loss": 1.5945, + "step": 10308 + }, + { + "epoch": 2.7089696850264393, + "grad_norm": 0.5482746362686157, + "learning_rate": 9.707376905554583e-06, + "loss": 1.5645, + "step": 10310 + }, + { + "epoch": 2.7094951883601013, + "grad_norm": 0.6123608946800232, + "learning_rate": 9.689854564569827e-06, + "loss": 1.5862, + "step": 10312 + }, + { + "epoch": 2.710020691693763, + "grad_norm": 0.6144323348999023, + "learning_rate": 9.672332223585071e-06, + "loss": 1.6422, + "step": 10314 + }, + { + "epoch": 2.7105461950274248, + "grad_norm": 0.5697504878044128, + "learning_rate": 9.654809882600315e-06, + "loss": 1.6299, + "step": 10316 + }, + { + "epoch": 2.7110716983610867, + "grad_norm": 0.5491029024124146, + "learning_rate": 9.63728754161556e-06, + "loss": 1.5843, + "step": 10318 + }, + { + "epoch": 2.7115972016947483, + "grad_norm": 0.8691264390945435, + "learning_rate": 9.619765200630806e-06, + "loss": 1.6086, + "step": 10320 + }, + { + "epoch": 2.71212270502841, + "grad_norm": 0.5798594355583191, + "learning_rate": 9.60224285964605e-06, + "loss": 1.6205, + "step": 10322 + }, + { + "epoch": 2.7126482083620718, + "grad_norm": 0.5886126756668091, + "learning_rate": 9.584720518661294e-06, + "loss": 1.5923, + "step": 10324 + }, + { + "epoch": 2.7131737116957337, + "grad_norm": 0.7271785140037537, + "learning_rate": 9.567198177676538e-06, + "loss": 1.623, + "step": 10326 + }, + { + "epoch": 2.7136992150293953, + "grad_norm": 0.5842270255088806, + "learning_rate": 9.549675836691782e-06, + "loss": 1.6141, + "step": 10328 + }, + { + "epoch": 2.7142247183630572, + "grad_norm": 0.5803746581077576, + "learning_rate": 9.532153495707028e-06, + "loss": 1.611, + "step": 10330 + }, + { + "epoch": 2.7147502216967188, + "grad_norm": 0.6185279488563538, + "learning_rate": 9.51463115472227e-06, + "loss": 1.6061, + "step": 10332 + }, + { + "epoch": 2.7152757250303807, + "grad_norm": 0.5305478572845459, + "learning_rate": 9.497108813737515e-06, + "loss": 1.6341, + "step": 10334 + }, + { + "epoch": 2.7158012283640423, + "grad_norm": 0.5952558517456055, + "learning_rate": 9.47958647275276e-06, + "loss": 1.6062, + "step": 10336 + }, + { + "epoch": 2.7163267316977042, + "grad_norm": 0.5312665700912476, + "learning_rate": 9.462064131768005e-06, + "loss": 1.6027, + "step": 10338 + }, + { + "epoch": 2.716852235031366, + "grad_norm": 0.6712905168533325, + "learning_rate": 9.444541790783249e-06, + "loss": 1.5846, + "step": 10340 + }, + { + "epoch": 2.7173777383650277, + "grad_norm": 0.6250156164169312, + "learning_rate": 9.427019449798493e-06, + "loss": 1.6207, + "step": 10342 + }, + { + "epoch": 2.7179032416986897, + "grad_norm": 0.6365962624549866, + "learning_rate": 9.409497108813737e-06, + "loss": 1.614, + "step": 10344 + }, + { + "epoch": 2.718428745032351, + "grad_norm": 0.5564280152320862, + "learning_rate": 9.391974767828983e-06, + "loss": 1.6054, + "step": 10346 + }, + { + "epoch": 2.718954248366013, + "grad_norm": 0.599860668182373, + "learning_rate": 9.374452426844227e-06, + "loss": 1.5914, + "step": 10348 + }, + { + "epoch": 2.7194797516996747, + "grad_norm": 0.6647321581840515, + "learning_rate": 9.356930085859472e-06, + "loss": 1.6001, + "step": 10350 + }, + { + "epoch": 2.7200052550333367, + "grad_norm": 0.5299103260040283, + "learning_rate": 9.339407744874716e-06, + "loss": 1.6181, + "step": 10352 + }, + { + "epoch": 2.7205307583669986, + "grad_norm": 0.6260030269622803, + "learning_rate": 9.32188540388996e-06, + "loss": 1.5963, + "step": 10354 + }, + { + "epoch": 2.72105626170066, + "grad_norm": 0.6055660247802734, + "learning_rate": 9.304363062905204e-06, + "loss": 1.564, + "step": 10356 + }, + { + "epoch": 2.7215817650343217, + "grad_norm": 0.6588335633277893, + "learning_rate": 9.28684072192045e-06, + "loss": 1.5692, + "step": 10358 + }, + { + "epoch": 2.7221072683679837, + "grad_norm": 0.5844119787216187, + "learning_rate": 9.269318380935692e-06, + "loss": 1.5931, + "step": 10360 + }, + { + "epoch": 2.7226327717016456, + "grad_norm": 0.5909096598625183, + "learning_rate": 9.251796039950938e-06, + "loss": 1.6011, + "step": 10362 + }, + { + "epoch": 2.723158275035307, + "grad_norm": 0.5176669955253601, + "learning_rate": 9.234273698966182e-06, + "loss": 1.5484, + "step": 10364 + }, + { + "epoch": 2.723683778368969, + "grad_norm": 0.5780521035194397, + "learning_rate": 9.216751357981427e-06, + "loss": 1.5989, + "step": 10366 + }, + { + "epoch": 2.7242092817026307, + "grad_norm": 0.6192241311073303, + "learning_rate": 9.19922901699667e-06, + "loss": 1.5719, + "step": 10368 + }, + { + "epoch": 2.7247347850362926, + "grad_norm": 0.5565381646156311, + "learning_rate": 9.181706676011915e-06, + "loss": 1.6109, + "step": 10370 + }, + { + "epoch": 2.725260288369954, + "grad_norm": 0.6125465631484985, + "learning_rate": 9.16418433502716e-06, + "loss": 1.61, + "step": 10372 + }, + { + "epoch": 2.725785791703616, + "grad_norm": 0.597527265548706, + "learning_rate": 9.146661994042405e-06, + "loss": 1.6093, + "step": 10374 + }, + { + "epoch": 2.726311295037278, + "grad_norm": 0.633154034614563, + "learning_rate": 9.12913965305765e-06, + "loss": 1.6101, + "step": 10376 + }, + { + "epoch": 2.7268367983709396, + "grad_norm": 0.5533692240715027, + "learning_rate": 9.111617312072893e-06, + "loss": 1.5807, + "step": 10378 + }, + { + "epoch": 2.7273623017046016, + "grad_norm": 0.6450961232185364, + "learning_rate": 9.094094971088138e-06, + "loss": 1.5835, + "step": 10380 + }, + { + "epoch": 2.727887805038263, + "grad_norm": 0.629943311214447, + "learning_rate": 9.076572630103382e-06, + "loss": 1.5977, + "step": 10382 + }, + { + "epoch": 2.728413308371925, + "grad_norm": 0.6521432995796204, + "learning_rate": 9.059050289118628e-06, + "loss": 1.6267, + "step": 10384 + }, + { + "epoch": 2.7289388117055866, + "grad_norm": 0.5852161645889282, + "learning_rate": 9.041527948133872e-06, + "loss": 1.6112, + "step": 10386 + }, + { + "epoch": 2.7294643150392486, + "grad_norm": 0.6588388085365295, + "learning_rate": 9.024005607149114e-06, + "loss": 1.6181, + "step": 10388 + }, + { + "epoch": 2.7299898183729105, + "grad_norm": 0.6498621106147766, + "learning_rate": 9.00648326616436e-06, + "loss": 1.596, + "step": 10390 + }, + { + "epoch": 2.730515321706572, + "grad_norm": 0.5801547169685364, + "learning_rate": 8.988960925179604e-06, + "loss": 1.5948, + "step": 10392 + }, + { + "epoch": 2.7310408250402336, + "grad_norm": 0.6173077821731567, + "learning_rate": 8.97143858419485e-06, + "loss": 1.6164, + "step": 10394 + }, + { + "epoch": 2.7315663283738956, + "grad_norm": 0.5936638116836548, + "learning_rate": 8.953916243210093e-06, + "loss": 1.5946, + "step": 10396 + }, + { + "epoch": 2.7320918317075575, + "grad_norm": 0.6268817186355591, + "learning_rate": 8.936393902225337e-06, + "loss": 1.6422, + "step": 10398 + }, + { + "epoch": 2.732617335041219, + "grad_norm": 0.7213032245635986, + "learning_rate": 8.918871561240583e-06, + "loss": 1.6387, + "step": 10400 + }, + { + "epoch": 2.732617335041219, + "eval_loss": 1.649667501449585, + "eval_runtime": 487.15, + "eval_samples_per_second": 250.003, + "eval_steps_per_second": 31.251, + "step": 10400 + }, + { + "epoch": 2.733142838374881, + "grad_norm": 0.5801553130149841, + "learning_rate": 8.901349220255827e-06, + "loss": 1.5527, + "step": 10402 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.6011367440223694, + "learning_rate": 8.883826879271071e-06, + "loss": 1.5901, + "step": 10404 + }, + { + "epoch": 2.7341938450422045, + "grad_norm": 0.5433336496353149, + "learning_rate": 8.866304538286315e-06, + "loss": 1.605, + "step": 10406 + }, + { + "epoch": 2.734719348375866, + "grad_norm": 0.5924966335296631, + "learning_rate": 8.84878219730156e-06, + "loss": 1.594, + "step": 10408 + }, + { + "epoch": 2.735244851709528, + "grad_norm": 0.6247406601905823, + "learning_rate": 8.831259856316805e-06, + "loss": 1.5501, + "step": 10410 + }, + { + "epoch": 2.73577035504319, + "grad_norm": 0.8343258500099182, + "learning_rate": 8.81373751533205e-06, + "loss": 1.6041, + "step": 10412 + }, + { + "epoch": 2.7362958583768515, + "grad_norm": 0.5661430358886719, + "learning_rate": 8.796215174347294e-06, + "loss": 1.6153, + "step": 10414 + }, + { + "epoch": 2.7368213617105135, + "grad_norm": 0.6813891530036926, + "learning_rate": 8.778692833362538e-06, + "loss": 1.6211, + "step": 10416 + }, + { + "epoch": 2.737346865044175, + "grad_norm": 0.6394498348236084, + "learning_rate": 8.761170492377782e-06, + "loss": 1.5989, + "step": 10418 + }, + { + "epoch": 2.737872368377837, + "grad_norm": 0.6066767573356628, + "learning_rate": 8.743648151393026e-06, + "loss": 1.6183, + "step": 10420 + }, + { + "epoch": 2.7383978717114985, + "grad_norm": 0.5361884236335754, + "learning_rate": 8.726125810408272e-06, + "loss": 1.619, + "step": 10422 + }, + { + "epoch": 2.7389233750451605, + "grad_norm": 0.5502820014953613, + "learning_rate": 8.708603469423514e-06, + "loss": 1.5936, + "step": 10424 + }, + { + "epoch": 2.7394488783788224, + "grad_norm": 0.6126530170440674, + "learning_rate": 8.691081128438759e-06, + "loss": 1.5771, + "step": 10426 + }, + { + "epoch": 2.739974381712484, + "grad_norm": 0.5665082335472107, + "learning_rate": 8.673558787454005e-06, + "loss": 1.608, + "step": 10428 + }, + { + "epoch": 2.7404998850461455, + "grad_norm": 0.6264111399650574, + "learning_rate": 8.656036446469249e-06, + "loss": 1.6095, + "step": 10430 + }, + { + "epoch": 2.7410253883798075, + "grad_norm": 0.6684525609016418, + "learning_rate": 8.638514105484493e-06, + "loss": 1.5786, + "step": 10432 + }, + { + "epoch": 2.7415508917134694, + "grad_norm": 0.5436965823173523, + "learning_rate": 8.620991764499737e-06, + "loss": 1.5764, + "step": 10434 + }, + { + "epoch": 2.742076395047131, + "grad_norm": 0.5792893171310425, + "learning_rate": 8.603469423514981e-06, + "loss": 1.6078, + "step": 10436 + }, + { + "epoch": 2.742601898380793, + "grad_norm": 0.5539526343345642, + "learning_rate": 8.585947082530227e-06, + "loss": 1.5919, + "step": 10438 + }, + { + "epoch": 2.7431274017144545, + "grad_norm": 0.5818884968757629, + "learning_rate": 8.568424741545471e-06, + "loss": 1.5926, + "step": 10440 + }, + { + "epoch": 2.7436529050481164, + "grad_norm": 0.5815169811248779, + "learning_rate": 8.550902400560715e-06, + "loss": 1.5986, + "step": 10442 + }, + { + "epoch": 2.744178408381778, + "grad_norm": 0.6487326622009277, + "learning_rate": 8.53338005957596e-06, + "loss": 1.6054, + "step": 10444 + }, + { + "epoch": 2.74470391171544, + "grad_norm": 0.6841205358505249, + "learning_rate": 8.515857718591204e-06, + "loss": 1.6031, + "step": 10446 + }, + { + "epoch": 2.745229415049102, + "grad_norm": 0.6351028680801392, + "learning_rate": 8.49833537760645e-06, + "loss": 1.6272, + "step": 10448 + }, + { + "epoch": 2.7457549183827634, + "grad_norm": 0.5550417900085449, + "learning_rate": 8.480813036621694e-06, + "loss": 1.6228, + "step": 10450 + }, + { + "epoch": 2.7462804217164254, + "grad_norm": 0.6113694906234741, + "learning_rate": 8.463290695636936e-06, + "loss": 1.5799, + "step": 10452 + }, + { + "epoch": 2.746805925050087, + "grad_norm": 0.5771463513374329, + "learning_rate": 8.445768354652182e-06, + "loss": 1.577, + "step": 10454 + }, + { + "epoch": 2.747331428383749, + "grad_norm": 0.5693290829658508, + "learning_rate": 8.428246013667426e-06, + "loss": 1.5841, + "step": 10456 + }, + { + "epoch": 2.7478569317174104, + "grad_norm": 0.5757125020027161, + "learning_rate": 8.41072367268267e-06, + "loss": 1.5785, + "step": 10458 + }, + { + "epoch": 2.7483824350510724, + "grad_norm": 0.643251359462738, + "learning_rate": 8.393201331697915e-06, + "loss": 1.611, + "step": 10460 + }, + { + "epoch": 2.7489079383847344, + "grad_norm": 0.6383783221244812, + "learning_rate": 8.375678990713159e-06, + "loss": 1.6065, + "step": 10462 + }, + { + "epoch": 2.749433441718396, + "grad_norm": 0.5447094440460205, + "learning_rate": 8.358156649728405e-06, + "loss": 1.5887, + "step": 10464 + }, + { + "epoch": 2.7499589450520574, + "grad_norm": 0.5493614077568054, + "learning_rate": 8.340634308743649e-06, + "loss": 1.5946, + "step": 10466 + }, + { + "epoch": 2.7504844483857194, + "grad_norm": 0.7292418479919434, + "learning_rate": 8.323111967758893e-06, + "loss": 1.5853, + "step": 10468 + }, + { + "epoch": 2.7510099517193813, + "grad_norm": 0.5726646780967712, + "learning_rate": 8.305589626774137e-06, + "loss": 1.6051, + "step": 10470 + }, + { + "epoch": 2.751535455053043, + "grad_norm": 0.6066914200782776, + "learning_rate": 8.288067285789381e-06, + "loss": 1.5952, + "step": 10472 + }, + { + "epoch": 2.752060958386705, + "grad_norm": 0.6483511328697205, + "learning_rate": 8.270544944804626e-06, + "loss": 1.6212, + "step": 10474 + }, + { + "epoch": 2.752586461720367, + "grad_norm": 0.5741682648658752, + "learning_rate": 8.253022603819872e-06, + "loss": 1.6178, + "step": 10476 + }, + { + "epoch": 2.7531119650540283, + "grad_norm": 0.5253785848617554, + "learning_rate": 8.235500262835116e-06, + "loss": 1.5889, + "step": 10478 + }, + { + "epoch": 2.75363746838769, + "grad_norm": 0.557144820690155, + "learning_rate": 8.21797792185036e-06, + "loss": 1.6081, + "step": 10480 + }, + { + "epoch": 2.754162971721352, + "grad_norm": 0.5865209698677063, + "learning_rate": 8.200455580865604e-06, + "loss": 1.603, + "step": 10482 + }, + { + "epoch": 2.754688475055014, + "grad_norm": 0.5472773909568787, + "learning_rate": 8.182933239880848e-06, + "loss": 1.6022, + "step": 10484 + }, + { + "epoch": 2.7552139783886753, + "grad_norm": 0.6681110262870789, + "learning_rate": 8.165410898896094e-06, + "loss": 1.5927, + "step": 10486 + }, + { + "epoch": 2.7557394817223373, + "grad_norm": 0.6070604920387268, + "learning_rate": 8.147888557911337e-06, + "loss": 1.6051, + "step": 10488 + }, + { + "epoch": 2.756264985055999, + "grad_norm": 0.5712462663650513, + "learning_rate": 8.13036621692658e-06, + "loss": 1.6124, + "step": 10490 + }, + { + "epoch": 2.756790488389661, + "grad_norm": 0.5560539364814758, + "learning_rate": 8.112843875941827e-06, + "loss": 1.6116, + "step": 10492 + }, + { + "epoch": 2.7573159917233223, + "grad_norm": 0.6540628671646118, + "learning_rate": 8.09532153495707e-06, + "loss": 1.5916, + "step": 10494 + }, + { + "epoch": 2.7578414950569843, + "grad_norm": 0.5967172384262085, + "learning_rate": 8.077799193972315e-06, + "loss": 1.5789, + "step": 10496 + }, + { + "epoch": 2.7583669983906463, + "grad_norm": 0.6844902634620667, + "learning_rate": 8.060276852987559e-06, + "loss": 1.6098, + "step": 10498 + }, + { + "epoch": 2.7588925017243078, + "grad_norm": 0.6153303980827332, + "learning_rate": 8.042754512002803e-06, + "loss": 1.6149, + "step": 10500 + }, + { + "epoch": 2.7594180050579697, + "grad_norm": 0.5895024538040161, + "learning_rate": 8.02523217101805e-06, + "loss": 1.6097, + "step": 10502 + }, + { + "epoch": 2.7599435083916313, + "grad_norm": 0.6543148159980774, + "learning_rate": 8.007709830033293e-06, + "loss": 1.5865, + "step": 10504 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.6354900002479553, + "learning_rate": 7.990187489048538e-06, + "loss": 1.587, + "step": 10506 + }, + { + "epoch": 2.7609945150589548, + "grad_norm": 0.6232167482376099, + "learning_rate": 7.972665148063782e-06, + "loss": 1.622, + "step": 10508 + }, + { + "epoch": 2.7615200183926167, + "grad_norm": 0.5841950178146362, + "learning_rate": 7.955142807079026e-06, + "loss": 1.6159, + "step": 10510 + }, + { + "epoch": 2.7620455217262787, + "grad_norm": 0.5530897974967957, + "learning_rate": 7.93762046609427e-06, + "loss": 1.5696, + "step": 10512 + }, + { + "epoch": 2.7625710250599402, + "grad_norm": 0.5793708562850952, + "learning_rate": 7.920098125109516e-06, + "loss": 1.5672, + "step": 10514 + }, + { + "epoch": 2.7630965283936018, + "grad_norm": 0.595795750617981, + "learning_rate": 7.902575784124758e-06, + "loss": 1.6003, + "step": 10516 + }, + { + "epoch": 2.7636220317272637, + "grad_norm": 0.5570265054702759, + "learning_rate": 7.885053443140004e-06, + "loss": 1.5881, + "step": 10518 + }, + { + "epoch": 2.7641475350609257, + "grad_norm": 0.6062209606170654, + "learning_rate": 7.867531102155248e-06, + "loss": 1.5865, + "step": 10520 + }, + { + "epoch": 2.7646730383945872, + "grad_norm": 0.6273199319839478, + "learning_rate": 7.850008761170493e-06, + "loss": 1.6095, + "step": 10522 + }, + { + "epoch": 2.765198541728249, + "grad_norm": 0.6534038782119751, + "learning_rate": 7.832486420185737e-06, + "loss": 1.5834, + "step": 10524 + }, + { + "epoch": 2.7657240450619107, + "grad_norm": 0.7082603573799133, + "learning_rate": 7.814964079200981e-06, + "loss": 1.5977, + "step": 10526 + }, + { + "epoch": 2.7662495483955727, + "grad_norm": 0.8246460556983948, + "learning_rate": 7.797441738216225e-06, + "loss": 1.5952, + "step": 10528 + }, + { + "epoch": 2.766775051729234, + "grad_norm": 0.6671035289764404, + "learning_rate": 7.779919397231471e-06, + "loss": 1.6045, + "step": 10530 + }, + { + "epoch": 2.767300555062896, + "grad_norm": 0.6524311900138855, + "learning_rate": 7.762397056246715e-06, + "loss": 1.6305, + "step": 10532 + }, + { + "epoch": 2.767826058396558, + "grad_norm": 0.5577046275138855, + "learning_rate": 7.74487471526196e-06, + "loss": 1.5955, + "step": 10534 + }, + { + "epoch": 2.7683515617302197, + "grad_norm": 0.5540436506271362, + "learning_rate": 7.727352374277204e-06, + "loss": 1.6009, + "step": 10536 + }, + { + "epoch": 2.7688770650638816, + "grad_norm": 0.5906193256378174, + "learning_rate": 7.709830033292448e-06, + "loss": 1.5597, + "step": 10538 + }, + { + "epoch": 2.769402568397543, + "grad_norm": 0.7687748074531555, + "learning_rate": 7.692307692307694e-06, + "loss": 1.6085, + "step": 10540 + }, + { + "epoch": 2.769928071731205, + "grad_norm": 0.5762370824813843, + "learning_rate": 7.674785351322938e-06, + "loss": 1.6209, + "step": 10542 + }, + { + "epoch": 2.7704535750648667, + "grad_norm": 0.6532847285270691, + "learning_rate": 7.65726301033818e-06, + "loss": 1.5809, + "step": 10544 + }, + { + "epoch": 2.7709790783985286, + "grad_norm": 0.6120353937149048, + "learning_rate": 7.639740669353426e-06, + "loss": 1.5994, + "step": 10546 + }, + { + "epoch": 2.7715045817321906, + "grad_norm": 0.5683396458625793, + "learning_rate": 7.62221832836867e-06, + "loss": 1.6049, + "step": 10548 + }, + { + "epoch": 2.772030085065852, + "grad_norm": 0.6380746364593506, + "learning_rate": 7.604695987383915e-06, + "loss": 1.5922, + "step": 10550 + }, + { + "epoch": 2.7725555883995137, + "grad_norm": 0.619238018989563, + "learning_rate": 7.5871736463991595e-06, + "loss": 1.6036, + "step": 10552 + }, + { + "epoch": 2.7730810917331756, + "grad_norm": 0.6409256458282471, + "learning_rate": 7.569651305414404e-06, + "loss": 1.6094, + "step": 10554 + }, + { + "epoch": 2.7736065950668376, + "grad_norm": 0.6005852818489075, + "learning_rate": 7.552128964429649e-06, + "loss": 1.5932, + "step": 10556 + }, + { + "epoch": 2.774132098400499, + "grad_norm": 0.5933837890625, + "learning_rate": 7.534606623444893e-06, + "loss": 1.5956, + "step": 10558 + }, + { + "epoch": 2.774657601734161, + "grad_norm": 0.6530826687812805, + "learning_rate": 7.517084282460136e-06, + "loss": 1.6305, + "step": 10560 + }, + { + "epoch": 2.7751831050678226, + "grad_norm": 0.7053954601287842, + "learning_rate": 7.499561941475382e-06, + "loss": 1.6037, + "step": 10562 + }, + { + "epoch": 2.7757086084014846, + "grad_norm": 0.5595335364341736, + "learning_rate": 7.482039600490625e-06, + "loss": 1.6028, + "step": 10564 + }, + { + "epoch": 2.776234111735146, + "grad_norm": 0.6155191659927368, + "learning_rate": 7.464517259505871e-06, + "loss": 1.591, + "step": 10566 + }, + { + "epoch": 2.776759615068808, + "grad_norm": 0.5995579361915588, + "learning_rate": 7.446994918521115e-06, + "loss": 1.6206, + "step": 10568 + }, + { + "epoch": 2.77728511840247, + "grad_norm": 0.5798934698104858, + "learning_rate": 7.429472577536359e-06, + "loss": 1.5958, + "step": 10570 + }, + { + "epoch": 2.7778106217361316, + "grad_norm": 0.6855894923210144, + "learning_rate": 7.411950236551604e-06, + "loss": 1.6102, + "step": 10572 + }, + { + "epoch": 2.7783361250697935, + "grad_norm": 0.6109877228736877, + "learning_rate": 7.394427895566848e-06, + "loss": 1.5838, + "step": 10574 + }, + { + "epoch": 2.778861628403455, + "grad_norm": 0.5831050276756287, + "learning_rate": 7.376905554582092e-06, + "loss": 1.5936, + "step": 10576 + }, + { + "epoch": 2.779387131737117, + "grad_norm": 0.6219046711921692, + "learning_rate": 7.359383213597337e-06, + "loss": 1.6013, + "step": 10578 + }, + { + "epoch": 2.7799126350707786, + "grad_norm": 0.6402764916419983, + "learning_rate": 7.341860872612581e-06, + "loss": 1.6106, + "step": 10580 + }, + { + "epoch": 2.7804381384044405, + "grad_norm": 0.6024214625358582, + "learning_rate": 7.324338531627826e-06, + "loss": 1.6195, + "step": 10582 + }, + { + "epoch": 2.7809636417381025, + "grad_norm": 0.5932384729385376, + "learning_rate": 7.3068161906430705e-06, + "loss": 1.5976, + "step": 10584 + }, + { + "epoch": 2.781489145071764, + "grad_norm": 0.631115198135376, + "learning_rate": 7.289293849658315e-06, + "loss": 1.5967, + "step": 10586 + }, + { + "epoch": 2.7820146484054256, + "grad_norm": 0.6470726132392883, + "learning_rate": 7.27177150867356e-06, + "loss": 1.5992, + "step": 10588 + }, + { + "epoch": 2.7825401517390875, + "grad_norm": 0.6247832775115967, + "learning_rate": 7.254249167688804e-06, + "loss": 1.6331, + "step": 10590 + }, + { + "epoch": 2.7830656550727495, + "grad_norm": 0.5455314517021179, + "learning_rate": 7.236726826704047e-06, + "loss": 1.6089, + "step": 10592 + }, + { + "epoch": 2.783591158406411, + "grad_norm": 0.7327371835708618, + "learning_rate": 7.219204485719293e-06, + "loss": 1.5804, + "step": 10594 + }, + { + "epoch": 2.784116661740073, + "grad_norm": 0.6657344102859497, + "learning_rate": 7.201682144734536e-06, + "loss": 1.5925, + "step": 10596 + }, + { + "epoch": 2.7846421650737345, + "grad_norm": 0.6697341203689575, + "learning_rate": 7.184159803749781e-06, + "loss": 1.5906, + "step": 10598 + }, + { + "epoch": 2.7851676684073965, + "grad_norm": 0.6150529384613037, + "learning_rate": 7.166637462765026e-06, + "loss": 1.6017, + "step": 10600 + }, + { + "epoch": 2.785693171741058, + "grad_norm": 0.5665842890739441, + "learning_rate": 7.14911512178027e-06, + "loss": 1.581, + "step": 10602 + }, + { + "epoch": 2.78621867507472, + "grad_norm": 0.5935694575309753, + "learning_rate": 7.131592780795515e-06, + "loss": 1.5845, + "step": 10604 + }, + { + "epoch": 2.786744178408382, + "grad_norm": 0.6057782769203186, + "learning_rate": 7.114070439810759e-06, + "loss": 1.5985, + "step": 10606 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.5478567481040955, + "learning_rate": 7.096548098826003e-06, + "loss": 1.6067, + "step": 10608 + }, + { + "epoch": 2.7877951850757055, + "grad_norm": 0.5651966333389282, + "learning_rate": 7.079025757841248e-06, + "loss": 1.6157, + "step": 10610 + }, + { + "epoch": 2.788320688409367, + "grad_norm": 0.6189813017845154, + "learning_rate": 7.061503416856492e-06, + "loss": 1.5912, + "step": 10612 + }, + { + "epoch": 2.788846191743029, + "grad_norm": 0.6159524917602539, + "learning_rate": 7.0439810758717365e-06, + "loss": 1.622, + "step": 10614 + }, + { + "epoch": 2.7893716950766905, + "grad_norm": 0.5482856631278992, + "learning_rate": 7.0264587348869816e-06, + "loss": 1.6197, + "step": 10616 + }, + { + "epoch": 2.7898971984103524, + "grad_norm": 0.6579813957214355, + "learning_rate": 7.008936393902226e-06, + "loss": 1.6182, + "step": 10618 + }, + { + "epoch": 2.7904227017440144, + "grad_norm": 0.607364296913147, + "learning_rate": 6.991414052917471e-06, + "loss": 1.6183, + "step": 10620 + }, + { + "epoch": 2.790948205077676, + "grad_norm": 0.649757444858551, + "learning_rate": 6.973891711932715e-06, + "loss": 1.6179, + "step": 10622 + }, + { + "epoch": 2.7914737084113375, + "grad_norm": 0.5615342259407043, + "learning_rate": 6.956369370947958e-06, + "loss": 1.632, + "step": 10624 + }, + { + "epoch": 2.7919992117449994, + "grad_norm": 0.6041872501373291, + "learning_rate": 6.938847029963204e-06, + "loss": 1.5943, + "step": 10626 + }, + { + "epoch": 2.7925247150786614, + "grad_norm": 0.6032666563987732, + "learning_rate": 6.9213246889784475e-06, + "loss": 1.5552, + "step": 10628 + }, + { + "epoch": 2.793050218412323, + "grad_norm": 0.5664829611778259, + "learning_rate": 6.903802347993692e-06, + "loss": 1.5918, + "step": 10630 + }, + { + "epoch": 2.793575721745985, + "grad_norm": 0.633831262588501, + "learning_rate": 6.886280007008937e-06, + "loss": 1.5763, + "step": 10632 + }, + { + "epoch": 2.794101225079647, + "grad_norm": 0.5904597640037537, + "learning_rate": 6.868757666024181e-06, + "loss": 1.5659, + "step": 10634 + }, + { + "epoch": 2.7946267284133084, + "grad_norm": 0.556845486164093, + "learning_rate": 6.851235325039426e-06, + "loss": 1.5793, + "step": 10636 + }, + { + "epoch": 2.79515223174697, + "grad_norm": 0.6646136045455933, + "learning_rate": 6.83371298405467e-06, + "loss": 1.6201, + "step": 10638 + }, + { + "epoch": 2.795677735080632, + "grad_norm": 0.5443913340568542, + "learning_rate": 6.816190643069914e-06, + "loss": 1.6126, + "step": 10640 + }, + { + "epoch": 2.796203238414294, + "grad_norm": 0.5882983803749084, + "learning_rate": 6.798668302085159e-06, + "loss": 1.572, + "step": 10642 + }, + { + "epoch": 2.7967287417479554, + "grad_norm": 0.5482323169708252, + "learning_rate": 6.781145961100403e-06, + "loss": 1.6127, + "step": 10644 + }, + { + "epoch": 2.7972542450816174, + "grad_norm": 0.5871306657791138, + "learning_rate": 6.763623620115648e-06, + "loss": 1.6031, + "step": 10646 + }, + { + "epoch": 2.797779748415279, + "grad_norm": 0.6310222744941711, + "learning_rate": 6.746101279130893e-06, + "loss": 1.6017, + "step": 10648 + }, + { + "epoch": 2.798305251748941, + "grad_norm": 0.8194197416305542, + "learning_rate": 6.728578938146137e-06, + "loss": 1.5373, + "step": 10650 + }, + { + "epoch": 2.7988307550826024, + "grad_norm": 0.5392462015151978, + "learning_rate": 6.711056597161382e-06, + "loss": 1.6042, + "step": 10652 + }, + { + "epoch": 2.7993562584162643, + "grad_norm": 0.5910424590110779, + "learning_rate": 6.693534256176626e-06, + "loss": 1.6332, + "step": 10654 + }, + { + "epoch": 2.7998817617499263, + "grad_norm": 0.6581763029098511, + "learning_rate": 6.676011915191869e-06, + "loss": 1.617, + "step": 10656 + }, + { + "epoch": 2.800407265083588, + "grad_norm": 0.6362857222557068, + "learning_rate": 6.658489574207115e-06, + "loss": 1.5952, + "step": 10658 + }, + { + "epoch": 2.80093276841725, + "grad_norm": 0.6781132817268372, + "learning_rate": 6.6409672332223585e-06, + "loss": 1.571, + "step": 10660 + }, + { + "epoch": 2.8014582717509113, + "grad_norm": 0.609855055809021, + "learning_rate": 6.623444892237603e-06, + "loss": 1.6155, + "step": 10662 + }, + { + "epoch": 2.8019837750845733, + "grad_norm": 0.5758035182952881, + "learning_rate": 6.605922551252848e-06, + "loss": 1.5991, + "step": 10664 + }, + { + "epoch": 2.802509278418235, + "grad_norm": 0.6021389961242676, + "learning_rate": 6.588400210268092e-06, + "loss": 1.6027, + "step": 10666 + }, + { + "epoch": 2.803034781751897, + "grad_norm": 0.6531217098236084, + "learning_rate": 6.570877869283337e-06, + "loss": 1.6246, + "step": 10668 + }, + { + "epoch": 2.8035602850855588, + "grad_norm": 0.7034488916397095, + "learning_rate": 6.553355528298581e-06, + "loss": 1.599, + "step": 10670 + }, + { + "epoch": 2.8040857884192203, + "grad_norm": 0.5464747548103333, + "learning_rate": 6.535833187313825e-06, + "loss": 1.6301, + "step": 10672 + }, + { + "epoch": 2.804611291752882, + "grad_norm": 0.6323386430740356, + "learning_rate": 6.51831084632907e-06, + "loss": 1.634, + "step": 10674 + }, + { + "epoch": 2.805136795086544, + "grad_norm": 0.5574644207954407, + "learning_rate": 6.5007885053443144e-06, + "loss": 1.5634, + "step": 10676 + }, + { + "epoch": 2.8056622984202058, + "grad_norm": 0.641542911529541, + "learning_rate": 6.483266164359559e-06, + "loss": 1.5822, + "step": 10678 + }, + { + "epoch": 2.8061878017538673, + "grad_norm": 0.5666943788528442, + "learning_rate": 6.465743823374804e-06, + "loss": 1.5812, + "step": 10680 + }, + { + "epoch": 2.8067133050875293, + "grad_norm": 0.5939244627952576, + "learning_rate": 6.448221482390048e-06, + "loss": 1.5907, + "step": 10682 + }, + { + "epoch": 2.807238808421191, + "grad_norm": 0.5618119835853577, + "learning_rate": 6.430699141405291e-06, + "loss": 1.5859, + "step": 10684 + }, + { + "epoch": 2.8077643117548527, + "grad_norm": 0.5734422206878662, + "learning_rate": 6.413176800420537e-06, + "loss": 1.5904, + "step": 10686 + }, + { + "epoch": 2.8082898150885143, + "grad_norm": 0.5530012845993042, + "learning_rate": 6.39565445943578e-06, + "loss": 1.6041, + "step": 10688 + }, + { + "epoch": 2.8088153184221762, + "grad_norm": 0.6470913290977478, + "learning_rate": 6.378132118451026e-06, + "loss": 1.594, + "step": 10690 + }, + { + "epoch": 2.809340821755838, + "grad_norm": 0.6118698120117188, + "learning_rate": 6.3606097774662695e-06, + "loss": 1.625, + "step": 10692 + }, + { + "epoch": 2.8098663250894997, + "grad_norm": 0.6750172972679138, + "learning_rate": 6.343087436481514e-06, + "loss": 1.5389, + "step": 10694 + }, + { + "epoch": 2.8103918284231617, + "grad_norm": 0.6908929347991943, + "learning_rate": 6.325565095496759e-06, + "loss": 1.6215, + "step": 10696 + }, + { + "epoch": 2.8109173317568232, + "grad_norm": 0.6111420392990112, + "learning_rate": 6.308042754512003e-06, + "loss": 1.5701, + "step": 10698 + }, + { + "epoch": 2.811442835090485, + "grad_norm": 0.6819601655006409, + "learning_rate": 6.290520413527247e-06, + "loss": 1.5895, + "step": 10700 + }, + { + "epoch": 2.8119683384241467, + "grad_norm": 0.6061453819274902, + "learning_rate": 6.272998072542492e-06, + "loss": 1.6076, + "step": 10702 + }, + { + "epoch": 2.8124938417578087, + "grad_norm": 0.6342339515686035, + "learning_rate": 6.255475731557736e-06, + "loss": 1.6128, + "step": 10704 + }, + { + "epoch": 2.8130193450914707, + "grad_norm": 0.6219684481620789, + "learning_rate": 6.2379533905729805e-06, + "loss": 1.6264, + "step": 10706 + }, + { + "epoch": 2.813544848425132, + "grad_norm": 0.779617190361023, + "learning_rate": 6.2204310495882255e-06, + "loss": 1.6218, + "step": 10708 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.5725176334381104, + "learning_rate": 6.20290870860347e-06, + "loss": 1.5772, + "step": 10710 + }, + { + "epoch": 2.8145958550924557, + "grad_norm": 0.6861016750335693, + "learning_rate": 6.185386367618714e-06, + "loss": 1.604, + "step": 10712 + }, + { + "epoch": 2.8151213584261177, + "grad_norm": 0.6791368126869202, + "learning_rate": 6.167864026633959e-06, + "loss": 1.6244, + "step": 10714 + }, + { + "epoch": 2.815646861759779, + "grad_norm": 0.5471606850624084, + "learning_rate": 6.150341685649203e-06, + "loss": 1.589, + "step": 10716 + }, + { + "epoch": 2.816172365093441, + "grad_norm": 0.5449687242507935, + "learning_rate": 6.132819344664448e-06, + "loss": 1.6234, + "step": 10718 + }, + { + "epoch": 2.8166978684271027, + "grad_norm": 0.6119715571403503, + "learning_rate": 6.115297003679691e-06, + "loss": 1.5809, + "step": 10720 + }, + { + "epoch": 2.8172233717607646, + "grad_norm": 0.5771490931510925, + "learning_rate": 6.097774662694936e-06, + "loss": 1.5833, + "step": 10722 + }, + { + "epoch": 2.817748875094426, + "grad_norm": 0.5519348978996277, + "learning_rate": 6.0802523217101806e-06, + "loss": 1.6195, + "step": 10724 + }, + { + "epoch": 2.818274378428088, + "grad_norm": 0.7355902791023254, + "learning_rate": 6.062729980725426e-06, + "loss": 1.54, + "step": 10726 + }, + { + "epoch": 2.81879988176175, + "grad_norm": 0.5478994250297546, + "learning_rate": 6.04520763974067e-06, + "loss": 1.6225, + "step": 10728 + }, + { + "epoch": 2.8193253850954116, + "grad_norm": 0.5732230544090271, + "learning_rate": 6.027685298755914e-06, + "loss": 1.5899, + "step": 10730 + }, + { + "epoch": 2.8198508884290736, + "grad_norm": 0.6284743547439575, + "learning_rate": 6.010162957771159e-06, + "loss": 1.6116, + "step": 10732 + }, + { + "epoch": 2.820376391762735, + "grad_norm": 0.6271458864212036, + "learning_rate": 5.992640616786403e-06, + "loss": 1.5938, + "step": 10734 + }, + { + "epoch": 2.820901895096397, + "grad_norm": 0.5997607111930847, + "learning_rate": 5.975118275801647e-06, + "loss": 1.6035, + "step": 10736 + }, + { + "epoch": 2.8214273984300586, + "grad_norm": 0.6365160942077637, + "learning_rate": 5.9575959348168915e-06, + "loss": 1.6172, + "step": 10738 + }, + { + "epoch": 2.8219529017637206, + "grad_norm": 0.7437217235565186, + "learning_rate": 5.9400735938321365e-06, + "loss": 1.605, + "step": 10740 + }, + { + "epoch": 2.8224784050973826, + "grad_norm": 0.6684809923171997, + "learning_rate": 5.922551252847381e-06, + "loss": 1.6066, + "step": 10742 + }, + { + "epoch": 2.823003908431044, + "grad_norm": 0.5696150064468384, + "learning_rate": 5.905028911862625e-06, + "loss": 1.6137, + "step": 10744 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.6045554876327515, + "learning_rate": 5.88750657087787e-06, + "loss": 1.6169, + "step": 10746 + }, + { + "epoch": 2.8240549150983676, + "grad_norm": 0.5769687294960022, + "learning_rate": 5.869984229893114e-06, + "loss": 1.612, + "step": 10748 + }, + { + "epoch": 2.8245804184320296, + "grad_norm": 0.6565601825714111, + "learning_rate": 5.852461888908359e-06, + "loss": 1.6078, + "step": 10750 + }, + { + "epoch": 2.825105921765691, + "grad_norm": 0.614622175693512, + "learning_rate": 5.834939547923602e-06, + "loss": 1.6229, + "step": 10752 + }, + { + "epoch": 2.825631425099353, + "grad_norm": 0.5989246964454651, + "learning_rate": 5.8174172069388474e-06, + "loss": 1.6167, + "step": 10754 + }, + { + "epoch": 2.8261569284330146, + "grad_norm": 0.6011819243431091, + "learning_rate": 5.799894865954092e-06, + "loss": 1.5975, + "step": 10756 + }, + { + "epoch": 2.8266824317666766, + "grad_norm": 0.5927401185035706, + "learning_rate": 5.782372524969336e-06, + "loss": 1.6034, + "step": 10758 + }, + { + "epoch": 2.827207935100338, + "grad_norm": 0.6807177066802979, + "learning_rate": 5.764850183984581e-06, + "loss": 1.6005, + "step": 10760 + }, + { + "epoch": 2.827733438434, + "grad_norm": 0.637870728969574, + "learning_rate": 5.747327842999825e-06, + "loss": 1.6184, + "step": 10762 + }, + { + "epoch": 2.828258941767662, + "grad_norm": 0.5749346017837524, + "learning_rate": 5.72980550201507e-06, + "loss": 1.6022, + "step": 10764 + }, + { + "epoch": 2.8287844451013235, + "grad_norm": 0.5985031127929688, + "learning_rate": 5.712283161030313e-06, + "loss": 1.5792, + "step": 10766 + }, + { + "epoch": 2.8293099484349855, + "grad_norm": 0.5938543677330017, + "learning_rate": 5.694760820045558e-06, + "loss": 1.621, + "step": 10768 + }, + { + "epoch": 2.829835451768647, + "grad_norm": 0.6295526027679443, + "learning_rate": 5.6772384790608025e-06, + "loss": 1.6134, + "step": 10770 + }, + { + "epoch": 2.830360955102309, + "grad_norm": 0.5686202049255371, + "learning_rate": 5.6597161380760476e-06, + "loss": 1.5748, + "step": 10772 + }, + { + "epoch": 2.8308864584359705, + "grad_norm": 0.5600228309631348, + "learning_rate": 5.642193797091292e-06, + "loss": 1.5901, + "step": 10774 + }, + { + "epoch": 2.8314119617696325, + "grad_norm": 0.5651440024375916, + "learning_rate": 5.624671456106536e-06, + "loss": 1.6054, + "step": 10776 + }, + { + "epoch": 2.8319374651032945, + "grad_norm": 0.5856150984764099, + "learning_rate": 5.607149115121781e-06, + "loss": 1.5641, + "step": 10778 + }, + { + "epoch": 2.832462968436956, + "grad_norm": 0.6541823148727417, + "learning_rate": 5.589626774137025e-06, + "loss": 1.5969, + "step": 10780 + }, + { + "epoch": 2.8329884717706175, + "grad_norm": 0.577987551689148, + "learning_rate": 5.572104433152269e-06, + "loss": 1.5768, + "step": 10782 + }, + { + "epoch": 2.8335139751042795, + "grad_norm": 0.6325380206108093, + "learning_rate": 5.5545820921675135e-06, + "loss": 1.5626, + "step": 10784 + }, + { + "epoch": 2.8340394784379415, + "grad_norm": 0.5774263143539429, + "learning_rate": 5.5370597511827585e-06, + "loss": 1.6056, + "step": 10786 + }, + { + "epoch": 2.834564981771603, + "grad_norm": 0.5753619074821472, + "learning_rate": 5.519537410198003e-06, + "loss": 1.5734, + "step": 10788 + }, + { + "epoch": 2.835090485105265, + "grad_norm": 0.5787561535835266, + "learning_rate": 5.502015069213247e-06, + "loss": 1.6027, + "step": 10790 + }, + { + "epoch": 2.835615988438927, + "grad_norm": 0.616862952709198, + "learning_rate": 5.484492728228492e-06, + "loss": 1.6296, + "step": 10792 + }, + { + "epoch": 2.8361414917725885, + "grad_norm": 0.5734919905662537, + "learning_rate": 5.466970387243736e-06, + "loss": 1.5966, + "step": 10794 + }, + { + "epoch": 2.83666699510625, + "grad_norm": 0.5936645865440369, + "learning_rate": 5.449448046258981e-06, + "loss": 1.6018, + "step": 10796 + }, + { + "epoch": 2.837192498439912, + "grad_norm": 0.6087591052055359, + "learning_rate": 5.431925705274224e-06, + "loss": 1.6201, + "step": 10798 + }, + { + "epoch": 2.837718001773574, + "grad_norm": 0.609794557094574, + "learning_rate": 5.414403364289469e-06, + "loss": 1.6401, + "step": 10800 + }, + { + "epoch": 2.837718001773574, + "eval_loss": 1.6476643085479736, + "eval_runtime": 487.2582, + "eval_samples_per_second": 249.948, + "eval_steps_per_second": 31.244, + "step": 10800 + }, + { + "epoch": 2.8382435051072354, + "grad_norm": 0.8440867066383362, + "learning_rate": 5.3968810233047136e-06, + "loss": 1.613, + "step": 10802 + }, + { + "epoch": 2.8387690084408974, + "grad_norm": 0.6034703254699707, + "learning_rate": 5.379358682319959e-06, + "loss": 1.5959, + "step": 10804 + }, + { + "epoch": 2.839294511774559, + "grad_norm": 0.5443819165229797, + "learning_rate": 5.361836341335203e-06, + "loss": 1.5844, + "step": 10806 + }, + { + "epoch": 2.839820015108221, + "grad_norm": 0.7041229605674744, + "learning_rate": 5.344314000350447e-06, + "loss": 1.611, + "step": 10808 + }, + { + "epoch": 2.8403455184418824, + "grad_norm": 0.5578668713569641, + "learning_rate": 5.326791659365692e-06, + "loss": 1.6177, + "step": 10810 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.5947692394256592, + "learning_rate": 5.309269318380936e-06, + "loss": 1.6223, + "step": 10812 + }, + { + "epoch": 2.8413965251092064, + "grad_norm": 0.6893056631088257, + "learning_rate": 5.29174697739618e-06, + "loss": 1.6284, + "step": 10814 + }, + { + "epoch": 2.841922028442868, + "grad_norm": 0.6605462431907654, + "learning_rate": 5.2742246364114245e-06, + "loss": 1.6525, + "step": 10816 + }, + { + "epoch": 2.84244753177653, + "grad_norm": 0.5912427306175232, + "learning_rate": 5.2567022954266695e-06, + "loss": 1.5903, + "step": 10818 + }, + { + "epoch": 2.8429730351101914, + "grad_norm": 0.6220155954360962, + "learning_rate": 5.239179954441914e-06, + "loss": 1.6177, + "step": 10820 + }, + { + "epoch": 2.8434985384438534, + "grad_norm": 0.5568714141845703, + "learning_rate": 5.221657613457158e-06, + "loss": 1.5975, + "step": 10822 + }, + { + "epoch": 2.844024041777515, + "grad_norm": 0.6185790300369263, + "learning_rate": 5.204135272472403e-06, + "loss": 1.6108, + "step": 10824 + }, + { + "epoch": 2.844549545111177, + "grad_norm": 0.5641743540763855, + "learning_rate": 5.186612931487647e-06, + "loss": 1.5923, + "step": 10826 + }, + { + "epoch": 2.845075048444839, + "grad_norm": 0.5651496052742004, + "learning_rate": 5.169090590502892e-06, + "loss": 1.6242, + "step": 10828 + }, + { + "epoch": 2.8456005517785004, + "grad_norm": 0.5548931360244751, + "learning_rate": 5.151568249518135e-06, + "loss": 1.5939, + "step": 10830 + }, + { + "epoch": 2.846126055112162, + "grad_norm": 0.6729891896247864, + "learning_rate": 5.1340459085333804e-06, + "loss": 1.6279, + "step": 10832 + }, + { + "epoch": 2.846651558445824, + "grad_norm": 0.603522002696991, + "learning_rate": 5.116523567548625e-06, + "loss": 1.5939, + "step": 10834 + }, + { + "epoch": 2.847177061779486, + "grad_norm": 0.5417622923851013, + "learning_rate": 5.099001226563869e-06, + "loss": 1.5809, + "step": 10836 + }, + { + "epoch": 2.8477025651131473, + "grad_norm": 0.6103736162185669, + "learning_rate": 5.081478885579114e-06, + "loss": 1.5864, + "step": 10838 + }, + { + "epoch": 2.8482280684468093, + "grad_norm": 0.6213535666465759, + "learning_rate": 5.063956544594358e-06, + "loss": 1.6082, + "step": 10840 + }, + { + "epoch": 2.848753571780471, + "grad_norm": 0.5709004998207092, + "learning_rate": 5.046434203609603e-06, + "loss": 1.5648, + "step": 10842 + }, + { + "epoch": 2.849279075114133, + "grad_norm": 0.6318028569221497, + "learning_rate": 5.028911862624846e-06, + "loss": 1.6319, + "step": 10844 + }, + { + "epoch": 2.8498045784477943, + "grad_norm": 0.5908393263816833, + "learning_rate": 5.011389521640091e-06, + "loss": 1.63, + "step": 10846 + }, + { + "epoch": 2.8503300817814563, + "grad_norm": 0.5498826503753662, + "learning_rate": 4.9938671806553355e-06, + "loss": 1.6219, + "step": 10848 + }, + { + "epoch": 2.8508555851151183, + "grad_norm": 0.5806742906570435, + "learning_rate": 4.9763448396705806e-06, + "loss": 1.574, + "step": 10850 + }, + { + "epoch": 2.85138108844878, + "grad_norm": 0.5563404560089111, + "learning_rate": 4.958822498685825e-06, + "loss": 1.6111, + "step": 10852 + }, + { + "epoch": 2.8519065917824418, + "grad_norm": 0.5694655179977417, + "learning_rate": 4.941300157701069e-06, + "loss": 1.604, + "step": 10854 + }, + { + "epoch": 2.8524320951161033, + "grad_norm": 0.5293794274330139, + "learning_rate": 4.923777816716314e-06, + "loss": 1.624, + "step": 10856 + }, + { + "epoch": 2.8529575984497653, + "grad_norm": 0.5831487774848938, + "learning_rate": 4.906255475731558e-06, + "loss": 1.6151, + "step": 10858 + }, + { + "epoch": 2.853483101783427, + "grad_norm": 0.6377184987068176, + "learning_rate": 4.888733134746802e-06, + "loss": 1.5698, + "step": 10860 + }, + { + "epoch": 2.8540086051170888, + "grad_norm": 0.6772559881210327, + "learning_rate": 4.8712107937620465e-06, + "loss": 1.5918, + "step": 10862 + }, + { + "epoch": 2.8545341084507507, + "grad_norm": 0.6782848238945007, + "learning_rate": 4.8536884527772915e-06, + "loss": 1.6303, + "step": 10864 + }, + { + "epoch": 2.8550596117844123, + "grad_norm": 0.6130311489105225, + "learning_rate": 4.836166111792536e-06, + "loss": 1.6051, + "step": 10866 + }, + { + "epoch": 2.855585115118074, + "grad_norm": 0.6167743802070618, + "learning_rate": 4.81864377080778e-06, + "loss": 1.6214, + "step": 10868 + }, + { + "epoch": 2.8561106184517357, + "grad_norm": 0.5957754850387573, + "learning_rate": 4.801121429823025e-06, + "loss": 1.6118, + "step": 10870 + }, + { + "epoch": 2.8566361217853977, + "grad_norm": 0.5984094738960266, + "learning_rate": 4.783599088838269e-06, + "loss": 1.5894, + "step": 10872 + }, + { + "epoch": 2.8571616251190592, + "grad_norm": 0.6340751051902771, + "learning_rate": 4.766076747853514e-06, + "loss": 1.6018, + "step": 10874 + }, + { + "epoch": 2.857687128452721, + "grad_norm": 0.6450504660606384, + "learning_rate": 4.748554406868757e-06, + "loss": 1.6009, + "step": 10876 + }, + { + "epoch": 2.8582126317863827, + "grad_norm": 0.5570113062858582, + "learning_rate": 4.731032065884002e-06, + "loss": 1.5877, + "step": 10878 + }, + { + "epoch": 2.8587381351200447, + "grad_norm": 0.6482218503952026, + "learning_rate": 4.7135097248992466e-06, + "loss": 1.6192, + "step": 10880 + }, + { + "epoch": 2.8592636384537062, + "grad_norm": 0.5896407961845398, + "learning_rate": 4.695987383914492e-06, + "loss": 1.5885, + "step": 10882 + }, + { + "epoch": 2.859789141787368, + "grad_norm": 0.6039836406707764, + "learning_rate": 4.678465042929736e-06, + "loss": 1.6141, + "step": 10884 + }, + { + "epoch": 2.86031464512103, + "grad_norm": 0.6409982442855835, + "learning_rate": 4.66094270194498e-06, + "loss": 1.6277, + "step": 10886 + }, + { + "epoch": 2.8608401484546917, + "grad_norm": 0.5397644639015198, + "learning_rate": 4.643420360960225e-06, + "loss": 1.6035, + "step": 10888 + }, + { + "epoch": 2.8613656517883537, + "grad_norm": 0.587805986404419, + "learning_rate": 4.625898019975469e-06, + "loss": 1.578, + "step": 10890 + }, + { + "epoch": 2.861891155122015, + "grad_norm": 0.6628287434577942, + "learning_rate": 4.608375678990713e-06, + "loss": 1.6003, + "step": 10892 + }, + { + "epoch": 2.862416658455677, + "grad_norm": 0.5740228891372681, + "learning_rate": 4.5908533380059575e-06, + "loss": 1.6358, + "step": 10894 + }, + { + "epoch": 2.8629421617893387, + "grad_norm": 0.6140313148498535, + "learning_rate": 4.5733309970212025e-06, + "loss": 1.5789, + "step": 10896 + }, + { + "epoch": 2.8634676651230007, + "grad_norm": 0.6435587406158447, + "learning_rate": 4.555808656036447e-06, + "loss": 1.6386, + "step": 10898 + }, + { + "epoch": 2.8639931684566626, + "grad_norm": 0.6358049511909485, + "learning_rate": 4.538286315051691e-06, + "loss": 1.6128, + "step": 10900 + }, + { + "epoch": 2.864518671790324, + "grad_norm": 0.5897725224494934, + "learning_rate": 4.520763974066936e-06, + "loss": 1.5958, + "step": 10902 + }, + { + "epoch": 2.8650441751239857, + "grad_norm": 0.6232768893241882, + "learning_rate": 4.50324163308218e-06, + "loss": 1.6026, + "step": 10904 + }, + { + "epoch": 2.8655696784576477, + "grad_norm": 0.5649431347846985, + "learning_rate": 4.485719292097425e-06, + "loss": 1.6001, + "step": 10906 + }, + { + "epoch": 2.8660951817913096, + "grad_norm": 0.5962793827056885, + "learning_rate": 4.468196951112668e-06, + "loss": 1.5408, + "step": 10908 + }, + { + "epoch": 2.866620685124971, + "grad_norm": 0.5510377883911133, + "learning_rate": 4.4506746101279134e-06, + "loss": 1.6295, + "step": 10910 + }, + { + "epoch": 2.867146188458633, + "grad_norm": 0.5945908427238464, + "learning_rate": 4.433152269143158e-06, + "loss": 1.6192, + "step": 10912 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5374131798744202, + "learning_rate": 4.415629928158403e-06, + "loss": 1.6022, + "step": 10914 + }, + { + "epoch": 2.8681971951259566, + "grad_norm": 0.5591030716896057, + "learning_rate": 4.398107587173647e-06, + "loss": 1.6104, + "step": 10916 + }, + { + "epoch": 2.868722698459618, + "grad_norm": 0.6745641231536865, + "learning_rate": 4.380585246188891e-06, + "loss": 1.5931, + "step": 10918 + }, + { + "epoch": 2.86924820179328, + "grad_norm": 0.7577717900276184, + "learning_rate": 4.363062905204136e-06, + "loss": 1.6324, + "step": 10920 + }, + { + "epoch": 2.869773705126942, + "grad_norm": 0.7548211812973022, + "learning_rate": 4.345540564219379e-06, + "loss": 1.62, + "step": 10922 + }, + { + "epoch": 2.8702992084606036, + "grad_norm": 0.6899915337562561, + "learning_rate": 4.328018223234624e-06, + "loss": 1.5757, + "step": 10924 + }, + { + "epoch": 2.8708247117942656, + "grad_norm": 0.580837070941925, + "learning_rate": 4.3104958822498685e-06, + "loss": 1.5889, + "step": 10926 + }, + { + "epoch": 2.871350215127927, + "grad_norm": 0.6023684740066528, + "learning_rate": 4.2929735412651136e-06, + "loss": 1.5814, + "step": 10928 + }, + { + "epoch": 2.871875718461589, + "grad_norm": 0.63701331615448, + "learning_rate": 4.275451200280358e-06, + "loss": 1.6205, + "step": 10930 + }, + { + "epoch": 2.8724012217952506, + "grad_norm": 0.5388381481170654, + "learning_rate": 4.257928859295602e-06, + "loss": 1.6093, + "step": 10932 + }, + { + "epoch": 2.8729267251289126, + "grad_norm": 0.5374415516853333, + "learning_rate": 4.240406518310847e-06, + "loss": 1.6366, + "step": 10934 + }, + { + "epoch": 2.8734522284625745, + "grad_norm": 0.7570417523384094, + "learning_rate": 4.222884177326091e-06, + "loss": 1.6263, + "step": 10936 + }, + { + "epoch": 2.873977731796236, + "grad_norm": 0.6314011812210083, + "learning_rate": 4.205361836341335e-06, + "loss": 1.5977, + "step": 10938 + }, + { + "epoch": 2.8745032351298976, + "grad_norm": 0.649953305721283, + "learning_rate": 4.1878394953565794e-06, + "loss": 1.6034, + "step": 10940 + }, + { + "epoch": 2.8750287384635596, + "grad_norm": 0.6486716866493225, + "learning_rate": 4.1703171543718245e-06, + "loss": 1.5938, + "step": 10942 + }, + { + "epoch": 2.8755542417972215, + "grad_norm": 0.6346645951271057, + "learning_rate": 4.152794813387069e-06, + "loss": 1.5878, + "step": 10944 + }, + { + "epoch": 2.876079745130883, + "grad_norm": 0.624394953250885, + "learning_rate": 4.135272472402313e-06, + "loss": 1.6165, + "step": 10946 + }, + { + "epoch": 2.876605248464545, + "grad_norm": 0.5654635429382324, + "learning_rate": 4.117750131417558e-06, + "loss": 1.5565, + "step": 10948 + }, + { + "epoch": 2.877130751798207, + "grad_norm": 0.6420629024505615, + "learning_rate": 4.100227790432802e-06, + "loss": 1.6144, + "step": 10950 + }, + { + "epoch": 2.8776562551318685, + "grad_norm": 0.6588164567947388, + "learning_rate": 4.082705449448047e-06, + "loss": 1.6324, + "step": 10952 + }, + { + "epoch": 2.87818175846553, + "grad_norm": 0.5764179229736328, + "learning_rate": 4.06518310846329e-06, + "loss": 1.608, + "step": 10954 + }, + { + "epoch": 2.878707261799192, + "grad_norm": 0.5966598391532898, + "learning_rate": 4.047660767478535e-06, + "loss": 1.5683, + "step": 10956 + }, + { + "epoch": 2.879232765132854, + "grad_norm": 0.6094986796379089, + "learning_rate": 4.0301384264937796e-06, + "loss": 1.6101, + "step": 10958 + }, + { + "epoch": 2.8797582684665155, + "grad_norm": 0.6777652502059937, + "learning_rate": 4.012616085509025e-06, + "loss": 1.6454, + "step": 10960 + }, + { + "epoch": 2.8802837718001775, + "grad_norm": 0.54978346824646, + "learning_rate": 3.995093744524269e-06, + "loss": 1.5811, + "step": 10962 + }, + { + "epoch": 2.880809275133839, + "grad_norm": 0.7338440418243408, + "learning_rate": 3.977571403539513e-06, + "loss": 1.6004, + "step": 10964 + }, + { + "epoch": 2.881334778467501, + "grad_norm": 0.5676441192626953, + "learning_rate": 3.960049062554758e-06, + "loss": 1.6194, + "step": 10966 + }, + { + "epoch": 2.8818602818011625, + "grad_norm": 0.6784756183624268, + "learning_rate": 3.942526721570002e-06, + "loss": 1.5941, + "step": 10968 + }, + { + "epoch": 2.8823857851348245, + "grad_norm": 0.6024194955825806, + "learning_rate": 3.925004380585246e-06, + "loss": 1.6035, + "step": 10970 + }, + { + "epoch": 2.8829112884684864, + "grad_norm": 0.5432921051979065, + "learning_rate": 3.9074820396004905e-06, + "loss": 1.6307, + "step": 10972 + }, + { + "epoch": 2.883436791802148, + "grad_norm": 0.5449672937393188, + "learning_rate": 3.8899596986157355e-06, + "loss": 1.6078, + "step": 10974 + }, + { + "epoch": 2.88396229513581, + "grad_norm": 0.582300066947937, + "learning_rate": 3.87243735763098e-06, + "loss": 1.5869, + "step": 10976 + }, + { + "epoch": 2.8844877984694715, + "grad_norm": 0.5711895823478699, + "learning_rate": 3.854915016646224e-06, + "loss": 1.578, + "step": 10978 + }, + { + "epoch": 2.8850133018031334, + "grad_norm": 0.6558634638786316, + "learning_rate": 3.837392675661469e-06, + "loss": 1.6074, + "step": 10980 + }, + { + "epoch": 2.885538805136795, + "grad_norm": 0.6054571866989136, + "learning_rate": 3.819870334676713e-06, + "loss": 1.6015, + "step": 10982 + }, + { + "epoch": 2.886064308470457, + "grad_norm": 0.5859559178352356, + "learning_rate": 3.8023479936919577e-06, + "loss": 1.575, + "step": 10984 + }, + { + "epoch": 2.886589811804119, + "grad_norm": 0.7558565139770508, + "learning_rate": 3.784825652707202e-06, + "loss": 1.6048, + "step": 10986 + }, + { + "epoch": 2.8871153151377804, + "grad_norm": 0.5738922357559204, + "learning_rate": 3.7673033117224464e-06, + "loss": 1.5895, + "step": 10988 + }, + { + "epoch": 2.887640818471442, + "grad_norm": 0.6217196583747864, + "learning_rate": 3.749780970737691e-06, + "loss": 1.6017, + "step": 10990 + }, + { + "epoch": 2.888166321805104, + "grad_norm": 0.590238094329834, + "learning_rate": 3.7322586297529356e-06, + "loss": 1.5937, + "step": 10992 + }, + { + "epoch": 2.888691825138766, + "grad_norm": 0.5736632943153381, + "learning_rate": 3.7147362887681794e-06, + "loss": 1.6011, + "step": 10994 + }, + { + "epoch": 2.8892173284724274, + "grad_norm": 0.6404598951339722, + "learning_rate": 3.697213947783424e-06, + "loss": 1.6158, + "step": 10996 + }, + { + "epoch": 2.8897428318060894, + "grad_norm": 0.5618501305580139, + "learning_rate": 3.6796916067986686e-06, + "loss": 1.5836, + "step": 10998 + }, + { + "epoch": 2.890268335139751, + "grad_norm": 0.5837007761001587, + "learning_rate": 3.662169265813913e-06, + "loss": 1.6126, + "step": 11000 + }, + { + "epoch": 2.890793838473413, + "grad_norm": 0.6110183596611023, + "learning_rate": 3.6446469248291574e-06, + "loss": 1.5715, + "step": 11002 + }, + { + "epoch": 2.8913193418070744, + "grad_norm": 0.5909767746925354, + "learning_rate": 3.627124583844402e-06, + "loss": 1.5992, + "step": 11004 + }, + { + "epoch": 2.8918448451407364, + "grad_norm": 0.5684041976928711, + "learning_rate": 3.6096022428596465e-06, + "loss": 1.6002, + "step": 11006 + }, + { + "epoch": 2.8923703484743983, + "grad_norm": 0.5662478804588318, + "learning_rate": 3.5920799018748903e-06, + "loss": 1.6093, + "step": 11008 + }, + { + "epoch": 2.89289585180806, + "grad_norm": 0.5797233581542969, + "learning_rate": 3.574557560890135e-06, + "loss": 1.5847, + "step": 11010 + }, + { + "epoch": 2.893421355141722, + "grad_norm": 0.60663902759552, + "learning_rate": 3.5570352199053795e-06, + "loss": 1.5927, + "step": 11012 + }, + { + "epoch": 2.8939468584753834, + "grad_norm": 0.5697915554046631, + "learning_rate": 3.539512878920624e-06, + "loss": 1.5718, + "step": 11014 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.6556651592254639, + "learning_rate": 3.5219905379358683e-06, + "loss": 1.6116, + "step": 11016 + }, + { + "epoch": 2.894997865142707, + "grad_norm": 0.6983522176742554, + "learning_rate": 3.504468196951113e-06, + "loss": 1.6157, + "step": 11018 + }, + { + "epoch": 2.895523368476369, + "grad_norm": 0.5559704303741455, + "learning_rate": 3.4869458559663575e-06, + "loss": 1.5646, + "step": 11020 + }, + { + "epoch": 2.896048871810031, + "grad_norm": 0.5538905262947083, + "learning_rate": 3.469423514981602e-06, + "loss": 1.6231, + "step": 11022 + }, + { + "epoch": 2.8965743751436923, + "grad_norm": 0.5700134038925171, + "learning_rate": 3.451901173996846e-06, + "loss": 1.5936, + "step": 11024 + }, + { + "epoch": 2.897099878477354, + "grad_norm": 0.5542116761207581, + "learning_rate": 3.4343788330120904e-06, + "loss": 1.6293, + "step": 11026 + }, + { + "epoch": 2.897625381811016, + "grad_norm": 0.5474702715873718, + "learning_rate": 3.416856492027335e-06, + "loss": 1.6112, + "step": 11028 + }, + { + "epoch": 2.898150885144678, + "grad_norm": 0.6167372465133667, + "learning_rate": 3.3993341510425796e-06, + "loss": 1.5763, + "step": 11030 + }, + { + "epoch": 2.8986763884783393, + "grad_norm": 0.6581651568412781, + "learning_rate": 3.381811810057824e-06, + "loss": 1.6004, + "step": 11032 + }, + { + "epoch": 2.8992018918120013, + "grad_norm": 0.62739497423172, + "learning_rate": 3.3642894690730684e-06, + "loss": 1.6136, + "step": 11034 + }, + { + "epoch": 2.899727395145663, + "grad_norm": 0.5928983688354492, + "learning_rate": 3.346767128088313e-06, + "loss": 1.622, + "step": 11036 + }, + { + "epoch": 2.9002528984793248, + "grad_norm": 0.6407076716423035, + "learning_rate": 3.3292447871035576e-06, + "loss": 1.5815, + "step": 11038 + }, + { + "epoch": 2.9007784018129863, + "grad_norm": 0.6269816756248474, + "learning_rate": 3.3117224461188013e-06, + "loss": 1.6119, + "step": 11040 + }, + { + "epoch": 2.9013039051466483, + "grad_norm": 0.5729198455810547, + "learning_rate": 3.294200105134046e-06, + "loss": 1.6173, + "step": 11042 + }, + { + "epoch": 2.9018294084803102, + "grad_norm": 0.5712758302688599, + "learning_rate": 3.2766777641492905e-06, + "loss": 1.5912, + "step": 11044 + }, + { + "epoch": 2.9023549118139718, + "grad_norm": 0.6735134720802307, + "learning_rate": 3.259155423164535e-06, + "loss": 1.603, + "step": 11046 + }, + { + "epoch": 2.9028804151476337, + "grad_norm": 0.6073388457298279, + "learning_rate": 3.2416330821797793e-06, + "loss": 1.6013, + "step": 11048 + }, + { + "epoch": 2.9034059184812953, + "grad_norm": 0.611774742603302, + "learning_rate": 3.224110741195024e-06, + "loss": 1.6021, + "step": 11050 + }, + { + "epoch": 2.9039314218149572, + "grad_norm": 0.6439489722251892, + "learning_rate": 3.2065884002102685e-06, + "loss": 1.6037, + "step": 11052 + }, + { + "epoch": 2.9044569251486188, + "grad_norm": 0.572325587272644, + "learning_rate": 3.189066059225513e-06, + "loss": 1.5629, + "step": 11054 + }, + { + "epoch": 2.9049824284822807, + "grad_norm": 0.5802271962165833, + "learning_rate": 3.171543718240757e-06, + "loss": 1.5982, + "step": 11056 + }, + { + "epoch": 2.9055079318159427, + "grad_norm": 0.6686844825744629, + "learning_rate": 3.1540213772560015e-06, + "loss": 1.626, + "step": 11058 + }, + { + "epoch": 2.906033435149604, + "grad_norm": 0.5929732918739319, + "learning_rate": 3.136499036271246e-06, + "loss": 1.5946, + "step": 11060 + }, + { + "epoch": 2.9065589384832657, + "grad_norm": 0.5740990042686462, + "learning_rate": 3.1189766952864902e-06, + "loss": 1.6365, + "step": 11062 + }, + { + "epoch": 2.9070844418169277, + "grad_norm": 0.6125746369361877, + "learning_rate": 3.101454354301735e-06, + "loss": 1.5778, + "step": 11064 + }, + { + "epoch": 2.9076099451505897, + "grad_norm": 0.5508874654769897, + "learning_rate": 3.0839320133169794e-06, + "loss": 1.6019, + "step": 11066 + }, + { + "epoch": 2.908135448484251, + "grad_norm": 0.5743160247802734, + "learning_rate": 3.066409672332224e-06, + "loss": 1.6103, + "step": 11068 + }, + { + "epoch": 2.908660951817913, + "grad_norm": 0.6552396416664124, + "learning_rate": 3.048887331347468e-06, + "loss": 1.6245, + "step": 11070 + }, + { + "epoch": 2.9091864551515747, + "grad_norm": 0.5878915190696716, + "learning_rate": 3.031364990362713e-06, + "loss": 1.6111, + "step": 11072 + }, + { + "epoch": 2.9097119584852367, + "grad_norm": 0.5548945665359497, + "learning_rate": 3.013842649377957e-06, + "loss": 1.6289, + "step": 11074 + }, + { + "epoch": 2.910237461818898, + "grad_norm": 0.5816505551338196, + "learning_rate": 2.9963203083932016e-06, + "loss": 1.6, + "step": 11076 + }, + { + "epoch": 2.91076296515256, + "grad_norm": 0.5567723512649536, + "learning_rate": 2.9787979674084457e-06, + "loss": 1.6144, + "step": 11078 + }, + { + "epoch": 2.911288468486222, + "grad_norm": 0.542295515537262, + "learning_rate": 2.9612756264236903e-06, + "loss": 1.6051, + "step": 11080 + }, + { + "epoch": 2.9118139718198837, + "grad_norm": 0.6353909969329834, + "learning_rate": 2.943753285438935e-06, + "loss": 1.6135, + "step": 11082 + }, + { + "epoch": 2.9123394751535456, + "grad_norm": 0.5646031498908997, + "learning_rate": 2.9262309444541795e-06, + "loss": 1.5934, + "step": 11084 + }, + { + "epoch": 2.912864978487207, + "grad_norm": 0.5628175139427185, + "learning_rate": 2.9087086034694237e-06, + "loss": 1.5589, + "step": 11086 + }, + { + "epoch": 2.913390481820869, + "grad_norm": 0.6169978380203247, + "learning_rate": 2.891186262484668e-06, + "loss": 1.5969, + "step": 11088 + }, + { + "epoch": 2.9139159851545307, + "grad_norm": 0.5552890300750732, + "learning_rate": 2.8736639214999125e-06, + "loss": 1.6083, + "step": 11090 + }, + { + "epoch": 2.9144414884881926, + "grad_norm": 0.5658928751945496, + "learning_rate": 2.8561415805151567e-06, + "loss": 1.5807, + "step": 11092 + }, + { + "epoch": 2.9149669918218546, + "grad_norm": 0.5916785001754761, + "learning_rate": 2.8386192395304013e-06, + "loss": 1.6128, + "step": 11094 + }, + { + "epoch": 2.915492495155516, + "grad_norm": 0.5797626376152039, + "learning_rate": 2.821096898545646e-06, + "loss": 1.6042, + "step": 11096 + }, + { + "epoch": 2.9160179984891776, + "grad_norm": 0.661906361579895, + "learning_rate": 2.8035745575608905e-06, + "loss": 1.6357, + "step": 11098 + }, + { + "epoch": 2.9165435018228396, + "grad_norm": 0.5823834538459778, + "learning_rate": 2.7860522165761346e-06, + "loss": 1.5987, + "step": 11100 + }, + { + "epoch": 2.9170690051565016, + "grad_norm": 0.5549127459526062, + "learning_rate": 2.7685298755913792e-06, + "loss": 1.6271, + "step": 11102 + }, + { + "epoch": 2.917594508490163, + "grad_norm": 0.633703351020813, + "learning_rate": 2.7510075346066234e-06, + "loss": 1.6639, + "step": 11104 + }, + { + "epoch": 2.918120011823825, + "grad_norm": 0.6295391321182251, + "learning_rate": 2.733485193621868e-06, + "loss": 1.5803, + "step": 11106 + }, + { + "epoch": 2.918645515157487, + "grad_norm": 0.7048977017402649, + "learning_rate": 2.715962852637112e-06, + "loss": 1.6112, + "step": 11108 + }, + { + "epoch": 2.9191710184911486, + "grad_norm": 0.7996454238891602, + "learning_rate": 2.6984405116523568e-06, + "loss": 1.5931, + "step": 11110 + }, + { + "epoch": 2.91969652182481, + "grad_norm": 0.6191883683204651, + "learning_rate": 2.6809181706676014e-06, + "loss": 1.6265, + "step": 11112 + }, + { + "epoch": 2.920222025158472, + "grad_norm": 0.6032472252845764, + "learning_rate": 2.663395829682846e-06, + "loss": 1.6078, + "step": 11114 + }, + { + "epoch": 2.920747528492134, + "grad_norm": 0.5909383296966553, + "learning_rate": 2.64587348869809e-06, + "loss": 1.6047, + "step": 11116 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.741261899471283, + "learning_rate": 2.6283511477133348e-06, + "loss": 1.5909, + "step": 11118 + }, + { + "epoch": 2.9217985351594575, + "grad_norm": 0.5791400074958801, + "learning_rate": 2.610828806728579e-06, + "loss": 1.6037, + "step": 11120 + }, + { + "epoch": 2.922324038493119, + "grad_norm": 0.5476728677749634, + "learning_rate": 2.5933064657438235e-06, + "loss": 1.5841, + "step": 11122 + }, + { + "epoch": 2.922849541826781, + "grad_norm": 0.6799639463424683, + "learning_rate": 2.5757841247590677e-06, + "loss": 1.6055, + "step": 11124 + }, + { + "epoch": 2.9233750451604426, + "grad_norm": 0.6471995115280151, + "learning_rate": 2.5582617837743123e-06, + "loss": 1.614, + "step": 11126 + }, + { + "epoch": 2.9239005484941045, + "grad_norm": 0.5741569399833679, + "learning_rate": 2.540739442789557e-06, + "loss": 1.5667, + "step": 11128 + }, + { + "epoch": 2.9244260518277665, + "grad_norm": 0.5387163162231445, + "learning_rate": 2.5232171018048015e-06, + "loss": 1.617, + "step": 11130 + }, + { + "epoch": 2.924951555161428, + "grad_norm": 0.5384705066680908, + "learning_rate": 2.5056947608200457e-06, + "loss": 1.5939, + "step": 11132 + }, + { + "epoch": 2.92547705849509, + "grad_norm": 0.5429417490959167, + "learning_rate": 2.4881724198352903e-06, + "loss": 1.5842, + "step": 11134 + }, + { + "epoch": 2.9260025618287515, + "grad_norm": 0.6191485524177551, + "learning_rate": 2.4706500788505345e-06, + "loss": 1.6338, + "step": 11136 + }, + { + "epoch": 2.9265280651624135, + "grad_norm": 0.7055456042289734, + "learning_rate": 2.453127737865779e-06, + "loss": 1.6349, + "step": 11138 + }, + { + "epoch": 2.927053568496075, + "grad_norm": 0.5733761787414551, + "learning_rate": 2.4356053968810232e-06, + "loss": 1.5677, + "step": 11140 + }, + { + "epoch": 2.927579071829737, + "grad_norm": 0.6309702396392822, + "learning_rate": 2.418083055896268e-06, + "loss": 1.577, + "step": 11142 + }, + { + "epoch": 2.928104575163399, + "grad_norm": 0.56844162940979, + "learning_rate": 2.4005607149115124e-06, + "loss": 1.6017, + "step": 11144 + }, + { + "epoch": 2.9286300784970605, + "grad_norm": 0.5193071365356445, + "learning_rate": 2.383038373926757e-06, + "loss": 1.5855, + "step": 11146 + }, + { + "epoch": 2.929155581830722, + "grad_norm": 0.6680015325546265, + "learning_rate": 2.365516032942001e-06, + "loss": 1.6215, + "step": 11148 + }, + { + "epoch": 2.929681085164384, + "grad_norm": 0.6055448055267334, + "learning_rate": 2.347993691957246e-06, + "loss": 1.5748, + "step": 11150 + }, + { + "epoch": 2.930206588498046, + "grad_norm": 0.763596773147583, + "learning_rate": 2.33047135097249e-06, + "loss": 1.567, + "step": 11152 + }, + { + "epoch": 2.9307320918317075, + "grad_norm": 0.5230932235717773, + "learning_rate": 2.3129490099877346e-06, + "loss": 1.6069, + "step": 11154 + }, + { + "epoch": 2.9312575951653694, + "grad_norm": 0.5715333819389343, + "learning_rate": 2.2954266690029787e-06, + "loss": 1.5798, + "step": 11156 + }, + { + "epoch": 2.931783098499031, + "grad_norm": 0.5592380166053772, + "learning_rate": 2.2779043280182233e-06, + "loss": 1.616, + "step": 11158 + }, + { + "epoch": 2.932308601832693, + "grad_norm": 0.5946754813194275, + "learning_rate": 2.260381987033468e-06, + "loss": 1.5689, + "step": 11160 + }, + { + "epoch": 2.9328341051663545, + "grad_norm": 0.6536034345626831, + "learning_rate": 2.2428596460487125e-06, + "loss": 1.5949, + "step": 11162 + }, + { + "epoch": 2.9333596085000164, + "grad_norm": 0.5874665975570679, + "learning_rate": 2.2253373050639567e-06, + "loss": 1.6125, + "step": 11164 + }, + { + "epoch": 2.9338851118336784, + "grad_norm": 0.5944022536277771, + "learning_rate": 2.2078149640792013e-06, + "loss": 1.5724, + "step": 11166 + }, + { + "epoch": 2.93441061516734, + "grad_norm": 0.5689641237258911, + "learning_rate": 2.1902926230944455e-06, + "loss": 1.587, + "step": 11168 + }, + { + "epoch": 2.934936118501002, + "grad_norm": 0.6851060390472412, + "learning_rate": 2.1727702821096897e-06, + "loss": 1.5942, + "step": 11170 + }, + { + "epoch": 2.9354616218346634, + "grad_norm": 0.6660142540931702, + "learning_rate": 2.1552479411249343e-06, + "loss": 1.5892, + "step": 11172 + }, + { + "epoch": 2.9359871251683254, + "grad_norm": 0.6187747716903687, + "learning_rate": 2.137725600140179e-06, + "loss": 1.5622, + "step": 11174 + }, + { + "epoch": 2.936512628501987, + "grad_norm": 0.5741645097732544, + "learning_rate": 2.1202032591554235e-06, + "loss": 1.5856, + "step": 11176 + }, + { + "epoch": 2.937038131835649, + "grad_norm": 0.5658064484596252, + "learning_rate": 2.1026809181706676e-06, + "loss": 1.5984, + "step": 11178 + }, + { + "epoch": 2.937563635169311, + "grad_norm": 0.6636450886726379, + "learning_rate": 2.0851585771859122e-06, + "loss": 1.6533, + "step": 11180 + }, + { + "epoch": 2.9380891385029724, + "grad_norm": 0.5776405334472656, + "learning_rate": 2.0676362362011564e-06, + "loss": 1.567, + "step": 11182 + }, + { + "epoch": 2.938614641836634, + "grad_norm": 0.6802220940589905, + "learning_rate": 2.050113895216401e-06, + "loss": 1.6211, + "step": 11184 + }, + { + "epoch": 2.939140145170296, + "grad_norm": 0.6535943746566772, + "learning_rate": 2.032591554231645e-06, + "loss": 1.596, + "step": 11186 + }, + { + "epoch": 2.939665648503958, + "grad_norm": 0.5814671516418457, + "learning_rate": 2.0150692132468898e-06, + "loss": 1.5689, + "step": 11188 + }, + { + "epoch": 2.9401911518376194, + "grad_norm": 0.6042841076850891, + "learning_rate": 1.9975468722621344e-06, + "loss": 1.6142, + "step": 11190 + }, + { + "epoch": 2.9407166551712813, + "grad_norm": 0.6148901581764221, + "learning_rate": 1.980024531277379e-06, + "loss": 1.5813, + "step": 11192 + }, + { + "epoch": 2.941242158504943, + "grad_norm": 0.6761669516563416, + "learning_rate": 1.962502190292623e-06, + "loss": 1.5915, + "step": 11194 + }, + { + "epoch": 2.941767661838605, + "grad_norm": 0.6184157133102417, + "learning_rate": 1.9449798493078678e-06, + "loss": 1.6291, + "step": 11196 + }, + { + "epoch": 2.9422931651722664, + "grad_norm": 0.567001223564148, + "learning_rate": 1.927457508323112e-06, + "loss": 1.5796, + "step": 11198 + }, + { + "epoch": 2.9428186685059283, + "grad_norm": 0.5846920609474182, + "learning_rate": 1.9099351673383565e-06, + "loss": 1.6186, + "step": 11200 + }, + { + "epoch": 2.9428186685059283, + "eval_loss": 1.6465901136398315, + "eval_runtime": 487.2703, + "eval_samples_per_second": 249.941, + "eval_steps_per_second": 31.243, + "step": 11200 + }, + { + "epoch": 2.9433441718395903, + "grad_norm": 0.5758314728736877, + "learning_rate": 1.892412826353601e-06, + "loss": 1.609, + "step": 11202 + }, + { + "epoch": 2.943869675173252, + "grad_norm": 0.6517518758773804, + "learning_rate": 1.8748904853688455e-06, + "loss": 1.614, + "step": 11204 + }, + { + "epoch": 2.944395178506914, + "grad_norm": 0.5758547186851501, + "learning_rate": 1.8573681443840897e-06, + "loss": 1.5874, + "step": 11206 + }, + { + "epoch": 2.9449206818405753, + "grad_norm": 0.5640631914138794, + "learning_rate": 1.8398458033993343e-06, + "loss": 1.5723, + "step": 11208 + }, + { + "epoch": 2.9454461851742373, + "grad_norm": 0.6348847150802612, + "learning_rate": 1.8223234624145787e-06, + "loss": 1.5789, + "step": 11210 + }, + { + "epoch": 2.945971688507899, + "grad_norm": 0.6184114217758179, + "learning_rate": 1.8048011214298233e-06, + "loss": 1.5986, + "step": 11212 + }, + { + "epoch": 2.946497191841561, + "grad_norm": 0.8769078850746155, + "learning_rate": 1.7872787804450674e-06, + "loss": 1.6315, + "step": 11214 + }, + { + "epoch": 2.9470226951752228, + "grad_norm": 0.6675733923912048, + "learning_rate": 1.769756439460312e-06, + "loss": 1.6477, + "step": 11216 + }, + { + "epoch": 2.9475481985088843, + "grad_norm": 0.5792946219444275, + "learning_rate": 1.7522340984755564e-06, + "loss": 1.6266, + "step": 11218 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.6874922513961792, + "learning_rate": 1.734711757490801e-06, + "loss": 1.6014, + "step": 11220 + }, + { + "epoch": 2.9485992051762078, + "grad_norm": 0.6224451065063477, + "learning_rate": 1.7171894165060452e-06, + "loss": 1.5985, + "step": 11222 + }, + { + "epoch": 2.9491247085098697, + "grad_norm": 0.5994266271591187, + "learning_rate": 1.6996670755212898e-06, + "loss": 1.6229, + "step": 11224 + }, + { + "epoch": 2.9496502118435313, + "grad_norm": 0.6327415704727173, + "learning_rate": 1.6821447345365342e-06, + "loss": 1.5961, + "step": 11226 + }, + { + "epoch": 2.9501757151771932, + "grad_norm": 0.5327082276344299, + "learning_rate": 1.6646223935517788e-06, + "loss": 1.5832, + "step": 11228 + }, + { + "epoch": 2.9507012185108548, + "grad_norm": 0.6451945900917053, + "learning_rate": 1.647100052567023e-06, + "loss": 1.6072, + "step": 11230 + }, + { + "epoch": 2.9512267218445167, + "grad_norm": 0.6457750201225281, + "learning_rate": 1.6295777115822676e-06, + "loss": 1.59, + "step": 11232 + }, + { + "epoch": 2.9517522251781783, + "grad_norm": 0.7860383987426758, + "learning_rate": 1.612055370597512e-06, + "loss": 1.6288, + "step": 11234 + }, + { + "epoch": 2.9522777285118402, + "grad_norm": 0.5900551080703735, + "learning_rate": 1.5945330296127566e-06, + "loss": 1.6113, + "step": 11236 + }, + { + "epoch": 2.952803231845502, + "grad_norm": 0.606599748134613, + "learning_rate": 1.5770106886280007e-06, + "loss": 1.6201, + "step": 11238 + }, + { + "epoch": 2.9533287351791637, + "grad_norm": 0.6085255742073059, + "learning_rate": 1.5594883476432451e-06, + "loss": 1.5788, + "step": 11240 + }, + { + "epoch": 2.9538542385128257, + "grad_norm": 0.689579427242279, + "learning_rate": 1.5419660066584897e-06, + "loss": 1.5928, + "step": 11242 + }, + { + "epoch": 2.954379741846487, + "grad_norm": 0.5633424520492554, + "learning_rate": 1.524443665673734e-06, + "loss": 1.5989, + "step": 11244 + }, + { + "epoch": 2.954905245180149, + "grad_norm": 0.6903582811355591, + "learning_rate": 1.5069213246889785e-06, + "loss": 1.6527, + "step": 11246 + }, + { + "epoch": 2.9554307485138107, + "grad_norm": 0.6674286723136902, + "learning_rate": 1.4893989837042229e-06, + "loss": 1.6041, + "step": 11248 + }, + { + "epoch": 2.9559562518474727, + "grad_norm": 0.5618950724601746, + "learning_rate": 1.4718766427194675e-06, + "loss": 1.6209, + "step": 11250 + }, + { + "epoch": 2.9564817551811347, + "grad_norm": 0.60519939661026, + "learning_rate": 1.4543543017347119e-06, + "loss": 1.57, + "step": 11252 + }, + { + "epoch": 2.957007258514796, + "grad_norm": 0.6382578611373901, + "learning_rate": 1.4368319607499562e-06, + "loss": 1.6074, + "step": 11254 + }, + { + "epoch": 2.9575327618484577, + "grad_norm": 0.5759736895561218, + "learning_rate": 1.4193096197652006e-06, + "loss": 1.5892, + "step": 11256 + }, + { + "epoch": 2.9580582651821197, + "grad_norm": 0.5800794363021851, + "learning_rate": 1.4017872787804452e-06, + "loss": 1.643, + "step": 11258 + }, + { + "epoch": 2.9585837685157816, + "grad_norm": 0.5824633240699768, + "learning_rate": 1.3842649377956896e-06, + "loss": 1.5935, + "step": 11260 + }, + { + "epoch": 2.959109271849443, + "grad_norm": 0.5774669647216797, + "learning_rate": 1.366742596810934e-06, + "loss": 1.6421, + "step": 11262 + }, + { + "epoch": 2.959634775183105, + "grad_norm": 0.5631816387176514, + "learning_rate": 1.3492202558261784e-06, + "loss": 1.6194, + "step": 11264 + }, + { + "epoch": 2.960160278516767, + "grad_norm": 0.5980478525161743, + "learning_rate": 1.331697914841423e-06, + "loss": 1.6124, + "step": 11266 + }, + { + "epoch": 2.9606857818504286, + "grad_norm": 0.5869792699813843, + "learning_rate": 1.3141755738566674e-06, + "loss": 1.6033, + "step": 11268 + }, + { + "epoch": 2.96121128518409, + "grad_norm": 0.7216488122940063, + "learning_rate": 1.2966532328719118e-06, + "loss": 1.6396, + "step": 11270 + }, + { + "epoch": 2.961736788517752, + "grad_norm": 0.6487390398979187, + "learning_rate": 1.2791308918871562e-06, + "loss": 1.611, + "step": 11272 + }, + { + "epoch": 2.962262291851414, + "grad_norm": 0.6345615983009338, + "learning_rate": 1.2616085509024008e-06, + "loss": 1.575, + "step": 11274 + }, + { + "epoch": 2.9627877951850756, + "grad_norm": 0.692871630191803, + "learning_rate": 1.2440862099176451e-06, + "loss": 1.6005, + "step": 11276 + }, + { + "epoch": 2.9633132985187376, + "grad_norm": 0.6169196367263794, + "learning_rate": 1.2265638689328895e-06, + "loss": 1.5758, + "step": 11278 + }, + { + "epoch": 2.963838801852399, + "grad_norm": 0.5923426151275635, + "learning_rate": 1.209041527948134e-06, + "loss": 1.5737, + "step": 11280 + }, + { + "epoch": 2.964364305186061, + "grad_norm": 0.629149854183197, + "learning_rate": 1.1915191869633785e-06, + "loss": 1.6043, + "step": 11282 + }, + { + "epoch": 2.9648898085197226, + "grad_norm": 0.5396657586097717, + "learning_rate": 1.173996845978623e-06, + "loss": 1.5922, + "step": 11284 + }, + { + "epoch": 2.9654153118533846, + "grad_norm": 0.6270614862442017, + "learning_rate": 1.1564745049938673e-06, + "loss": 1.6032, + "step": 11286 + }, + { + "epoch": 2.9659408151870466, + "grad_norm": 0.6442562937736511, + "learning_rate": 1.1389521640091117e-06, + "loss": 1.5929, + "step": 11288 + }, + { + "epoch": 2.966466318520708, + "grad_norm": 0.5687601566314697, + "learning_rate": 1.1214298230243563e-06, + "loss": 1.5735, + "step": 11290 + }, + { + "epoch": 2.96699182185437, + "grad_norm": 0.6251326203346252, + "learning_rate": 1.1039074820396007e-06, + "loss": 1.5841, + "step": 11292 + }, + { + "epoch": 2.9675173251880316, + "grad_norm": 0.6409140229225159, + "learning_rate": 1.0863851410548448e-06, + "loss": 1.6202, + "step": 11294 + }, + { + "epoch": 2.9680428285216935, + "grad_norm": 0.5543487668037415, + "learning_rate": 1.0688628000700894e-06, + "loss": 1.6237, + "step": 11296 + }, + { + "epoch": 2.968568331855355, + "grad_norm": 0.6813648343086243, + "learning_rate": 1.0513404590853338e-06, + "loss": 1.6548, + "step": 11298 + }, + { + "epoch": 2.969093835189017, + "grad_norm": 0.5688581466674805, + "learning_rate": 1.0338181181005782e-06, + "loss": 1.597, + "step": 11300 + }, + { + "epoch": 2.969619338522679, + "grad_norm": 0.6080614328384399, + "learning_rate": 1.0162957771158226e-06, + "loss": 1.6439, + "step": 11302 + }, + { + "epoch": 2.9701448418563405, + "grad_norm": 0.5709330439567566, + "learning_rate": 9.987734361310672e-07, + "loss": 1.6394, + "step": 11304 + }, + { + "epoch": 2.970670345190002, + "grad_norm": 0.6443371772766113, + "learning_rate": 9.812510951463116e-07, + "loss": 1.6215, + "step": 11306 + }, + { + "epoch": 2.971195848523664, + "grad_norm": 0.6284047365188599, + "learning_rate": 9.63728754161556e-07, + "loss": 1.6182, + "step": 11308 + }, + { + "epoch": 2.971721351857326, + "grad_norm": 0.5524962544441223, + "learning_rate": 9.462064131768005e-07, + "loss": 1.5953, + "step": 11310 + }, + { + "epoch": 2.9722468551909875, + "grad_norm": 0.56756991147995, + "learning_rate": 9.286840721920448e-07, + "loss": 1.6048, + "step": 11312 + }, + { + "epoch": 2.9727723585246495, + "grad_norm": 0.729558527469635, + "learning_rate": 9.111617312072893e-07, + "loss": 1.6145, + "step": 11314 + }, + { + "epoch": 2.973297861858311, + "grad_norm": 0.6247385144233704, + "learning_rate": 8.936393902225337e-07, + "loss": 1.6176, + "step": 11316 + }, + { + "epoch": 2.973823365191973, + "grad_norm": 0.6063811779022217, + "learning_rate": 8.761170492377782e-07, + "loss": 1.5839, + "step": 11318 + }, + { + "epoch": 2.9743488685256345, + "grad_norm": 0.6044145226478577, + "learning_rate": 8.585947082530226e-07, + "loss": 1.6051, + "step": 11320 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.5743279457092285, + "learning_rate": 8.410723672682671e-07, + "loss": 1.6128, + "step": 11322 + }, + { + "epoch": 2.9753998751929585, + "grad_norm": 0.5868866443634033, + "learning_rate": 8.235500262835115e-07, + "loss": 1.6428, + "step": 11324 + }, + { + "epoch": 2.97592537852662, + "grad_norm": 0.5260694026947021, + "learning_rate": 8.06027685298756e-07, + "loss": 1.584, + "step": 11326 + }, + { + "epoch": 2.976450881860282, + "grad_norm": 0.6764076948165894, + "learning_rate": 7.885053443140004e-07, + "loss": 1.6163, + "step": 11328 + }, + { + "epoch": 2.9769763851939435, + "grad_norm": 0.5896710157394409, + "learning_rate": 7.709830033292449e-07, + "loss": 1.5717, + "step": 11330 + }, + { + "epoch": 2.9775018885276054, + "grad_norm": 0.6759716272354126, + "learning_rate": 7.534606623444892e-07, + "loss": 1.5856, + "step": 11332 + }, + { + "epoch": 2.978027391861267, + "grad_norm": 0.6693316102027893, + "learning_rate": 7.359383213597337e-07, + "loss": 1.6189, + "step": 11334 + }, + { + "epoch": 2.978552895194929, + "grad_norm": 0.5483260154724121, + "learning_rate": 7.184159803749781e-07, + "loss": 1.6106, + "step": 11336 + }, + { + "epoch": 2.979078398528591, + "grad_norm": 0.6094440817832947, + "learning_rate": 7.008936393902226e-07, + "loss": 1.6105, + "step": 11338 + }, + { + "epoch": 2.9796039018622524, + "grad_norm": 0.6723849773406982, + "learning_rate": 6.83371298405467e-07, + "loss": 1.5995, + "step": 11340 + }, + { + "epoch": 2.980129405195914, + "grad_norm": 0.6282168030738831, + "learning_rate": 6.658489574207115e-07, + "loss": 1.5797, + "step": 11342 + }, + { + "epoch": 2.980654908529576, + "grad_norm": 0.5661284923553467, + "learning_rate": 6.483266164359559e-07, + "loss": 1.5908, + "step": 11344 + }, + { + "epoch": 2.981180411863238, + "grad_norm": 0.598994255065918, + "learning_rate": 6.308042754512004e-07, + "loss": 1.592, + "step": 11346 + }, + { + "epoch": 2.9817059151968994, + "grad_norm": 0.6033176183700562, + "learning_rate": 6.132819344664448e-07, + "loss": 1.5935, + "step": 11348 + }, + { + "epoch": 2.9822314185305614, + "grad_norm": 0.6345306038856506, + "learning_rate": 5.957595934816893e-07, + "loss": 1.5789, + "step": 11350 + }, + { + "epoch": 2.982756921864223, + "grad_norm": 0.606160044670105, + "learning_rate": 5.782372524969336e-07, + "loss": 1.5949, + "step": 11352 + }, + { + "epoch": 2.983282425197885, + "grad_norm": 0.7126883268356323, + "learning_rate": 5.607149115121781e-07, + "loss": 1.5786, + "step": 11354 + }, + { + "epoch": 2.9838079285315464, + "grad_norm": 0.6453995704650879, + "learning_rate": 5.431925705274224e-07, + "loss": 1.6359, + "step": 11356 + }, + { + "epoch": 2.9843334318652084, + "grad_norm": 0.5591650605201721, + "learning_rate": 5.256702295426669e-07, + "loss": 1.596, + "step": 11358 + }, + { + "epoch": 2.9848589351988704, + "grad_norm": 0.5831902027130127, + "learning_rate": 5.081478885579113e-07, + "loss": 1.5785, + "step": 11360 + }, + { + "epoch": 2.985384438532532, + "grad_norm": 0.5874847173690796, + "learning_rate": 4.906255475731558e-07, + "loss": 1.6092, + "step": 11362 + }, + { + "epoch": 2.985909941866194, + "grad_norm": 0.6131449937820435, + "learning_rate": 4.7310320658840023e-07, + "loss": 1.6002, + "step": 11364 + }, + { + "epoch": 2.9864354451998554, + "grad_norm": 0.5965538620948792, + "learning_rate": 4.5558086560364467e-07, + "loss": 1.5937, + "step": 11366 + }, + { + "epoch": 2.9869609485335173, + "grad_norm": 0.6877683401107788, + "learning_rate": 4.380585246188891e-07, + "loss": 1.5981, + "step": 11368 + }, + { + "epoch": 2.987486451867179, + "grad_norm": 0.6720339059829712, + "learning_rate": 4.2053618363413355e-07, + "loss": 1.6071, + "step": 11370 + }, + { + "epoch": 2.988011955200841, + "grad_norm": 0.7178999185562134, + "learning_rate": 4.03013842649378e-07, + "loss": 1.6089, + "step": 11372 + }, + { + "epoch": 2.988537458534503, + "grad_norm": 0.5988190770149231, + "learning_rate": 3.8549150166462243e-07, + "loss": 1.6001, + "step": 11374 + }, + { + "epoch": 2.9890629618681643, + "grad_norm": 0.6235753297805786, + "learning_rate": 3.6796916067986687e-07, + "loss": 1.6124, + "step": 11376 + }, + { + "epoch": 2.989588465201826, + "grad_norm": 0.6031942367553711, + "learning_rate": 3.504468196951113e-07, + "loss": 1.6445, + "step": 11378 + }, + { + "epoch": 2.990113968535488, + "grad_norm": 0.6021082401275635, + "learning_rate": 3.3292447871035575e-07, + "loss": 1.6139, + "step": 11380 + }, + { + "epoch": 2.99063947186915, + "grad_norm": 0.5596811175346375, + "learning_rate": 3.154021377256002e-07, + "loss": 1.5923, + "step": 11382 + }, + { + "epoch": 2.9911649752028113, + "grad_norm": 0.5226958990097046, + "learning_rate": 2.9787979674084463e-07, + "loss": 1.6145, + "step": 11384 + }, + { + "epoch": 2.9916904785364733, + "grad_norm": 0.6241964101791382, + "learning_rate": 2.8035745575608907e-07, + "loss": 1.5997, + "step": 11386 + }, + { + "epoch": 2.992215981870135, + "grad_norm": 0.5991497039794922, + "learning_rate": 2.6283511477133345e-07, + "loss": 1.6279, + "step": 11388 + }, + { + "epoch": 2.992741485203797, + "grad_norm": 0.5512558817863464, + "learning_rate": 2.453127737865779e-07, + "loss": 1.5999, + "step": 11390 + }, + { + "epoch": 2.9932669885374583, + "grad_norm": 0.6609853506088257, + "learning_rate": 2.2779043280182233e-07, + "loss": 1.6145, + "step": 11392 + }, + { + "epoch": 2.9937924918711203, + "grad_norm": 0.6005377769470215, + "learning_rate": 2.1026809181706677e-07, + "loss": 1.5893, + "step": 11394 + }, + { + "epoch": 2.9943179952047823, + "grad_norm": 0.5884697437286377, + "learning_rate": 1.9274575083231121e-07, + "loss": 1.5813, + "step": 11396 + }, + { + "epoch": 2.994843498538444, + "grad_norm": 0.7103734612464905, + "learning_rate": 1.7522340984755565e-07, + "loss": 1.592, + "step": 11398 + }, + { + "epoch": 2.9953690018721058, + "grad_norm": 0.5749437808990479, + "learning_rate": 1.577010688628001e-07, + "loss": 1.6045, + "step": 11400 + }, + { + "epoch": 2.9958945052057673, + "grad_norm": 0.6688903570175171, + "learning_rate": 1.4017872787804453e-07, + "loss": 1.6188, + "step": 11402 + }, + { + "epoch": 2.9964200085394292, + "grad_norm": 0.6899111866950989, + "learning_rate": 1.2265638689328895e-07, + "loss": 1.6102, + "step": 11404 + }, + { + "epoch": 2.9969455118730908, + "grad_norm": 0.6025496125221252, + "learning_rate": 1.0513404590853339e-07, + "loss": 1.5927, + "step": 11406 + }, + { + "epoch": 2.9974710152067527, + "grad_norm": 0.8233180642127991, + "learning_rate": 8.761170492377783e-08, + "loss": 1.6267, + "step": 11408 + }, + { + "epoch": 2.9979965185404147, + "grad_norm": 0.633036196231842, + "learning_rate": 7.008936393902227e-08, + "loss": 1.6246, + "step": 11410 + }, + { + "epoch": 2.9985220218740762, + "grad_norm": 0.6616869568824768, + "learning_rate": 5.2567022954266694e-08, + "loss": 1.5989, + "step": 11412 + }, + { + "epoch": 2.9990475252077378, + "grad_norm": 0.7166438102722168, + "learning_rate": 3.5044681969511133e-08, + "loss": 1.6247, + "step": 11414 + }, + { + "epoch": 2.999310276874569, + "step": 11415, + "total_flos": 8.316227983158804e+17, + "train_loss": 1.7114452323838383, + "train_runtime": 54638.3985, + "train_samples_per_second": 53.496, + "train_steps_per_second": 0.209 + }, + { + "epoch": 2.999310276874569, + "eval_loss": 1.6470075845718384, + "eval_runtime": 487.9081, + "eval_samples_per_second": 249.615, + "eval_steps_per_second": 31.203, + "step": 11415 + }, + { + "epoch": 2.999310276874569, + "eval_loss": 1.6461076736450195, + "eval_runtime": 486.7382, + "eval_samples_per_second": 250.215, + "eval_steps_per_second": 31.278, + "step": 11415 + } + ], + "logging_steps": 2, + "max_steps": 11415, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.316227983158804e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}