{ "best_metric": 0.8029689974239149, "best_model_checkpoint": "./rubert_merged_and_weighted_cls_checkpoints/checkpoint-128445", "epoch": 9.0, "eval_steps": 500, "global_step": 231201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001946358363501888, "grad_norm": 7.56255578994751, "learning_rate": 4.998053641636498e-05, "loss": 1.4749, "step": 50 }, { "epoch": 0.003892716727003776, "grad_norm": 4.601037979125977, "learning_rate": 4.996224064774807e-05, "loss": 1.3811, "step": 100 }, { "epoch": 0.005839075090505664, "grad_norm": 19.44391632080078, "learning_rate": 4.994355560745845e-05, "loss": 1.3651, "step": 150 }, { "epoch": 0.007785433454007552, "grad_norm": 6.989410877227783, "learning_rate": 4.992409202382343e-05, "loss": 1.3036, "step": 200 }, { "epoch": 0.00973179181750944, "grad_norm": 13.325335502624512, "learning_rate": 4.990462844018841e-05, "loss": 1.2226, "step": 250 }, { "epoch": 0.011678150181011327, "grad_norm": 6.461861610412598, "learning_rate": 4.988516485655339e-05, "loss": 1.347, "step": 300 }, { "epoch": 0.013624508544513216, "grad_norm": 11.010049819946289, "learning_rate": 4.9865701272918375e-05, "loss": 1.3306, "step": 350 }, { "epoch": 0.015570866908015103, "grad_norm": 11.974625587463379, "learning_rate": 4.984623768928335e-05, "loss": 1.1592, "step": 400 }, { "epoch": 0.01751722527151699, "grad_norm": 6.983129978179932, "learning_rate": 4.982677410564833e-05, "loss": 1.2452, "step": 450 }, { "epoch": 0.01946358363501888, "grad_norm": 7.131897926330566, "learning_rate": 4.9807310522013314e-05, "loss": 1.2373, "step": 500 }, { "epoch": 0.021409941998520768, "grad_norm": 9.738677978515625, "learning_rate": 4.97878469383783e-05, "loss": 1.1345, "step": 550 }, { "epoch": 0.023356300362022655, "grad_norm": 6.072765350341797, "learning_rate": 4.9768383354743276e-05, "loss": 1.1992, "step": 600 }, { "epoch": 0.025302658725524542, "grad_norm": 7.039418697357178, "learning_rate": 4.974891977110826e-05, "loss": 1.1435, "step": 650 }, { "epoch": 0.027249017089026432, "grad_norm": 5.42138671875, "learning_rate": 4.972945618747324e-05, "loss": 1.1197, "step": 700 }, { "epoch": 0.02919537545252832, "grad_norm": 8.618386268615723, "learning_rate": 4.970999260383822e-05, "loss": 1.1184, "step": 750 }, { "epoch": 0.031141733816030207, "grad_norm": 15.724136352539062, "learning_rate": 4.96909182918759e-05, "loss": 1.0312, "step": 800 }, { "epoch": 0.0330880921795321, "grad_norm": 11.72488784790039, "learning_rate": 4.967145470824088e-05, "loss": 1.1396, "step": 850 }, { "epoch": 0.03503445054303398, "grad_norm": 11.833064079284668, "learning_rate": 4.9651991124605865e-05, "loss": 1.0926, "step": 900 }, { "epoch": 0.03698080890653587, "grad_norm": 2.5419535636901855, "learning_rate": 4.963252754097085e-05, "loss": 1.0798, "step": 950 }, { "epoch": 0.03892716727003776, "grad_norm": 10.202709197998047, "learning_rate": 4.961306395733583e-05, "loss": 1.074, "step": 1000 }, { "epoch": 0.040873525633539645, "grad_norm": 9.575445175170898, "learning_rate": 4.959360037370081e-05, "loss": 1.0328, "step": 1050 }, { "epoch": 0.042819883997041536, "grad_norm": 9.735647201538086, "learning_rate": 4.957413679006579e-05, "loss": 1.0577, "step": 1100 }, { "epoch": 0.044766242360543426, "grad_norm": 8.523118019104004, "learning_rate": 4.955467320643077e-05, "loss": 1.0899, "step": 1150 }, { "epoch": 0.04671260072404531, "grad_norm": 45.88571548461914, "learning_rate": 4.953520962279575e-05, "loss": 1.1176, "step": 1200 }, { "epoch": 0.0486589590875472, "grad_norm": 6.792192459106445, "learning_rate": 4.951574603916073e-05, "loss": 1.0438, "step": 1250 }, { "epoch": 0.050605317451049084, "grad_norm": 7.1662468910217285, "learning_rate": 4.949628245552572e-05, "loss": 1.0879, "step": 1300 }, { "epoch": 0.052551675814550974, "grad_norm": 34.963008880615234, "learning_rate": 4.94768188718907e-05, "loss": 1.0854, "step": 1350 }, { "epoch": 0.054498034178052865, "grad_norm": 32.2481689453125, "learning_rate": 4.9457355288255675e-05, "loss": 1.0568, "step": 1400 }, { "epoch": 0.05644439254155475, "grad_norm": 11.33353042602539, "learning_rate": 4.943789170462066e-05, "loss": 1.0493, "step": 1450 }, { "epoch": 0.05839075090505664, "grad_norm": 21.23685073852539, "learning_rate": 4.941842812098564e-05, "loss": 0.9191, "step": 1500 }, { "epoch": 0.06033710926855853, "grad_norm": 10.951175689697266, "learning_rate": 4.9398964537350614e-05, "loss": 1.0626, "step": 1550 }, { "epoch": 0.06228346763206041, "grad_norm": 6.901663780212402, "learning_rate": 4.93795009537156e-05, "loss": 0.9369, "step": 1600 }, { "epoch": 0.0642298259955623, "grad_norm": 5.283642768859863, "learning_rate": 4.936003737008058e-05, "loss": 0.9684, "step": 1650 }, { "epoch": 0.0661761843590642, "grad_norm": 6.274130821228027, "learning_rate": 4.934057378644557e-05, "loss": 0.9818, "step": 1700 }, { "epoch": 0.06812254272256608, "grad_norm": 10.882516860961914, "learning_rate": 4.9321110202810545e-05, "loss": 0.8857, "step": 1750 }, { "epoch": 0.07006890108606796, "grad_norm": 7.680133819580078, "learning_rate": 4.930164661917552e-05, "loss": 0.9435, "step": 1800 }, { "epoch": 0.07201525944956985, "grad_norm": 2.6674492359161377, "learning_rate": 4.928218303554051e-05, "loss": 1.1362, "step": 1850 }, { "epoch": 0.07396161781307174, "grad_norm": 6.698825836181641, "learning_rate": 4.9262719451905485e-05, "loss": 1.1788, "step": 1900 }, { "epoch": 0.07590797617657363, "grad_norm": 5.092474937438965, "learning_rate": 4.924325586827047e-05, "loss": 1.0459, "step": 1950 }, { "epoch": 0.07785433454007552, "grad_norm": 13.385080337524414, "learning_rate": 4.922379228463545e-05, "loss": 0.8982, "step": 2000 }, { "epoch": 0.0798006929035774, "grad_norm": 6.494448184967041, "learning_rate": 4.920432870100043e-05, "loss": 0.9881, "step": 2050 }, { "epoch": 0.08174705126707929, "grad_norm": 10.139787673950195, "learning_rate": 4.9184865117365415e-05, "loss": 0.9683, "step": 2100 }, { "epoch": 0.08369340963058118, "grad_norm": 6.303593158721924, "learning_rate": 4.916540153373039e-05, "loss": 0.9254, "step": 2150 }, { "epoch": 0.08563976799408307, "grad_norm": 8.52916431427002, "learning_rate": 4.914593795009537e-05, "loss": 1.0023, "step": 2200 }, { "epoch": 0.08758612635758496, "grad_norm": 12.403430938720703, "learning_rate": 4.9126474366460355e-05, "loss": 0.8974, "step": 2250 }, { "epoch": 0.08953248472108685, "grad_norm": 10.531347274780273, "learning_rate": 4.910701078282533e-05, "loss": 0.9991, "step": 2300 }, { "epoch": 0.09147884308458873, "grad_norm": 14.944234848022461, "learning_rate": 4.908754719919032e-05, "loss": 0.8804, "step": 2350 }, { "epoch": 0.09342520144809062, "grad_norm": 18.904951095581055, "learning_rate": 4.90680836155553e-05, "loss": 0.8598, "step": 2400 }, { "epoch": 0.09537155981159251, "grad_norm": 5.995104789733887, "learning_rate": 4.904862003192028e-05, "loss": 1.0032, "step": 2450 }, { "epoch": 0.0973179181750944, "grad_norm": 9.083269119262695, "learning_rate": 4.902915644828526e-05, "loss": 0.9923, "step": 2500 }, { "epoch": 0.09926427653859629, "grad_norm": 9.006484985351562, "learning_rate": 4.900969286465024e-05, "loss": 0.8828, "step": 2550 }, { "epoch": 0.10121063490209817, "grad_norm": 14.719727516174316, "learning_rate": 4.899022928101522e-05, "loss": 0.8716, "step": 2600 }, { "epoch": 0.10315699326560006, "grad_norm": 29.95701026916504, "learning_rate": 4.89707656973802e-05, "loss": 0.9057, "step": 2650 }, { "epoch": 0.10510335162910195, "grad_norm": 10.219870567321777, "learning_rate": 4.895130211374519e-05, "loss": 0.8555, "step": 2700 }, { "epoch": 0.10704970999260384, "grad_norm": 4.191556453704834, "learning_rate": 4.893183853011017e-05, "loss": 0.8952, "step": 2750 }, { "epoch": 0.10899606835610573, "grad_norm": 10.738969802856445, "learning_rate": 4.891237494647515e-05, "loss": 0.9109, "step": 2800 }, { "epoch": 0.11094242671960762, "grad_norm": 41.21760559082031, "learning_rate": 4.8892911362840126e-05, "loss": 0.9984, "step": 2850 }, { "epoch": 0.1128887850831095, "grad_norm": 13.713314056396484, "learning_rate": 4.887344777920511e-05, "loss": 0.8488, "step": 2900 }, { "epoch": 0.11483514344661139, "grad_norm": 18.241104125976562, "learning_rate": 4.885398419557009e-05, "loss": 0.9235, "step": 2950 }, { "epoch": 0.11678150181011328, "grad_norm": 12.40414810180664, "learning_rate": 4.883452061193507e-05, "loss": 0.9426, "step": 3000 }, { "epoch": 0.11872786017361517, "grad_norm": 10.660947799682617, "learning_rate": 4.881505702830006e-05, "loss": 0.898, "step": 3050 }, { "epoch": 0.12067421853711706, "grad_norm": 9.347933769226074, "learning_rate": 4.8795593444665035e-05, "loss": 0.9709, "step": 3100 }, { "epoch": 0.12262057690061894, "grad_norm": 7.6703200340271, "learning_rate": 4.877612986103002e-05, "loss": 0.9236, "step": 3150 }, { "epoch": 0.12456693526412083, "grad_norm": 12.222637176513672, "learning_rate": 4.8756666277395e-05, "loss": 0.9465, "step": 3200 }, { "epoch": 0.12651329362762273, "grad_norm": 15.256314277648926, "learning_rate": 4.8737202693759974e-05, "loss": 0.9741, "step": 3250 }, { "epoch": 0.1284596519911246, "grad_norm": 4.745419025421143, "learning_rate": 4.871773911012496e-05, "loss": 0.979, "step": 3300 }, { "epoch": 0.13040601035462648, "grad_norm": 12.130667686462402, "learning_rate": 4.8698275526489936e-05, "loss": 0.9298, "step": 3350 }, { "epoch": 0.1323523687181284, "grad_norm": 8.664134979248047, "learning_rate": 4.867881194285492e-05, "loss": 0.9185, "step": 3400 }, { "epoch": 0.13429872708163026, "grad_norm": 7.2126078605651855, "learning_rate": 4.8659348359219905e-05, "loss": 0.7964, "step": 3450 }, { "epoch": 0.13624508544513217, "grad_norm": 44.417240142822266, "learning_rate": 4.863988477558488e-05, "loss": 0.904, "step": 3500 }, { "epoch": 0.13819144380863405, "grad_norm": 13.76638126373291, "learning_rate": 4.862042119194987e-05, "loss": 0.8427, "step": 3550 }, { "epoch": 0.14013780217213592, "grad_norm": 13.140986442565918, "learning_rate": 4.8600957608314844e-05, "loss": 0.9465, "step": 3600 }, { "epoch": 0.14208416053563783, "grad_norm": 13.883295059204102, "learning_rate": 4.858149402467982e-05, "loss": 0.9357, "step": 3650 }, { "epoch": 0.1440305188991397, "grad_norm": 6.274234771728516, "learning_rate": 4.8562030441044806e-05, "loss": 0.84, "step": 3700 }, { "epoch": 0.1459768772626416, "grad_norm": 14.278850555419922, "learning_rate": 4.854256685740979e-05, "loss": 0.8809, "step": 3750 }, { "epoch": 0.14792323562614348, "grad_norm": 13.721392631530762, "learning_rate": 4.852310327377477e-05, "loss": 0.9212, "step": 3800 }, { "epoch": 0.14986959398964536, "grad_norm": 7.06493616104126, "learning_rate": 4.850363969013975e-05, "loss": 0.8513, "step": 3850 }, { "epoch": 0.15181595235314727, "grad_norm": 12.275819778442383, "learning_rate": 4.848417610650473e-05, "loss": 0.8939, "step": 3900 }, { "epoch": 0.15376231071664914, "grad_norm": 11.961417198181152, "learning_rate": 4.8464712522869715e-05, "loss": 0.9485, "step": 3950 }, { "epoch": 0.15570866908015105, "grad_norm": 6.35565710067749, "learning_rate": 4.844524893923469e-05, "loss": 0.9892, "step": 4000 }, { "epoch": 0.15765502744365292, "grad_norm": 6.395826816558838, "learning_rate": 4.8425785355599677e-05, "loss": 0.8684, "step": 4050 }, { "epoch": 0.1596013858071548, "grad_norm": 10.91559886932373, "learning_rate": 4.840632177196466e-05, "loss": 0.8335, "step": 4100 }, { "epoch": 0.1615477441706567, "grad_norm": 8.743356704711914, "learning_rate": 4.838685818832964e-05, "loss": 0.912, "step": 4150 }, { "epoch": 0.16349410253415858, "grad_norm": 21.774524688720703, "learning_rate": 4.836739460469462e-05, "loss": 0.8753, "step": 4200 }, { "epoch": 0.16544046089766049, "grad_norm": 12.699227333068848, "learning_rate": 4.83479310210596e-05, "loss": 0.8327, "step": 4250 }, { "epoch": 0.16738681926116236, "grad_norm": 12.212305068969727, "learning_rate": 4.832846743742458e-05, "loss": 0.7932, "step": 4300 }, { "epoch": 0.16933317762466427, "grad_norm": 14.349658966064453, "learning_rate": 4.830900385378956e-05, "loss": 0.9196, "step": 4350 }, { "epoch": 0.17127953598816614, "grad_norm": 9.126670837402344, "learning_rate": 4.828954027015454e-05, "loss": 0.8681, "step": 4400 }, { "epoch": 0.17322589435166802, "grad_norm": 14.85190200805664, "learning_rate": 4.8270076686519524e-05, "loss": 0.9667, "step": 4450 }, { "epoch": 0.17517225271516992, "grad_norm": 14.347668647766113, "learning_rate": 4.825061310288451e-05, "loss": 0.8344, "step": 4500 }, { "epoch": 0.1771186110786718, "grad_norm": 34.92314147949219, "learning_rate": 4.8231149519249486e-05, "loss": 0.8275, "step": 4550 }, { "epoch": 0.1790649694421737, "grad_norm": 11.294047355651855, "learning_rate": 4.821168593561447e-05, "loss": 0.9301, "step": 4600 }, { "epoch": 0.18101132780567558, "grad_norm": 9.225805282592773, "learning_rate": 4.819222235197945e-05, "loss": 0.7689, "step": 4650 }, { "epoch": 0.18295768616917746, "grad_norm": 16.026540756225586, "learning_rate": 4.8172758768344426e-05, "loss": 0.8356, "step": 4700 }, { "epoch": 0.18490404453267936, "grad_norm": 10.634355545043945, "learning_rate": 4.815329518470941e-05, "loss": 0.9164, "step": 4750 }, { "epoch": 0.18685040289618124, "grad_norm": 10.122540473937988, "learning_rate": 4.8133831601074394e-05, "loss": 0.8233, "step": 4800 }, { "epoch": 0.18879676125968314, "grad_norm": 18.178909301757812, "learning_rate": 4.811436801743937e-05, "loss": 0.8562, "step": 4850 }, { "epoch": 0.19074311962318502, "grad_norm": 9.000377655029297, "learning_rate": 4.8094904433804356e-05, "loss": 0.8604, "step": 4900 }, { "epoch": 0.1926894779866869, "grad_norm": 7.699686050415039, "learning_rate": 4.807583012184204e-05, "loss": 0.863, "step": 4950 }, { "epoch": 0.1946358363501888, "grad_norm": 22.39118194580078, "learning_rate": 4.8056366538207015e-05, "loss": 0.8673, "step": 5000 }, { "epoch": 0.19658219471369068, "grad_norm": 15.446267127990723, "learning_rate": 4.8036902954572e-05, "loss": 0.8174, "step": 5050 }, { "epoch": 0.19852855307719258, "grad_norm": 9.779167175292969, "learning_rate": 4.801743937093698e-05, "loss": 0.8405, "step": 5100 }, { "epoch": 0.20047491144069446, "grad_norm": 10.70969009399414, "learning_rate": 4.799797578730196e-05, "loss": 0.8683, "step": 5150 }, { "epoch": 0.20242126980419634, "grad_norm": 15.108619689941406, "learning_rate": 4.7978512203666945e-05, "loss": 0.8907, "step": 5200 }, { "epoch": 0.20436762816769824, "grad_norm": 18.34493637084961, "learning_rate": 4.795904862003192e-05, "loss": 0.8412, "step": 5250 }, { "epoch": 0.20631398653120012, "grad_norm": 12.68906307220459, "learning_rate": 4.793958503639691e-05, "loss": 0.8375, "step": 5300 }, { "epoch": 0.20826034489470202, "grad_norm": 17.815834045410156, "learning_rate": 4.7920121452761885e-05, "loss": 0.9485, "step": 5350 }, { "epoch": 0.2102067032582039, "grad_norm": 8.562508583068848, "learning_rate": 4.790065786912686e-05, "loss": 0.7979, "step": 5400 }, { "epoch": 0.2121530616217058, "grad_norm": 12.47730541229248, "learning_rate": 4.788119428549185e-05, "loss": 0.8497, "step": 5450 }, { "epoch": 0.21409941998520768, "grad_norm": 16.748720169067383, "learning_rate": 4.7861730701856824e-05, "loss": 0.7939, "step": 5500 }, { "epoch": 0.21604577834870956, "grad_norm": 20.49860191345215, "learning_rate": 4.784226711822181e-05, "loss": 0.8163, "step": 5550 }, { "epoch": 0.21799213671221146, "grad_norm": 13.554031372070312, "learning_rate": 4.782280353458679e-05, "loss": 0.873, "step": 5600 }, { "epoch": 0.21993849507571334, "grad_norm": 4.266268253326416, "learning_rate": 4.780333995095177e-05, "loss": 0.8155, "step": 5650 }, { "epoch": 0.22188485343921524, "grad_norm": 12.320168495178223, "learning_rate": 4.7783876367316755e-05, "loss": 0.8421, "step": 5700 }, { "epoch": 0.22383121180271712, "grad_norm": 10.59591007232666, "learning_rate": 4.776441278368173e-05, "loss": 0.8401, "step": 5750 }, { "epoch": 0.225777570166219, "grad_norm": 15.77294921875, "learning_rate": 4.774494920004671e-05, "loss": 0.8431, "step": 5800 }, { "epoch": 0.2277239285297209, "grad_norm": 8.8922700881958, "learning_rate": 4.7725485616411695e-05, "loss": 0.8496, "step": 5850 }, { "epoch": 0.22967028689322277, "grad_norm": 4.880230903625488, "learning_rate": 4.770602203277668e-05, "loss": 0.8579, "step": 5900 }, { "epoch": 0.23161664525672468, "grad_norm": 9.504624366760254, "learning_rate": 4.768655844914166e-05, "loss": 0.7776, "step": 5950 }, { "epoch": 0.23356300362022656, "grad_norm": 13.43596076965332, "learning_rate": 4.766709486550664e-05, "loss": 0.9415, "step": 6000 }, { "epoch": 0.23550936198372843, "grad_norm": 14.37273120880127, "learning_rate": 4.764763128187162e-05, "loss": 0.8348, "step": 6050 }, { "epoch": 0.23745572034723034, "grad_norm": 2.88686466217041, "learning_rate": 4.76281676982366e-05, "loss": 0.8214, "step": 6100 }, { "epoch": 0.2394020787107322, "grad_norm": 4.786400318145752, "learning_rate": 4.760870411460158e-05, "loss": 0.8672, "step": 6150 }, { "epoch": 0.24134843707423412, "grad_norm": 9.8790864944458, "learning_rate": 4.758924053096656e-05, "loss": 0.9012, "step": 6200 }, { "epoch": 0.243294795437736, "grad_norm": 7.362231731414795, "learning_rate": 4.756977694733154e-05, "loss": 0.8109, "step": 6250 }, { "epoch": 0.24524115380123787, "grad_norm": 5.234625816345215, "learning_rate": 4.755031336369653e-05, "loss": 0.8666, "step": 6300 }, { "epoch": 0.24718751216473978, "grad_norm": 7.8718767166137695, "learning_rate": 4.753084978006151e-05, "loss": 0.8385, "step": 6350 }, { "epoch": 0.24913387052824165, "grad_norm": 6.104279518127441, "learning_rate": 4.751138619642649e-05, "loss": 0.7976, "step": 6400 }, { "epoch": 0.25108022889174353, "grad_norm": 27.433591842651367, "learning_rate": 4.7491922612791466e-05, "loss": 0.7583, "step": 6450 }, { "epoch": 0.25302658725524546, "grad_norm": 8.644813537597656, "learning_rate": 4.747245902915645e-05, "loss": 0.7867, "step": 6500 }, { "epoch": 0.25497294561874734, "grad_norm": 8.821413040161133, "learning_rate": 4.745299544552143e-05, "loss": 0.8405, "step": 6550 }, { "epoch": 0.2569193039822492, "grad_norm": 12.348808288574219, "learning_rate": 4.743353186188641e-05, "loss": 0.9449, "step": 6600 }, { "epoch": 0.2588656623457511, "grad_norm": 13.958788871765137, "learning_rate": 4.74140682782514e-05, "loss": 0.8606, "step": 6650 }, { "epoch": 0.26081202070925297, "grad_norm": 10.098516464233398, "learning_rate": 4.7394604694616375e-05, "loss": 0.8139, "step": 6700 }, { "epoch": 0.2627583790727549, "grad_norm": 13.42656135559082, "learning_rate": 4.737514111098136e-05, "loss": 0.8656, "step": 6750 }, { "epoch": 0.2647047374362568, "grad_norm": 22.668865203857422, "learning_rate": 4.7355677527346336e-05, "loss": 0.7569, "step": 6800 }, { "epoch": 0.26665109579975865, "grad_norm": 8.301971435546875, "learning_rate": 4.7336213943711314e-05, "loss": 0.8722, "step": 6850 }, { "epoch": 0.26859745416326053, "grad_norm": 9.278164863586426, "learning_rate": 4.73167503600763e-05, "loss": 0.7884, "step": 6900 }, { "epoch": 0.2705438125267624, "grad_norm": 6.844503402709961, "learning_rate": 4.729728677644128e-05, "loss": 0.9129, "step": 6950 }, { "epoch": 0.27249017089026434, "grad_norm": 3.8999664783477783, "learning_rate": 4.727782319280627e-05, "loss": 0.8357, "step": 7000 }, { "epoch": 0.2744365292537662, "grad_norm": 14.453765869140625, "learning_rate": 4.7258359609171245e-05, "loss": 0.8903, "step": 7050 }, { "epoch": 0.2763828876172681, "grad_norm": 13.608865737915039, "learning_rate": 4.723889602553622e-05, "loss": 0.7111, "step": 7100 }, { "epoch": 0.27832924598076997, "grad_norm": 5.950908184051514, "learning_rate": 4.721943244190121e-05, "loss": 0.7649, "step": 7150 }, { "epoch": 0.28027560434427184, "grad_norm": 9.839347839355469, "learning_rate": 4.7199968858266184e-05, "loss": 0.776, "step": 7200 }, { "epoch": 0.2822219627077738, "grad_norm": 9.80959415435791, "learning_rate": 4.718050527463116e-05, "loss": 0.8398, "step": 7250 }, { "epoch": 0.28416832107127565, "grad_norm": 10.288087844848633, "learning_rate": 4.7161041690996146e-05, "loss": 0.8525, "step": 7300 }, { "epoch": 0.28611467943477753, "grad_norm": 6.474511623382568, "learning_rate": 4.714157810736113e-05, "loss": 0.7646, "step": 7350 }, { "epoch": 0.2880610377982794, "grad_norm": 26.715721130371094, "learning_rate": 4.7122114523726115e-05, "loss": 0.8463, "step": 7400 }, { "epoch": 0.2900073961617813, "grad_norm": 12.851744651794434, "learning_rate": 4.710265094009109e-05, "loss": 0.8118, "step": 7450 }, { "epoch": 0.2919537545252832, "grad_norm": 14.158758163452148, "learning_rate": 4.708318735645607e-05, "loss": 0.752, "step": 7500 }, { "epoch": 0.2939001128887851, "grad_norm": 8.202895164489746, "learning_rate": 4.7063723772821054e-05, "loss": 0.8836, "step": 7550 }, { "epoch": 0.29584647125228697, "grad_norm": 21.108827590942383, "learning_rate": 4.704426018918603e-05, "loss": 0.8401, "step": 7600 }, { "epoch": 0.29779282961578885, "grad_norm": 11.068799018859863, "learning_rate": 4.7024796605551016e-05, "loss": 0.8865, "step": 7650 }, { "epoch": 0.2997391879792907, "grad_norm": 20.6379451751709, "learning_rate": 4.7005333021916e-05, "loss": 0.8474, "step": 7700 }, { "epoch": 0.30168554634279265, "grad_norm": 13.363690376281738, "learning_rate": 4.698586943828098e-05, "loss": 0.8342, "step": 7750 }, { "epoch": 0.30363190470629453, "grad_norm": 11.773860931396484, "learning_rate": 4.696640585464596e-05, "loss": 0.8456, "step": 7800 }, { "epoch": 0.3055782630697964, "grad_norm": 6.885359764099121, "learning_rate": 4.694694227101094e-05, "loss": 0.8377, "step": 7850 }, { "epoch": 0.3075246214332983, "grad_norm": 32.01542663574219, "learning_rate": 4.692747868737592e-05, "loss": 0.8477, "step": 7900 }, { "epoch": 0.30947097979680016, "grad_norm": 5.54678201675415, "learning_rate": 4.69080151037409e-05, "loss": 0.803, "step": 7950 }, { "epoch": 0.3114173381603021, "grad_norm": 8.73697566986084, "learning_rate": 4.6888551520105887e-05, "loss": 0.8822, "step": 8000 }, { "epoch": 0.31336369652380397, "grad_norm": 9.326956748962402, "learning_rate": 4.686908793647087e-05, "loss": 0.7492, "step": 8050 }, { "epoch": 0.31531005488730585, "grad_norm": 12.44359016418457, "learning_rate": 4.684962435283585e-05, "loss": 0.7743, "step": 8100 }, { "epoch": 0.3172564132508077, "grad_norm": 11.388222694396973, "learning_rate": 4.6830160769200826e-05, "loss": 0.7719, "step": 8150 }, { "epoch": 0.3192027716143096, "grad_norm": 9.53879165649414, "learning_rate": 4.681069718556581e-05, "loss": 0.6956, "step": 8200 }, { "epoch": 0.32114912997781153, "grad_norm": 18.38358497619629, "learning_rate": 4.679123360193079e-05, "loss": 0.8205, "step": 8250 }, { "epoch": 0.3230954883413134, "grad_norm": 8.223372459411621, "learning_rate": 4.6771770018295766e-05, "loss": 0.9243, "step": 8300 }, { "epoch": 0.3250418467048153, "grad_norm": 5.088399410247803, "learning_rate": 4.675230643466075e-05, "loss": 0.7211, "step": 8350 }, { "epoch": 0.32698820506831716, "grad_norm": 5.690611362457275, "learning_rate": 4.6732842851025734e-05, "loss": 0.727, "step": 8400 }, { "epoch": 0.3289345634318191, "grad_norm": 8.803997993469238, "learning_rate": 4.671337926739072e-05, "loss": 0.8047, "step": 8450 }, { "epoch": 0.33088092179532097, "grad_norm": 16.774333953857422, "learning_rate": 4.6693915683755696e-05, "loss": 0.8122, "step": 8500 }, { "epoch": 0.33282728015882285, "grad_norm": 27.07183074951172, "learning_rate": 4.6674452100120674e-05, "loss": 0.7939, "step": 8550 }, { "epoch": 0.3347736385223247, "grad_norm": 16.549375534057617, "learning_rate": 4.665498851648566e-05, "loss": 0.8268, "step": 8600 }, { "epoch": 0.3367199968858266, "grad_norm": 6.0482096672058105, "learning_rate": 4.6635524932850636e-05, "loss": 0.7369, "step": 8650 }, { "epoch": 0.33866635524932853, "grad_norm": 29.78438949584961, "learning_rate": 4.661606134921562e-05, "loss": 0.8783, "step": 8700 }, { "epoch": 0.3406127136128304, "grad_norm": 7.376162528991699, "learning_rate": 4.6596597765580604e-05, "loss": 0.7627, "step": 8750 }, { "epoch": 0.3425590719763323, "grad_norm": 10.565337181091309, "learning_rate": 4.657713418194558e-05, "loss": 0.8418, "step": 8800 }, { "epoch": 0.34450543033983416, "grad_norm": 4.8138203620910645, "learning_rate": 4.6557670598310566e-05, "loss": 0.7113, "step": 8850 }, { "epoch": 0.34645178870333604, "grad_norm": 12.1827392578125, "learning_rate": 4.6538207014675544e-05, "loss": 0.7759, "step": 8900 }, { "epoch": 0.34839814706683797, "grad_norm": 21.5107364654541, "learning_rate": 4.651874343104052e-05, "loss": 0.6293, "step": 8950 }, { "epoch": 0.35034450543033985, "grad_norm": 31.061279296875, "learning_rate": 4.6499279847405506e-05, "loss": 0.7275, "step": 9000 }, { "epoch": 0.3522908637938417, "grad_norm": 20.567771911621094, "learning_rate": 4.648020553544319e-05, "loss": 0.7352, "step": 9050 }, { "epoch": 0.3542372221573436, "grad_norm": 19.547704696655273, "learning_rate": 4.646074195180817e-05, "loss": 0.7098, "step": 9100 }, { "epoch": 0.3561835805208455, "grad_norm": 16.899614334106445, "learning_rate": 4.6441278368173155e-05, "loss": 0.7319, "step": 9150 }, { "epoch": 0.3581299388843474, "grad_norm": 19.504152297973633, "learning_rate": 4.642181478453813e-05, "loss": 0.7402, "step": 9200 }, { "epoch": 0.3600762972478493, "grad_norm": 46.84148025512695, "learning_rate": 4.640235120090311e-05, "loss": 0.8547, "step": 9250 }, { "epoch": 0.36202265561135116, "grad_norm": 10.423336029052734, "learning_rate": 4.63832768889408e-05, "loss": 0.8625, "step": 9300 }, { "epoch": 0.36396901397485304, "grad_norm": 22.25065040588379, "learning_rate": 4.6363813305305776e-05, "loss": 0.7837, "step": 9350 }, { "epoch": 0.3659153723383549, "grad_norm": 5.14716100692749, "learning_rate": 4.634434972167075e-05, "loss": 0.8305, "step": 9400 }, { "epoch": 0.36786173070185685, "grad_norm": 23.919570922851562, "learning_rate": 4.632488613803574e-05, "loss": 0.8229, "step": 9450 }, { "epoch": 0.3698080890653587, "grad_norm": 41.26445388793945, "learning_rate": 4.630581182607342e-05, "loss": 0.7834, "step": 9500 }, { "epoch": 0.3717544474288606, "grad_norm": 0.6982908844947815, "learning_rate": 4.6286348242438396e-05, "loss": 0.7622, "step": 9550 }, { "epoch": 0.3737008057923625, "grad_norm": 21.07360076904297, "learning_rate": 4.626688465880338e-05, "loss": 0.7879, "step": 9600 }, { "epoch": 0.37564716415586435, "grad_norm": 8.763337135314941, "learning_rate": 4.6247421075168365e-05, "loss": 0.8354, "step": 9650 }, { "epoch": 0.3775935225193663, "grad_norm": 14.109028816223145, "learning_rate": 4.622795749153334e-05, "loss": 0.7928, "step": 9700 }, { "epoch": 0.37953988088286816, "grad_norm": 4.796310901641846, "learning_rate": 4.6208493907898327e-05, "loss": 0.7752, "step": 9750 }, { "epoch": 0.38148623924637004, "grad_norm": 19.992938995361328, "learning_rate": 4.6189030324263304e-05, "loss": 0.7135, "step": 9800 }, { "epoch": 0.3834325976098719, "grad_norm": 10.536787033081055, "learning_rate": 4.616956674062829e-05, "loss": 0.7765, "step": 9850 }, { "epoch": 0.3853789559733738, "grad_norm": 7.374678134918213, "learning_rate": 4.6150103156993266e-05, "loss": 0.77, "step": 9900 }, { "epoch": 0.3873253143368757, "grad_norm": 14.167473793029785, "learning_rate": 4.613063957335825e-05, "loss": 0.7883, "step": 9950 }, { "epoch": 0.3892716727003776, "grad_norm": 6.783972263336182, "learning_rate": 4.6111175989723235e-05, "loss": 0.7852, "step": 10000 }, { "epoch": 0.3912180310638795, "grad_norm": 10.45738410949707, "learning_rate": 4.609171240608821e-05, "loss": 0.8194, "step": 10050 }, { "epoch": 0.39316438942738136, "grad_norm": 27.95968246459961, "learning_rate": 4.607224882245319e-05, "loss": 0.825, "step": 10100 }, { "epoch": 0.39511074779088323, "grad_norm": 15.53756332397461, "learning_rate": 4.6052785238818174e-05, "loss": 0.6946, "step": 10150 }, { "epoch": 0.39705710615438516, "grad_norm": 9.468719482421875, "learning_rate": 4.603332165518315e-05, "loss": 0.813, "step": 10200 }, { "epoch": 0.39900346451788704, "grad_norm": 12.168792724609375, "learning_rate": 4.6013858071548136e-05, "loss": 0.8199, "step": 10250 }, { "epoch": 0.4009498228813889, "grad_norm": 19.081567764282227, "learning_rate": 4.5994394487913114e-05, "loss": 0.7956, "step": 10300 }, { "epoch": 0.4028961812448908, "grad_norm": 7.980998516082764, "learning_rate": 4.59749309042781e-05, "loss": 0.7499, "step": 10350 }, { "epoch": 0.40484253960839267, "grad_norm": 8.15658950805664, "learning_rate": 4.595546732064308e-05, "loss": 0.7524, "step": 10400 }, { "epoch": 0.4067888979718946, "grad_norm": 7.936160564422607, "learning_rate": 4.593600373700806e-05, "loss": 0.8904, "step": 10450 }, { "epoch": 0.4087352563353965, "grad_norm": 27.66614532470703, "learning_rate": 4.591654015337304e-05, "loss": 0.8299, "step": 10500 }, { "epoch": 0.41068161469889836, "grad_norm": 3.2807061672210693, "learning_rate": 4.589707656973802e-05, "loss": 0.736, "step": 10550 }, { "epoch": 0.41262797306240023, "grad_norm": 16.195850372314453, "learning_rate": 4.5877612986103e-05, "loss": 0.7984, "step": 10600 }, { "epoch": 0.4145743314259021, "grad_norm": 18.340042114257812, "learning_rate": 4.5858149402467984e-05, "loss": 0.7369, "step": 10650 }, { "epoch": 0.41652068978940404, "grad_norm": 18.507083892822266, "learning_rate": 4.583868581883297e-05, "loss": 0.8439, "step": 10700 }, { "epoch": 0.4184670481529059, "grad_norm": 2.7585103511810303, "learning_rate": 4.5819222235197946e-05, "loss": 0.7158, "step": 10750 }, { "epoch": 0.4204134065164078, "grad_norm": 18.177396774291992, "learning_rate": 4.579975865156293e-05, "loss": 0.81, "step": 10800 }, { "epoch": 0.42235976487990967, "grad_norm": 15.865877151489258, "learning_rate": 4.578029506792791e-05, "loss": 0.7257, "step": 10850 }, { "epoch": 0.4243061232434116, "grad_norm": 21.182064056396484, "learning_rate": 4.5760831484292885e-05, "loss": 0.7532, "step": 10900 }, { "epoch": 0.4262524816069135, "grad_norm": 17.83367156982422, "learning_rate": 4.574136790065787e-05, "loss": 0.8048, "step": 10950 }, { "epoch": 0.42819883997041536, "grad_norm": 20.544261932373047, "learning_rate": 4.5721904317022854e-05, "loss": 0.8693, "step": 11000 }, { "epoch": 0.43014519833391723, "grad_norm": 9.830706596374512, "learning_rate": 4.570244073338784e-05, "loss": 0.7316, "step": 11050 }, { "epoch": 0.4320915566974191, "grad_norm": 12.76479434967041, "learning_rate": 4.5682977149752816e-05, "loss": 0.676, "step": 11100 }, { "epoch": 0.43403791506092104, "grad_norm": 11.760798454284668, "learning_rate": 4.5663513566117794e-05, "loss": 0.736, "step": 11150 }, { "epoch": 0.4359842734244229, "grad_norm": 10.957178115844727, "learning_rate": 4.564404998248278e-05, "loss": 0.7583, "step": 11200 }, { "epoch": 0.4379306317879248, "grad_norm": 36.927303314208984, "learning_rate": 4.5624586398847756e-05, "loss": 0.7115, "step": 11250 }, { "epoch": 0.4398769901514267, "grad_norm": 19.579509735107422, "learning_rate": 4.560512281521274e-05, "loss": 0.7315, "step": 11300 }, { "epoch": 0.44182334851492855, "grad_norm": 17.163299560546875, "learning_rate": 4.558565923157772e-05, "loss": 0.8268, "step": 11350 }, { "epoch": 0.4437697068784305, "grad_norm": 26.414487838745117, "learning_rate": 4.55661956479427e-05, "loss": 0.7966, "step": 11400 }, { "epoch": 0.44571606524193236, "grad_norm": 7.534037113189697, "learning_rate": 4.5546732064307686e-05, "loss": 0.774, "step": 11450 }, { "epoch": 0.44766242360543423, "grad_norm": 28.2991943359375, "learning_rate": 4.5527268480672664e-05, "loss": 0.8352, "step": 11500 }, { "epoch": 0.4496087819689361, "grad_norm": 13.67409896850586, "learning_rate": 4.550780489703764e-05, "loss": 0.8064, "step": 11550 }, { "epoch": 0.451555140332438, "grad_norm": 13.358135223388672, "learning_rate": 4.5488341313402626e-05, "loss": 0.8366, "step": 11600 }, { "epoch": 0.4535014986959399, "grad_norm": 22.41477394104004, "learning_rate": 4.5468877729767603e-05, "loss": 0.7355, "step": 11650 }, { "epoch": 0.4554478570594418, "grad_norm": 21.003925323486328, "learning_rate": 4.544941414613259e-05, "loss": 0.7172, "step": 11700 }, { "epoch": 0.4573942154229437, "grad_norm": 15.303767204284668, "learning_rate": 4.542995056249757e-05, "loss": 0.7977, "step": 11750 }, { "epoch": 0.45934057378644555, "grad_norm": 4.509729862213135, "learning_rate": 4.541048697886255e-05, "loss": 0.6847, "step": 11800 }, { "epoch": 0.4612869321499474, "grad_norm": 21.702726364135742, "learning_rate": 4.5391023395227534e-05, "loss": 0.7982, "step": 11850 }, { "epoch": 0.46323329051344936, "grad_norm": 6.290675640106201, "learning_rate": 4.537155981159251e-05, "loss": 0.7502, "step": 11900 }, { "epoch": 0.46517964887695123, "grad_norm": 29.96541976928711, "learning_rate": 4.535209622795749e-05, "loss": 0.7175, "step": 11950 }, { "epoch": 0.4671260072404531, "grad_norm": 27.05795669555664, "learning_rate": 4.5332632644322474e-05, "loss": 0.694, "step": 12000 }, { "epoch": 0.469072365603955, "grad_norm": 8.5138521194458, "learning_rate": 4.531316906068746e-05, "loss": 0.8002, "step": 12050 }, { "epoch": 0.47101872396745686, "grad_norm": 5.764838218688965, "learning_rate": 4.529370547705244e-05, "loss": 0.7395, "step": 12100 }, { "epoch": 0.4729650823309588, "grad_norm": 13.710786819458008, "learning_rate": 4.527424189341742e-05, "loss": 0.827, "step": 12150 }, { "epoch": 0.4749114406944607, "grad_norm": 17.551198959350586, "learning_rate": 4.52547783097824e-05, "loss": 0.7598, "step": 12200 }, { "epoch": 0.47685779905796255, "grad_norm": 37.59514236450195, "learning_rate": 4.523531472614738e-05, "loss": 0.7078, "step": 12250 }, { "epoch": 0.4788041574214644, "grad_norm": 14.084299087524414, "learning_rate": 4.521585114251236e-05, "loss": 0.8207, "step": 12300 }, { "epoch": 0.4807505157849663, "grad_norm": 16.30020523071289, "learning_rate": 4.519638755887734e-05, "loss": 0.652, "step": 12350 }, { "epoch": 0.48269687414846824, "grad_norm": 6.346928119659424, "learning_rate": 4.517692397524232e-05, "loss": 0.8089, "step": 12400 }, { "epoch": 0.4846432325119701, "grad_norm": 46.6481819152832, "learning_rate": 4.5157460391607306e-05, "loss": 0.835, "step": 12450 }, { "epoch": 0.486589590875472, "grad_norm": 31.6607723236084, "learning_rate": 4.513799680797229e-05, "loss": 0.7864, "step": 12500 }, { "epoch": 0.48853594923897387, "grad_norm": 11.753110885620117, "learning_rate": 4.511853322433727e-05, "loss": 0.6675, "step": 12550 }, { "epoch": 0.49048230760247574, "grad_norm": 9.733305931091309, "learning_rate": 4.5099069640702245e-05, "loss": 0.763, "step": 12600 }, { "epoch": 0.4924286659659777, "grad_norm": 2.823514938354492, "learning_rate": 4.507960605706723e-05, "loss": 0.8165, "step": 12650 }, { "epoch": 0.49437502432947955, "grad_norm": 83.86141204833984, "learning_rate": 4.506014247343221e-05, "loss": 0.8281, "step": 12700 }, { "epoch": 0.4963213826929814, "grad_norm": 14.956427574157715, "learning_rate": 4.504067888979719e-05, "loss": 0.7906, "step": 12750 }, { "epoch": 0.4982677410564833, "grad_norm": 5.740167140960693, "learning_rate": 4.5021215306162176e-05, "loss": 0.7608, "step": 12800 }, { "epoch": 0.5002140994199852, "grad_norm": 6.208752632141113, "learning_rate": 4.5001751722527153e-05, "loss": 0.7619, "step": 12850 }, { "epoch": 0.5021604577834871, "grad_norm": 4.269346237182617, "learning_rate": 4.498228813889214e-05, "loss": 0.7654, "step": 12900 }, { "epoch": 0.5041068161469889, "grad_norm": 28.708641052246094, "learning_rate": 4.4962824555257115e-05, "loss": 0.7848, "step": 12950 }, { "epoch": 0.5060531745104909, "grad_norm": 5.881056785583496, "learning_rate": 4.494336097162209e-05, "loss": 0.6932, "step": 13000 }, { "epoch": 0.5079995328739928, "grad_norm": 10.74325180053711, "learning_rate": 4.492389738798708e-05, "loss": 0.7739, "step": 13050 }, { "epoch": 0.5099458912374947, "grad_norm": 15.692766189575195, "learning_rate": 4.490443380435206e-05, "loss": 0.7513, "step": 13100 }, { "epoch": 0.5118922496009966, "grad_norm": 8.959396362304688, "learning_rate": 4.4884970220717046e-05, "loss": 0.7351, "step": 13150 }, { "epoch": 0.5138386079644984, "grad_norm": 8.217195510864258, "learning_rate": 4.4865506637082024e-05, "loss": 0.7525, "step": 13200 }, { "epoch": 0.5157849663280003, "grad_norm": 7.837081432342529, "learning_rate": 4.4846043053447e-05, "loss": 0.7492, "step": 13250 }, { "epoch": 0.5177313246915022, "grad_norm": 4.801698207855225, "learning_rate": 4.4826579469811986e-05, "loss": 0.7556, "step": 13300 }, { "epoch": 0.5196776830550041, "grad_norm": 16.48876190185547, "learning_rate": 4.480711588617696e-05, "loss": 0.8078, "step": 13350 }, { "epoch": 0.5216240414185059, "grad_norm": 11.812597274780273, "learning_rate": 4.478765230254194e-05, "loss": 0.7334, "step": 13400 }, { "epoch": 0.5235703997820078, "grad_norm": 16.098819732666016, "learning_rate": 4.4768188718906925e-05, "loss": 0.649, "step": 13450 }, { "epoch": 0.5255167581455098, "grad_norm": 5.974759578704834, "learning_rate": 4.474872513527191e-05, "loss": 0.7583, "step": 13500 }, { "epoch": 0.5274631165090117, "grad_norm": 10.199329376220703, "learning_rate": 4.4729261551636894e-05, "loss": 0.7919, "step": 13550 }, { "epoch": 0.5294094748725136, "grad_norm": 24.56243324279785, "learning_rate": 4.470979796800187e-05, "loss": 0.7821, "step": 13600 }, { "epoch": 0.5313558332360154, "grad_norm": 5.828876495361328, "learning_rate": 4.469033438436685e-05, "loss": 0.7467, "step": 13650 }, { "epoch": 0.5333021915995173, "grad_norm": 12.34913444519043, "learning_rate": 4.467087080073183e-05, "loss": 0.7665, "step": 13700 }, { "epoch": 0.5352485499630192, "grad_norm": 32.360538482666016, "learning_rate": 4.4651796488769514e-05, "loss": 0.7145, "step": 13750 }, { "epoch": 0.5371949083265211, "grad_norm": 13.184320449829102, "learning_rate": 4.463233290513449e-05, "loss": 0.7994, "step": 13800 }, { "epoch": 0.5391412666900229, "grad_norm": 9.366595268249512, "learning_rate": 4.4612869321499476e-05, "loss": 0.7168, "step": 13850 }, { "epoch": 0.5410876250535248, "grad_norm": 7.465328216552734, "learning_rate": 4.459340573786446e-05, "loss": 0.835, "step": 13900 }, { "epoch": 0.5430339834170267, "grad_norm": 11.568052291870117, "learning_rate": 4.457394215422944e-05, "loss": 0.7071, "step": 13950 }, { "epoch": 0.5449803417805287, "grad_norm": 19.84809684753418, "learning_rate": 4.455447857059442e-05, "loss": 0.6983, "step": 14000 }, { "epoch": 0.5469267001440306, "grad_norm": 15.243165969848633, "learning_rate": 4.45350149869594e-05, "loss": 0.7691, "step": 14050 }, { "epoch": 0.5488730585075324, "grad_norm": 12.250027656555176, "learning_rate": 4.4515551403324384e-05, "loss": 0.8464, "step": 14100 }, { "epoch": 0.5508194168710343, "grad_norm": 24.135351181030273, "learning_rate": 4.449608781968936e-05, "loss": 0.6754, "step": 14150 }, { "epoch": 0.5527657752345362, "grad_norm": 11.8226318359375, "learning_rate": 4.447662423605434e-05, "loss": 0.763, "step": 14200 }, { "epoch": 0.5547121335980381, "grad_norm": 8.578081130981445, "learning_rate": 4.445716065241933e-05, "loss": 0.8545, "step": 14250 }, { "epoch": 0.5566584919615399, "grad_norm": 10.140195846557617, "learning_rate": 4.443769706878431e-05, "loss": 0.7233, "step": 14300 }, { "epoch": 0.5586048503250418, "grad_norm": 7.432424068450928, "learning_rate": 4.4418233485149286e-05, "loss": 0.7093, "step": 14350 }, { "epoch": 0.5605512086885437, "grad_norm": 13.604988098144531, "learning_rate": 4.439876990151427e-05, "loss": 0.7077, "step": 14400 }, { "epoch": 0.5624975670520457, "grad_norm": 7.397076606750488, "learning_rate": 4.437930631787925e-05, "loss": 0.7755, "step": 14450 }, { "epoch": 0.5644439254155476, "grad_norm": 9.328235626220703, "learning_rate": 4.435984273424423e-05, "loss": 0.6684, "step": 14500 }, { "epoch": 0.5663902837790494, "grad_norm": 10.961734771728516, "learning_rate": 4.434037915060921e-05, "loss": 0.7946, "step": 14550 }, { "epoch": 0.5683366421425513, "grad_norm": 49.56478500366211, "learning_rate": 4.4320915566974194e-05, "loss": 0.7229, "step": 14600 }, { "epoch": 0.5702830005060532, "grad_norm": 8.345135688781738, "learning_rate": 4.430145198333918e-05, "loss": 0.7801, "step": 14650 }, { "epoch": 0.5722293588695551, "grad_norm": 10.84884262084961, "learning_rate": 4.4281988399704156e-05, "loss": 0.7927, "step": 14700 }, { "epoch": 0.5741757172330569, "grad_norm": 11.867048263549805, "learning_rate": 4.4262524816069134e-05, "loss": 0.7107, "step": 14750 }, { "epoch": 0.5761220755965588, "grad_norm": 16.31625747680664, "learning_rate": 4.424306123243412e-05, "loss": 0.7376, "step": 14800 }, { "epoch": 0.5780684339600607, "grad_norm": 9.733827590942383, "learning_rate": 4.4223597648799095e-05, "loss": 0.7059, "step": 14850 }, { "epoch": 0.5800147923235626, "grad_norm": 10.850849151611328, "learning_rate": 4.420413406516408e-05, "loss": 0.7652, "step": 14900 }, { "epoch": 0.5819611506870646, "grad_norm": 21.48041534423828, "learning_rate": 4.4184670481529064e-05, "loss": 0.5847, "step": 14950 }, { "epoch": 0.5839075090505664, "grad_norm": 10.605826377868652, "learning_rate": 4.416520689789404e-05, "loss": 0.8977, "step": 15000 }, { "epoch": 0.5858538674140683, "grad_norm": 9.879531860351562, "learning_rate": 4.4145743314259026e-05, "loss": 0.7421, "step": 15050 }, { "epoch": 0.5878002257775702, "grad_norm": 9.968884468078613, "learning_rate": 4.4126279730624004e-05, "loss": 0.7455, "step": 15100 }, { "epoch": 0.5897465841410721, "grad_norm": 15.170600891113281, "learning_rate": 4.410681614698899e-05, "loss": 0.7517, "step": 15150 }, { "epoch": 0.5916929425045739, "grad_norm": 7.045068264007568, "learning_rate": 4.4087352563353966e-05, "loss": 0.7323, "step": 15200 }, { "epoch": 0.5936393008680758, "grad_norm": 16.718570709228516, "learning_rate": 4.406788897971894e-05, "loss": 0.7112, "step": 15250 }, { "epoch": 0.5955856592315777, "grad_norm": 23.08176040649414, "learning_rate": 4.4048425396083934e-05, "loss": 0.6778, "step": 15300 }, { "epoch": 0.5975320175950796, "grad_norm": 6.3371992111206055, "learning_rate": 4.402896181244891e-05, "loss": 0.7203, "step": 15350 }, { "epoch": 0.5994783759585814, "grad_norm": 9.724499702453613, "learning_rate": 4.400949822881389e-05, "loss": 0.8652, "step": 15400 }, { "epoch": 0.6014247343220834, "grad_norm": 25.625246047973633, "learning_rate": 4.399042391685158e-05, "loss": 0.6899, "step": 15450 }, { "epoch": 0.6033710926855853, "grad_norm": 11.967310905456543, "learning_rate": 4.3970960333216555e-05, "loss": 0.7222, "step": 15500 }, { "epoch": 0.6053174510490872, "grad_norm": 12.910189628601074, "learning_rate": 4.395149674958153e-05, "loss": 0.8043, "step": 15550 }, { "epoch": 0.6072638094125891, "grad_norm": 10.33430004119873, "learning_rate": 4.3932033165946517e-05, "loss": 0.8379, "step": 15600 }, { "epoch": 0.6092101677760909, "grad_norm": 26.118852615356445, "learning_rate": 4.3912569582311494e-05, "loss": 0.7116, "step": 15650 }, { "epoch": 0.6111565261395928, "grad_norm": 4.646867275238037, "learning_rate": 4.389310599867648e-05, "loss": 0.7376, "step": 15700 }, { "epoch": 0.6131028845030947, "grad_norm": 12.0819091796875, "learning_rate": 4.387364241504146e-05, "loss": 0.6882, "step": 15750 }, { "epoch": 0.6150492428665966, "grad_norm": 15.061481475830078, "learning_rate": 4.385417883140644e-05, "loss": 0.7819, "step": 15800 }, { "epoch": 0.6169956012300984, "grad_norm": 4.991694927215576, "learning_rate": 4.3834715247771425e-05, "loss": 0.7388, "step": 15850 }, { "epoch": 0.6189419595936003, "grad_norm": 28.870878219604492, "learning_rate": 4.38152516641364e-05, "loss": 0.7633, "step": 15900 }, { "epoch": 0.6208883179571023, "grad_norm": 10.489543914794922, "learning_rate": 4.379578808050138e-05, "loss": 0.8328, "step": 15950 }, { "epoch": 0.6228346763206042, "grad_norm": 4.812399864196777, "learning_rate": 4.3776324496866364e-05, "loss": 0.7135, "step": 16000 }, { "epoch": 0.6247810346841061, "grad_norm": 17.787609100341797, "learning_rate": 4.375686091323135e-05, "loss": 0.8574, "step": 16050 }, { "epoch": 0.6267273930476079, "grad_norm": 63.12671661376953, "learning_rate": 4.3737397329596326e-05, "loss": 0.808, "step": 16100 }, { "epoch": 0.6286737514111098, "grad_norm": 7.184403419494629, "learning_rate": 4.371793374596131e-05, "loss": 0.7543, "step": 16150 }, { "epoch": 0.6306201097746117, "grad_norm": 93.47250366210938, "learning_rate": 4.369847016232629e-05, "loss": 0.7149, "step": 16200 }, { "epoch": 0.6325664681381136, "grad_norm": 14.896015167236328, "learning_rate": 4.367900657869127e-05, "loss": 0.7987, "step": 16250 }, { "epoch": 0.6345128265016154, "grad_norm": 41.22991180419922, "learning_rate": 4.365954299505625e-05, "loss": 0.8745, "step": 16300 }, { "epoch": 0.6364591848651173, "grad_norm": 18.766292572021484, "learning_rate": 4.364007941142123e-05, "loss": 0.7728, "step": 16350 }, { "epoch": 0.6384055432286192, "grad_norm": 6.061841011047363, "learning_rate": 4.362061582778621e-05, "loss": 0.6964, "step": 16400 }, { "epoch": 0.6403519015921212, "grad_norm": 8.97814655303955, "learning_rate": 4.3601152244151196e-05, "loss": 0.8024, "step": 16450 }, { "epoch": 0.6422982599556231, "grad_norm": 7.129734992980957, "learning_rate": 4.358168866051618e-05, "loss": 0.7625, "step": 16500 }, { "epoch": 0.6442446183191249, "grad_norm": 145.56500244140625, "learning_rate": 4.356222507688116e-05, "loss": 0.7151, "step": 16550 }, { "epoch": 0.6461909766826268, "grad_norm": 5.052350997924805, "learning_rate": 4.3542761493246136e-05, "loss": 0.6768, "step": 16600 }, { "epoch": 0.6481373350461287, "grad_norm": 7.0590081214904785, "learning_rate": 4.352329790961112e-05, "loss": 0.7019, "step": 16650 }, { "epoch": 0.6500836934096306, "grad_norm": 11.310042381286621, "learning_rate": 4.35038343259761e-05, "loss": 0.7401, "step": 16700 }, { "epoch": 0.6520300517731324, "grad_norm": 19.27460479736328, "learning_rate": 4.348437074234108e-05, "loss": 0.8641, "step": 16750 }, { "epoch": 0.6539764101366343, "grad_norm": 11.066648483276367, "learning_rate": 4.346490715870607e-05, "loss": 0.728, "step": 16800 }, { "epoch": 0.6559227685001362, "grad_norm": 3.734243392944336, "learning_rate": 4.3445443575071044e-05, "loss": 0.7899, "step": 16850 }, { "epoch": 0.6578691268636382, "grad_norm": 9.622673034667969, "learning_rate": 4.342597999143603e-05, "loss": 0.7222, "step": 16900 }, { "epoch": 0.6598154852271401, "grad_norm": 12.402290344238281, "learning_rate": 4.3406516407801006e-05, "loss": 0.8085, "step": 16950 }, { "epoch": 0.6617618435906419, "grad_norm": 10.547155380249023, "learning_rate": 4.3387052824165984e-05, "loss": 0.7407, "step": 17000 }, { "epoch": 0.6637082019541438, "grad_norm": 24.279090881347656, "learning_rate": 4.336758924053097e-05, "loss": 0.7062, "step": 17050 }, { "epoch": 0.6656545603176457, "grad_norm": 7.916913986206055, "learning_rate": 4.334812565689595e-05, "loss": 0.7409, "step": 17100 }, { "epoch": 0.6676009186811476, "grad_norm": 15.824557304382324, "learning_rate": 4.332866207326093e-05, "loss": 0.7512, "step": 17150 }, { "epoch": 0.6695472770446494, "grad_norm": 9.764187812805176, "learning_rate": 4.3309198489625914e-05, "loss": 0.7328, "step": 17200 }, { "epoch": 0.6714936354081513, "grad_norm": 25.365703582763672, "learning_rate": 4.328973490599089e-05, "loss": 0.7844, "step": 17250 }, { "epoch": 0.6734399937716532, "grad_norm": 10.425263404846191, "learning_rate": 4.3270271322355876e-05, "loss": 0.7159, "step": 17300 }, { "epoch": 0.6753863521351551, "grad_norm": 10.696349143981934, "learning_rate": 4.3250807738720854e-05, "loss": 0.6779, "step": 17350 }, { "epoch": 0.6773327104986571, "grad_norm": 11.495389938354492, "learning_rate": 4.323134415508583e-05, "loss": 0.7337, "step": 17400 }, { "epoch": 0.6792790688621589, "grad_norm": 34.4336051940918, "learning_rate": 4.3211880571450816e-05, "loss": 0.7545, "step": 17450 }, { "epoch": 0.6812254272256608, "grad_norm": 13.71507453918457, "learning_rate": 4.31924169878158e-05, "loss": 0.7139, "step": 17500 }, { "epoch": 0.6831717855891627, "grad_norm": 15.37479019165039, "learning_rate": 4.3172953404180785e-05, "loss": 0.7091, "step": 17550 }, { "epoch": 0.6851181439526646, "grad_norm": 20.00337028503418, "learning_rate": 4.315348982054576e-05, "loss": 0.7525, "step": 17600 }, { "epoch": 0.6870645023161664, "grad_norm": 9.934649467468262, "learning_rate": 4.313402623691074e-05, "loss": 0.7365, "step": 17650 }, { "epoch": 0.6890108606796683, "grad_norm": 5.76163387298584, "learning_rate": 4.3114562653275724e-05, "loss": 0.8124, "step": 17700 }, { "epoch": 0.6909572190431702, "grad_norm": 15.189363479614258, "learning_rate": 4.30950990696407e-05, "loss": 0.7319, "step": 17750 }, { "epoch": 0.6929035774066721, "grad_norm": 13.084677696228027, "learning_rate": 4.3075635486005686e-05, "loss": 0.8377, "step": 17800 }, { "epoch": 0.694849935770174, "grad_norm": 9.929899215698242, "learning_rate": 4.305656117404337e-05, "loss": 0.7488, "step": 17850 }, { "epoch": 0.6967962941336759, "grad_norm": 25.551347732543945, "learning_rate": 4.303709759040835e-05, "loss": 0.7938, "step": 17900 }, { "epoch": 0.6987426524971778, "grad_norm": 19.51130485534668, "learning_rate": 4.301763400677333e-05, "loss": 0.7087, "step": 17950 }, { "epoch": 0.7006890108606797, "grad_norm": 9.172579765319824, "learning_rate": 4.299817042313831e-05, "loss": 0.7618, "step": 18000 }, { "epoch": 0.7026353692241816, "grad_norm": 13.391602516174316, "learning_rate": 4.297870683950329e-05, "loss": 0.7518, "step": 18050 }, { "epoch": 0.7045817275876834, "grad_norm": 4.889591217041016, "learning_rate": 4.295924325586827e-05, "loss": 0.6688, "step": 18100 }, { "epoch": 0.7065280859511853, "grad_norm": 3.8700320720672607, "learning_rate": 4.293977967223325e-05, "loss": 0.6338, "step": 18150 }, { "epoch": 0.7084744443146872, "grad_norm": 7.195404052734375, "learning_rate": 4.292031608859824e-05, "loss": 0.759, "step": 18200 }, { "epoch": 0.7104208026781891, "grad_norm": 66.3180160522461, "learning_rate": 4.290085250496322e-05, "loss": 0.6909, "step": 18250 }, { "epoch": 0.712367161041691, "grad_norm": 9.272618293762207, "learning_rate": 4.28813889213282e-05, "loss": 0.773, "step": 18300 }, { "epoch": 0.7143135194051928, "grad_norm": 11.745140075683594, "learning_rate": 4.2861925337693176e-05, "loss": 0.7195, "step": 18350 }, { "epoch": 0.7162598777686948, "grad_norm": 16.27741813659668, "learning_rate": 4.284246175405816e-05, "loss": 0.8114, "step": 18400 }, { "epoch": 0.7182062361321967, "grad_norm": 32.083641052246094, "learning_rate": 4.282299817042314e-05, "loss": 0.7465, "step": 18450 }, { "epoch": 0.7201525944956986, "grad_norm": 36.44049072265625, "learning_rate": 4.2803534586788116e-05, "loss": 0.6399, "step": 18500 }, { "epoch": 0.7220989528592004, "grad_norm": 5.762078762054443, "learning_rate": 4.27840710031531e-05, "loss": 0.7146, "step": 18550 }, { "epoch": 0.7240453112227023, "grad_norm": 22.881813049316406, "learning_rate": 4.2764607419518085e-05, "loss": 0.7936, "step": 18600 }, { "epoch": 0.7259916695862042, "grad_norm": 8.416080474853516, "learning_rate": 4.274514383588307e-05, "loss": 0.7514, "step": 18650 }, { "epoch": 0.7279380279497061, "grad_norm": 14.003064155578613, "learning_rate": 4.272568025224805e-05, "loss": 0.7678, "step": 18700 }, { "epoch": 0.729884386313208, "grad_norm": 12.491682052612305, "learning_rate": 4.2706216668613024e-05, "loss": 0.7462, "step": 18750 }, { "epoch": 0.7318307446767098, "grad_norm": 8.797147750854492, "learning_rate": 4.268675308497801e-05, "loss": 0.7157, "step": 18800 }, { "epoch": 0.7337771030402117, "grad_norm": 16.05181884765625, "learning_rate": 4.2667289501342986e-05, "loss": 0.7938, "step": 18850 }, { "epoch": 0.7357234614037137, "grad_norm": 8.482451438903809, "learning_rate": 4.264782591770797e-05, "loss": 0.7152, "step": 18900 }, { "epoch": 0.7376698197672156, "grad_norm": 16.760271072387695, "learning_rate": 4.2628362334072955e-05, "loss": 0.7923, "step": 18950 }, { "epoch": 0.7396161781307174, "grad_norm": 13.47537612915039, "learning_rate": 4.260889875043793e-05, "loss": 0.7364, "step": 19000 }, { "epoch": 0.7415625364942193, "grad_norm": 11.111624717712402, "learning_rate": 4.258943516680292e-05, "loss": 0.8148, "step": 19050 }, { "epoch": 0.7435088948577212, "grad_norm": 14.903639793395996, "learning_rate": 4.2569971583167894e-05, "loss": 0.8243, "step": 19100 }, { "epoch": 0.7454552532212231, "grad_norm": 7.4388957023620605, "learning_rate": 4.255050799953287e-05, "loss": 0.7872, "step": 19150 }, { "epoch": 0.747401611584725, "grad_norm": 13.052556991577148, "learning_rate": 4.2531044415897856e-05, "loss": 0.6827, "step": 19200 }, { "epoch": 0.7493479699482268, "grad_norm": 13.374312400817871, "learning_rate": 4.2511580832262834e-05, "loss": 0.8523, "step": 19250 }, { "epoch": 0.7512943283117287, "grad_norm": 126.33373260498047, "learning_rate": 4.2492117248627825e-05, "loss": 0.734, "step": 19300 }, { "epoch": 0.7532406866752307, "grad_norm": 17.77503204345703, "learning_rate": 4.24726536649928e-05, "loss": 0.7235, "step": 19350 }, { "epoch": 0.7551870450387326, "grad_norm": 8.419774055480957, "learning_rate": 4.245319008135778e-05, "loss": 0.7409, "step": 19400 }, { "epoch": 0.7571334034022345, "grad_norm": 17.862178802490234, "learning_rate": 4.2433726497722765e-05, "loss": 0.7159, "step": 19450 }, { "epoch": 0.7590797617657363, "grad_norm": 17.305519104003906, "learning_rate": 4.241426291408774e-05, "loss": 0.6636, "step": 19500 }, { "epoch": 0.7610261201292382, "grad_norm": 4.077340602874756, "learning_rate": 4.239479933045272e-05, "loss": 0.7131, "step": 19550 }, { "epoch": 0.7629724784927401, "grad_norm": 5.903844356536865, "learning_rate": 4.2375335746817704e-05, "loss": 0.7636, "step": 19600 }, { "epoch": 0.764918836856242, "grad_norm": 13.105559349060059, "learning_rate": 4.235587216318269e-05, "loss": 0.6865, "step": 19650 }, { "epoch": 0.7668651952197438, "grad_norm": 50.021175384521484, "learning_rate": 4.233640857954767e-05, "loss": 0.7578, "step": 19700 }, { "epoch": 0.7688115535832457, "grad_norm": 21.004024505615234, "learning_rate": 4.231694499591265e-05, "loss": 0.7384, "step": 19750 }, { "epoch": 0.7707579119467476, "grad_norm": 17.335832595825195, "learning_rate": 4.229748141227763e-05, "loss": 0.7028, "step": 19800 }, { "epoch": 0.7727042703102496, "grad_norm": 18.504657745361328, "learning_rate": 4.227801782864261e-05, "loss": 0.7791, "step": 19850 }, { "epoch": 0.7746506286737515, "grad_norm": 11.166783332824707, "learning_rate": 4.225855424500759e-05, "loss": 0.7326, "step": 19900 }, { "epoch": 0.7765969870372533, "grad_norm": 12.100536346435547, "learning_rate": 4.2239090661372574e-05, "loss": 0.6964, "step": 19950 }, { "epoch": 0.7785433454007552, "grad_norm": 6.65763521194458, "learning_rate": 4.221962707773756e-05, "loss": 0.7727, "step": 20000 }, { "epoch": 0.7804897037642571, "grad_norm": 29.17792510986328, "learning_rate": 4.2200163494102536e-05, "loss": 0.6754, "step": 20050 }, { "epoch": 0.782436062127759, "grad_norm": 28.765995025634766, "learning_rate": 4.218069991046752e-05, "loss": 0.7357, "step": 20100 }, { "epoch": 0.7843824204912608, "grad_norm": 12.131084442138672, "learning_rate": 4.21612363268325e-05, "loss": 0.7338, "step": 20150 }, { "epoch": 0.7863287788547627, "grad_norm": 15.52925968170166, "learning_rate": 4.2141772743197476e-05, "loss": 0.7437, "step": 20200 }, { "epoch": 0.7882751372182646, "grad_norm": 6.228748798370361, "learning_rate": 4.212230915956246e-05, "loss": 0.6674, "step": 20250 }, { "epoch": 0.7902214955817665, "grad_norm": 10.853662490844727, "learning_rate": 4.210284557592744e-05, "loss": 0.7877, "step": 20300 }, { "epoch": 0.7921678539452685, "grad_norm": 22.440872192382812, "learning_rate": 4.208338199229243e-05, "loss": 0.6872, "step": 20350 }, { "epoch": 0.7941142123087703, "grad_norm": 14.012978553771973, "learning_rate": 4.2063918408657406e-05, "loss": 0.7313, "step": 20400 }, { "epoch": 0.7960605706722722, "grad_norm": 14.384893417358398, "learning_rate": 4.2044454825022384e-05, "loss": 0.7558, "step": 20450 }, { "epoch": 0.7980069290357741, "grad_norm": 16.413166046142578, "learning_rate": 4.202499124138737e-05, "loss": 0.7495, "step": 20500 }, { "epoch": 0.799953287399276, "grad_norm": 8.508787155151367, "learning_rate": 4.2005527657752346e-05, "loss": 0.6537, "step": 20550 }, { "epoch": 0.8018996457627778, "grad_norm": 7.591757297515869, "learning_rate": 4.1986064074117324e-05, "loss": 0.6588, "step": 20600 }, { "epoch": 0.8038460041262797, "grad_norm": 20.607555389404297, "learning_rate": 4.196660049048231e-05, "loss": 0.708, "step": 20650 }, { "epoch": 0.8057923624897816, "grad_norm": 16.927520751953125, "learning_rate": 4.194713690684729e-05, "loss": 0.7488, "step": 20700 }, { "epoch": 0.8077387208532835, "grad_norm": 5.596468448638916, "learning_rate": 4.1927673323212277e-05, "loss": 0.7331, "step": 20750 }, { "epoch": 0.8096850792167853, "grad_norm": 12.319095611572266, "learning_rate": 4.1908209739577254e-05, "loss": 0.8668, "step": 20800 }, { "epoch": 0.8116314375802873, "grad_norm": 6.9447526931762695, "learning_rate": 4.188874615594223e-05, "loss": 0.8025, "step": 20850 }, { "epoch": 0.8135777959437892, "grad_norm": 11.16087818145752, "learning_rate": 4.1869282572307216e-05, "loss": 0.6614, "step": 20900 }, { "epoch": 0.8155241543072911, "grad_norm": 14.913068771362305, "learning_rate": 4.1849818988672194e-05, "loss": 0.6782, "step": 20950 }, { "epoch": 0.817470512670793, "grad_norm": 13.312978744506836, "learning_rate": 4.183035540503718e-05, "loss": 0.6456, "step": 21000 }, { "epoch": 0.8194168710342948, "grad_norm": 13.253778457641602, "learning_rate": 4.181089182140216e-05, "loss": 0.6752, "step": 21050 }, { "epoch": 0.8213632293977967, "grad_norm": 4.332753658294678, "learning_rate": 4.179142823776714e-05, "loss": 0.737, "step": 21100 }, { "epoch": 0.8233095877612986, "grad_norm": 5.357513904571533, "learning_rate": 4.1771964654132124e-05, "loss": 0.7643, "step": 21150 }, { "epoch": 0.8252559461248005, "grad_norm": 2.791322946548462, "learning_rate": 4.17525010704971e-05, "loss": 0.6661, "step": 21200 }, { "epoch": 0.8272023044883023, "grad_norm": 13.960010528564453, "learning_rate": 4.173303748686208e-05, "loss": 0.7515, "step": 21250 }, { "epoch": 0.8291486628518042, "grad_norm": 5.775780200958252, "learning_rate": 4.171396317489977e-05, "loss": 0.739, "step": 21300 }, { "epoch": 0.8310950212153062, "grad_norm": 11.577250480651855, "learning_rate": 4.1694499591264745e-05, "loss": 0.7292, "step": 21350 }, { "epoch": 0.8330413795788081, "grad_norm": 8.533802032470703, "learning_rate": 4.167503600762972e-05, "loss": 0.8219, "step": 21400 }, { "epoch": 0.83498773794231, "grad_norm": 11.08761215209961, "learning_rate": 4.1655572423994707e-05, "loss": 0.8026, "step": 21450 }, { "epoch": 0.8369340963058118, "grad_norm": 12.353090286254883, "learning_rate": 4.163610884035969e-05, "loss": 0.8036, "step": 21500 }, { "epoch": 0.8388804546693137, "grad_norm": 8.356585502624512, "learning_rate": 4.161664525672467e-05, "loss": 0.7487, "step": 21550 }, { "epoch": 0.8408268130328156, "grad_norm": 9.948888778686523, "learning_rate": 4.159718167308965e-05, "loss": 0.5866, "step": 21600 }, { "epoch": 0.8427731713963175, "grad_norm": 4.336802959442139, "learning_rate": 4.157771808945463e-05, "loss": 0.7756, "step": 21650 }, { "epoch": 0.8447195297598193, "grad_norm": 12.014235496520996, "learning_rate": 4.1558254505819615e-05, "loss": 0.6883, "step": 21700 }, { "epoch": 0.8466658881233212, "grad_norm": 15.13467788696289, "learning_rate": 4.153879092218459e-05, "loss": 0.7168, "step": 21750 }, { "epoch": 0.8486122464868232, "grad_norm": 14.484478950500488, "learning_rate": 4.151932733854958e-05, "loss": 0.7217, "step": 21800 }, { "epoch": 0.8505586048503251, "grad_norm": 20.778038024902344, "learning_rate": 4.149986375491456e-05, "loss": 0.7473, "step": 21850 }, { "epoch": 0.852504963213827, "grad_norm": 19.673484802246094, "learning_rate": 4.148040017127954e-05, "loss": 0.735, "step": 21900 }, { "epoch": 0.8544513215773288, "grad_norm": 3.628446102142334, "learning_rate": 4.1460936587644516e-05, "loss": 0.6981, "step": 21950 }, { "epoch": 0.8563976799408307, "grad_norm": 5.1977152824401855, "learning_rate": 4.14414730040095e-05, "loss": 0.7716, "step": 22000 }, { "epoch": 0.8583440383043326, "grad_norm": 20.602136611938477, "learning_rate": 4.142200942037448e-05, "loss": 0.7012, "step": 22050 }, { "epoch": 0.8602903966678345, "grad_norm": 13.175037384033203, "learning_rate": 4.140254583673946e-05, "loss": 0.7023, "step": 22100 }, { "epoch": 0.8622367550313363, "grad_norm": 7.099280834197998, "learning_rate": 4.138308225310445e-05, "loss": 0.6601, "step": 22150 }, { "epoch": 0.8641831133948382, "grad_norm": 9.846458435058594, "learning_rate": 4.1363618669469425e-05, "loss": 0.6589, "step": 22200 }, { "epoch": 0.8661294717583401, "grad_norm": 6.346736907958984, "learning_rate": 4.134415508583441e-05, "loss": 0.7549, "step": 22250 }, { "epoch": 0.8680758301218421, "grad_norm": 18.881092071533203, "learning_rate": 4.1324691502199386e-05, "loss": 0.6648, "step": 22300 }, { "epoch": 0.870022188485344, "grad_norm": 11.129849433898926, "learning_rate": 4.1305227918564364e-05, "loss": 0.7403, "step": 22350 }, { "epoch": 0.8719685468488458, "grad_norm": 35.84275436401367, "learning_rate": 4.128576433492935e-05, "loss": 0.7098, "step": 22400 }, { "epoch": 0.8739149052123477, "grad_norm": 6.525640487670898, "learning_rate": 4.1266300751294326e-05, "loss": 0.7484, "step": 22450 }, { "epoch": 0.8758612635758496, "grad_norm": 36.4077262878418, "learning_rate": 4.124683716765931e-05, "loss": 0.7404, "step": 22500 }, { "epoch": 0.8778076219393515, "grad_norm": 6.708271503448486, "learning_rate": 4.1227373584024295e-05, "loss": 0.718, "step": 22550 }, { "epoch": 0.8797539803028533, "grad_norm": 15.29634952545166, "learning_rate": 4.120791000038927e-05, "loss": 0.6399, "step": 22600 }, { "epoch": 0.8817003386663552, "grad_norm": 11.063860893249512, "learning_rate": 4.118844641675426e-05, "loss": 0.7525, "step": 22650 }, { "epoch": 0.8836466970298571, "grad_norm": 15.845246315002441, "learning_rate": 4.1168982833119234e-05, "loss": 0.6654, "step": 22700 }, { "epoch": 0.885593055393359, "grad_norm": 13.998644828796387, "learning_rate": 4.114951924948422e-05, "loss": 0.6452, "step": 22750 }, { "epoch": 0.887539413756861, "grad_norm": 6.16065788269043, "learning_rate": 4.1130055665849196e-05, "loss": 0.6761, "step": 22800 }, { "epoch": 0.8894857721203628, "grad_norm": 29.386442184448242, "learning_rate": 4.111059208221418e-05, "loss": 0.7276, "step": 22850 }, { "epoch": 0.8914321304838647, "grad_norm": 21.921226501464844, "learning_rate": 4.1091128498579165e-05, "loss": 0.7062, "step": 22900 }, { "epoch": 0.8933784888473666, "grad_norm": 6.034702777862549, "learning_rate": 4.107166491494414e-05, "loss": 0.7074, "step": 22950 }, { "epoch": 0.8953248472108685, "grad_norm": 36.694908142089844, "learning_rate": 4.105220133130912e-05, "loss": 0.6683, "step": 23000 }, { "epoch": 0.8972712055743703, "grad_norm": 7.700848579406738, "learning_rate": 4.1032737747674104e-05, "loss": 0.7228, "step": 23050 }, { "epoch": 0.8992175639378722, "grad_norm": 10.832247734069824, "learning_rate": 4.101327416403908e-05, "loss": 0.6721, "step": 23100 }, { "epoch": 0.9011639223013741, "grad_norm": 10.93287181854248, "learning_rate": 4.0993810580404066e-05, "loss": 0.6103, "step": 23150 }, { "epoch": 0.903110280664876, "grad_norm": 13.139593124389648, "learning_rate": 4.097434699676905e-05, "loss": 0.7195, "step": 23200 }, { "epoch": 0.9050566390283779, "grad_norm": 10.643058776855469, "learning_rate": 4.095488341313403e-05, "loss": 0.6724, "step": 23250 }, { "epoch": 0.9070029973918798, "grad_norm": 24.261667251586914, "learning_rate": 4.093541982949901e-05, "loss": 0.8004, "step": 23300 }, { "epoch": 0.9089493557553817, "grad_norm": 14.239994049072266, "learning_rate": 4.091595624586399e-05, "loss": 0.6885, "step": 23350 }, { "epoch": 0.9108957141188836, "grad_norm": 8.599918365478516, "learning_rate": 4.089649266222897e-05, "loss": 0.6852, "step": 23400 }, { "epoch": 0.9128420724823855, "grad_norm": 20.846200942993164, "learning_rate": 4.087702907859395e-05, "loss": 0.7008, "step": 23450 }, { "epoch": 0.9147884308458873, "grad_norm": 16.060100555419922, "learning_rate": 4.085756549495893e-05, "loss": 0.7553, "step": 23500 }, { "epoch": 0.9167347892093892, "grad_norm": 17.530363082885742, "learning_rate": 4.0838101911323914e-05, "loss": 0.692, "step": 23550 }, { "epoch": 0.9186811475728911, "grad_norm": 25.222034454345703, "learning_rate": 4.08186383276889e-05, "loss": 0.744, "step": 23600 }, { "epoch": 0.920627505936393, "grad_norm": 61.249298095703125, "learning_rate": 4.0799174744053876e-05, "loss": 0.6196, "step": 23650 }, { "epoch": 0.9225738642998949, "grad_norm": 12.622259140014648, "learning_rate": 4.077971116041886e-05, "loss": 0.8147, "step": 23700 }, { "epoch": 0.9245202226633968, "grad_norm": 2.536447048187256, "learning_rate": 4.076024757678384e-05, "loss": 0.6731, "step": 23750 }, { "epoch": 0.9264665810268987, "grad_norm": 10.33336353302002, "learning_rate": 4.0740783993148816e-05, "loss": 0.8562, "step": 23800 }, { "epoch": 0.9284129393904006, "grad_norm": 19.090290069580078, "learning_rate": 4.07213204095138e-05, "loss": 0.7623, "step": 23850 }, { "epoch": 0.9303592977539025, "grad_norm": 19.48790168762207, "learning_rate": 4.0701856825878784e-05, "loss": 0.7312, "step": 23900 }, { "epoch": 0.9323056561174043, "grad_norm": 19.644702911376953, "learning_rate": 4.068239324224377e-05, "loss": 0.6501, "step": 23950 }, { "epoch": 0.9342520144809062, "grad_norm": 6.179945945739746, "learning_rate": 4.0662929658608746e-05, "loss": 0.7323, "step": 24000 }, { "epoch": 0.9361983728444081, "grad_norm": 6.483129024505615, "learning_rate": 4.0643466074973724e-05, "loss": 0.7294, "step": 24050 }, { "epoch": 0.93814473120791, "grad_norm": 24.40945816040039, "learning_rate": 4.062400249133871e-05, "loss": 0.7455, "step": 24100 }, { "epoch": 0.9400910895714119, "grad_norm": 9.607022285461426, "learning_rate": 4.0604538907703686e-05, "loss": 0.678, "step": 24150 }, { "epoch": 0.9420374479349137, "grad_norm": 10.24763011932373, "learning_rate": 4.058507532406867e-05, "loss": 0.7797, "step": 24200 }, { "epoch": 0.9439838062984157, "grad_norm": 28.262149810791016, "learning_rate": 4.0565611740433654e-05, "loss": 0.747, "step": 24250 }, { "epoch": 0.9459301646619176, "grad_norm": 9.676417350769043, "learning_rate": 4.054614815679863e-05, "loss": 0.6823, "step": 24300 }, { "epoch": 0.9478765230254195, "grad_norm": 12.27448844909668, "learning_rate": 4.0526684573163616e-05, "loss": 0.6004, "step": 24350 }, { "epoch": 0.9498228813889213, "grad_norm": 6.199162006378174, "learning_rate": 4.0507220989528594e-05, "loss": 0.7384, "step": 24400 }, { "epoch": 0.9517692397524232, "grad_norm": 8.001496315002441, "learning_rate": 4.048775740589357e-05, "loss": 0.7128, "step": 24450 }, { "epoch": 0.9537155981159251, "grad_norm": 29.52419090270996, "learning_rate": 4.0468293822258556e-05, "loss": 0.8061, "step": 24500 }, { "epoch": 0.955661956479427, "grad_norm": 63.508384704589844, "learning_rate": 4.0448830238623533e-05, "loss": 0.5707, "step": 24550 }, { "epoch": 0.9576083148429289, "grad_norm": 12.524792671203613, "learning_rate": 4.042936665498852e-05, "loss": 0.7287, "step": 24600 }, { "epoch": 0.9595546732064307, "grad_norm": 20.709444046020508, "learning_rate": 4.04099030713535e-05, "loss": 0.724, "step": 24650 }, { "epoch": 0.9615010315699326, "grad_norm": 17.139657974243164, "learning_rate": 4.039082875939118e-05, "loss": 0.6787, "step": 24700 }, { "epoch": 0.9634473899334346, "grad_norm": 10.646354675292969, "learning_rate": 4.037136517575616e-05, "loss": 0.6813, "step": 24750 }, { "epoch": 0.9653937482969365, "grad_norm": 28.307188034057617, "learning_rate": 4.0351901592121145e-05, "loss": 0.8106, "step": 24800 }, { "epoch": 0.9673401066604383, "grad_norm": 11.407912254333496, "learning_rate": 4.033243800848612e-05, "loss": 0.7224, "step": 24850 }, { "epoch": 0.9692864650239402, "grad_norm": 31.392183303833008, "learning_rate": 4.031297442485111e-05, "loss": 0.7476, "step": 24900 }, { "epoch": 0.9712328233874421, "grad_norm": 11.617532730102539, "learning_rate": 4.0293510841216084e-05, "loss": 0.6295, "step": 24950 }, { "epoch": 0.973179181750944, "grad_norm": 44.32483673095703, "learning_rate": 4.027404725758107e-05, "loss": 0.6916, "step": 25000 }, { "epoch": 0.9751255401144459, "grad_norm": 9.769405364990234, "learning_rate": 4.025458367394605e-05, "loss": 0.6997, "step": 25050 }, { "epoch": 0.9770718984779477, "grad_norm": 13.01120376586914, "learning_rate": 4.023512009031103e-05, "loss": 0.7304, "step": 25100 }, { "epoch": 0.9790182568414496, "grad_norm": 4.7533278465271, "learning_rate": 4.0215656506676015e-05, "loss": 0.6972, "step": 25150 }, { "epoch": 0.9809646152049515, "grad_norm": 6.3509907722473145, "learning_rate": 4.019619292304099e-05, "loss": 0.6946, "step": 25200 }, { "epoch": 0.9829109735684535, "grad_norm": 6.284741401672363, "learning_rate": 4.017672933940597e-05, "loss": 0.6904, "step": 25250 }, { "epoch": 0.9848573319319553, "grad_norm": 14.752326965332031, "learning_rate": 4.0157265755770955e-05, "loss": 0.6729, "step": 25300 }, { "epoch": 0.9868036902954572, "grad_norm": 10.41714859008789, "learning_rate": 4.013780217213593e-05, "loss": 0.7109, "step": 25350 }, { "epoch": 0.9887500486589591, "grad_norm": 16.713977813720703, "learning_rate": 4.0118338588500917e-05, "loss": 0.6333, "step": 25400 }, { "epoch": 0.990696407022461, "grad_norm": 11.51398754119873, "learning_rate": 4.00988750048659e-05, "loss": 0.6733, "step": 25450 }, { "epoch": 0.9926427653859629, "grad_norm": 8.69009780883789, "learning_rate": 4.007941142123088e-05, "loss": 0.7484, "step": 25500 }, { "epoch": 0.9945891237494647, "grad_norm": 9.150976181030273, "learning_rate": 4.005994783759586e-05, "loss": 0.6647, "step": 25550 }, { "epoch": 0.9965354821129666, "grad_norm": 25.82608985900879, "learning_rate": 4.004048425396084e-05, "loss": 0.7622, "step": 25600 }, { "epoch": 0.9984818404764685, "grad_norm": 8.156515121459961, "learning_rate": 4.002102067032582e-05, "loss": 0.7877, "step": 25650 }, { "epoch": 1.0, "eval_accuracy": 0.7449881272139827, "eval_f1_macro": 0.6969577852819083, "eval_f1_weighted": 0.7418939805532414, "eval_loss": 0.703120768070221, "eval_roc_auc": 0.9389065243381043, "eval_runtime": 27.0354, "eval_samples_per_second": 950.2, "eval_steps_per_second": 118.807, "step": 25689 }, { "epoch": 1.0004281988399704, "grad_norm": 14.577681541442871, "learning_rate": 4.00015570866908e-05, "loss": 0.6018, "step": 25700 }, { "epoch": 1.0023745572034723, "grad_norm": 35.07035446166992, "learning_rate": 3.998209350305579e-05, "loss": 0.6477, "step": 25750 }, { "epoch": 1.0043209155669741, "grad_norm": 12.141730308532715, "learning_rate": 3.9962629919420764e-05, "loss": 0.6388, "step": 25800 }, { "epoch": 1.006267273930476, "grad_norm": 15.19098949432373, "learning_rate": 3.994316633578575e-05, "loss": 0.6475, "step": 25850 }, { "epoch": 1.0082136322939779, "grad_norm": 23.924997329711914, "learning_rate": 3.9923702752150726e-05, "loss": 0.7003, "step": 25900 }, { "epoch": 1.0101599906574799, "grad_norm": 25.04294204711914, "learning_rate": 3.990423916851571e-05, "loss": 0.5822, "step": 25950 }, { "epoch": 1.0121063490209818, "grad_norm": 12.006014823913574, "learning_rate": 3.988477558488069e-05, "loss": 0.7355, "step": 26000 }, { "epoch": 1.0140527073844836, "grad_norm": 9.703333854675293, "learning_rate": 3.986531200124567e-05, "loss": 0.6238, "step": 26050 }, { "epoch": 1.0159990657479856, "grad_norm": 15.69824504852295, "learning_rate": 3.984584841761066e-05, "loss": 0.6892, "step": 26100 }, { "epoch": 1.0179454241114874, "grad_norm": 64.83596801757812, "learning_rate": 3.9826384833975634e-05, "loss": 0.5863, "step": 26150 }, { "epoch": 1.0198917824749894, "grad_norm": 4.981834411621094, "learning_rate": 3.980692125034061e-05, "loss": 0.7082, "step": 26200 }, { "epoch": 1.0218381408384911, "grad_norm": 36.35984420776367, "learning_rate": 3.9787457666705596e-05, "loss": 0.666, "step": 26250 }, { "epoch": 1.023784499201993, "grad_norm": 4.4853997230529785, "learning_rate": 3.9767994083070574e-05, "loss": 0.6785, "step": 26300 }, { "epoch": 1.0257308575654949, "grad_norm": 54.25199508666992, "learning_rate": 3.974853049943556e-05, "loss": 0.696, "step": 26350 }, { "epoch": 1.0276772159289969, "grad_norm": 16.701147079467773, "learning_rate": 3.9729066915800536e-05, "loss": 0.6005, "step": 26400 }, { "epoch": 1.0296235742924988, "grad_norm": 17.23859214782715, "learning_rate": 3.970960333216552e-05, "loss": 0.6069, "step": 26450 }, { "epoch": 1.0315699326560006, "grad_norm": 44.267364501953125, "learning_rate": 3.9690139748530505e-05, "loss": 0.6833, "step": 26500 }, { "epoch": 1.0335162910195026, "grad_norm": 15.629772186279297, "learning_rate": 3.967067616489548e-05, "loss": 0.6557, "step": 26550 }, { "epoch": 1.0354626493830044, "grad_norm": 31.496387481689453, "learning_rate": 3.9651212581260467e-05, "loss": 0.6241, "step": 26600 }, { "epoch": 1.0374090077465064, "grad_norm": 35.068687438964844, "learning_rate": 3.963213826929815e-05, "loss": 0.6572, "step": 26650 }, { "epoch": 1.0393553661100081, "grad_norm": 25.12656593322754, "learning_rate": 3.9612674685663125e-05, "loss": 0.6921, "step": 26700 }, { "epoch": 1.04130172447351, "grad_norm": 274.9530944824219, "learning_rate": 3.95932111020281e-05, "loss": 0.6076, "step": 26750 }, { "epoch": 1.0432480828370119, "grad_norm": 19.99907112121582, "learning_rate": 3.957374751839309e-05, "loss": 0.7065, "step": 26800 }, { "epoch": 1.0451944412005139, "grad_norm": 5.122546195983887, "learning_rate": 3.955428393475807e-05, "loss": 0.7176, "step": 26850 }, { "epoch": 1.0471407995640156, "grad_norm": 9.226968765258789, "learning_rate": 3.9534820351123056e-05, "loss": 0.6555, "step": 26900 }, { "epoch": 1.0490871579275176, "grad_norm": 18.128637313842773, "learning_rate": 3.951535676748803e-05, "loss": 0.6658, "step": 26950 }, { "epoch": 1.0510335162910196, "grad_norm": 15.379231452941895, "learning_rate": 3.949589318385301e-05, "loss": 0.7304, "step": 27000 }, { "epoch": 1.0529798746545214, "grad_norm": 34.52772903442383, "learning_rate": 3.9476429600217995e-05, "loss": 0.6149, "step": 27050 }, { "epoch": 1.0549262330180234, "grad_norm": 65.75399780273438, "learning_rate": 3.945696601658297e-05, "loss": 0.6619, "step": 27100 }, { "epoch": 1.0568725913815251, "grad_norm": 10.57369613647461, "learning_rate": 3.943750243294796e-05, "loss": 0.6265, "step": 27150 }, { "epoch": 1.058818949745027, "grad_norm": 4.673130035400391, "learning_rate": 3.941803884931294e-05, "loss": 0.5332, "step": 27200 }, { "epoch": 1.0607653081085289, "grad_norm": 23.92059326171875, "learning_rate": 3.939857526567792e-05, "loss": 0.6006, "step": 27250 }, { "epoch": 1.0627116664720309, "grad_norm": 5.338550567626953, "learning_rate": 3.93791116820429e-05, "loss": 0.7753, "step": 27300 }, { "epoch": 1.0646580248355326, "grad_norm": 8.852982521057129, "learning_rate": 3.935964809840788e-05, "loss": 0.6249, "step": 27350 }, { "epoch": 1.0666043831990346, "grad_norm": 16.647966384887695, "learning_rate": 3.934018451477286e-05, "loss": 0.6366, "step": 27400 }, { "epoch": 1.0685507415625364, "grad_norm": 28.924148559570312, "learning_rate": 3.932072093113784e-05, "loss": 0.7681, "step": 27450 }, { "epoch": 1.0704970999260384, "grad_norm": 5.967368125915527, "learning_rate": 3.930125734750282e-05, "loss": 0.7139, "step": 27500 }, { "epoch": 1.0724434582895404, "grad_norm": 11.518197059631348, "learning_rate": 3.9281793763867805e-05, "loss": 0.5984, "step": 27550 }, { "epoch": 1.0743898166530421, "grad_norm": 6.728857517242432, "learning_rate": 3.926233018023279e-05, "loss": 0.8228, "step": 27600 }, { "epoch": 1.076336175016544, "grad_norm": 78.30040740966797, "learning_rate": 3.924286659659777e-05, "loss": 0.77, "step": 27650 }, { "epoch": 1.0782825333800459, "grad_norm": 20.562726974487305, "learning_rate": 3.922340301296275e-05, "loss": 0.7085, "step": 27700 }, { "epoch": 1.0802288917435479, "grad_norm": 4.975492477416992, "learning_rate": 3.920393942932773e-05, "loss": 0.6186, "step": 27750 }, { "epoch": 1.0821752501070496, "grad_norm": 8.30207633972168, "learning_rate": 3.9184475845692706e-05, "loss": 0.606, "step": 27800 }, { "epoch": 1.0841216084705516, "grad_norm": 127.28559112548828, "learning_rate": 3.916501226205769e-05, "loss": 0.7417, "step": 27850 }, { "epoch": 1.0860679668340536, "grad_norm": 9.118617057800293, "learning_rate": 3.9145548678422675e-05, "loss": 0.7673, "step": 27900 }, { "epoch": 1.0880143251975554, "grad_norm": 20.277854919433594, "learning_rate": 3.912608509478766e-05, "loss": 0.6452, "step": 27950 }, { "epoch": 1.0899606835610574, "grad_norm": 16.53668212890625, "learning_rate": 3.910662151115264e-05, "loss": 0.6743, "step": 28000 }, { "epoch": 1.0919070419245591, "grad_norm": 29.85817527770996, "learning_rate": 3.9087157927517615e-05, "loss": 0.6513, "step": 28050 }, { "epoch": 1.093853400288061, "grad_norm": 15.49708366394043, "learning_rate": 3.90676943438826e-05, "loss": 0.6282, "step": 28100 }, { "epoch": 1.0957997586515629, "grad_norm": 77.18496704101562, "learning_rate": 3.9048230760247576e-05, "loss": 0.7091, "step": 28150 }, { "epoch": 1.0977461170150649, "grad_norm": 16.014463424682617, "learning_rate": 3.9028767176612554e-05, "loss": 0.6537, "step": 28200 }, { "epoch": 1.0996924753785666, "grad_norm": 50.62329864501953, "learning_rate": 3.9009303592977545e-05, "loss": 0.6938, "step": 28250 }, { "epoch": 1.1016388337420686, "grad_norm": 7.959646224975586, "learning_rate": 3.898984000934252e-05, "loss": 0.6884, "step": 28300 }, { "epoch": 1.1035851921055704, "grad_norm": 20.376602172851562, "learning_rate": 3.897037642570751e-05, "loss": 0.648, "step": 28350 }, { "epoch": 1.1055315504690724, "grad_norm": 5.660374164581299, "learning_rate": 3.8950912842072485e-05, "loss": 0.6413, "step": 28400 }, { "epoch": 1.1074779088325744, "grad_norm": 25.169357299804688, "learning_rate": 3.893144925843746e-05, "loss": 0.6181, "step": 28450 }, { "epoch": 1.1094242671960761, "grad_norm": 45.559993743896484, "learning_rate": 3.891198567480245e-05, "loss": 0.7445, "step": 28500 }, { "epoch": 1.111370625559578, "grad_norm": 30.599302291870117, "learning_rate": 3.8892522091167424e-05, "loss": 0.6905, "step": 28550 }, { "epoch": 1.1133169839230799, "grad_norm": 20.952903747558594, "learning_rate": 3.887305850753241e-05, "loss": 0.6144, "step": 28600 }, { "epoch": 1.1152633422865819, "grad_norm": 44.161624908447266, "learning_rate": 3.885359492389739e-05, "loss": 0.6454, "step": 28650 }, { "epoch": 1.1172097006500836, "grad_norm": 11.840255737304688, "learning_rate": 3.883413134026237e-05, "loss": 0.7741, "step": 28700 }, { "epoch": 1.1191560590135856, "grad_norm": 14.251753807067871, "learning_rate": 3.8814667756627355e-05, "loss": 0.7141, "step": 28750 }, { "epoch": 1.1211024173770874, "grad_norm": 3.2870490550994873, "learning_rate": 3.879520417299233e-05, "loss": 0.6397, "step": 28800 }, { "epoch": 1.1230487757405894, "grad_norm": 29.445972442626953, "learning_rate": 3.877574058935731e-05, "loss": 0.6355, "step": 28850 }, { "epoch": 1.1249951341040911, "grad_norm": 21.126012802124023, "learning_rate": 3.8756277005722294e-05, "loss": 0.5937, "step": 28900 }, { "epoch": 1.1269414924675931, "grad_norm": 11.885201454162598, "learning_rate": 3.873681342208728e-05, "loss": 0.6729, "step": 28950 }, { "epoch": 1.128887850831095, "grad_norm": 24.67503547668457, "learning_rate": 3.871734983845226e-05, "loss": 0.6358, "step": 29000 }, { "epoch": 1.1308342091945969, "grad_norm": 2.1587953567504883, "learning_rate": 3.869788625481724e-05, "loss": 0.585, "step": 29050 }, { "epoch": 1.1327805675580989, "grad_norm": 15.891336441040039, "learning_rate": 3.867842267118222e-05, "loss": 0.6855, "step": 29100 }, { "epoch": 1.1347269259216006, "grad_norm": 10.612324714660645, "learning_rate": 3.86589590875472e-05, "loss": 0.685, "step": 29150 }, { "epoch": 1.1366732842851026, "grad_norm": 10.291898727416992, "learning_rate": 3.863949550391218e-05, "loss": 0.6927, "step": 29200 }, { "epoch": 1.1386196426486044, "grad_norm": 11.972062110900879, "learning_rate": 3.862003192027716e-05, "loss": 0.5328, "step": 29250 }, { "epoch": 1.1405660010121064, "grad_norm": 33.76552200317383, "learning_rate": 3.860056833664215e-05, "loss": 0.7107, "step": 29300 }, { "epoch": 1.1425123593756084, "grad_norm": 94.32827758789062, "learning_rate": 3.8581104753007127e-05, "loss": 0.6386, "step": 29350 }, { "epoch": 1.1444587177391101, "grad_norm": 10.383445739746094, "learning_rate": 3.856164116937211e-05, "loss": 0.699, "step": 29400 }, { "epoch": 1.146405076102612, "grad_norm": 15.138472557067871, "learning_rate": 3.854217758573709e-05, "loss": 0.7649, "step": 29450 }, { "epoch": 1.1483514344661139, "grad_norm": 42.89930725097656, "learning_rate": 3.8522714002102066e-05, "loss": 0.6683, "step": 29500 }, { "epoch": 1.1502977928296159, "grad_norm": 13.869772911071777, "learning_rate": 3.850325041846705e-05, "loss": 0.6595, "step": 29550 }, { "epoch": 1.1522441511931176, "grad_norm": 12.339373588562012, "learning_rate": 3.848378683483203e-05, "loss": 0.6663, "step": 29600 }, { "epoch": 1.1541905095566196, "grad_norm": 4.027193546295166, "learning_rate": 3.846432325119701e-05, "loss": 0.6194, "step": 29650 }, { "epoch": 1.1561368679201214, "grad_norm": 300.1123962402344, "learning_rate": 3.8444859667562e-05, "loss": 0.5334, "step": 29700 }, { "epoch": 1.1580832262836234, "grad_norm": 10.1583251953125, "learning_rate": 3.8425396083926974e-05, "loss": 0.61, "step": 29750 }, { "epoch": 1.1600295846471251, "grad_norm": 9.423940658569336, "learning_rate": 3.840593250029196e-05, "loss": 0.6444, "step": 29800 }, { "epoch": 1.1619759430106271, "grad_norm": 14.869356155395508, "learning_rate": 3.8386468916656936e-05, "loss": 0.6223, "step": 29850 }, { "epoch": 1.163922301374129, "grad_norm": 23.436697006225586, "learning_rate": 3.8367005333021914e-05, "loss": 0.6774, "step": 29900 }, { "epoch": 1.1658686597376309, "grad_norm": 101.20919799804688, "learning_rate": 3.83475417493869e-05, "loss": 0.6052, "step": 29950 }, { "epoch": 1.1678150181011329, "grad_norm": 22.73941421508789, "learning_rate": 3.832807816575188e-05, "loss": 0.6801, "step": 30000 }, { "epoch": 1.1697613764646346, "grad_norm": 15.480627059936523, "learning_rate": 3.830861458211686e-05, "loss": 0.6507, "step": 30050 }, { "epoch": 1.1717077348281366, "grad_norm": 32.418949127197266, "learning_rate": 3.8289150998481844e-05, "loss": 0.7054, "step": 30100 }, { "epoch": 1.1736540931916384, "grad_norm": 20.57775115966797, "learning_rate": 3.826968741484682e-05, "loss": 0.7045, "step": 30150 }, { "epoch": 1.1756004515551404, "grad_norm": 32.964935302734375, "learning_rate": 3.8250223831211806e-05, "loss": 0.6157, "step": 30200 }, { "epoch": 1.1775468099186421, "grad_norm": 25.92686653137207, "learning_rate": 3.8230760247576784e-05, "loss": 0.6766, "step": 30250 }, { "epoch": 1.1794931682821441, "grad_norm": 7.484406471252441, "learning_rate": 3.821129666394176e-05, "loss": 0.6577, "step": 30300 }, { "epoch": 1.1814395266456459, "grad_norm": 58.94056701660156, "learning_rate": 3.8191833080306746e-05, "loss": 0.581, "step": 30350 }, { "epoch": 1.1833858850091479, "grad_norm": 8.59178638458252, "learning_rate": 3.817236949667173e-05, "loss": 0.7376, "step": 30400 }, { "epoch": 1.1853322433726499, "grad_norm": 36.66286087036133, "learning_rate": 3.8152905913036715e-05, "loss": 0.5869, "step": 30450 }, { "epoch": 1.1872786017361516, "grad_norm": 24.386655807495117, "learning_rate": 3.813344232940169e-05, "loss": 0.6585, "step": 30500 }, { "epoch": 1.1892249600996536, "grad_norm": 12.961252212524414, "learning_rate": 3.811397874576667e-05, "loss": 0.6884, "step": 30550 }, { "epoch": 1.1911713184631554, "grad_norm": 7.110319137573242, "learning_rate": 3.8094515162131654e-05, "loss": 0.6619, "step": 30600 }, { "epoch": 1.1931176768266574, "grad_norm": 15.833562850952148, "learning_rate": 3.807505157849663e-05, "loss": 0.6697, "step": 30650 }, { "epoch": 1.1950640351901591, "grad_norm": 44.593631744384766, "learning_rate": 3.8055587994861616e-05, "loss": 0.5965, "step": 30700 }, { "epoch": 1.1970103935536611, "grad_norm": 12.795648574829102, "learning_rate": 3.80361244112266e-05, "loss": 0.6434, "step": 30750 }, { "epoch": 1.198956751917163, "grad_norm": 3.8210175037384033, "learning_rate": 3.801666082759158e-05, "loss": 0.6537, "step": 30800 }, { "epoch": 1.2009031102806649, "grad_norm": 15.508024215698242, "learning_rate": 3.799719724395656e-05, "loss": 0.746, "step": 30850 }, { "epoch": 1.2028494686441666, "grad_norm": 55.087154388427734, "learning_rate": 3.797773366032154e-05, "loss": 0.8172, "step": 30900 }, { "epoch": 1.2047958270076686, "grad_norm": 16.399255752563477, "learning_rate": 3.795827007668652e-05, "loss": 0.6676, "step": 30950 }, { "epoch": 1.2067421853711706, "grad_norm": 7.380814552307129, "learning_rate": 3.79388064930515e-05, "loss": 0.6815, "step": 31000 }, { "epoch": 1.2086885437346724, "grad_norm": 20.366552352905273, "learning_rate": 3.7919342909416486e-05, "loss": 0.7256, "step": 31050 }, { "epoch": 1.2106349020981744, "grad_norm": 8.70262336730957, "learning_rate": 3.7899879325781464e-05, "loss": 0.7467, "step": 31100 }, { "epoch": 1.2125812604616761, "grad_norm": 5.155397891998291, "learning_rate": 3.788041574214645e-05, "loss": 0.6932, "step": 31150 }, { "epoch": 1.2145276188251781, "grad_norm": 9.524085998535156, "learning_rate": 3.7860952158511426e-05, "loss": 0.6194, "step": 31200 }, { "epoch": 1.21647397718868, "grad_norm": 11.964740753173828, "learning_rate": 3.784148857487641e-05, "loss": 0.7454, "step": 31250 }, { "epoch": 1.2184203355521819, "grad_norm": 25.15473175048828, "learning_rate": 3.782202499124139e-05, "loss": 0.6627, "step": 31300 }, { "epoch": 1.2203666939156839, "grad_norm": 13.250192642211914, "learning_rate": 3.7802561407606365e-05, "loss": 0.6186, "step": 31350 }, { "epoch": 1.2223130522791856, "grad_norm": 172.26531982421875, "learning_rate": 3.778309782397135e-05, "loss": 0.6505, "step": 31400 }, { "epoch": 1.2242594106426876, "grad_norm": 13.819783210754395, "learning_rate": 3.7763634240336334e-05, "loss": 0.615, "step": 31450 }, { "epoch": 1.2262057690061894, "grad_norm": 56.694149017333984, "learning_rate": 3.774417065670131e-05, "loss": 0.6041, "step": 31500 }, { "epoch": 1.2281521273696914, "grad_norm": 16.94282341003418, "learning_rate": 3.7724707073066296e-05, "loss": 0.7611, "step": 31550 }, { "epoch": 1.2300984857331931, "grad_norm": 23.03072738647461, "learning_rate": 3.7705243489431274e-05, "loss": 0.6434, "step": 31600 }, { "epoch": 1.2320448440966951, "grad_norm": 7.091614723205566, "learning_rate": 3.768577990579626e-05, "loss": 0.5719, "step": 31650 }, { "epoch": 1.233991202460197, "grad_norm": 39.62540054321289, "learning_rate": 3.7666316322161235e-05, "loss": 0.8562, "step": 31700 }, { "epoch": 1.2359375608236989, "grad_norm": 37.3956184387207, "learning_rate": 3.764685273852622e-05, "loss": 0.6948, "step": 31750 }, { "epoch": 1.2378839191872006, "grad_norm": 25.477008819580078, "learning_rate": 3.7627389154891204e-05, "loss": 0.7201, "step": 31800 }, { "epoch": 1.2398302775507026, "grad_norm": 25.95840835571289, "learning_rate": 3.760792557125618e-05, "loss": 0.6596, "step": 31850 }, { "epoch": 1.2417766359142046, "grad_norm": 23.408517837524414, "learning_rate": 3.7588461987621166e-05, "loss": 0.7, "step": 31900 }, { "epoch": 1.2437229942777064, "grad_norm": 39.24456787109375, "learning_rate": 3.7568998403986144e-05, "loss": 0.8017, "step": 31950 }, { "epoch": 1.2456693526412084, "grad_norm": 12.013161659240723, "learning_rate": 3.754953482035112e-05, "loss": 0.6046, "step": 32000 }, { "epoch": 1.2476157110047101, "grad_norm": 9.616015434265137, "learning_rate": 3.7530071236716106e-05, "loss": 0.6354, "step": 32050 }, { "epoch": 1.2495620693682121, "grad_norm": 7.879519462585449, "learning_rate": 3.751060765308109e-05, "loss": 0.6104, "step": 32100 }, { "epoch": 1.251508427731714, "grad_norm": 8.281095504760742, "learning_rate": 3.749114406944607e-05, "loss": 0.6634, "step": 32150 }, { "epoch": 1.2534547860952159, "grad_norm": 13.312612533569336, "learning_rate": 3.747168048581105e-05, "loss": 0.6611, "step": 32200 }, { "epoch": 1.2554011444587179, "grad_norm": 5.163415431976318, "learning_rate": 3.745221690217603e-05, "loss": 0.6803, "step": 32250 }, { "epoch": 1.2573475028222196, "grad_norm": 18.449356079101562, "learning_rate": 3.7432753318541014e-05, "loss": 0.7035, "step": 32300 }, { "epoch": 1.2592938611857214, "grad_norm": 3.8862197399139404, "learning_rate": 3.741328973490599e-05, "loss": 0.7096, "step": 32350 }, { "epoch": 1.2612402195492234, "grad_norm": 32.95891571044922, "learning_rate": 3.739382615127097e-05, "loss": 0.6579, "step": 32400 }, { "epoch": 1.2631865779127254, "grad_norm": 37.064971923828125, "learning_rate": 3.7374362567635953e-05, "loss": 0.7162, "step": 32450 }, { "epoch": 1.2651329362762271, "grad_norm": 10.136852264404297, "learning_rate": 3.735489898400094e-05, "loss": 0.77, "step": 32500 }, { "epoch": 1.2670792946397291, "grad_norm": 20.00455093383789, "learning_rate": 3.733582467203862e-05, "loss": 0.5971, "step": 32550 }, { "epoch": 1.269025653003231, "grad_norm": 59.05356216430664, "learning_rate": 3.73163610884036e-05, "loss": 0.573, "step": 32600 }, { "epoch": 1.2709720113667329, "grad_norm": 15.34808349609375, "learning_rate": 3.729689750476858e-05, "loss": 0.6384, "step": 32650 }, { "epoch": 1.2729183697302346, "grad_norm": 18.196151733398438, "learning_rate": 3.727743392113356e-05, "loss": 0.6625, "step": 32700 }, { "epoch": 1.2748647280937366, "grad_norm": 15.862820625305176, "learning_rate": 3.725797033749854e-05, "loss": 0.6385, "step": 32750 }, { "epoch": 1.2768110864572386, "grad_norm": 8.629034042358398, "learning_rate": 3.723850675386352e-05, "loss": 0.6551, "step": 32800 }, { "epoch": 1.2787574448207404, "grad_norm": 14.009358406066895, "learning_rate": 3.7219043170228504e-05, "loss": 0.5911, "step": 32850 }, { "epoch": 1.2807038031842422, "grad_norm": 22.001821517944336, "learning_rate": 3.719957958659349e-05, "loss": 0.6024, "step": 32900 }, { "epoch": 1.2826501615477441, "grad_norm": 25.052412033081055, "learning_rate": 3.7180116002958466e-05, "loss": 0.5819, "step": 32950 }, { "epoch": 1.2845965199112461, "grad_norm": 24.96440887451172, "learning_rate": 3.716065241932345e-05, "loss": 0.6699, "step": 33000 }, { "epoch": 1.286542878274748, "grad_norm": 18.00562858581543, "learning_rate": 3.714118883568843e-05, "loss": 0.7022, "step": 33050 }, { "epoch": 1.2884892366382499, "grad_norm": 31.0590763092041, "learning_rate": 3.7121725252053406e-05, "loss": 0.6223, "step": 33100 }, { "epoch": 1.2904355950017516, "grad_norm": 15.350159645080566, "learning_rate": 3.710226166841839e-05, "loss": 0.6297, "step": 33150 }, { "epoch": 1.2923819533652536, "grad_norm": 84.01378631591797, "learning_rate": 3.708279808478337e-05, "loss": 0.5614, "step": 33200 }, { "epoch": 1.2943283117287554, "grad_norm": 25.175607681274414, "learning_rate": 3.706333450114836e-05, "loss": 0.7177, "step": 33250 }, { "epoch": 1.2962746700922574, "grad_norm": 18.434240341186523, "learning_rate": 3.7043870917513336e-05, "loss": 0.6096, "step": 33300 }, { "epoch": 1.2982210284557594, "grad_norm": 15.432963371276855, "learning_rate": 3.7024407333878314e-05, "loss": 0.6589, "step": 33350 }, { "epoch": 1.3001673868192611, "grad_norm": 36.6771240234375, "learning_rate": 3.70049437502433e-05, "loss": 0.5892, "step": 33400 }, { "epoch": 1.3021137451827631, "grad_norm": 8.170853614807129, "learning_rate": 3.6985480166608276e-05, "loss": 0.6595, "step": 33450 }, { "epoch": 1.304060103546265, "grad_norm": 27.628053665161133, "learning_rate": 3.6966016582973254e-05, "loss": 0.6267, "step": 33500 }, { "epoch": 1.3060064619097669, "grad_norm": 6.108870983123779, "learning_rate": 3.694655299933824e-05, "loss": 0.6458, "step": 33550 }, { "epoch": 1.3079528202732686, "grad_norm": 13.700627326965332, "learning_rate": 3.692708941570322e-05, "loss": 0.6441, "step": 33600 }, { "epoch": 1.3098991786367706, "grad_norm": 21.229265213012695, "learning_rate": 3.690762583206821e-05, "loss": 0.7144, "step": 33650 }, { "epoch": 1.3118455370002726, "grad_norm": 30.280641555786133, "learning_rate": 3.6888162248433184e-05, "loss": 0.544, "step": 33700 }, { "epoch": 1.3137918953637744, "grad_norm": 15.00646686553955, "learning_rate": 3.686869866479816e-05, "loss": 0.6104, "step": 33750 }, { "epoch": 1.3157382537272762, "grad_norm": 16.879486083984375, "learning_rate": 3.6849235081163146e-05, "loss": 0.5744, "step": 33800 }, { "epoch": 1.3176846120907781, "grad_norm": 9.873910903930664, "learning_rate": 3.6829771497528124e-05, "loss": 0.6403, "step": 33850 }, { "epoch": 1.3196309704542801, "grad_norm": 30.345134735107422, "learning_rate": 3.681030791389311e-05, "loss": 0.7084, "step": 33900 }, { "epoch": 1.321577328817782, "grad_norm": 114.48217010498047, "learning_rate": 3.679084433025809e-05, "loss": 0.7107, "step": 33950 }, { "epoch": 1.3235236871812839, "grad_norm": 3.1867921352386475, "learning_rate": 3.677138074662307e-05, "loss": 0.6913, "step": 34000 }, { "epoch": 1.3254700455447856, "grad_norm": 56.6605339050293, "learning_rate": 3.6751917162988054e-05, "loss": 0.7056, "step": 34050 }, { "epoch": 1.3274164039082876, "grad_norm": 14.920662879943848, "learning_rate": 3.673245357935303e-05, "loss": 0.6717, "step": 34100 }, { "epoch": 1.3293627622717894, "grad_norm": 35.867488861083984, "learning_rate": 3.671298999571801e-05, "loss": 0.5472, "step": 34150 }, { "epoch": 1.3313091206352914, "grad_norm": 21.453571319580078, "learning_rate": 3.6693526412082994e-05, "loss": 0.7175, "step": 34200 }, { "epoch": 1.3332554789987934, "grad_norm": 10.702232360839844, "learning_rate": 3.667406282844797e-05, "loss": 0.6242, "step": 34250 }, { "epoch": 1.3352018373622951, "grad_norm": 12.998764991760254, "learning_rate": 3.665459924481296e-05, "loss": 0.6624, "step": 34300 }, { "epoch": 1.337148195725797, "grad_norm": 5.75769567489624, "learning_rate": 3.663513566117794e-05, "loss": 0.6916, "step": 34350 }, { "epoch": 1.339094554089299, "grad_norm": 9.57388687133789, "learning_rate": 3.661567207754292e-05, "loss": 0.6032, "step": 34400 }, { "epoch": 1.3410409124528009, "grad_norm": 11.879281997680664, "learning_rate": 3.65962084939079e-05, "loss": 0.6344, "step": 34450 }, { "epoch": 1.3429872708163026, "grad_norm": 25.007797241210938, "learning_rate": 3.657674491027288e-05, "loss": 0.6603, "step": 34500 }, { "epoch": 1.3449336291798046, "grad_norm": 54.79502868652344, "learning_rate": 3.655728132663786e-05, "loss": 0.7009, "step": 34550 }, { "epoch": 1.3468799875433064, "grad_norm": 5.102016925811768, "learning_rate": 3.6538207014675545e-05, "loss": 0.6349, "step": 34600 }, { "epoch": 1.3488263459068084, "grad_norm": 9.561335563659668, "learning_rate": 3.651874343104052e-05, "loss": 0.6462, "step": 34650 }, { "epoch": 1.3507727042703102, "grad_norm": 42.900550842285156, "learning_rate": 3.649927984740551e-05, "loss": 0.5929, "step": 34700 }, { "epoch": 1.3527190626338121, "grad_norm": 26.465988159179688, "learning_rate": 3.647981626377049e-05, "loss": 0.5082, "step": 34750 }, { "epoch": 1.3546654209973141, "grad_norm": 14.65463638305664, "learning_rate": 3.646035268013547e-05, "loss": 0.6623, "step": 34800 }, { "epoch": 1.356611779360816, "grad_norm": 6.745665550231934, "learning_rate": 3.6440889096500446e-05, "loss": 0.5903, "step": 34850 }, { "epoch": 1.3585581377243179, "grad_norm": 22.312650680541992, "learning_rate": 3.642142551286543e-05, "loss": 0.6915, "step": 34900 }, { "epoch": 1.3605044960878196, "grad_norm": 25.509965896606445, "learning_rate": 3.640196192923041e-05, "loss": 0.6065, "step": 34950 }, { "epoch": 1.3624508544513216, "grad_norm": 24.00339698791504, "learning_rate": 3.638249834559539e-05, "loss": 0.6471, "step": 35000 }, { "epoch": 1.3643972128148234, "grad_norm": 6.224186420440674, "learning_rate": 3.636303476196038e-05, "loss": 0.6338, "step": 35050 }, { "epoch": 1.3663435711783254, "grad_norm": 18.130130767822266, "learning_rate": 3.6343571178325355e-05, "loss": 0.7378, "step": 35100 }, { "epoch": 1.3682899295418274, "grad_norm": 26.097824096679688, "learning_rate": 3.632410759469034e-05, "loss": 0.6725, "step": 35150 }, { "epoch": 1.3702362879053291, "grad_norm": 18.883941650390625, "learning_rate": 3.6304644011055317e-05, "loss": 0.6363, "step": 35200 }, { "epoch": 1.372182646268831, "grad_norm": 13.148035049438477, "learning_rate": 3.62851804274203e-05, "loss": 0.6633, "step": 35250 }, { "epoch": 1.374129004632333, "grad_norm": 5.090790271759033, "learning_rate": 3.626571684378528e-05, "loss": 0.5983, "step": 35300 }, { "epoch": 1.3760753629958349, "grad_norm": 18.486387252807617, "learning_rate": 3.6246253260150256e-05, "loss": 0.5946, "step": 35350 }, { "epoch": 1.3780217213593366, "grad_norm": 11.340799331665039, "learning_rate": 3.622678967651524e-05, "loss": 0.5981, "step": 35400 }, { "epoch": 1.3799680797228386, "grad_norm": 23.8195743560791, "learning_rate": 3.6207326092880225e-05, "loss": 0.6683, "step": 35450 }, { "epoch": 1.3819144380863404, "grad_norm": 11.623726844787598, "learning_rate": 3.61878625092452e-05, "loss": 0.5942, "step": 35500 }, { "epoch": 1.3838607964498424, "grad_norm": 17.490236282348633, "learning_rate": 3.616839892561019e-05, "loss": 0.6233, "step": 35550 }, { "epoch": 1.3858071548133442, "grad_norm": 31.9366397857666, "learning_rate": 3.6148935341975164e-05, "loss": 0.6396, "step": 35600 }, { "epoch": 1.3877535131768461, "grad_norm": 32.56630325317383, "learning_rate": 3.612947175834015e-05, "loss": 0.5952, "step": 35650 }, { "epoch": 1.3896998715403481, "grad_norm": 18.059593200683594, "learning_rate": 3.6110008174705126e-05, "loss": 0.712, "step": 35700 }, { "epoch": 1.39164622990385, "grad_norm": 13.18091106414795, "learning_rate": 3.609054459107011e-05, "loss": 0.6906, "step": 35750 }, { "epoch": 1.3935925882673517, "grad_norm": 4.848202228546143, "learning_rate": 3.6071081007435095e-05, "loss": 0.69, "step": 35800 }, { "epoch": 1.3955389466308536, "grad_norm": 11.284486770629883, "learning_rate": 3.605161742380007e-05, "loss": 0.6235, "step": 35850 }, { "epoch": 1.3974853049943556, "grad_norm": 23.607465744018555, "learning_rate": 3.603215384016505e-05, "loss": 0.6985, "step": 35900 }, { "epoch": 1.3994316633578574, "grad_norm": 16.28246307373047, "learning_rate": 3.6012690256530034e-05, "loss": 0.6979, "step": 35950 }, { "epoch": 1.4013780217213594, "grad_norm": 14.341331481933594, "learning_rate": 3.599322667289501e-05, "loss": 0.6447, "step": 36000 }, { "epoch": 1.4033243800848612, "grad_norm": 18.49776268005371, "learning_rate": 3.5973763089259996e-05, "loss": 0.6097, "step": 36050 }, { "epoch": 1.4052707384483631, "grad_norm": 15.384420394897461, "learning_rate": 3.595429950562498e-05, "loss": 0.6894, "step": 36100 }, { "epoch": 1.407217096811865, "grad_norm": 14.834940910339355, "learning_rate": 3.593483592198996e-05, "loss": 0.6029, "step": 36150 }, { "epoch": 1.409163455175367, "grad_norm": 17.7336483001709, "learning_rate": 3.591537233835494e-05, "loss": 0.6185, "step": 36200 }, { "epoch": 1.4111098135388689, "grad_norm": 10.513029098510742, "learning_rate": 3.589590875471992e-05, "loss": 0.6249, "step": 36250 }, { "epoch": 1.4130561719023707, "grad_norm": 34.12071990966797, "learning_rate": 3.58764451710849e-05, "loss": 0.7187, "step": 36300 }, { "epoch": 1.4150025302658724, "grad_norm": 14.284405708312988, "learning_rate": 3.585698158744988e-05, "loss": 0.6003, "step": 36350 }, { "epoch": 1.4169488886293744, "grad_norm": 10.528316497802734, "learning_rate": 3.583751800381486e-05, "loss": 0.6761, "step": 36400 }, { "epoch": 1.4188952469928764, "grad_norm": 8.760578155517578, "learning_rate": 3.5818054420179844e-05, "loss": 0.6074, "step": 36450 }, { "epoch": 1.4208416053563782, "grad_norm": 13.310457229614258, "learning_rate": 3.579859083654483e-05, "loss": 0.7531, "step": 36500 }, { "epoch": 1.4227879637198801, "grad_norm": 49.12535858154297, "learning_rate": 3.5779127252909806e-05, "loss": 0.6718, "step": 36550 }, { "epoch": 1.424734322083382, "grad_norm": 19.785152435302734, "learning_rate": 3.575966366927479e-05, "loss": 0.5779, "step": 36600 }, { "epoch": 1.426680680446884, "grad_norm": 15.406665802001953, "learning_rate": 3.574020008563977e-05, "loss": 0.7238, "step": 36650 }, { "epoch": 1.4286270388103857, "grad_norm": 7.360702037811279, "learning_rate": 3.572073650200475e-05, "loss": 0.7093, "step": 36700 }, { "epoch": 1.4305733971738877, "grad_norm": 5.521740913391113, "learning_rate": 3.570127291836973e-05, "loss": 0.5615, "step": 36750 }, { "epoch": 1.4325197555373896, "grad_norm": 20.201677322387695, "learning_rate": 3.5681809334734714e-05, "loss": 0.5628, "step": 36800 }, { "epoch": 1.4344661139008914, "grad_norm": 12.886085510253906, "learning_rate": 3.56623457510997e-05, "loss": 0.6395, "step": 36850 }, { "epoch": 1.4364124722643934, "grad_norm": 28.534385681152344, "learning_rate": 3.5642882167464676e-05, "loss": 0.6163, "step": 36900 }, { "epoch": 1.4383588306278952, "grad_norm": 25.887035369873047, "learning_rate": 3.5623418583829654e-05, "loss": 0.6712, "step": 36950 }, { "epoch": 1.4403051889913971, "grad_norm": 9.261523246765137, "learning_rate": 3.560395500019464e-05, "loss": 0.6119, "step": 37000 }, { "epoch": 1.442251547354899, "grad_norm": 18.233558654785156, "learning_rate": 3.5584491416559616e-05, "loss": 0.7636, "step": 37050 }, { "epoch": 1.444197905718401, "grad_norm": 5.661403656005859, "learning_rate": 3.55650278329246e-05, "loss": 0.6062, "step": 37100 }, { "epoch": 1.4461442640819029, "grad_norm": 9.447407722473145, "learning_rate": 3.5545564249289585e-05, "loss": 0.549, "step": 37150 }, { "epoch": 1.4480906224454047, "grad_norm": 52.83908462524414, "learning_rate": 3.552610066565456e-05, "loss": 0.6849, "step": 37200 }, { "epoch": 1.4500369808089064, "grad_norm": 13.534957885742188, "learning_rate": 3.5506637082019546e-05, "loss": 0.6589, "step": 37250 }, { "epoch": 1.4519833391724084, "grad_norm": 19.959447860717773, "learning_rate": 3.5487173498384524e-05, "loss": 0.7006, "step": 37300 }, { "epoch": 1.4539296975359104, "grad_norm": 16.741853713989258, "learning_rate": 3.54677099147495e-05, "loss": 0.6768, "step": 37350 }, { "epoch": 1.4558760558994122, "grad_norm": 18.33676528930664, "learning_rate": 3.5448246331114486e-05, "loss": 0.5861, "step": 37400 }, { "epoch": 1.4578224142629141, "grad_norm": 23.227108001708984, "learning_rate": 3.5428782747479464e-05, "loss": 0.7115, "step": 37450 }, { "epoch": 1.459768772626416, "grad_norm": 15.384740829467773, "learning_rate": 3.540931916384445e-05, "loss": 0.7023, "step": 37500 }, { "epoch": 1.461715130989918, "grad_norm": 26.545534133911133, "learning_rate": 3.538985558020943e-05, "loss": 0.6947, "step": 37550 }, { "epoch": 1.4636614893534197, "grad_norm": 28.865934371948242, "learning_rate": 3.537039199657441e-05, "loss": 0.6535, "step": 37600 }, { "epoch": 1.4656078477169217, "grad_norm": 17.92835807800293, "learning_rate": 3.5350928412939394e-05, "loss": 0.6034, "step": 37650 }, { "epoch": 1.4675542060804236, "grad_norm": 17.21173858642578, "learning_rate": 3.533146482930437e-05, "loss": 0.7055, "step": 37700 }, { "epoch": 1.4695005644439254, "grad_norm": 20.9925537109375, "learning_rate": 3.531200124566935e-05, "loss": 0.5264, "step": 37750 }, { "epoch": 1.4714469228074272, "grad_norm": 30.676984786987305, "learning_rate": 3.5292537662034334e-05, "loss": 0.7891, "step": 37800 }, { "epoch": 1.4733932811709292, "grad_norm": 26.742504119873047, "learning_rate": 3.527307407839932e-05, "loss": 0.6402, "step": 37850 }, { "epoch": 1.4753396395344311, "grad_norm": 21.7519474029541, "learning_rate": 3.52536104947643e-05, "loss": 0.7496, "step": 37900 }, { "epoch": 1.477285997897933, "grad_norm": 11.474286079406738, "learning_rate": 3.523414691112928e-05, "loss": 0.6331, "step": 37950 }, { "epoch": 1.479232356261435, "grad_norm": 6.052927017211914, "learning_rate": 3.521468332749426e-05, "loss": 0.7566, "step": 38000 }, { "epoch": 1.4811787146249367, "grad_norm": 17.886445999145508, "learning_rate": 3.519521974385924e-05, "loss": 0.5928, "step": 38050 }, { "epoch": 1.4831250729884387, "grad_norm": 22.491378784179688, "learning_rate": 3.517575616022422e-05, "loss": 0.6304, "step": 38100 }, { "epoch": 1.4850714313519404, "grad_norm": 5.91766357421875, "learning_rate": 3.5156292576589204e-05, "loss": 0.6092, "step": 38150 }, { "epoch": 1.4870177897154424, "grad_norm": 22.568275451660156, "learning_rate": 3.513682899295419e-05, "loss": 0.6432, "step": 38200 }, { "epoch": 1.4889641480789444, "grad_norm": 38.50241470336914, "learning_rate": 3.5117365409319166e-05, "loss": 0.597, "step": 38250 }, { "epoch": 1.4909105064424462, "grad_norm": 78.09729766845703, "learning_rate": 3.509790182568415e-05, "loss": 0.6066, "step": 38300 }, { "epoch": 1.4928568648059481, "grad_norm": 9.007063865661621, "learning_rate": 3.507843824204913e-05, "loss": 0.6771, "step": 38350 }, { "epoch": 1.49480322316945, "grad_norm": 21.275802612304688, "learning_rate": 3.5058974658414105e-05, "loss": 0.6316, "step": 38400 }, { "epoch": 1.496749581532952, "grad_norm": 48.907283782958984, "learning_rate": 3.503951107477909e-05, "loss": 0.57, "step": 38450 }, { "epoch": 1.4986959398964537, "grad_norm": 8.10950756072998, "learning_rate": 3.502004749114407e-05, "loss": 0.5425, "step": 38500 }, { "epoch": 1.5006422982599557, "grad_norm": 53.618202209472656, "learning_rate": 3.500058390750905e-05, "loss": 0.7218, "step": 38550 }, { "epoch": 1.5025886566234576, "grad_norm": 24.029069900512695, "learning_rate": 3.498150959554673e-05, "loss": 0.6234, "step": 38600 }, { "epoch": 1.5045350149869594, "grad_norm": 9.205044746398926, "learning_rate": 3.496204601191172e-05, "loss": 0.7217, "step": 38650 }, { "epoch": 1.5064813733504612, "grad_norm": 15.576617240905762, "learning_rate": 3.4942582428276694e-05, "loss": 0.5204, "step": 38700 }, { "epoch": 1.5084277317139632, "grad_norm": 11.751370429992676, "learning_rate": 3.492311884464168e-05, "loss": 0.6734, "step": 38750 }, { "epoch": 1.5103740900774651, "grad_norm": 10.517868995666504, "learning_rate": 3.4903655261006656e-05, "loss": 0.6424, "step": 38800 }, { "epoch": 1.512320448440967, "grad_norm": 18.950918197631836, "learning_rate": 3.488419167737164e-05, "loss": 0.6743, "step": 38850 }, { "epoch": 1.5142668068044687, "grad_norm": 4.481653213500977, "learning_rate": 3.486472809373662e-05, "loss": 0.6439, "step": 38900 }, { "epoch": 1.5162131651679709, "grad_norm": 41.65850830078125, "learning_rate": 3.48452645101016e-05, "loss": 0.6649, "step": 38950 }, { "epoch": 1.5181595235314727, "grad_norm": 28.37667465209961, "learning_rate": 3.482580092646659e-05, "loss": 0.6802, "step": 39000 }, { "epoch": 1.5201058818949744, "grad_norm": 20.680389404296875, "learning_rate": 3.4806337342831565e-05, "loss": 0.6842, "step": 39050 }, { "epoch": 1.5220522402584764, "grad_norm": 4.483341217041016, "learning_rate": 3.478687375919655e-05, "loss": 0.6348, "step": 39100 }, { "epoch": 1.5239985986219784, "grad_norm": 18.663856506347656, "learning_rate": 3.4767410175561526e-05, "loss": 0.6446, "step": 39150 }, { "epoch": 1.5259449569854802, "grad_norm": 10.918580055236816, "learning_rate": 3.4747946591926504e-05, "loss": 0.7604, "step": 39200 }, { "epoch": 1.527891315348982, "grad_norm": 14.077402114868164, "learning_rate": 3.472848300829149e-05, "loss": 0.629, "step": 39250 }, { "epoch": 1.529837673712484, "grad_norm": 5.575888156890869, "learning_rate": 3.4709019424656466e-05, "loss": 0.6763, "step": 39300 }, { "epoch": 1.531784032075986, "grad_norm": 19.619409561157227, "learning_rate": 3.468955584102145e-05, "loss": 0.6855, "step": 39350 }, { "epoch": 1.5337303904394877, "grad_norm": 10.749835968017578, "learning_rate": 3.4670092257386435e-05, "loss": 0.6107, "step": 39400 }, { "epoch": 1.5356767488029897, "grad_norm": 10.375340461730957, "learning_rate": 3.465062867375141e-05, "loss": 0.6222, "step": 39450 }, { "epoch": 1.5376231071664916, "grad_norm": 10.285493850708008, "learning_rate": 3.46311650901164e-05, "loss": 0.6339, "step": 39500 }, { "epoch": 1.5395694655299934, "grad_norm": 23.2364444732666, "learning_rate": 3.4611701506481374e-05, "loss": 0.6468, "step": 39550 }, { "epoch": 1.5415158238934952, "grad_norm": 46.003414154052734, "learning_rate": 3.459223792284635e-05, "loss": 0.57, "step": 39600 }, { "epoch": 1.5434621822569972, "grad_norm": 16.965803146362305, "learning_rate": 3.4572774339211336e-05, "loss": 0.6671, "step": 39650 }, { "epoch": 1.5454085406204991, "grad_norm": 4.043222427368164, "learning_rate": 3.455331075557632e-05, "loss": 0.711, "step": 39700 }, { "epoch": 1.547354898984001, "grad_norm": 66.6514663696289, "learning_rate": 3.45338471719413e-05, "loss": 0.654, "step": 39750 }, { "epoch": 1.5493012573475027, "grad_norm": 7.554368019104004, "learning_rate": 3.451438358830628e-05, "loss": 0.6066, "step": 39800 }, { "epoch": 1.5512476157110047, "grad_norm": 27.968488693237305, "learning_rate": 3.449492000467126e-05, "loss": 0.5539, "step": 39850 }, { "epoch": 1.5531939740745067, "grad_norm": 14.104660034179688, "learning_rate": 3.4475456421036244e-05, "loss": 0.7776, "step": 39900 }, { "epoch": 1.5551403324380084, "grad_norm": 6.206201076507568, "learning_rate": 3.445599283740122e-05, "loss": 0.7091, "step": 39950 }, { "epoch": 1.5570866908015104, "grad_norm": 20.62215232849121, "learning_rate": 3.4436529253766206e-05, "loss": 0.6554, "step": 40000 }, { "epoch": 1.5590330491650124, "grad_norm": 12.475567817687988, "learning_rate": 3.441706567013119e-05, "loss": 0.6414, "step": 40050 }, { "epoch": 1.5609794075285142, "grad_norm": 28.36690330505371, "learning_rate": 3.439760208649617e-05, "loss": 0.5814, "step": 40100 }, { "epoch": 1.562925765892016, "grad_norm": 18.327392578125, "learning_rate": 3.4378138502861146e-05, "loss": 0.6548, "step": 40150 }, { "epoch": 1.564872124255518, "grad_norm": 13.208046913146973, "learning_rate": 3.435867491922613e-05, "loss": 0.563, "step": 40200 }, { "epoch": 1.56681848261902, "grad_norm": 36.65834426879883, "learning_rate": 3.433921133559111e-05, "loss": 0.6737, "step": 40250 }, { "epoch": 1.5687648409825217, "grad_norm": 14.143219947814941, "learning_rate": 3.431974775195609e-05, "loss": 0.5341, "step": 40300 }, { "epoch": 1.5707111993460234, "grad_norm": 6.191122055053711, "learning_rate": 3.430028416832107e-05, "loss": 0.6433, "step": 40350 }, { "epoch": 1.5726575577095256, "grad_norm": 7.541060924530029, "learning_rate": 3.4280820584686054e-05, "loss": 0.7395, "step": 40400 }, { "epoch": 1.5746039160730274, "grad_norm": 3.1459977626800537, "learning_rate": 3.426135700105104e-05, "loss": 0.6117, "step": 40450 }, { "epoch": 1.5765502744365292, "grad_norm": 9.351485252380371, "learning_rate": 3.4241893417416016e-05, "loss": 0.5868, "step": 40500 }, { "epoch": 1.5784966328000312, "grad_norm": 8.417421340942383, "learning_rate": 3.4222429833781e-05, "loss": 0.5356, "step": 40550 }, { "epoch": 1.5804429911635332, "grad_norm": 20.70960807800293, "learning_rate": 3.420296625014598e-05, "loss": 0.6204, "step": 40600 }, { "epoch": 1.582389349527035, "grad_norm": 7.025996208190918, "learning_rate": 3.4183502666510956e-05, "loss": 0.6682, "step": 40650 }, { "epoch": 1.5843357078905367, "grad_norm": 12.55179214477539, "learning_rate": 3.416403908287594e-05, "loss": 0.5848, "step": 40700 }, { "epoch": 1.5862820662540387, "grad_norm": 10.313057899475098, "learning_rate": 3.4144575499240924e-05, "loss": 0.6374, "step": 40750 }, { "epoch": 1.5882284246175407, "grad_norm": 29.018314361572266, "learning_rate": 3.41251119156059e-05, "loss": 0.6548, "step": 40800 }, { "epoch": 1.5901747829810424, "grad_norm": 8.622284889221191, "learning_rate": 3.4105648331970886e-05, "loss": 0.6159, "step": 40850 }, { "epoch": 1.5921211413445444, "grad_norm": 15.068008422851562, "learning_rate": 3.4086184748335864e-05, "loss": 0.6186, "step": 40900 }, { "epoch": 1.5940674997080464, "grad_norm": 36.47996520996094, "learning_rate": 3.406672116470085e-05, "loss": 0.6297, "step": 40950 }, { "epoch": 1.5960138580715482, "grad_norm": 22.160200119018555, "learning_rate": 3.4047257581065826e-05, "loss": 0.6692, "step": 41000 }, { "epoch": 1.59796021643505, "grad_norm": 4.7639384269714355, "learning_rate": 3.402779399743081e-05, "loss": 0.6598, "step": 41050 }, { "epoch": 1.599906574798552, "grad_norm": 19.96210479736328, "learning_rate": 3.4008330413795794e-05, "loss": 0.6811, "step": 41100 }, { "epoch": 1.601852933162054, "grad_norm": 5.365239143371582, "learning_rate": 3.398886683016077e-05, "loss": 0.6787, "step": 41150 }, { "epoch": 1.6037992915255557, "grad_norm": 23.604095458984375, "learning_rate": 3.396940324652575e-05, "loss": 0.615, "step": 41200 }, { "epoch": 1.6057456498890574, "grad_norm": 6.665222644805908, "learning_rate": 3.3949939662890734e-05, "loss": 0.6301, "step": 41250 }, { "epoch": 1.6076920082525594, "grad_norm": 14.375205039978027, "learning_rate": 3.393047607925571e-05, "loss": 0.6448, "step": 41300 }, { "epoch": 1.6096383666160614, "grad_norm": 6.348198413848877, "learning_rate": 3.3911012495620696e-05, "loss": 0.6664, "step": 41350 }, { "epoch": 1.6115847249795632, "grad_norm": 16.64923095703125, "learning_rate": 3.3891548911985674e-05, "loss": 0.6818, "step": 41400 }, { "epoch": 1.6135310833430652, "grad_norm": 4.109652519226074, "learning_rate": 3.387208532835066e-05, "loss": 0.749, "step": 41450 }, { "epoch": 1.6154774417065672, "grad_norm": 11.53089427947998, "learning_rate": 3.385262174471564e-05, "loss": 0.6453, "step": 41500 }, { "epoch": 1.617423800070069, "grad_norm": 23.84912109375, "learning_rate": 3.383315816108062e-05, "loss": 0.6673, "step": 41550 }, { "epoch": 1.6193701584335707, "grad_norm": 19.092859268188477, "learning_rate": 3.38136945774456e-05, "loss": 0.63, "step": 41600 }, { "epoch": 1.6213165167970727, "grad_norm": 19.391376495361328, "learning_rate": 3.379423099381058e-05, "loss": 0.6369, "step": 41650 }, { "epoch": 1.6232628751605747, "grad_norm": 25.417146682739258, "learning_rate": 3.377476741017556e-05, "loss": 0.5437, "step": 41700 }, { "epoch": 1.6252092335240764, "grad_norm": 14.560564041137695, "learning_rate": 3.3755303826540544e-05, "loss": 0.7017, "step": 41750 }, { "epoch": 1.6271555918875782, "grad_norm": 11.145736694335938, "learning_rate": 3.373584024290553e-05, "loss": 0.6192, "step": 41800 }, { "epoch": 1.6291019502510802, "grad_norm": 40.10994338989258, "learning_rate": 3.3716376659270506e-05, "loss": 0.65, "step": 41850 }, { "epoch": 1.6310483086145822, "grad_norm": 29.94738006591797, "learning_rate": 3.369691307563549e-05, "loss": 0.6454, "step": 41900 }, { "epoch": 1.632994666978084, "grad_norm": 19.944231033325195, "learning_rate": 3.367744949200047e-05, "loss": 0.5991, "step": 41950 }, { "epoch": 1.634941025341586, "grad_norm": 5.2453107833862305, "learning_rate": 3.365798590836545e-05, "loss": 0.6428, "step": 42000 }, { "epoch": 1.636887383705088, "grad_norm": 16.973224639892578, "learning_rate": 3.363852232473043e-05, "loss": 0.6052, "step": 42050 }, { "epoch": 1.6388337420685897, "grad_norm": 12.557331085205078, "learning_rate": 3.3619058741095414e-05, "loss": 0.6361, "step": 42100 }, { "epoch": 1.6407801004320914, "grad_norm": 3.965310573577881, "learning_rate": 3.35995951574604e-05, "loss": 0.5842, "step": 42150 }, { "epoch": 1.6427264587955934, "grad_norm": 27.116952896118164, "learning_rate": 3.3580131573825376e-05, "loss": 0.5914, "step": 42200 }, { "epoch": 1.6446728171590954, "grad_norm": 13.965898513793945, "learning_rate": 3.3560667990190353e-05, "loss": 0.6626, "step": 42250 }, { "epoch": 1.6466191755225972, "grad_norm": 143.52365112304688, "learning_rate": 3.354120440655534e-05, "loss": 0.6501, "step": 42300 }, { "epoch": 1.648565533886099, "grad_norm": 10.905447959899902, "learning_rate": 3.352213009459302e-05, "loss": 0.6554, "step": 42350 }, { "epoch": 1.6505118922496012, "grad_norm": 8.113410949707031, "learning_rate": 3.3502666510957996e-05, "loss": 0.6754, "step": 42400 }, { "epoch": 1.652458250613103, "grad_norm": 39.31525421142578, "learning_rate": 3.348320292732298e-05, "loss": 0.5845, "step": 42450 }, { "epoch": 1.6544046089766047, "grad_norm": 8.0653715133667, "learning_rate": 3.346373934368796e-05, "loss": 0.6257, "step": 42500 }, { "epoch": 1.6563509673401067, "grad_norm": 14.867386817932129, "learning_rate": 3.344427576005294e-05, "loss": 0.6827, "step": 42550 }, { "epoch": 1.6582973257036087, "grad_norm": 31.453580856323242, "learning_rate": 3.342481217641793e-05, "loss": 0.6397, "step": 42600 }, { "epoch": 1.6602436840671104, "grad_norm": 73.83524322509766, "learning_rate": 3.3405348592782904e-05, "loss": 0.6023, "step": 42650 }, { "epoch": 1.6621900424306122, "grad_norm": 8.53406047821045, "learning_rate": 3.338588500914789e-05, "loss": 0.6498, "step": 42700 }, { "epoch": 1.6641364007941142, "grad_norm": 17.859333038330078, "learning_rate": 3.3366421425512866e-05, "loss": 0.6529, "step": 42750 }, { "epoch": 1.6660827591576162, "grad_norm": 4.08618688583374, "learning_rate": 3.3346957841877844e-05, "loss": 0.6811, "step": 42800 }, { "epoch": 1.668029117521118, "grad_norm": 13.59468936920166, "learning_rate": 3.332749425824283e-05, "loss": 0.6496, "step": 42850 }, { "epoch": 1.66997547588462, "grad_norm": 3.8985610008239746, "learning_rate": 3.330803067460781e-05, "loss": 0.5401, "step": 42900 }, { "epoch": 1.671921834248122, "grad_norm": 12.570989608764648, "learning_rate": 3.32885670909728e-05, "loss": 0.5436, "step": 42950 }, { "epoch": 1.6738681926116237, "grad_norm": 7.856734275817871, "learning_rate": 3.3269103507337775e-05, "loss": 0.6477, "step": 43000 }, { "epoch": 1.6758145509751254, "grad_norm": 14.801864624023438, "learning_rate": 3.324963992370275e-05, "loss": 0.584, "step": 43050 }, { "epoch": 1.6777609093386274, "grad_norm": 11.930379867553711, "learning_rate": 3.3230176340067736e-05, "loss": 0.6002, "step": 43100 }, { "epoch": 1.6797072677021294, "grad_norm": 22.97522735595703, "learning_rate": 3.3210712756432714e-05, "loss": 0.6195, "step": 43150 }, { "epoch": 1.6816536260656312, "grad_norm": 8.452421188354492, "learning_rate": 3.319124917279769e-05, "loss": 0.703, "step": 43200 }, { "epoch": 1.683599984429133, "grad_norm": 7.180862903594971, "learning_rate": 3.317178558916268e-05, "loss": 0.637, "step": 43250 }, { "epoch": 1.685546342792635, "grad_norm": 5.882069110870361, "learning_rate": 3.315232200552766e-05, "loss": 0.6313, "step": 43300 }, { "epoch": 1.687492701156137, "grad_norm": 20.898656845092773, "learning_rate": 3.3132858421892645e-05, "loss": 0.6956, "step": 43350 }, { "epoch": 1.6894390595196387, "grad_norm": 23.192819595336914, "learning_rate": 3.311339483825762e-05, "loss": 0.5513, "step": 43400 }, { "epoch": 1.6913854178831407, "grad_norm": 19.101221084594727, "learning_rate": 3.30939312546226e-05, "loss": 0.7058, "step": 43450 }, { "epoch": 1.6933317762466427, "grad_norm": 123.71995544433594, "learning_rate": 3.3074467670987584e-05, "loss": 0.7148, "step": 43500 }, { "epoch": 1.6952781346101444, "grad_norm": 2.960566997528076, "learning_rate": 3.305500408735256e-05, "loss": 0.6693, "step": 43550 }, { "epoch": 1.6972244929736462, "grad_norm": 17.402097702026367, "learning_rate": 3.3035540503717546e-05, "loss": 0.6656, "step": 43600 }, { "epoch": 1.6991708513371482, "grad_norm": 19.012680053710938, "learning_rate": 3.301607692008253e-05, "loss": 0.648, "step": 43650 }, { "epoch": 1.7011172097006502, "grad_norm": 6.4097771644592285, "learning_rate": 3.299661333644751e-05, "loss": 0.6412, "step": 43700 }, { "epoch": 1.703063568064152, "grad_norm": 6.353412628173828, "learning_rate": 3.297714975281249e-05, "loss": 0.6347, "step": 43750 }, { "epoch": 1.7050099264276537, "grad_norm": 35.246334075927734, "learning_rate": 3.295768616917747e-05, "loss": 0.7076, "step": 43800 }, { "epoch": 1.706956284791156, "grad_norm": 68.9794692993164, "learning_rate": 3.293822258554245e-05, "loss": 0.5799, "step": 43850 }, { "epoch": 1.7089026431546577, "grad_norm": 159.63157653808594, "learning_rate": 3.291875900190743e-05, "loss": 0.7182, "step": 43900 }, { "epoch": 1.7108490015181594, "grad_norm": 19.759716033935547, "learning_rate": 3.2899295418272416e-05, "loss": 0.6283, "step": 43950 }, { "epoch": 1.7127953598816614, "grad_norm": 9.91257095336914, "learning_rate": 3.2879831834637394e-05, "loss": 0.7006, "step": 44000 }, { "epoch": 1.7147417182451634, "grad_norm": 52.58943176269531, "learning_rate": 3.286036825100238e-05, "loss": 0.6504, "step": 44050 }, { "epoch": 1.7166880766086652, "grad_norm": 4.800380229949951, "learning_rate": 3.2840904667367356e-05, "loss": 0.6658, "step": 44100 }, { "epoch": 1.718634434972167, "grad_norm": 12.198222160339355, "learning_rate": 3.282144108373234e-05, "loss": 0.6534, "step": 44150 }, { "epoch": 1.720580793335669, "grad_norm": 21.233123779296875, "learning_rate": 3.280197750009732e-05, "loss": 0.6849, "step": 44200 }, { "epoch": 1.722527151699171, "grad_norm": 12.844585418701172, "learning_rate": 3.2782513916462295e-05, "loss": 0.6745, "step": 44250 }, { "epoch": 1.7244735100626727, "grad_norm": 23.284297943115234, "learning_rate": 3.276305033282728e-05, "loss": 0.6505, "step": 44300 }, { "epoch": 1.7264198684261747, "grad_norm": 19.108293533325195, "learning_rate": 3.2743586749192264e-05, "loss": 0.7469, "step": 44350 }, { "epoch": 1.7283662267896767, "grad_norm": 11.593676567077637, "learning_rate": 3.272412316555725e-05, "loss": 0.6567, "step": 44400 }, { "epoch": 1.7303125851531784, "grad_norm": 19.396406173706055, "learning_rate": 3.2704659581922226e-05, "loss": 0.6372, "step": 44450 }, { "epoch": 1.7322589435166802, "grad_norm": 3.8881468772888184, "learning_rate": 3.2685195998287204e-05, "loss": 0.5602, "step": 44500 }, { "epoch": 1.7342053018801822, "grad_norm": 15.071468353271484, "learning_rate": 3.266573241465219e-05, "loss": 0.7532, "step": 44550 }, { "epoch": 1.7361516602436842, "grad_norm": 26.150928497314453, "learning_rate": 3.2646268831017166e-05, "loss": 0.6701, "step": 44600 }, { "epoch": 1.738098018607186, "grad_norm": 27.02860450744629, "learning_rate": 3.262680524738215e-05, "loss": 0.7129, "step": 44650 }, { "epoch": 1.7400443769706877, "grad_norm": 7.594332695007324, "learning_rate": 3.2607341663747134e-05, "loss": 0.5981, "step": 44700 }, { "epoch": 1.7419907353341897, "grad_norm": 30.662446975708008, "learning_rate": 3.258787808011211e-05, "loss": 0.59, "step": 44750 }, { "epoch": 1.7439370936976917, "grad_norm": 9.138285636901855, "learning_rate": 3.2568414496477096e-05, "loss": 0.6169, "step": 44800 }, { "epoch": 1.7458834520611934, "grad_norm": 17.97589683532715, "learning_rate": 3.2548950912842074e-05, "loss": 0.6749, "step": 44850 }, { "epoch": 1.7478298104246954, "grad_norm": 9.2334566116333, "learning_rate": 3.252948732920705e-05, "loss": 0.605, "step": 44900 }, { "epoch": 1.7497761687881974, "grad_norm": 16.177980422973633, "learning_rate": 3.2510023745572036e-05, "loss": 0.7335, "step": 44950 }, { "epoch": 1.7517225271516992, "grad_norm": 15.157333374023438, "learning_rate": 3.249056016193702e-05, "loss": 0.6568, "step": 45000 }, { "epoch": 1.753668885515201, "grad_norm": 58.45502853393555, "learning_rate": 3.2471096578302e-05, "loss": 0.6346, "step": 45050 }, { "epoch": 1.755615243878703, "grad_norm": 20.656431198120117, "learning_rate": 3.245163299466698e-05, "loss": 0.657, "step": 45100 }, { "epoch": 1.757561602242205, "grad_norm": 26.48894691467285, "learning_rate": 3.243255868270466e-05, "loss": 0.6399, "step": 45150 }, { "epoch": 1.7595079606057067, "grad_norm": 20.236629486083984, "learning_rate": 3.241309509906964e-05, "loss": 0.5923, "step": 45200 }, { "epoch": 1.7614543189692085, "grad_norm": 12.983643531799316, "learning_rate": 3.2393631515434625e-05, "loss": 0.6815, "step": 45250 }, { "epoch": 1.7634006773327107, "grad_norm": 46.368865966796875, "learning_rate": 3.23741679317996e-05, "loss": 0.6912, "step": 45300 }, { "epoch": 1.7653470356962124, "grad_norm": 8.254097938537598, "learning_rate": 3.235470434816458e-05, "loss": 0.6053, "step": 45350 }, { "epoch": 1.7672933940597142, "grad_norm": 26.602148056030273, "learning_rate": 3.2335240764529564e-05, "loss": 0.621, "step": 45400 }, { "epoch": 1.7692397524232162, "grad_norm": 23.168476104736328, "learning_rate": 3.231577718089455e-05, "loss": 0.6962, "step": 45450 }, { "epoch": 1.7711861107867182, "grad_norm": 32.12530517578125, "learning_rate": 3.229631359725953e-05, "loss": 0.6076, "step": 45500 }, { "epoch": 1.77313246915022, "grad_norm": 16.383962631225586, "learning_rate": 3.227685001362451e-05, "loss": 0.7166, "step": 45550 }, { "epoch": 1.7750788275137217, "grad_norm": 4.311821937561035, "learning_rate": 3.225738642998949e-05, "loss": 0.5843, "step": 45600 }, { "epoch": 1.7770251858772237, "grad_norm": 33.63228225708008, "learning_rate": 3.223792284635447e-05, "loss": 0.6279, "step": 45650 }, { "epoch": 1.7789715442407257, "grad_norm": 13.47040843963623, "learning_rate": 3.221845926271945e-05, "loss": 0.6172, "step": 45700 }, { "epoch": 1.7809179026042274, "grad_norm": 14.264995574951172, "learning_rate": 3.2198995679084434e-05, "loss": 0.7033, "step": 45750 }, { "epoch": 1.7828642609677294, "grad_norm": 18.09751319885254, "learning_rate": 3.217953209544942e-05, "loss": 0.5833, "step": 45800 }, { "epoch": 1.7848106193312314, "grad_norm": 18.299312591552734, "learning_rate": 3.2160068511814396e-05, "loss": 0.6096, "step": 45850 }, { "epoch": 1.7867569776947332, "grad_norm": 20.845361709594727, "learning_rate": 3.214060492817938e-05, "loss": 0.7217, "step": 45900 }, { "epoch": 1.788703336058235, "grad_norm": 6.954010009765625, "learning_rate": 3.212114134454436e-05, "loss": 0.6337, "step": 45950 }, { "epoch": 1.790649694421737, "grad_norm": 9.469720840454102, "learning_rate": 3.2101677760909336e-05, "loss": 0.638, "step": 46000 }, { "epoch": 1.792596052785239, "grad_norm": 7.228301525115967, "learning_rate": 3.208221417727432e-05, "loss": 0.7075, "step": 46050 }, { "epoch": 1.7945424111487407, "grad_norm": 23.982351303100586, "learning_rate": 3.2062750593639305e-05, "loss": 0.5256, "step": 46100 }, { "epoch": 1.7964887695122425, "grad_norm": 28.82716178894043, "learning_rate": 3.204328701000429e-05, "loss": 0.5775, "step": 46150 }, { "epoch": 1.7984351278757444, "grad_norm": 20.36063575744629, "learning_rate": 3.202421269804197e-05, "loss": 0.7338, "step": 46200 }, { "epoch": 1.8003814862392464, "grad_norm": 30.550411224365234, "learning_rate": 3.200474911440695e-05, "loss": 0.5996, "step": 46250 }, { "epoch": 1.8023278446027482, "grad_norm": 5.496059417724609, "learning_rate": 3.1985285530771925e-05, "loss": 0.7492, "step": 46300 }, { "epoch": 1.8042742029662502, "grad_norm": 14.580866813659668, "learning_rate": 3.196582194713691e-05, "loss": 0.5773, "step": 46350 }, { "epoch": 1.8062205613297522, "grad_norm": 13.151415824890137, "learning_rate": 3.194635836350189e-05, "loss": 0.5629, "step": 46400 }, { "epoch": 1.808166919693254, "grad_norm": 10.927295684814453, "learning_rate": 3.192689477986687e-05, "loss": 0.6103, "step": 46450 }, { "epoch": 1.8101132780567557, "grad_norm": 43.06269454956055, "learning_rate": 3.190743119623185e-05, "loss": 0.6338, "step": 46500 }, { "epoch": 1.8120596364202577, "grad_norm": 6.7707343101501465, "learning_rate": 3.188796761259683e-05, "loss": 0.5793, "step": 46550 }, { "epoch": 1.8140059947837597, "grad_norm": 11.158266067504883, "learning_rate": 3.186850402896182e-05, "loss": 0.6843, "step": 46600 }, { "epoch": 1.8159523531472614, "grad_norm": 54.30908203125, "learning_rate": 3.1849040445326795e-05, "loss": 0.6515, "step": 46650 }, { "epoch": 1.8178987115107632, "grad_norm": 80.3941650390625, "learning_rate": 3.182957686169178e-05, "loss": 0.6313, "step": 46700 }, { "epoch": 1.8198450698742652, "grad_norm": 7.776123523712158, "learning_rate": 3.181011327805676e-05, "loss": 0.5401, "step": 46750 }, { "epoch": 1.8217914282377672, "grad_norm": 98.66846466064453, "learning_rate": 3.1790649694421735e-05, "loss": 0.6268, "step": 46800 }, { "epoch": 1.823737786601269, "grad_norm": 21.368833541870117, "learning_rate": 3.177118611078672e-05, "loss": 0.6528, "step": 46850 }, { "epoch": 1.825684144964771, "grad_norm": 13.133437156677246, "learning_rate": 3.17517225271517e-05, "loss": 0.6296, "step": 46900 }, { "epoch": 1.827630503328273, "grad_norm": 27.631587982177734, "learning_rate": 3.173225894351668e-05, "loss": 0.6221, "step": 46950 }, { "epoch": 1.8295768616917747, "grad_norm": 9.664813041687012, "learning_rate": 3.1712795359881665e-05, "loss": 0.6765, "step": 47000 }, { "epoch": 1.8315232200552765, "grad_norm": 114.55805206298828, "learning_rate": 3.169333177624664e-05, "loss": 0.6833, "step": 47050 }, { "epoch": 1.8334695784187784, "grad_norm": 12.664616584777832, "learning_rate": 3.167386819261163e-05, "loss": 0.7272, "step": 47100 }, { "epoch": 1.8354159367822804, "grad_norm": 17.974599838256836, "learning_rate": 3.1654404608976605e-05, "loss": 0.5987, "step": 47150 }, { "epoch": 1.8373622951457822, "grad_norm": 24.91884422302246, "learning_rate": 3.163494102534158e-05, "loss": 0.6174, "step": 47200 }, { "epoch": 1.839308653509284, "grad_norm": 6.305722713470459, "learning_rate": 3.1615477441706573e-05, "loss": 0.5337, "step": 47250 }, { "epoch": 1.8412550118727862, "grad_norm": 12.780620574951172, "learning_rate": 3.159601385807155e-05, "loss": 0.6529, "step": 47300 }, { "epoch": 1.843201370236288, "grad_norm": 11.825821876525879, "learning_rate": 3.157655027443653e-05, "loss": 0.6319, "step": 47350 }, { "epoch": 1.8451477285997897, "grad_norm": 3.605128288269043, "learning_rate": 3.155708669080151e-05, "loss": 0.6426, "step": 47400 }, { "epoch": 1.8470940869632917, "grad_norm": 7.201091289520264, "learning_rate": 3.153762310716649e-05, "loss": 0.52, "step": 47450 }, { "epoch": 1.8490404453267937, "grad_norm": 14.191445350646973, "learning_rate": 3.1518159523531475e-05, "loss": 0.6618, "step": 47500 }, { "epoch": 1.8509868036902954, "grad_norm": 44.98381042480469, "learning_rate": 3.149869593989645e-05, "loss": 0.7115, "step": 47550 }, { "epoch": 1.8529331620537972, "grad_norm": 17.752029418945312, "learning_rate": 3.147923235626144e-05, "loss": 0.6583, "step": 47600 }, { "epoch": 1.8548795204172992, "grad_norm": 12.982268333435059, "learning_rate": 3.145976877262642e-05, "loss": 0.569, "step": 47650 }, { "epoch": 1.8568258787808012, "grad_norm": 5.3733062744140625, "learning_rate": 3.14403051889914e-05, "loss": 0.5254, "step": 47700 }, { "epoch": 1.858772237144303, "grad_norm": 37.37739562988281, "learning_rate": 3.1420841605356376e-05, "loss": 0.5901, "step": 47750 }, { "epoch": 1.860718595507805, "grad_norm": 24.81436538696289, "learning_rate": 3.140137802172136e-05, "loss": 0.6819, "step": 47800 }, { "epoch": 1.862664953871307, "grad_norm": 7.0834527015686035, "learning_rate": 3.138191443808634e-05, "loss": 0.6177, "step": 47850 }, { "epoch": 1.8646113122348087, "grad_norm": 8.853228569030762, "learning_rate": 3.136245085445132e-05, "loss": 0.6403, "step": 47900 }, { "epoch": 1.8665576705983105, "grad_norm": 11.595633506774902, "learning_rate": 3.134298727081631e-05, "loss": 0.6129, "step": 47950 }, { "epoch": 1.8685040289618124, "grad_norm": 11.003334045410156, "learning_rate": 3.1323523687181285e-05, "loss": 0.6193, "step": 48000 }, { "epoch": 1.8704503873253144, "grad_norm": 13.160431861877441, "learning_rate": 3.130406010354627e-05, "loss": 0.5909, "step": 48050 }, { "epoch": 1.8723967456888162, "grad_norm": 18.65304183959961, "learning_rate": 3.1284596519911247e-05, "loss": 0.5555, "step": 48100 }, { "epoch": 1.874343104052318, "grad_norm": 11.042593002319336, "learning_rate": 3.126513293627623e-05, "loss": 0.7243, "step": 48150 }, { "epoch": 1.87628946241582, "grad_norm": 11.163121223449707, "learning_rate": 3.124566935264121e-05, "loss": 0.603, "step": 48200 }, { "epoch": 1.878235820779322, "grad_norm": 40.173580169677734, "learning_rate": 3.1226205769006186e-05, "loss": 0.713, "step": 48250 }, { "epoch": 1.8801821791428237, "grad_norm": 24.671205520629883, "learning_rate": 3.120674218537118e-05, "loss": 0.6264, "step": 48300 }, { "epoch": 1.8821285375063257, "grad_norm": 17.38442611694336, "learning_rate": 3.1187278601736155e-05, "loss": 0.6425, "step": 48350 }, { "epoch": 1.8840748958698277, "grad_norm": 35.746726989746094, "learning_rate": 3.116781501810113e-05, "loss": 0.631, "step": 48400 }, { "epoch": 1.8860212542333294, "grad_norm": 14.038824081420898, "learning_rate": 3.114835143446612e-05, "loss": 0.6762, "step": 48450 }, { "epoch": 1.8879676125968312, "grad_norm": 2.4392940998077393, "learning_rate": 3.1128887850831094e-05, "loss": 0.6374, "step": 48500 }, { "epoch": 1.8899139709603332, "grad_norm": 9.120672225952148, "learning_rate": 3.110942426719608e-05, "loss": 0.6114, "step": 48550 }, { "epoch": 1.8918603293238352, "grad_norm": 3.8188068866729736, "learning_rate": 3.1089960683561056e-05, "loss": 0.6156, "step": 48600 }, { "epoch": 1.893806687687337, "grad_norm": 22.31060791015625, "learning_rate": 3.107049709992604e-05, "loss": 0.5672, "step": 48650 }, { "epoch": 1.8957530460508387, "grad_norm": 7.84792947769165, "learning_rate": 3.1051033516291025e-05, "loss": 0.5889, "step": 48700 }, { "epoch": 1.897699404414341, "grad_norm": 5.405965328216553, "learning_rate": 3.1031569932656e-05, "loss": 0.5499, "step": 48750 }, { "epoch": 1.8996457627778427, "grad_norm": 10.002777099609375, "learning_rate": 3.101210634902098e-05, "loss": 0.642, "step": 48800 }, { "epoch": 1.9015921211413445, "grad_norm": 20.25397300720215, "learning_rate": 3.099303203705867e-05, "loss": 0.6415, "step": 48850 }, { "epoch": 1.9035384795048464, "grad_norm": 21.4931640625, "learning_rate": 3.0973568453423645e-05, "loss": 0.6384, "step": 48900 }, { "epoch": 1.9054848378683484, "grad_norm": 27.349008560180664, "learning_rate": 3.095410486978862e-05, "loss": 0.6655, "step": 48950 }, { "epoch": 1.9074311962318502, "grad_norm": 6.929286003112793, "learning_rate": 3.093464128615361e-05, "loss": 0.6654, "step": 49000 }, { "epoch": 1.909377554595352, "grad_norm": 6.163684844970703, "learning_rate": 3.091517770251859e-05, "loss": 0.5644, "step": 49050 }, { "epoch": 1.911323912958854, "grad_norm": 12.192011833190918, "learning_rate": 3.0895714118883576e-05, "loss": 0.6454, "step": 49100 }, { "epoch": 1.913270271322356, "grad_norm": 9.885323524475098, "learning_rate": 3.0876250535248553e-05, "loss": 0.6796, "step": 49150 }, { "epoch": 1.9152166296858577, "grad_norm": 5.621720790863037, "learning_rate": 3.085678695161353e-05, "loss": 0.5349, "step": 49200 }, { "epoch": 1.9171629880493597, "grad_norm": 6.426109313964844, "learning_rate": 3.0837323367978515e-05, "loss": 0.6792, "step": 49250 }, { "epoch": 1.9191093464128617, "grad_norm": 35.79526901245117, "learning_rate": 3.081785978434349e-05, "loss": 0.5889, "step": 49300 }, { "epoch": 1.9210557047763634, "grad_norm": 2.708463430404663, "learning_rate": 3.079839620070847e-05, "loss": 0.6352, "step": 49350 }, { "epoch": 1.9230020631398652, "grad_norm": 13.67793083190918, "learning_rate": 3.0778932617073455e-05, "loss": 0.6673, "step": 49400 }, { "epoch": 1.9249484215033672, "grad_norm": 6.640336513519287, "learning_rate": 3.075946903343844e-05, "loss": 0.6021, "step": 49450 }, { "epoch": 1.9268947798668692, "grad_norm": 7.4698710441589355, "learning_rate": 3.0740005449803424e-05, "loss": 0.6311, "step": 49500 }, { "epoch": 1.928841138230371, "grad_norm": 47.44767761230469, "learning_rate": 3.07205418661684e-05, "loss": 0.559, "step": 49550 }, { "epoch": 1.9307874965938727, "grad_norm": 1.724997639656067, "learning_rate": 3.070107828253338e-05, "loss": 0.5835, "step": 49600 }, { "epoch": 1.9327338549573747, "grad_norm": 6.836215019226074, "learning_rate": 3.068161469889836e-05, "loss": 0.6593, "step": 49650 }, { "epoch": 1.9346802133208767, "grad_norm": 8.789504051208496, "learning_rate": 3.066215111526334e-05, "loss": 0.6112, "step": 49700 }, { "epoch": 1.9366265716843785, "grad_norm": 11.758174896240234, "learning_rate": 3.0642687531628325e-05, "loss": 0.6491, "step": 49750 }, { "epoch": 1.9385729300478804, "grad_norm": 6.510067462921143, "learning_rate": 3.062322394799331e-05, "loss": 0.7502, "step": 49800 }, { "epoch": 1.9405192884113824, "grad_norm": 20.4722900390625, "learning_rate": 3.060376036435829e-05, "loss": 0.6498, "step": 49850 }, { "epoch": 1.9424656467748842, "grad_norm": 2.776576280593872, "learning_rate": 3.058429678072327e-05, "loss": 0.5751, "step": 49900 }, { "epoch": 1.944412005138386, "grad_norm": 10.676612854003906, "learning_rate": 3.056483319708825e-05, "loss": 0.6157, "step": 49950 }, { "epoch": 1.946358363501888, "grad_norm": 16.13102149963379, "learning_rate": 3.0545369613453227e-05, "loss": 0.7086, "step": 50000 }, { "epoch": 1.94830472186539, "grad_norm": 42.112613677978516, "learning_rate": 3.052590602981821e-05, "loss": 0.536, "step": 50050 }, { "epoch": 1.9502510802288917, "grad_norm": 24.60944175720215, "learning_rate": 3.0506442446183192e-05, "loss": 0.5617, "step": 50100 }, { "epoch": 1.9521974385923935, "grad_norm": 8.113758087158203, "learning_rate": 3.048697886254817e-05, "loss": 0.6635, "step": 50150 }, { "epoch": 1.9541437969558957, "grad_norm": 25.426332473754883, "learning_rate": 3.0467515278913157e-05, "loss": 0.6343, "step": 50200 }, { "epoch": 1.9560901553193975, "grad_norm": 20.29708480834961, "learning_rate": 3.0448051695278135e-05, "loss": 0.5633, "step": 50250 }, { "epoch": 1.9580365136828992, "grad_norm": 13.312697410583496, "learning_rate": 3.042858811164312e-05, "loss": 0.6025, "step": 50300 }, { "epoch": 1.9599828720464012, "grad_norm": 11.712089538574219, "learning_rate": 3.0409124528008097e-05, "loss": 0.6277, "step": 50350 }, { "epoch": 1.9619292304099032, "grad_norm": 5.127732753753662, "learning_rate": 3.0389660944373078e-05, "loss": 0.533, "step": 50400 }, { "epoch": 1.963875588773405, "grad_norm": 7.2708587646484375, "learning_rate": 3.0370197360738062e-05, "loss": 0.6458, "step": 50450 }, { "epoch": 1.9658219471369067, "grad_norm": 24.060832977294922, "learning_rate": 3.035073377710304e-05, "loss": 0.6851, "step": 50500 }, { "epoch": 1.9677683055004087, "grad_norm": 11.673768043518066, "learning_rate": 3.0331270193468024e-05, "loss": 0.6812, "step": 50550 }, { "epoch": 1.9697146638639107, "grad_norm": 24.654735565185547, "learning_rate": 3.0311806609833005e-05, "loss": 0.5844, "step": 50600 }, { "epoch": 1.9716610222274125, "grad_norm": 18.49933624267578, "learning_rate": 3.0292343026197983e-05, "loss": 0.6153, "step": 50650 }, { "epoch": 1.9736073805909145, "grad_norm": 5.433356761932373, "learning_rate": 3.0272879442562967e-05, "loss": 0.6904, "step": 50700 }, { "epoch": 1.9755537389544164, "grad_norm": 14.306107521057129, "learning_rate": 3.0253415858927948e-05, "loss": 0.5678, "step": 50750 }, { "epoch": 1.9775000973179182, "grad_norm": 43.65245819091797, "learning_rate": 3.0233952275292925e-05, "loss": 0.5237, "step": 50800 }, { "epoch": 1.97944645568142, "grad_norm": 9.075907707214355, "learning_rate": 3.021448869165791e-05, "loss": 0.6016, "step": 50850 }, { "epoch": 1.981392814044922, "grad_norm": 18.92595672607422, "learning_rate": 3.019502510802289e-05, "loss": 0.6133, "step": 50900 }, { "epoch": 1.983339172408424, "grad_norm": 14.473649978637695, "learning_rate": 3.0175561524387875e-05, "loss": 0.5555, "step": 50950 }, { "epoch": 1.9852855307719257, "grad_norm": 10.031424522399902, "learning_rate": 3.0156097940752853e-05, "loss": 0.6867, "step": 51000 }, { "epoch": 1.9872318891354275, "grad_norm": 17.073915481567383, "learning_rate": 3.0136634357117834e-05, "loss": 0.5919, "step": 51050 }, { "epoch": 1.9891782474989295, "grad_norm": 5.591005802154541, "learning_rate": 3.0117170773482818e-05, "loss": 0.6182, "step": 51100 }, { "epoch": 1.9911246058624315, "grad_norm": 13.550150871276855, "learning_rate": 3.0097707189847796e-05, "loss": 0.457, "step": 51150 }, { "epoch": 1.9930709642259332, "grad_norm": 60.49889373779297, "learning_rate": 3.0078243606212773e-05, "loss": 0.6347, "step": 51200 }, { "epoch": 1.9950173225894352, "grad_norm": 7.604905128479004, "learning_rate": 3.005878002257776e-05, "loss": 0.6814, "step": 51250 }, { "epoch": 1.9969636809529372, "grad_norm": 29.387537002563477, "learning_rate": 3.003931643894274e-05, "loss": 0.6898, "step": 51300 }, { "epoch": 1.998910039316439, "grad_norm": 5.087653160095215, "learning_rate": 3.0019852855307723e-05, "loss": 0.6589, "step": 51350 }, { "epoch": 2.0, "eval_accuracy": 0.7737163766592705, "eval_f1_macro": 0.7229216076954589, "eval_f1_weighted": 0.7698159304253903, "eval_loss": 0.6461913585662842, "eval_roc_auc": 0.9461542596310664, "eval_runtime": 26.0737, "eval_samples_per_second": 985.244, "eval_steps_per_second": 123.189, "step": 51378 }, { "epoch": 2.0008563976799407, "grad_norm": 5.689671993255615, "learning_rate": 3.00003892716727e-05, "loss": 0.6316, "step": 51400 }, { "epoch": 2.002802756043443, "grad_norm": 19.76540184020996, "learning_rate": 2.998092568803768e-05, "loss": 0.5968, "step": 51450 }, { "epoch": 2.0047491144069447, "grad_norm": 20.951391220092773, "learning_rate": 2.9961462104402666e-05, "loss": 0.6136, "step": 51500 }, { "epoch": 2.0066954727704465, "grad_norm": 27.283966064453125, "learning_rate": 2.9941998520767643e-05, "loss": 0.5586, "step": 51550 }, { "epoch": 2.0086418311339482, "grad_norm": 3.5430080890655518, "learning_rate": 2.9922534937132624e-05, "loss": 0.6073, "step": 51600 }, { "epoch": 2.0105881894974504, "grad_norm": 5.316318035125732, "learning_rate": 2.990307135349761e-05, "loss": 0.5814, "step": 51650 }, { "epoch": 2.012534547860952, "grad_norm": 22.576169967651367, "learning_rate": 2.9883607769862586e-05, "loss": 0.5864, "step": 51700 }, { "epoch": 2.014480906224454, "grad_norm": 28.686248779296875, "learning_rate": 2.986414418622757e-05, "loss": 0.6734, "step": 51750 }, { "epoch": 2.0164272645879557, "grad_norm": 19.677955627441406, "learning_rate": 2.984468060259255e-05, "loss": 0.5502, "step": 51800 }, { "epoch": 2.018373622951458, "grad_norm": 24.950286865234375, "learning_rate": 2.982521701895753e-05, "loss": 0.5667, "step": 51850 }, { "epoch": 2.0203199813149597, "grad_norm": 5.69610595703125, "learning_rate": 2.9805753435322514e-05, "loss": 0.5979, "step": 51900 }, { "epoch": 2.0222663396784615, "grad_norm": 77.0417251586914, "learning_rate": 2.9786289851687495e-05, "loss": 0.5531, "step": 51950 }, { "epoch": 2.0242126980419637, "grad_norm": 2.097442150115967, "learning_rate": 2.976682626805248e-05, "loss": 0.5466, "step": 52000 }, { "epoch": 2.0261590564054655, "grad_norm": 24.78962516784668, "learning_rate": 2.9747362684417457e-05, "loss": 0.5211, "step": 52050 }, { "epoch": 2.028105414768967, "grad_norm": 8.230952262878418, "learning_rate": 2.9727899100782438e-05, "loss": 0.6087, "step": 52100 }, { "epoch": 2.030051773132469, "grad_norm": 15.653302192687988, "learning_rate": 2.9708435517147422e-05, "loss": 0.578, "step": 52150 }, { "epoch": 2.031998131495971, "grad_norm": 17.88194465637207, "learning_rate": 2.96889719335124e-05, "loss": 0.6051, "step": 52200 }, { "epoch": 2.033944489859473, "grad_norm": 12.239280700683594, "learning_rate": 2.9669508349877377e-05, "loss": 0.6138, "step": 52250 }, { "epoch": 2.0358908482229747, "grad_norm": 25.323143005371094, "learning_rate": 2.965004476624236e-05, "loss": 0.5684, "step": 52300 }, { "epoch": 2.0378372065864765, "grad_norm": 16.554807662963867, "learning_rate": 2.9630581182607342e-05, "loss": 0.556, "step": 52350 }, { "epoch": 2.0397835649499787, "grad_norm": 24.392778396606445, "learning_rate": 2.9611117598972327e-05, "loss": 0.586, "step": 52400 }, { "epoch": 2.0417299233134805, "grad_norm": 29.97602081298828, "learning_rate": 2.9591654015337304e-05, "loss": 0.5072, "step": 52450 }, { "epoch": 2.0436762816769822, "grad_norm": 14.967512130737305, "learning_rate": 2.9572190431702285e-05, "loss": 0.6156, "step": 52500 }, { "epoch": 2.0456226400404844, "grad_norm": 15.501233100891113, "learning_rate": 2.955272684806727e-05, "loss": 0.4743, "step": 52550 }, { "epoch": 2.047568998403986, "grad_norm": 17.823915481567383, "learning_rate": 2.9533263264432247e-05, "loss": 0.5985, "step": 52600 }, { "epoch": 2.049515356767488, "grad_norm": 7.528595924377441, "learning_rate": 2.9513799680797228e-05, "loss": 0.5974, "step": 52650 }, { "epoch": 2.0514617151309897, "grad_norm": 6.566778182983398, "learning_rate": 2.9494336097162213e-05, "loss": 0.5491, "step": 52700 }, { "epoch": 2.053408073494492, "grad_norm": 4.719038963317871, "learning_rate": 2.947487251352719e-05, "loss": 0.5449, "step": 52750 }, { "epoch": 2.0553544318579937, "grad_norm": 10.979540824890137, "learning_rate": 2.9455408929892174e-05, "loss": 0.6161, "step": 52800 }, { "epoch": 2.0573007902214955, "grad_norm": 20.325965881347656, "learning_rate": 2.9435945346257155e-05, "loss": 0.5414, "step": 52850 }, { "epoch": 2.0592471485849977, "grad_norm": 38.64848327636719, "learning_rate": 2.9416481762622133e-05, "loss": 0.6009, "step": 52900 }, { "epoch": 2.0611935069484995, "grad_norm": 62.28468704223633, "learning_rate": 2.939740745065982e-05, "loss": 0.6231, "step": 52950 }, { "epoch": 2.063139865312001, "grad_norm": 22.000192642211914, "learning_rate": 2.9377943867024798e-05, "loss": 0.5777, "step": 53000 }, { "epoch": 2.065086223675503, "grad_norm": 19.796693801879883, "learning_rate": 2.935848028338978e-05, "loss": 0.5028, "step": 53050 }, { "epoch": 2.067032582039005, "grad_norm": 4.130773544311523, "learning_rate": 2.9339016699754763e-05, "loss": 0.607, "step": 53100 }, { "epoch": 2.068978940402507, "grad_norm": 3.1777868270874023, "learning_rate": 2.9319942387792444e-05, "loss": 0.5929, "step": 53150 }, { "epoch": 2.0709252987660087, "grad_norm": 69.64038848876953, "learning_rate": 2.9300478804157422e-05, "loss": 0.612, "step": 53200 }, { "epoch": 2.0728716571295105, "grad_norm": 5.487189769744873, "learning_rate": 2.9281015220522406e-05, "loss": 0.6076, "step": 53250 }, { "epoch": 2.0748180154930127, "grad_norm": 13.81440258026123, "learning_rate": 2.9261551636887387e-05, "loss": 0.5965, "step": 53300 }, { "epoch": 2.0767643738565145, "grad_norm": 34.77814483642578, "learning_rate": 2.9242088053252365e-05, "loss": 0.491, "step": 53350 }, { "epoch": 2.0787107322200162, "grad_norm": 28.347227096557617, "learning_rate": 2.922262446961735e-05, "loss": 0.4312, "step": 53400 }, { "epoch": 2.0806570905835184, "grad_norm": 5.333169937133789, "learning_rate": 2.9203160885982327e-05, "loss": 0.6982, "step": 53450 }, { "epoch": 2.08260344894702, "grad_norm": 13.304935455322266, "learning_rate": 2.9183697302347308e-05, "loss": 0.5584, "step": 53500 }, { "epoch": 2.084549807310522, "grad_norm": 7.216817378997803, "learning_rate": 2.9164233718712292e-05, "loss": 0.5128, "step": 53550 }, { "epoch": 2.0864961656740237, "grad_norm": 11.369471549987793, "learning_rate": 2.914477013507727e-05, "loss": 0.6169, "step": 53600 }, { "epoch": 2.088442524037526, "grad_norm": 13.658463478088379, "learning_rate": 2.9125306551442254e-05, "loss": 0.5247, "step": 53650 }, { "epoch": 2.0903888824010277, "grad_norm": 14.956033706665039, "learning_rate": 2.9105842967807235e-05, "loss": 0.4936, "step": 53700 }, { "epoch": 2.0923352407645295, "grad_norm": 18.098590850830078, "learning_rate": 2.9086379384172212e-05, "loss": 0.5413, "step": 53750 }, { "epoch": 2.0942815991280312, "grad_norm": 25.9637393951416, "learning_rate": 2.9066915800537197e-05, "loss": 0.7275, "step": 53800 }, { "epoch": 2.0962279574915335, "grad_norm": 12.264385223388672, "learning_rate": 2.9047452216902178e-05, "loss": 0.5859, "step": 53850 }, { "epoch": 2.098174315855035, "grad_norm": 3.335156202316284, "learning_rate": 2.9027988633267155e-05, "loss": 0.5545, "step": 53900 }, { "epoch": 2.100120674218537, "grad_norm": 47.84690475463867, "learning_rate": 2.900852504963214e-05, "loss": 0.6385, "step": 53950 }, { "epoch": 2.102067032582039, "grad_norm": 8.796411514282227, "learning_rate": 2.898906146599712e-05, "loss": 0.6037, "step": 54000 }, { "epoch": 2.104013390945541, "grad_norm": 13.429327011108398, "learning_rate": 2.8969597882362105e-05, "loss": 0.5298, "step": 54050 }, { "epoch": 2.1059597493090427, "grad_norm": 20.816314697265625, "learning_rate": 2.8950134298727083e-05, "loss": 0.5732, "step": 54100 }, { "epoch": 2.1079061076725445, "grad_norm": 15.441302299499512, "learning_rate": 2.8930670715092064e-05, "loss": 0.4856, "step": 54150 }, { "epoch": 2.1098524660360467, "grad_norm": 78.61820220947266, "learning_rate": 2.8911207131457048e-05, "loss": 0.5522, "step": 54200 }, { "epoch": 2.1117988243995485, "grad_norm": 6.467291831970215, "learning_rate": 2.8891743547822026e-05, "loss": 0.5603, "step": 54250 }, { "epoch": 2.1137451827630502, "grad_norm": 26.693876266479492, "learning_rate": 2.887227996418701e-05, "loss": 0.5965, "step": 54300 }, { "epoch": 2.1156915411265524, "grad_norm": 1.6628177165985107, "learning_rate": 2.8852816380551987e-05, "loss": 0.4467, "step": 54350 }, { "epoch": 2.117637899490054, "grad_norm": 26.832683563232422, "learning_rate": 2.883335279691697e-05, "loss": 0.57, "step": 54400 }, { "epoch": 2.119584257853556, "grad_norm": 3.420207977294922, "learning_rate": 2.8813889213281953e-05, "loss": 0.5984, "step": 54450 }, { "epoch": 2.1215306162170577, "grad_norm": 45.87754440307617, "learning_rate": 2.879442562964693e-05, "loss": 0.5181, "step": 54500 }, { "epoch": 2.12347697458056, "grad_norm": 10.076251983642578, "learning_rate": 2.877496204601191e-05, "loss": 0.693, "step": 54550 }, { "epoch": 2.1254233329440617, "grad_norm": 22.976442337036133, "learning_rate": 2.8755498462376896e-05, "loss": 0.5104, "step": 54600 }, { "epoch": 2.1273696913075635, "grad_norm": 36.7247200012207, "learning_rate": 2.8736034878741873e-05, "loss": 0.6249, "step": 54650 }, { "epoch": 2.1293160496710652, "grad_norm": 24.524677276611328, "learning_rate": 2.8716571295106858e-05, "loss": 0.5927, "step": 54700 }, { "epoch": 2.1312624080345675, "grad_norm": 20.642854690551758, "learning_rate": 2.869710771147184e-05, "loss": 0.6129, "step": 54750 }, { "epoch": 2.1332087663980692, "grad_norm": 10.939338684082031, "learning_rate": 2.8677644127836816e-05, "loss": 0.5911, "step": 54800 }, { "epoch": 2.135155124761571, "grad_norm": 39.195526123046875, "learning_rate": 2.86581805442018e-05, "loss": 0.5808, "step": 54850 }, { "epoch": 2.1371014831250728, "grad_norm": 15.959467887878418, "learning_rate": 2.863871696056678e-05, "loss": 0.5285, "step": 54900 }, { "epoch": 2.139047841488575, "grad_norm": 21.316715240478516, "learning_rate": 2.861925337693176e-05, "loss": 0.6106, "step": 54950 }, { "epoch": 2.1409941998520767, "grad_norm": 17.284530639648438, "learning_rate": 2.8599789793296743e-05, "loss": 0.5247, "step": 55000 }, { "epoch": 2.1429405582155785, "grad_norm": 17.983993530273438, "learning_rate": 2.8580326209661724e-05, "loss": 0.4987, "step": 55050 }, { "epoch": 2.1448869165790807, "grad_norm": 23.051467895507812, "learning_rate": 2.856086262602671e-05, "loss": 0.5598, "step": 55100 }, { "epoch": 2.1468332749425825, "grad_norm": 5.550172328948975, "learning_rate": 2.8541399042391686e-05, "loss": 0.4817, "step": 55150 }, { "epoch": 2.1487796333060842, "grad_norm": 31.626354217529297, "learning_rate": 2.8521935458756664e-05, "loss": 0.5925, "step": 55200 }, { "epoch": 2.150725991669586, "grad_norm": 25.689552307128906, "learning_rate": 2.8502471875121652e-05, "loss": 0.6334, "step": 55250 }, { "epoch": 2.152672350033088, "grad_norm": 31.3800106048584, "learning_rate": 2.848300829148663e-05, "loss": 0.6121, "step": 55300 }, { "epoch": 2.15461870839659, "grad_norm": 29.860572814941406, "learning_rate": 2.8463544707851614e-05, "loss": 0.5272, "step": 55350 }, { "epoch": 2.1565650667600917, "grad_norm": 9.451452255249023, "learning_rate": 2.844408112421659e-05, "loss": 0.5445, "step": 55400 }, { "epoch": 2.158511425123594, "grad_norm": 22.7726993560791, "learning_rate": 2.8424617540581572e-05, "loss": 0.5346, "step": 55450 }, { "epoch": 2.1604577834870957, "grad_norm": 7.104035377502441, "learning_rate": 2.8405153956946557e-05, "loss": 0.5799, "step": 55500 }, { "epoch": 2.1624041418505975, "grad_norm": 13.191107749938965, "learning_rate": 2.8385690373311534e-05, "loss": 0.6823, "step": 55550 }, { "epoch": 2.1643505002140992, "grad_norm": 28.219919204711914, "learning_rate": 2.8366226789676515e-05, "loss": 0.6119, "step": 55600 }, { "epoch": 2.1662968585776015, "grad_norm": 7.374736785888672, "learning_rate": 2.83467632060415e-05, "loss": 0.5837, "step": 55650 }, { "epoch": 2.1682432169411032, "grad_norm": 7.324770927429199, "learning_rate": 2.8327299622406477e-05, "loss": 0.5509, "step": 55700 }, { "epoch": 2.170189575304605, "grad_norm": 31.97715950012207, "learning_rate": 2.830783603877146e-05, "loss": 0.6221, "step": 55750 }, { "epoch": 2.172135933668107, "grad_norm": 23.755449295043945, "learning_rate": 2.8288372455136442e-05, "loss": 0.6192, "step": 55800 }, { "epoch": 2.174082292031609, "grad_norm": 0.6099250912666321, "learning_rate": 2.826890887150142e-05, "loss": 0.5872, "step": 55850 }, { "epoch": 2.1760286503951107, "grad_norm": 9.792630195617676, "learning_rate": 2.8249445287866404e-05, "loss": 0.5027, "step": 55900 }, { "epoch": 2.1779750087586125, "grad_norm": 10.416454315185547, "learning_rate": 2.8229981704231385e-05, "loss": 0.6413, "step": 55950 }, { "epoch": 2.1799213671221147, "grad_norm": 17.33498191833496, "learning_rate": 2.8210518120596363e-05, "loss": 0.6023, "step": 56000 }, { "epoch": 2.1818677254856165, "grad_norm": 7.310183525085449, "learning_rate": 2.8191054536961347e-05, "loss": 0.5768, "step": 56050 }, { "epoch": 2.1838140838491182, "grad_norm": 6.330143928527832, "learning_rate": 2.8171590953326328e-05, "loss": 0.6224, "step": 56100 }, { "epoch": 2.18576044221262, "grad_norm": 21.384601593017578, "learning_rate": 2.8152127369691313e-05, "loss": 0.5293, "step": 56150 }, { "epoch": 2.187706800576122, "grad_norm": 31.702720642089844, "learning_rate": 2.813266378605629e-05, "loss": 0.6428, "step": 56200 }, { "epoch": 2.189653158939624, "grad_norm": 14.322802543640137, "learning_rate": 2.8113200202421268e-05, "loss": 0.5632, "step": 56250 }, { "epoch": 2.1915995173031257, "grad_norm": 8.354218482971191, "learning_rate": 2.809412589045895e-05, "loss": 0.623, "step": 56300 }, { "epoch": 2.1935458756666275, "grad_norm": 23.4522762298584, "learning_rate": 2.8074662306823933e-05, "loss": 0.5802, "step": 56350 }, { "epoch": 2.1954922340301297, "grad_norm": 18.079788208007812, "learning_rate": 2.8055198723188914e-05, "loss": 0.6231, "step": 56400 }, { "epoch": 2.1974385923936315, "grad_norm": 13.644038200378418, "learning_rate": 2.8035735139553898e-05, "loss": 0.5348, "step": 56450 }, { "epoch": 2.1993849507571333, "grad_norm": 23.219881057739258, "learning_rate": 2.8016271555918876e-05, "loss": 0.5698, "step": 56500 }, { "epoch": 2.2013313091206355, "grad_norm": 10.476996421813965, "learning_rate": 2.7996807972283857e-05, "loss": 0.5823, "step": 56550 }, { "epoch": 2.2032776674841372, "grad_norm": 36.06291961669922, "learning_rate": 2.797734438864884e-05, "loss": 0.5152, "step": 56600 }, { "epoch": 2.205224025847639, "grad_norm": 3.8125925064086914, "learning_rate": 2.795788080501382e-05, "loss": 0.5193, "step": 56650 }, { "epoch": 2.2071703842111408, "grad_norm": 8.895625114440918, "learning_rate": 2.7938417221378803e-05, "loss": 0.6182, "step": 56700 }, { "epoch": 2.209116742574643, "grad_norm": 23.124698638916016, "learning_rate": 2.7918953637743784e-05, "loss": 0.5398, "step": 56750 }, { "epoch": 2.2110631009381447, "grad_norm": 2.5478899478912354, "learning_rate": 2.789949005410876e-05, "loss": 0.61, "step": 56800 }, { "epoch": 2.2130094593016465, "grad_norm": 17.31949234008789, "learning_rate": 2.7880026470473746e-05, "loss": 0.6121, "step": 56850 }, { "epoch": 2.2149558176651487, "grad_norm": 10.137930870056152, "learning_rate": 2.7860562886838727e-05, "loss": 0.5487, "step": 56900 }, { "epoch": 2.2169021760286505, "grad_norm": 16.173480987548828, "learning_rate": 2.7841099303203704e-05, "loss": 0.6913, "step": 56950 }, { "epoch": 2.2188485343921522, "grad_norm": 20.57544708251953, "learning_rate": 2.782163571956869e-05, "loss": 0.6087, "step": 57000 }, { "epoch": 2.220794892755654, "grad_norm": 47.49404525756836, "learning_rate": 2.780217213593367e-05, "loss": 0.6687, "step": 57050 }, { "epoch": 2.222741251119156, "grad_norm": 38.83674240112305, "learning_rate": 2.7782708552298654e-05, "loss": 0.6314, "step": 57100 }, { "epoch": 2.224687609482658, "grad_norm": 6.05210018157959, "learning_rate": 2.7763244968663632e-05, "loss": 0.609, "step": 57150 }, { "epoch": 2.2266339678461597, "grad_norm": 5.9965362548828125, "learning_rate": 2.774378138502861e-05, "loss": 0.5669, "step": 57200 }, { "epoch": 2.228580326209662, "grad_norm": 40.08223342895508, "learning_rate": 2.7724317801393597e-05, "loss": 0.5614, "step": 57250 }, { "epoch": 2.2305266845731637, "grad_norm": 23.709110260009766, "learning_rate": 2.7704854217758575e-05, "loss": 0.5487, "step": 57300 }, { "epoch": 2.2324730429366655, "grad_norm": 8.868976593017578, "learning_rate": 2.7685390634123552e-05, "loss": 0.6049, "step": 57350 }, { "epoch": 2.2344194013001673, "grad_norm": 0.41209474205970764, "learning_rate": 2.7665927050488537e-05, "loss": 0.556, "step": 57400 }, { "epoch": 2.2363657596636695, "grad_norm": 7.829455852508545, "learning_rate": 2.7646463466853518e-05, "loss": 0.5187, "step": 57450 }, { "epoch": 2.2383121180271712, "grad_norm": 12.621383666992188, "learning_rate": 2.7626999883218502e-05, "loss": 0.6037, "step": 57500 }, { "epoch": 2.240258476390673, "grad_norm": 17.137496948242188, "learning_rate": 2.760753629958348e-05, "loss": 0.5516, "step": 57550 }, { "epoch": 2.2422048347541748, "grad_norm": 19.438697814941406, "learning_rate": 2.758807271594846e-05, "loss": 0.6041, "step": 57600 }, { "epoch": 2.244151193117677, "grad_norm": 39.467769622802734, "learning_rate": 2.7568609132313445e-05, "loss": 0.6032, "step": 57650 }, { "epoch": 2.2460975514811787, "grad_norm": 25.654226303100586, "learning_rate": 2.7549145548678422e-05, "loss": 0.5093, "step": 57700 }, { "epoch": 2.2480439098446805, "grad_norm": 29.690357208251953, "learning_rate": 2.7529681965043403e-05, "loss": 0.6973, "step": 57750 }, { "epoch": 2.2499902682081823, "grad_norm": 16.75621795654297, "learning_rate": 2.7510218381408388e-05, "loss": 0.5497, "step": 57800 }, { "epoch": 2.2519366265716845, "grad_norm": 19.21369743347168, "learning_rate": 2.7490754797773365e-05, "loss": 0.6079, "step": 57850 }, { "epoch": 2.2538829849351862, "grad_norm": 21.246784210205078, "learning_rate": 2.747129121413835e-05, "loss": 0.5224, "step": 57900 }, { "epoch": 2.255829343298688, "grad_norm": 5.209725856781006, "learning_rate": 2.745182763050333e-05, "loss": 0.479, "step": 57950 }, { "epoch": 2.25777570166219, "grad_norm": 21.560047149658203, "learning_rate": 2.7432364046868308e-05, "loss": 0.4879, "step": 58000 }, { "epoch": 2.259722060025692, "grad_norm": 20.20037078857422, "learning_rate": 2.7412900463233293e-05, "loss": 0.5609, "step": 58050 }, { "epoch": 2.2616684183891937, "grad_norm": 78.23851013183594, "learning_rate": 2.7393436879598274e-05, "loss": 0.5232, "step": 58100 }, { "epoch": 2.2636147767526955, "grad_norm": 21.22826385498047, "learning_rate": 2.7373973295963258e-05, "loss": 0.5727, "step": 58150 }, { "epoch": 2.2655611351161977, "grad_norm": 13.096561431884766, "learning_rate": 2.7354509712328236e-05, "loss": 0.5608, "step": 58200 }, { "epoch": 2.2675074934796995, "grad_norm": 15.57311725616455, "learning_rate": 2.7335046128693213e-05, "loss": 0.6819, "step": 58250 }, { "epoch": 2.2694538518432013, "grad_norm": 13.421225547790527, "learning_rate": 2.73155825450582e-05, "loss": 0.5781, "step": 58300 }, { "epoch": 2.2714002102067035, "grad_norm": 12.305198669433594, "learning_rate": 2.729611896142318e-05, "loss": 0.5858, "step": 58350 }, { "epoch": 2.2733465685702052, "grad_norm": 32.70185470581055, "learning_rate": 2.7276655377788156e-05, "loss": 0.5924, "step": 58400 }, { "epoch": 2.275292926933707, "grad_norm": 3.7276499271392822, "learning_rate": 2.725719179415314e-05, "loss": 0.532, "step": 58450 }, { "epoch": 2.2772392852972088, "grad_norm": 27.530439376831055, "learning_rate": 2.723772821051812e-05, "loss": 0.5712, "step": 58500 }, { "epoch": 2.279185643660711, "grad_norm": 6.0480732917785645, "learning_rate": 2.7218264626883106e-05, "loss": 0.5452, "step": 58550 }, { "epoch": 2.2811320020242127, "grad_norm": 65.64759826660156, "learning_rate": 2.7198801043248083e-05, "loss": 0.5635, "step": 58600 }, { "epoch": 2.2830783603877145, "grad_norm": 8.518084526062012, "learning_rate": 2.7179337459613064e-05, "loss": 0.6192, "step": 58650 }, { "epoch": 2.2850247187512167, "grad_norm": 16.8560791015625, "learning_rate": 2.715987387597805e-05, "loss": 0.6768, "step": 58700 }, { "epoch": 2.2869710771147185, "grad_norm": 11.05721378326416, "learning_rate": 2.7140410292343026e-05, "loss": 0.5287, "step": 58750 }, { "epoch": 2.2889174354782202, "grad_norm": 11.521369934082031, "learning_rate": 2.7120946708708007e-05, "loss": 0.5699, "step": 58800 }, { "epoch": 2.290863793841722, "grad_norm": 19.95265769958496, "learning_rate": 2.710148312507299e-05, "loss": 0.5812, "step": 58850 }, { "epoch": 2.292810152205224, "grad_norm": 22.240602493286133, "learning_rate": 2.708201954143797e-05, "loss": 0.6368, "step": 58900 }, { "epoch": 2.294756510568726, "grad_norm": 10.322290420532227, "learning_rate": 2.7062555957802953e-05, "loss": 0.5247, "step": 58950 }, { "epoch": 2.2967028689322277, "grad_norm": 23.392765045166016, "learning_rate": 2.7043092374167934e-05, "loss": 0.5816, "step": 59000 }, { "epoch": 2.2986492272957295, "grad_norm": 12.684326171875, "learning_rate": 2.7023628790532912e-05, "loss": 0.5479, "step": 59050 }, { "epoch": 2.3005955856592317, "grad_norm": 8.390941619873047, "learning_rate": 2.7004165206897896e-05, "loss": 0.5342, "step": 59100 }, { "epoch": 2.3025419440227335, "grad_norm": 6.13254976272583, "learning_rate": 2.6984701623262877e-05, "loss": 0.6108, "step": 59150 }, { "epoch": 2.3044883023862353, "grad_norm": 11.719011306762695, "learning_rate": 2.696523803962786e-05, "loss": 0.5734, "step": 59200 }, { "epoch": 2.306434660749737, "grad_norm": 15.797625541687012, "learning_rate": 2.694577445599284e-05, "loss": 0.6165, "step": 59250 }, { "epoch": 2.3083810191132392, "grad_norm": 36.1708869934082, "learning_rate": 2.6926310872357817e-05, "loss": 0.4839, "step": 59300 }, { "epoch": 2.310327377476741, "grad_norm": 8.173049926757812, "learning_rate": 2.6906847288722805e-05, "loss": 0.4744, "step": 59350 }, { "epoch": 2.3122737358402428, "grad_norm": 39.86005401611328, "learning_rate": 2.6887383705087782e-05, "loss": 0.5004, "step": 59400 }, { "epoch": 2.314220094203745, "grad_norm": 13.561505317687988, "learning_rate": 2.686792012145276e-05, "loss": 0.5515, "step": 59450 }, { "epoch": 2.3161664525672467, "grad_norm": 10.231904029846191, "learning_rate": 2.6848456537817744e-05, "loss": 0.9276, "step": 59500 }, { "epoch": 2.3181128109307485, "grad_norm": 11.346729278564453, "learning_rate": 2.6828992954182725e-05, "loss": 0.6566, "step": 59550 }, { "epoch": 2.3200591692942503, "grad_norm": 19.543004989624023, "learning_rate": 2.6809918642220406e-05, "loss": 0.5645, "step": 59600 }, { "epoch": 2.3220055276577525, "grad_norm": 17.82452392578125, "learning_rate": 2.679045505858539e-05, "loss": 0.5385, "step": 59650 }, { "epoch": 2.3239518860212542, "grad_norm": 4.003356456756592, "learning_rate": 2.6770991474950368e-05, "loss": 0.5139, "step": 59700 }, { "epoch": 2.325898244384756, "grad_norm": 8.438146591186523, "learning_rate": 2.675152789131535e-05, "loss": 0.5475, "step": 59750 }, { "epoch": 2.327844602748258, "grad_norm": 21.45138931274414, "learning_rate": 2.6732064307680333e-05, "loss": 0.6367, "step": 59800 }, { "epoch": 2.32979096111176, "grad_norm": 9.837311744689941, "learning_rate": 2.671260072404531e-05, "loss": 0.4908, "step": 59850 }, { "epoch": 2.3317373194752617, "grad_norm": 8.567156791687012, "learning_rate": 2.6693137140410295e-05, "loss": 0.5582, "step": 59900 }, { "epoch": 2.3336836778387635, "grad_norm": 18.257999420166016, "learning_rate": 2.6673673556775276e-05, "loss": 0.5788, "step": 59950 }, { "epoch": 2.3356300362022657, "grad_norm": 5.399444580078125, "learning_rate": 2.6654209973140254e-05, "loss": 0.5908, "step": 60000 }, { "epoch": 2.3375763945657675, "grad_norm": 25.929563522338867, "learning_rate": 2.6634746389505238e-05, "loss": 0.5621, "step": 60050 }, { "epoch": 2.3395227529292693, "grad_norm": 20.525115966796875, "learning_rate": 2.661528280587022e-05, "loss": 0.4895, "step": 60100 }, { "epoch": 2.3414691112927715, "grad_norm": 898.8723754882812, "learning_rate": 2.6595819222235197e-05, "loss": 0.4877, "step": 60150 }, { "epoch": 2.3434154696562732, "grad_norm": 32.06769943237305, "learning_rate": 2.657635563860018e-05, "loss": 0.5399, "step": 60200 }, { "epoch": 2.345361828019775, "grad_norm": 17.192026138305664, "learning_rate": 2.655689205496516e-05, "loss": 0.629, "step": 60250 }, { "epoch": 2.3473081863832768, "grad_norm": 46.93745803833008, "learning_rate": 2.6537428471330146e-05, "loss": 0.6335, "step": 60300 }, { "epoch": 2.3492545447467785, "grad_norm": 40.49058151245117, "learning_rate": 2.6517964887695124e-05, "loss": 0.5425, "step": 60350 }, { "epoch": 2.3512009031102807, "grad_norm": 31.499408721923828, "learning_rate": 2.64985013040601e-05, "loss": 0.5252, "step": 60400 }, { "epoch": 2.3531472614737825, "grad_norm": 19.36634063720703, "learning_rate": 2.6479037720425086e-05, "loss": 0.484, "step": 60450 }, { "epoch": 2.3550936198372843, "grad_norm": 5.937497615814209, "learning_rate": 2.6459574136790067e-05, "loss": 0.598, "step": 60500 }, { "epoch": 2.3570399782007865, "grad_norm": 38.68523406982422, "learning_rate": 2.644011055315505e-05, "loss": 0.5854, "step": 60550 }, { "epoch": 2.3589863365642882, "grad_norm": 10.430084228515625, "learning_rate": 2.642064696952003e-05, "loss": 0.4749, "step": 60600 }, { "epoch": 2.36093269492779, "grad_norm": 11.106812477111816, "learning_rate": 2.640118338588501e-05, "loss": 0.5964, "step": 60650 }, { "epoch": 2.3628790532912918, "grad_norm": 18.978046417236328, "learning_rate": 2.6381719802249994e-05, "loss": 0.5461, "step": 60700 }, { "epoch": 2.364825411654794, "grad_norm": 5.686960220336914, "learning_rate": 2.636225621861497e-05, "loss": 0.581, "step": 60750 }, { "epoch": 2.3667717700182958, "grad_norm": 14.132088661193848, "learning_rate": 2.6342792634979953e-05, "loss": 0.6508, "step": 60800 }, { "epoch": 2.3687181283817975, "grad_norm": 14.356369972229004, "learning_rate": 2.6323329051344937e-05, "loss": 0.6616, "step": 60850 }, { "epoch": 2.3706644867452997, "grad_norm": 22.98474884033203, "learning_rate": 2.6303865467709914e-05, "loss": 0.5692, "step": 60900 }, { "epoch": 2.3726108451088015, "grad_norm": 8.788293838500977, "learning_rate": 2.62844018840749e-05, "loss": 0.64, "step": 60950 }, { "epoch": 2.3745572034723033, "grad_norm": 16.29184913635254, "learning_rate": 2.626493830043988e-05, "loss": 0.611, "step": 61000 }, { "epoch": 2.376503561835805, "grad_norm": 41.976314544677734, "learning_rate": 2.6245474716804857e-05, "loss": 0.5603, "step": 61050 }, { "epoch": 2.3784499201993072, "grad_norm": 15.925715446472168, "learning_rate": 2.6226011133169842e-05, "loss": 0.562, "step": 61100 }, { "epoch": 2.380396278562809, "grad_norm": 59.78548812866211, "learning_rate": 2.6206547549534823e-05, "loss": 0.6057, "step": 61150 }, { "epoch": 2.3823426369263108, "grad_norm": 15.637125015258789, "learning_rate": 2.61870839658998e-05, "loss": 0.5729, "step": 61200 }, { "epoch": 2.384288995289813, "grad_norm": 1.3441723585128784, "learning_rate": 2.6167620382264785e-05, "loss": 0.4338, "step": 61250 }, { "epoch": 2.3862353536533147, "grad_norm": 29.00841522216797, "learning_rate": 2.6148156798629762e-05, "loss": 0.6318, "step": 61300 }, { "epoch": 2.3881817120168165, "grad_norm": 30.59292221069336, "learning_rate": 2.612869321499475e-05, "loss": 0.6245, "step": 61350 }, { "epoch": 2.3901280703803183, "grad_norm": 16.62345314025879, "learning_rate": 2.6109229631359728e-05, "loss": 0.5533, "step": 61400 }, { "epoch": 2.3920744287438205, "grad_norm": 10.3213529586792, "learning_rate": 2.6089766047724705e-05, "loss": 0.5108, "step": 61450 }, { "epoch": 2.3940207871073222, "grad_norm": 58.100730895996094, "learning_rate": 2.607030246408969e-05, "loss": 0.4926, "step": 61500 }, { "epoch": 2.395967145470824, "grad_norm": 11.50496768951416, "learning_rate": 2.605083888045467e-05, "loss": 0.6153, "step": 61550 }, { "epoch": 2.397913503834326, "grad_norm": 13.698942184448242, "learning_rate": 2.6031375296819648e-05, "loss": 0.6535, "step": 61600 }, { "epoch": 2.399859862197828, "grad_norm": 20.92922019958496, "learning_rate": 2.6011911713184632e-05, "loss": 0.4765, "step": 61650 }, { "epoch": 2.4018062205613298, "grad_norm": 8.27878189086914, "learning_rate": 2.5992448129549613e-05, "loss": 0.6268, "step": 61700 }, { "epoch": 2.4037525789248315, "grad_norm": 17.331438064575195, "learning_rate": 2.5972984545914598e-05, "loss": 0.5588, "step": 61750 }, { "epoch": 2.4056989372883333, "grad_norm": 43.85171890258789, "learning_rate": 2.5953520962279575e-05, "loss": 0.5779, "step": 61800 }, { "epoch": 2.4076452956518355, "grad_norm": 41.51362991333008, "learning_rate": 2.5934057378644556e-05, "loss": 0.5422, "step": 61850 }, { "epoch": 2.4095916540153373, "grad_norm": 16.486940383911133, "learning_rate": 2.591459379500954e-05, "loss": 0.5262, "step": 61900 }, { "epoch": 2.411538012378839, "grad_norm": 12.562714576721191, "learning_rate": 2.5895130211374518e-05, "loss": 0.551, "step": 61950 }, { "epoch": 2.4134843707423412, "grad_norm": 13.096254348754883, "learning_rate": 2.5875666627739503e-05, "loss": 0.5978, "step": 62000 }, { "epoch": 2.415430729105843, "grad_norm": 13.751553535461426, "learning_rate": 2.5856203044104484e-05, "loss": 0.5271, "step": 62050 }, { "epoch": 2.4173770874693448, "grad_norm": 26.160202026367188, "learning_rate": 2.583673946046946e-05, "loss": 0.5567, "step": 62100 }, { "epoch": 2.4193234458328465, "grad_norm": 19.561321258544922, "learning_rate": 2.5817275876834445e-05, "loss": 0.5096, "step": 62150 }, { "epoch": 2.4212698041963487, "grad_norm": 10.068018913269043, "learning_rate": 2.5797812293199426e-05, "loss": 0.4419, "step": 62200 }, { "epoch": 2.4232161625598505, "grad_norm": 18.259418487548828, "learning_rate": 2.5778348709564404e-05, "loss": 0.63, "step": 62250 }, { "epoch": 2.4251625209233523, "grad_norm": 13.354029655456543, "learning_rate": 2.575888512592939e-05, "loss": 0.4908, "step": 62300 }, { "epoch": 2.4271088792868545, "grad_norm": 9.134654998779297, "learning_rate": 2.5739421542294366e-05, "loss": 0.5148, "step": 62350 }, { "epoch": 2.4290552376503562, "grad_norm": 9.271320343017578, "learning_rate": 2.571995795865935e-05, "loss": 0.5907, "step": 62400 }, { "epoch": 2.431001596013858, "grad_norm": 32.005455017089844, "learning_rate": 2.570049437502433e-05, "loss": 0.6524, "step": 62450 }, { "epoch": 2.43294795437736, "grad_norm": 23.31838035583496, "learning_rate": 2.568103079138931e-05, "loss": 0.4596, "step": 62500 }, { "epoch": 2.434894312740862, "grad_norm": 7.321031093597412, "learning_rate": 2.5661567207754293e-05, "loss": 0.5629, "step": 62550 }, { "epoch": 2.4368406711043638, "grad_norm": 13.157580375671387, "learning_rate": 2.5642103624119274e-05, "loss": 0.5774, "step": 62600 }, { "epoch": 2.4387870294678655, "grad_norm": 22.033737182617188, "learning_rate": 2.5622640040484252e-05, "loss": 0.5844, "step": 62650 }, { "epoch": 2.4407333878313677, "grad_norm": 18.39310073852539, "learning_rate": 2.5603176456849236e-05, "loss": 0.4906, "step": 62700 }, { "epoch": 2.4426797461948695, "grad_norm": 14.769813537597656, "learning_rate": 2.5583712873214217e-05, "loss": 0.5591, "step": 62750 }, { "epoch": 2.4446261045583713, "grad_norm": 55.39284133911133, "learning_rate": 2.55642492895792e-05, "loss": 0.6035, "step": 62800 }, { "epoch": 2.446572462921873, "grad_norm": 2.6999547481536865, "learning_rate": 2.554478570594418e-05, "loss": 0.5516, "step": 62850 }, { "epoch": 2.4485188212853752, "grad_norm": 19.894437789916992, "learning_rate": 2.552532212230916e-05, "loss": 0.5903, "step": 62900 }, { "epoch": 2.450465179648877, "grad_norm": 12.625886917114258, "learning_rate": 2.5505858538674144e-05, "loss": 0.5129, "step": 62950 }, { "epoch": 2.4524115380123788, "grad_norm": 22.07157325744629, "learning_rate": 2.5486394955039122e-05, "loss": 0.5508, "step": 63000 }, { "epoch": 2.454357896375881, "grad_norm": 16.41398811340332, "learning_rate": 2.5466931371404106e-05, "loss": 0.563, "step": 63050 }, { "epoch": 2.4563042547393827, "grad_norm": 32.290157318115234, "learning_rate": 2.5447467787769087e-05, "loss": 0.5229, "step": 63100 }, { "epoch": 2.4582506131028845, "grad_norm": 10.294833183288574, "learning_rate": 2.5428004204134065e-05, "loss": 0.5337, "step": 63150 }, { "epoch": 2.4601969714663863, "grad_norm": 18.237449645996094, "learning_rate": 2.540854062049905e-05, "loss": 0.674, "step": 63200 }, { "epoch": 2.462143329829888, "grad_norm": 30.17572784423828, "learning_rate": 2.5389077036864027e-05, "loss": 0.5418, "step": 63250 }, { "epoch": 2.4640896881933902, "grad_norm": 25.6109676361084, "learning_rate": 2.5369613453229008e-05, "loss": 0.58, "step": 63300 }, { "epoch": 2.466036046556892, "grad_norm": 21.3280029296875, "learning_rate": 2.5350149869593992e-05, "loss": 0.497, "step": 63350 }, { "epoch": 2.467982404920394, "grad_norm": 14.814033508300781, "learning_rate": 2.533068628595897e-05, "loss": 0.4461, "step": 63400 }, { "epoch": 2.469928763283896, "grad_norm": 39.753116607666016, "learning_rate": 2.5311222702323954e-05, "loss": 0.6074, "step": 63450 }, { "epoch": 2.4718751216473978, "grad_norm": 6.742668628692627, "learning_rate": 2.5291759118688935e-05, "loss": 0.4999, "step": 63500 }, { "epoch": 2.4738214800108995, "grad_norm": 33.80112838745117, "learning_rate": 2.5272295535053913e-05, "loss": 0.5526, "step": 63550 }, { "epoch": 2.4757678383744013, "grad_norm": 22.939037322998047, "learning_rate": 2.5252831951418897e-05, "loss": 0.5259, "step": 63600 }, { "epoch": 2.4777141967379035, "grad_norm": 28.222105026245117, "learning_rate": 2.5233368367783878e-05, "loss": 0.5663, "step": 63650 }, { "epoch": 2.4796605551014053, "grad_norm": 5.831964015960693, "learning_rate": 2.5213904784148856e-05, "loss": 0.6388, "step": 63700 }, { "epoch": 2.481606913464907, "grad_norm": 170.19024658203125, "learning_rate": 2.519444120051384e-05, "loss": 0.4954, "step": 63750 }, { "epoch": 2.4835532718284092, "grad_norm": 57.314456939697266, "learning_rate": 2.517497761687882e-05, "loss": 0.5948, "step": 63800 }, { "epoch": 2.485499630191911, "grad_norm": 8.98520565032959, "learning_rate": 2.5155514033243805e-05, "loss": 0.5989, "step": 63850 }, { "epoch": 2.4874459885554128, "grad_norm": 8.92178726196289, "learning_rate": 2.5136050449608783e-05, "loss": 0.6241, "step": 63900 }, { "epoch": 2.4893923469189145, "grad_norm": 37.40452575683594, "learning_rate": 2.5116586865973764e-05, "loss": 0.5148, "step": 63950 }, { "epoch": 2.4913387052824167, "grad_norm": 15.853340148925781, "learning_rate": 2.5097123282338748e-05, "loss": 0.5525, "step": 64000 }, { "epoch": 2.4932850636459185, "grad_norm": 14.418586730957031, "learning_rate": 2.5077659698703726e-05, "loss": 0.5492, "step": 64050 }, { "epoch": 2.4952314220094203, "grad_norm": 347.91998291015625, "learning_rate": 2.5058196115068707e-05, "loss": 0.5725, "step": 64100 }, { "epoch": 2.4971777803729225, "grad_norm": 6.751857280731201, "learning_rate": 2.503873253143369e-05, "loss": 0.5487, "step": 64150 }, { "epoch": 2.4991241387364242, "grad_norm": 15.643485069274902, "learning_rate": 2.501926894779867e-05, "loss": 0.5397, "step": 64200 }, { "epoch": 2.501070497099926, "grad_norm": 20.34881019592285, "learning_rate": 2.499980536416365e-05, "loss": 0.6085, "step": 64250 }, { "epoch": 2.503016855463428, "grad_norm": 29.934282302856445, "learning_rate": 2.498034178052863e-05, "loss": 0.5686, "step": 64300 }, { "epoch": 2.5049632138269295, "grad_norm": 11.598827362060547, "learning_rate": 2.4960878196893615e-05, "loss": 0.5543, "step": 64350 }, { "epoch": 2.5069095721904318, "grad_norm": 6.270168304443359, "learning_rate": 2.4941414613258596e-05, "loss": 0.5192, "step": 64400 }, { "epoch": 2.5088559305539335, "grad_norm": 4.7610554695129395, "learning_rate": 2.4921951029623573e-05, "loss": 0.5976, "step": 64450 }, { "epoch": 2.5108022889174357, "grad_norm": 16.913850784301758, "learning_rate": 2.4902487445988558e-05, "loss": 0.5963, "step": 64500 }, { "epoch": 2.5127486472809375, "grad_norm": 40.02470397949219, "learning_rate": 2.488302386235354e-05, "loss": 0.528, "step": 64550 }, { "epoch": 2.5146950056444393, "grad_norm": 67.5257568359375, "learning_rate": 2.486356027871852e-05, "loss": 0.5846, "step": 64600 }, { "epoch": 2.516641364007941, "grad_norm": 10.313309669494629, "learning_rate": 2.48440966950835e-05, "loss": 0.5307, "step": 64650 }, { "epoch": 2.518587722371443, "grad_norm": 54.307090759277344, "learning_rate": 2.4824633111448482e-05, "loss": 0.4918, "step": 64700 }, { "epoch": 2.520534080734945, "grad_norm": 13.13563346862793, "learning_rate": 2.4805169527813463e-05, "loss": 0.5206, "step": 64750 }, { "epoch": 2.5224804390984468, "grad_norm": 35.22493362426758, "learning_rate": 2.4785705944178444e-05, "loss": 0.6548, "step": 64800 }, { "epoch": 2.5244267974619485, "grad_norm": 34.032325744628906, "learning_rate": 2.4766242360543425e-05, "loss": 0.5553, "step": 64850 }, { "epoch": 2.5263731558254507, "grad_norm": 16.939971923828125, "learning_rate": 2.4746778776908406e-05, "loss": 0.5799, "step": 64900 }, { "epoch": 2.5283195141889525, "grad_norm": 31.05055046081543, "learning_rate": 2.4727315193273387e-05, "loss": 0.5144, "step": 64950 }, { "epoch": 2.5302658725524543, "grad_norm": 7.136385917663574, "learning_rate": 2.4707851609638368e-05, "loss": 0.5643, "step": 65000 }, { "epoch": 2.532212230915956, "grad_norm": 15.558509826660156, "learning_rate": 2.468838802600335e-05, "loss": 0.5316, "step": 65050 }, { "epoch": 2.5341585892794583, "grad_norm": 10.731505393981934, "learning_rate": 2.466892444236833e-05, "loss": 0.6296, "step": 65100 }, { "epoch": 2.53610494764296, "grad_norm": 32.11027526855469, "learning_rate": 2.464946085873331e-05, "loss": 0.6929, "step": 65150 }, { "epoch": 2.538051306006462, "grad_norm": 13.802175521850586, "learning_rate": 2.4629997275098295e-05, "loss": 0.5114, "step": 65200 }, { "epoch": 2.539997664369964, "grad_norm": 16.469348907470703, "learning_rate": 2.4610533691463272e-05, "loss": 0.6105, "step": 65250 }, { "epoch": 2.5419440227334658, "grad_norm": 4.4951324462890625, "learning_rate": 2.4591070107828253e-05, "loss": 0.5174, "step": 65300 }, { "epoch": 2.5438903810969675, "grad_norm": 16.808975219726562, "learning_rate": 2.4571606524193234e-05, "loss": 0.5534, "step": 65350 }, { "epoch": 2.5458367394604693, "grad_norm": 9.593555450439453, "learning_rate": 2.455214294055822e-05, "loss": 0.6568, "step": 65400 }, { "epoch": 2.5477830978239715, "grad_norm": 34.4417724609375, "learning_rate": 2.4532679356923196e-05, "loss": 0.5252, "step": 65450 }, { "epoch": 2.5497294561874733, "grad_norm": 25.91260528564453, "learning_rate": 2.4513215773288177e-05, "loss": 0.5411, "step": 65500 }, { "epoch": 2.551675814550975, "grad_norm": 21.93971824645996, "learning_rate": 2.449375218965316e-05, "loss": 0.7136, "step": 65550 }, { "epoch": 2.5536221729144772, "grad_norm": 3.1900734901428223, "learning_rate": 2.4474288606018143e-05, "loss": 0.568, "step": 65600 }, { "epoch": 2.555568531277979, "grad_norm": 21.25973129272461, "learning_rate": 2.445482502238312e-05, "loss": 0.6328, "step": 65650 }, { "epoch": 2.5575148896414808, "grad_norm": 22.059242248535156, "learning_rate": 2.44353614387481e-05, "loss": 0.5428, "step": 65700 }, { "epoch": 2.5594612480049825, "grad_norm": 3.39290452003479, "learning_rate": 2.4415897855113086e-05, "loss": 0.5353, "step": 65750 }, { "epoch": 2.5614076063684843, "grad_norm": 15.536284446716309, "learning_rate": 2.4396434271478066e-05, "loss": 0.6377, "step": 65800 }, { "epoch": 2.5633539647319865, "grad_norm": 11.289787292480469, "learning_rate": 2.4376970687843047e-05, "loss": 0.5066, "step": 65850 }, { "epoch": 2.5653003230954883, "grad_norm": 20.887380599975586, "learning_rate": 2.435750710420803e-05, "loss": 0.6174, "step": 65900 }, { "epoch": 2.5672466814589905, "grad_norm": 48.976375579833984, "learning_rate": 2.433804352057301e-05, "loss": 0.6247, "step": 65950 }, { "epoch": 2.5691930398224923, "grad_norm": 10.934263229370117, "learning_rate": 2.431857993693799e-05, "loss": 0.6379, "step": 66000 }, { "epoch": 2.571139398185994, "grad_norm": 6.00993013381958, "learning_rate": 2.429911635330297e-05, "loss": 0.534, "step": 66050 }, { "epoch": 2.573085756549496, "grad_norm": 19.14497947692871, "learning_rate": 2.4279652769667952e-05, "loss": 0.6411, "step": 66100 }, { "epoch": 2.5750321149129975, "grad_norm": 7.02522611618042, "learning_rate": 2.4260578457705636e-05, "loss": 0.5744, "step": 66150 }, { "epoch": 2.5769784732764998, "grad_norm": 5.243693828582764, "learning_rate": 2.4241114874070614e-05, "loss": 0.5071, "step": 66200 }, { "epoch": 2.5789248316400015, "grad_norm": 4.916667938232422, "learning_rate": 2.4221651290435595e-05, "loss": 0.5958, "step": 66250 }, { "epoch": 2.5808711900035033, "grad_norm": 56.90110397338867, "learning_rate": 2.4202187706800576e-05, "loss": 0.5174, "step": 66300 }, { "epoch": 2.5828175483670055, "grad_norm": 6.805826187133789, "learning_rate": 2.418272412316556e-05, "loss": 0.4951, "step": 66350 }, { "epoch": 2.5847639067305073, "grad_norm": 34.80731964111328, "learning_rate": 2.4163260539530538e-05, "loss": 0.5927, "step": 66400 }, { "epoch": 2.586710265094009, "grad_norm": 18.449344635009766, "learning_rate": 2.414379695589552e-05, "loss": 0.5749, "step": 66450 }, { "epoch": 2.588656623457511, "grad_norm": 16.757694244384766, "learning_rate": 2.4124333372260503e-05, "loss": 0.6219, "step": 66500 }, { "epoch": 2.590602981821013, "grad_norm": 20.08295249938965, "learning_rate": 2.4104869788625484e-05, "loss": 0.6013, "step": 66550 }, { "epoch": 2.5925493401845148, "grad_norm": 4.350827693939209, "learning_rate": 2.4085406204990462e-05, "loss": 0.573, "step": 66600 }, { "epoch": 2.5944956985480165, "grad_norm": 7.054327964782715, "learning_rate": 2.4065942621355446e-05, "loss": 0.5886, "step": 66650 }, { "epoch": 2.5964420569115187, "grad_norm": 4.51849889755249, "learning_rate": 2.4046479037720427e-05, "loss": 0.6009, "step": 66700 }, { "epoch": 2.5983884152750205, "grad_norm": 16.424898147583008, "learning_rate": 2.4027015454085408e-05, "loss": 0.5317, "step": 66750 }, { "epoch": 2.6003347736385223, "grad_norm": 12.883808135986328, "learning_rate": 2.4007551870450386e-05, "loss": 0.6652, "step": 66800 }, { "epoch": 2.602281132002024, "grad_norm": 36.89630126953125, "learning_rate": 2.398808828681537e-05, "loss": 0.4647, "step": 66850 }, { "epoch": 2.6042274903655263, "grad_norm": 25.707841873168945, "learning_rate": 2.396862470318035e-05, "loss": 0.5325, "step": 66900 }, { "epoch": 2.606173848729028, "grad_norm": 20.153709411621094, "learning_rate": 2.3949161119545332e-05, "loss": 0.4707, "step": 66950 }, { "epoch": 2.60812020709253, "grad_norm": 8.54288387298584, "learning_rate": 2.3929697535910313e-05, "loss": 0.5841, "step": 67000 }, { "epoch": 2.610066565456032, "grad_norm": 300.13067626953125, "learning_rate": 2.3910233952275294e-05, "loss": 0.589, "step": 67050 }, { "epoch": 2.6120129238195338, "grad_norm": 13.120031356811523, "learning_rate": 2.3890770368640275e-05, "loss": 0.5702, "step": 67100 }, { "epoch": 2.6139592821830355, "grad_norm": 13.52970027923584, "learning_rate": 2.3871306785005256e-05, "loss": 0.579, "step": 67150 }, { "epoch": 2.6159056405465373, "grad_norm": 13.045538902282715, "learning_rate": 2.385184320137024e-05, "loss": 0.5809, "step": 67200 }, { "epoch": 2.617851998910039, "grad_norm": 27.501998901367188, "learning_rate": 2.3832379617735218e-05, "loss": 0.617, "step": 67250 }, { "epoch": 2.6197983572735413, "grad_norm": 17.684951782226562, "learning_rate": 2.38129160341002e-05, "loss": 0.6035, "step": 67300 }, { "epoch": 2.621744715637043, "grad_norm": 7.844106674194336, "learning_rate": 2.379345245046518e-05, "loss": 0.4959, "step": 67350 }, { "epoch": 2.6236910740005452, "grad_norm": 14.171538352966309, "learning_rate": 2.3773988866830164e-05, "loss": 0.447, "step": 67400 }, { "epoch": 2.625637432364047, "grad_norm": 86.60087585449219, "learning_rate": 2.375452528319514e-05, "loss": 0.5889, "step": 67450 }, { "epoch": 2.6275837907275488, "grad_norm": 21.38888931274414, "learning_rate": 2.3735061699560123e-05, "loss": 0.5071, "step": 67500 }, { "epoch": 2.6295301490910505, "grad_norm": 43.21151351928711, "learning_rate": 2.3715598115925107e-05, "loss": 0.5675, "step": 67550 }, { "epoch": 2.6314765074545523, "grad_norm": 10.768310546875, "learning_rate": 2.3696523803962788e-05, "loss": 0.6087, "step": 67600 }, { "epoch": 2.6334228658180545, "grad_norm": 10.908760070800781, "learning_rate": 2.367706022032777e-05, "loss": 0.6118, "step": 67650 }, { "epoch": 2.6353692241815563, "grad_norm": 3.4283225536346436, "learning_rate": 2.365759663669275e-05, "loss": 0.5192, "step": 67700 }, { "epoch": 2.637315582545058, "grad_norm": 24.067216873168945, "learning_rate": 2.3638133053057727e-05, "loss": 0.5745, "step": 67750 }, { "epoch": 2.6392619409085603, "grad_norm": 52.83134460449219, "learning_rate": 2.361866946942271e-05, "loss": 0.6573, "step": 67800 }, { "epoch": 2.641208299272062, "grad_norm": 14.856293678283691, "learning_rate": 2.3599205885787693e-05, "loss": 0.5007, "step": 67850 }, { "epoch": 2.643154657635564, "grad_norm": 11.778996467590332, "learning_rate": 2.3579742302152674e-05, "loss": 0.5212, "step": 67900 }, { "epoch": 2.6451010159990656, "grad_norm": 55.12577438354492, "learning_rate": 2.3560278718517655e-05, "loss": 0.6058, "step": 67950 }, { "epoch": 2.6470473743625678, "grad_norm": 2.112765073776245, "learning_rate": 2.3540815134882635e-05, "loss": 0.52, "step": 68000 }, { "epoch": 2.6489937327260695, "grad_norm": 21.850738525390625, "learning_rate": 2.3521351551247616e-05, "loss": 0.4727, "step": 68050 }, { "epoch": 2.6509400910895713, "grad_norm": 27.86273193359375, "learning_rate": 2.3501887967612597e-05, "loss": 0.5234, "step": 68100 }, { "epoch": 2.6528864494530735, "grad_norm": 0.7597148418426514, "learning_rate": 2.3482424383977582e-05, "loss": 0.5066, "step": 68150 }, { "epoch": 2.6548328078165753, "grad_norm": 12.95754623413086, "learning_rate": 2.346296080034256e-05, "loss": 0.717, "step": 68200 }, { "epoch": 2.656779166180077, "grad_norm": 17.306852340698242, "learning_rate": 2.344349721670754e-05, "loss": 0.5363, "step": 68250 }, { "epoch": 2.658725524543579, "grad_norm": 13.701223373413086, "learning_rate": 2.342403363307252e-05, "loss": 0.5885, "step": 68300 }, { "epoch": 2.660671882907081, "grad_norm": 17.2751522064209, "learning_rate": 2.3404570049437506e-05, "loss": 0.4723, "step": 68350 }, { "epoch": 2.6626182412705828, "grad_norm": 9.467267036437988, "learning_rate": 2.3385106465802483e-05, "loss": 0.4794, "step": 68400 }, { "epoch": 2.6645645996340845, "grad_norm": 11.665079116821289, "learning_rate": 2.3365642882167464e-05, "loss": 0.523, "step": 68450 }, { "epoch": 2.6665109579975868, "grad_norm": 32.19440841674805, "learning_rate": 2.334617929853245e-05, "loss": 0.5524, "step": 68500 }, { "epoch": 2.6684573163610885, "grad_norm": 14.884758949279785, "learning_rate": 2.332671571489743e-05, "loss": 0.4947, "step": 68550 }, { "epoch": 2.6704036747245903, "grad_norm": 22.707279205322266, "learning_rate": 2.3307252131262407e-05, "loss": 0.6041, "step": 68600 }, { "epoch": 2.672350033088092, "grad_norm": 21.368370056152344, "learning_rate": 2.328778854762739e-05, "loss": 0.5006, "step": 68650 }, { "epoch": 2.674296391451594, "grad_norm": 6.495297908782959, "learning_rate": 2.3268324963992372e-05, "loss": 0.5849, "step": 68700 }, { "epoch": 2.676242749815096, "grad_norm": 10.624874114990234, "learning_rate": 2.3248861380357353e-05, "loss": 0.4853, "step": 68750 }, { "epoch": 2.678189108178598, "grad_norm": 13.924145698547363, "learning_rate": 2.322939779672233e-05, "loss": 0.5616, "step": 68800 }, { "epoch": 2.6801354665421, "grad_norm": 8.935267448425293, "learning_rate": 2.3209934213087315e-05, "loss": 0.5376, "step": 68850 }, { "epoch": 2.6820818249056018, "grad_norm": 169.68377685546875, "learning_rate": 2.3190470629452296e-05, "loss": 0.5875, "step": 68900 }, { "epoch": 2.6840281832691035, "grad_norm": 5.473453044891357, "learning_rate": 2.3171007045817277e-05, "loss": 0.4686, "step": 68950 }, { "epoch": 2.6859745416326053, "grad_norm": 60.169010162353516, "learning_rate": 2.3151543462182258e-05, "loss": 0.4685, "step": 69000 }, { "epoch": 2.687920899996107, "grad_norm": 11.92808723449707, "learning_rate": 2.313207987854724e-05, "loss": 0.4891, "step": 69050 }, { "epoch": 2.6898672583596093, "grad_norm": 10.702982902526855, "learning_rate": 2.311261629491222e-05, "loss": 0.6011, "step": 69100 }, { "epoch": 2.691813616723111, "grad_norm": 40.390830993652344, "learning_rate": 2.30931527112772e-05, "loss": 0.5457, "step": 69150 }, { "epoch": 2.693759975086613, "grad_norm": 12.831649780273438, "learning_rate": 2.3073689127642182e-05, "loss": 0.6619, "step": 69200 }, { "epoch": 2.695706333450115, "grad_norm": 13.396260261535645, "learning_rate": 2.3054225544007163e-05, "loss": 0.5062, "step": 69250 }, { "epoch": 2.6976526918136168, "grad_norm": 7.836676597595215, "learning_rate": 2.3034761960372144e-05, "loss": 0.5924, "step": 69300 }, { "epoch": 2.6995990501771185, "grad_norm": 11.53304386138916, "learning_rate": 2.3015298376737125e-05, "loss": 0.5212, "step": 69350 }, { "epoch": 2.7015454085406203, "grad_norm": 10.424164772033691, "learning_rate": 2.299583479310211e-05, "loss": 0.6097, "step": 69400 }, { "epoch": 2.7034917669041225, "grad_norm": 21.44893455505371, "learning_rate": 2.2976371209467087e-05, "loss": 0.5436, "step": 69450 }, { "epoch": 2.7054381252676243, "grad_norm": 22.725017547607422, "learning_rate": 2.2956907625832068e-05, "loss": 0.4383, "step": 69500 }, { "epoch": 2.707384483631126, "grad_norm": 3.391291618347168, "learning_rate": 2.2937444042197052e-05, "loss": 0.5859, "step": 69550 }, { "epoch": 2.7093308419946283, "grad_norm": 1.6060655117034912, "learning_rate": 2.2918369730234733e-05, "loss": 0.5572, "step": 69600 }, { "epoch": 2.71127720035813, "grad_norm": 8.559743881225586, "learning_rate": 2.2898906146599714e-05, "loss": 0.5935, "step": 69650 }, { "epoch": 2.713223558721632, "grad_norm": 22.139394760131836, "learning_rate": 2.2879442562964695e-05, "loss": 0.5733, "step": 69700 }, { "epoch": 2.7151699170851336, "grad_norm": 20.233442306518555, "learning_rate": 2.2859978979329673e-05, "loss": 0.5216, "step": 69750 }, { "epoch": 2.7171162754486358, "grad_norm": 7.153957843780518, "learning_rate": 2.2840515395694657e-05, "loss": 0.6699, "step": 69800 }, { "epoch": 2.7190626338121375, "grad_norm": 19.483787536621094, "learning_rate": 2.2821051812059638e-05, "loss": 0.4598, "step": 69850 }, { "epoch": 2.7210089921756393, "grad_norm": 40.41450119018555, "learning_rate": 2.280158822842462e-05, "loss": 0.4522, "step": 69900 }, { "epoch": 2.7229553505391415, "grad_norm": 12.804308891296387, "learning_rate": 2.27821246447896e-05, "loss": 0.6517, "step": 69950 }, { "epoch": 2.7249017089026433, "grad_norm": 18.493148803710938, "learning_rate": 2.276266106115458e-05, "loss": 0.5522, "step": 70000 }, { "epoch": 2.726848067266145, "grad_norm": 32.01470947265625, "learning_rate": 2.2743197477519562e-05, "loss": 0.5321, "step": 70050 }, { "epoch": 2.728794425629647, "grad_norm": 11.052045822143555, "learning_rate": 2.2723733893884543e-05, "loss": 0.5202, "step": 70100 }, { "epoch": 2.7307407839931486, "grad_norm": 9.517524719238281, "learning_rate": 2.2704270310249524e-05, "loss": 0.5727, "step": 70150 }, { "epoch": 2.7326871423566508, "grad_norm": 27.8685302734375, "learning_rate": 2.2684806726614505e-05, "loss": 0.6681, "step": 70200 }, { "epoch": 2.7346335007201525, "grad_norm": 9.710700035095215, "learning_rate": 2.2665343142979486e-05, "loss": 0.5525, "step": 70250 }, { "epoch": 2.7365798590836548, "grad_norm": 19.894451141357422, "learning_rate": 2.264587955934447e-05, "loss": 0.5693, "step": 70300 }, { "epoch": 2.7385262174471565, "grad_norm": 12.87425422668457, "learning_rate": 2.2626415975709448e-05, "loss": 0.5822, "step": 70350 }, { "epoch": 2.7404725758106583, "grad_norm": 6.128742218017578, "learning_rate": 2.260695239207443e-05, "loss": 0.6138, "step": 70400 }, { "epoch": 2.74241893417416, "grad_norm": 47.86465072631836, "learning_rate": 2.258748880843941e-05, "loss": 0.5837, "step": 70450 }, { "epoch": 2.744365292537662, "grad_norm": 13.290578842163086, "learning_rate": 2.2568025224804394e-05, "loss": 0.674, "step": 70500 }, { "epoch": 2.746311650901164, "grad_norm": 14.747990608215332, "learning_rate": 2.2548561641169375e-05, "loss": 0.5126, "step": 70550 }, { "epoch": 2.748258009264666, "grad_norm": 3.8355839252471924, "learning_rate": 2.2529098057534352e-05, "loss": 0.5905, "step": 70600 }, { "epoch": 2.7502043676281676, "grad_norm": 16.76650047302246, "learning_rate": 2.2509634473899337e-05, "loss": 0.5046, "step": 70650 }, { "epoch": 2.7521507259916698, "grad_norm": 140.04237365722656, "learning_rate": 2.2490170890264318e-05, "loss": 0.572, "step": 70700 }, { "epoch": 2.7540970843551715, "grad_norm": 15.58576488494873, "learning_rate": 2.24707073066293e-05, "loss": 0.5037, "step": 70750 }, { "epoch": 2.7560434427186733, "grad_norm": 15.735386848449707, "learning_rate": 2.2451243722994276e-05, "loss": 0.5641, "step": 70800 }, { "epoch": 2.757989801082175, "grad_norm": 4.819324970245361, "learning_rate": 2.243178013935926e-05, "loss": 0.5805, "step": 70850 }, { "epoch": 2.7599361594456773, "grad_norm": 24.187326431274414, "learning_rate": 2.241231655572424e-05, "loss": 0.5997, "step": 70900 }, { "epoch": 2.761882517809179, "grad_norm": 11.869930267333984, "learning_rate": 2.2392852972089223e-05, "loss": 0.4783, "step": 70950 }, { "epoch": 2.763828876172681, "grad_norm": 8.527299880981445, "learning_rate": 2.2373389388454204e-05, "loss": 0.6576, "step": 71000 }, { "epoch": 2.765775234536183, "grad_norm": 10.338357925415039, "learning_rate": 2.2353925804819185e-05, "loss": 0.7262, "step": 71050 }, { "epoch": 2.767721592899685, "grad_norm": 11.733469009399414, "learning_rate": 2.2334462221184166e-05, "loss": 0.6113, "step": 71100 }, { "epoch": 2.7696679512631865, "grad_norm": 0.730152428150177, "learning_rate": 2.2314998637549147e-05, "loss": 0.5273, "step": 71150 }, { "epoch": 2.7716143096266883, "grad_norm": 74.51802825927734, "learning_rate": 2.2295535053914128e-05, "loss": 0.5019, "step": 71200 }, { "epoch": 2.7735606679901905, "grad_norm": 5.555564880371094, "learning_rate": 2.227607147027911e-05, "loss": 0.6407, "step": 71250 }, { "epoch": 2.7755070263536923, "grad_norm": 113.8477783203125, "learning_rate": 2.225660788664409e-05, "loss": 0.5464, "step": 71300 }, { "epoch": 2.777453384717194, "grad_norm": 13.764908790588379, "learning_rate": 2.223714430300907e-05, "loss": 0.5323, "step": 71350 }, { "epoch": 2.7793997430806963, "grad_norm": 5.706633567810059, "learning_rate": 2.221768071937405e-05, "loss": 0.5061, "step": 71400 }, { "epoch": 2.781346101444198, "grad_norm": 12.058531761169434, "learning_rate": 2.2198217135739032e-05, "loss": 0.6096, "step": 71450 }, { "epoch": 2.7832924598077, "grad_norm": 13.848576545715332, "learning_rate": 2.2178753552104013e-05, "loss": 0.6192, "step": 71500 }, { "epoch": 2.7852388181712016, "grad_norm": 58.06562805175781, "learning_rate": 2.2159289968468998e-05, "loss": 0.4994, "step": 71550 }, { "epoch": 2.7871851765347033, "grad_norm": 18.642051696777344, "learning_rate": 2.2139826384833975e-05, "loss": 0.5801, "step": 71600 }, { "epoch": 2.7891315348982055, "grad_norm": 20.672319412231445, "learning_rate": 2.212075207287166e-05, "loss": 0.4963, "step": 71650 }, { "epoch": 2.7910778932617073, "grad_norm": 6.7827677726745605, "learning_rate": 2.210128848923664e-05, "loss": 0.4895, "step": 71700 }, { "epoch": 2.7930242516252095, "grad_norm": 8.469505310058594, "learning_rate": 2.208182490560162e-05, "loss": 0.524, "step": 71750 }, { "epoch": 2.7949706099887113, "grad_norm": 5.165273189544678, "learning_rate": 2.2062361321966602e-05, "loss": 0.5644, "step": 71800 }, { "epoch": 2.796916968352213, "grad_norm": 29.827129364013672, "learning_rate": 2.2042897738331583e-05, "loss": 0.5257, "step": 71850 }, { "epoch": 2.798863326715715, "grad_norm": 7.851189136505127, "learning_rate": 2.2023434154696564e-05, "loss": 0.5035, "step": 71900 }, { "epoch": 2.8008096850792166, "grad_norm": 88.9677505493164, "learning_rate": 2.2003970571061545e-05, "loss": 0.4902, "step": 71950 }, { "epoch": 2.802756043442719, "grad_norm": 13.825353622436523, "learning_rate": 2.1984506987426526e-05, "loss": 0.5658, "step": 72000 }, { "epoch": 2.8047024018062205, "grad_norm": 28.517993927001953, "learning_rate": 2.1965043403791507e-05, "loss": 0.4992, "step": 72050 }, { "epoch": 2.8066487601697223, "grad_norm": 3.133042097091675, "learning_rate": 2.1945579820156488e-05, "loss": 0.5517, "step": 72100 }, { "epoch": 2.8085951185332245, "grad_norm": 14.259151458740234, "learning_rate": 2.192611623652147e-05, "loss": 0.5733, "step": 72150 }, { "epoch": 2.8105414768967263, "grad_norm": 10.55013656616211, "learning_rate": 2.190665265288645e-05, "loss": 0.485, "step": 72200 }, { "epoch": 2.812487835260228, "grad_norm": 21.911157608032227, "learning_rate": 2.188718906925143e-05, "loss": 0.5746, "step": 72250 }, { "epoch": 2.81443419362373, "grad_norm": 12.012662887573242, "learning_rate": 2.1867725485616415e-05, "loss": 0.4978, "step": 72300 }, { "epoch": 2.816380551987232, "grad_norm": 10.22600269317627, "learning_rate": 2.1848261901981393e-05, "loss": 0.6547, "step": 72350 }, { "epoch": 2.818326910350734, "grad_norm": 34.39353942871094, "learning_rate": 2.1828798318346374e-05, "loss": 0.601, "step": 72400 }, { "epoch": 2.8202732687142356, "grad_norm": 2.3234455585479736, "learning_rate": 2.1809334734711355e-05, "loss": 0.6319, "step": 72450 }, { "epoch": 2.8222196270777378, "grad_norm": 9.056440353393555, "learning_rate": 2.178987115107634e-05, "loss": 0.4516, "step": 72500 }, { "epoch": 2.8241659854412395, "grad_norm": 32.581031799316406, "learning_rate": 2.1770407567441317e-05, "loss": 0.5686, "step": 72550 }, { "epoch": 2.8261123438047413, "grad_norm": 3.7688870429992676, "learning_rate": 2.1750943983806298e-05, "loss": 0.4785, "step": 72600 }, { "epoch": 2.828058702168243, "grad_norm": 2.9451708793640137, "learning_rate": 2.1731480400171282e-05, "loss": 0.5603, "step": 72650 }, { "epoch": 2.830005060531745, "grad_norm": 26.34090805053711, "learning_rate": 2.1712016816536263e-05, "loss": 0.5538, "step": 72700 }, { "epoch": 2.831951418895247, "grad_norm": 18.88164710998535, "learning_rate": 2.169255323290124e-05, "loss": 0.5125, "step": 72750 }, { "epoch": 2.833897777258749, "grad_norm": 24.33026885986328, "learning_rate": 2.1673089649266222e-05, "loss": 0.5602, "step": 72800 }, { "epoch": 2.835844135622251, "grad_norm": 9.198532104492188, "learning_rate": 2.1653626065631206e-05, "loss": 0.4839, "step": 72850 }, { "epoch": 2.837790493985753, "grad_norm": 6.660632133483887, "learning_rate": 2.1634162481996187e-05, "loss": 0.5929, "step": 72900 }, { "epoch": 2.8397368523492545, "grad_norm": 2.15732741355896, "learning_rate": 2.1614698898361168e-05, "loss": 0.4533, "step": 72950 }, { "epoch": 2.8416832107127563, "grad_norm": 16.598106384277344, "learning_rate": 2.159523531472615e-05, "loss": 0.4882, "step": 73000 }, { "epoch": 2.843629569076258, "grad_norm": 3.8987197875976562, "learning_rate": 2.157577173109113e-05, "loss": 0.5192, "step": 73050 }, { "epoch": 2.8455759274397603, "grad_norm": 13.79594612121582, "learning_rate": 2.155630814745611e-05, "loss": 0.4734, "step": 73100 }, { "epoch": 2.847522285803262, "grad_norm": 6.378300666809082, "learning_rate": 2.1536844563821092e-05, "loss": 0.6149, "step": 73150 }, { "epoch": 2.849468644166764, "grad_norm": 7.007735729217529, "learning_rate": 2.1517380980186073e-05, "loss": 0.4984, "step": 73200 }, { "epoch": 2.851415002530266, "grad_norm": 105.3277587890625, "learning_rate": 2.1497917396551054e-05, "loss": 0.5954, "step": 73250 }, { "epoch": 2.853361360893768, "grad_norm": 37.22242736816406, "learning_rate": 2.1478453812916035e-05, "loss": 0.6125, "step": 73300 }, { "epoch": 2.8553077192572696, "grad_norm": 21.837495803833008, "learning_rate": 2.1458990229281016e-05, "loss": 0.6268, "step": 73350 }, { "epoch": 2.8572540776207713, "grad_norm": 45.69218444824219, "learning_rate": 2.1439526645645997e-05, "loss": 0.5652, "step": 73400 }, { "epoch": 2.8592004359842735, "grad_norm": 37.758846282958984, "learning_rate": 2.1420063062010978e-05, "loss": 0.5277, "step": 73450 }, { "epoch": 2.8611467943477753, "grad_norm": 21.433246612548828, "learning_rate": 2.140059947837596e-05, "loss": 0.6263, "step": 73500 }, { "epoch": 2.863093152711277, "grad_norm": 32.41407012939453, "learning_rate": 2.1381135894740943e-05, "loss": 0.5764, "step": 73550 }, { "epoch": 2.8650395110747793, "grad_norm": 13.34726333618164, "learning_rate": 2.136167231110592e-05, "loss": 0.5702, "step": 73600 }, { "epoch": 2.866985869438281, "grad_norm": 5.143879413604736, "learning_rate": 2.13422087274709e-05, "loss": 0.5374, "step": 73650 }, { "epoch": 2.868932227801783, "grad_norm": 15.54714584350586, "learning_rate": 2.1322745143835886e-05, "loss": 0.5202, "step": 73700 }, { "epoch": 2.8708785861652846, "grad_norm": 14.201764106750488, "learning_rate": 2.1303281560200867e-05, "loss": 0.5658, "step": 73750 }, { "epoch": 2.872824944528787, "grad_norm": 14.26060962677002, "learning_rate": 2.1283817976565845e-05, "loss": 0.5765, "step": 73800 }, { "epoch": 2.8747713028922885, "grad_norm": 21.087759017944336, "learning_rate": 2.1264354392930825e-05, "loss": 0.5907, "step": 73850 }, { "epoch": 2.8767176612557903, "grad_norm": 22.039127349853516, "learning_rate": 2.124489080929581e-05, "loss": 0.5557, "step": 73900 }, { "epoch": 2.8786640196192925, "grad_norm": 98.8547592163086, "learning_rate": 2.122581649733349e-05, "loss": 0.5709, "step": 73950 }, { "epoch": 2.8806103779827943, "grad_norm": 47.808509826660156, "learning_rate": 2.120635291369847e-05, "loss": 0.5946, "step": 74000 }, { "epoch": 2.882556736346296, "grad_norm": 12.724800109863281, "learning_rate": 2.1186889330063453e-05, "loss": 0.5857, "step": 74050 }, { "epoch": 2.884503094709798, "grad_norm": 3.2288455963134766, "learning_rate": 2.1167425746428433e-05, "loss": 0.509, "step": 74100 }, { "epoch": 2.8864494530732996, "grad_norm": 48.66091537475586, "learning_rate": 2.1147962162793414e-05, "loss": 0.5915, "step": 74150 }, { "epoch": 2.888395811436802, "grad_norm": 35.32694625854492, "learning_rate": 2.1128498579158395e-05, "loss": 0.5644, "step": 74200 }, { "epoch": 2.8903421698003036, "grad_norm": 16.189754486083984, "learning_rate": 2.1109034995523376e-05, "loss": 0.5843, "step": 74250 }, { "epoch": 2.8922885281638058, "grad_norm": 4.124429225921631, "learning_rate": 2.108957141188836e-05, "loss": 0.5341, "step": 74300 }, { "epoch": 2.8942348865273075, "grad_norm": 76.46636199951172, "learning_rate": 2.107010782825334e-05, "loss": 0.5634, "step": 74350 }, { "epoch": 2.8961812448908093, "grad_norm": 84.87883758544922, "learning_rate": 2.105064424461832e-05, "loss": 0.6298, "step": 74400 }, { "epoch": 2.898127603254311, "grad_norm": 12.338223457336426, "learning_rate": 2.10311806609833e-05, "loss": 0.5399, "step": 74450 }, { "epoch": 2.900073961617813, "grad_norm": 5.213508605957031, "learning_rate": 2.1011717077348285e-05, "loss": 0.6384, "step": 74500 }, { "epoch": 2.902020319981315, "grad_norm": 7.685599327087402, "learning_rate": 2.0992253493713262e-05, "loss": 0.5978, "step": 74550 }, { "epoch": 2.903966678344817, "grad_norm": 16.107698440551758, "learning_rate": 2.0972789910078243e-05, "loss": 0.5832, "step": 74600 }, { "epoch": 2.9059130367083186, "grad_norm": 34.20879364013672, "learning_rate": 2.0953326326443228e-05, "loss": 0.5521, "step": 74650 }, { "epoch": 2.907859395071821, "grad_norm": 6.708345890045166, "learning_rate": 2.093386274280821e-05, "loss": 0.5799, "step": 74700 }, { "epoch": 2.9098057534353226, "grad_norm": 32.071075439453125, "learning_rate": 2.0914399159173186e-05, "loss": 0.6609, "step": 74750 }, { "epoch": 2.9117521117988243, "grad_norm": 9.595416069030762, "learning_rate": 2.0894935575538167e-05, "loss": 0.5903, "step": 74800 }, { "epoch": 2.913698470162326, "grad_norm": 35.93272018432617, "learning_rate": 2.087547199190315e-05, "loss": 0.5511, "step": 74850 }, { "epoch": 2.9156448285258283, "grad_norm": 12.679047584533691, "learning_rate": 2.0856008408268132e-05, "loss": 0.5498, "step": 74900 }, { "epoch": 2.91759118688933, "grad_norm": 21.00086212158203, "learning_rate": 2.083654482463311e-05, "loss": 0.5104, "step": 74950 }, { "epoch": 2.919537545252832, "grad_norm": 27.580059051513672, "learning_rate": 2.0817081240998094e-05, "loss": 0.6052, "step": 75000 }, { "epoch": 2.921483903616334, "grad_norm": 12.343066215515137, "learning_rate": 2.0797617657363075e-05, "loss": 0.5872, "step": 75050 }, { "epoch": 2.923430261979836, "grad_norm": 7.58252477645874, "learning_rate": 2.0778154073728056e-05, "loss": 0.5074, "step": 75100 }, { "epoch": 2.9253766203433376, "grad_norm": 13.442036628723145, "learning_rate": 2.0758690490093037e-05, "loss": 0.5864, "step": 75150 }, { "epoch": 2.9273229787068393, "grad_norm": 38.34627914428711, "learning_rate": 2.0739226906458018e-05, "loss": 0.5922, "step": 75200 }, { "epoch": 2.9292693370703415, "grad_norm": 14.47314167022705, "learning_rate": 2.0719763322823e-05, "loss": 0.594, "step": 75250 }, { "epoch": 2.9312156954338433, "grad_norm": 21.154664993286133, "learning_rate": 2.070029973918798e-05, "loss": 0.5841, "step": 75300 }, { "epoch": 2.933162053797345, "grad_norm": 10.4031343460083, "learning_rate": 2.068083615555296e-05, "loss": 0.6007, "step": 75350 }, { "epoch": 2.9351084121608473, "grad_norm": 11.218421936035156, "learning_rate": 2.0661372571917942e-05, "loss": 0.5429, "step": 75400 }, { "epoch": 2.937054770524349, "grad_norm": 11.067743301391602, "learning_rate": 2.0641908988282923e-05, "loss": 0.6014, "step": 75450 }, { "epoch": 2.939001128887851, "grad_norm": 24.498441696166992, "learning_rate": 2.0622445404647904e-05, "loss": 0.6378, "step": 75500 }, { "epoch": 2.9409474872513526, "grad_norm": 17.871280670166016, "learning_rate": 2.060298182101289e-05, "loss": 0.5649, "step": 75550 }, { "epoch": 2.9428938456148543, "grad_norm": 15.951749801635742, "learning_rate": 2.0583518237377866e-05, "loss": 0.5598, "step": 75600 }, { "epoch": 2.9448402039783566, "grad_norm": 4.7176513671875, "learning_rate": 2.0564054653742847e-05, "loss": 0.4844, "step": 75650 }, { "epoch": 2.9467865623418583, "grad_norm": 23.091100692749023, "learning_rate": 2.054459107010783e-05, "loss": 0.5441, "step": 75700 }, { "epoch": 2.9487329207053605, "grad_norm": 14.94837760925293, "learning_rate": 2.0525127486472812e-05, "loss": 0.5567, "step": 75750 }, { "epoch": 2.9506792790688623, "grad_norm": 11.347490310668945, "learning_rate": 2.050566390283779e-05, "loss": 0.49, "step": 75800 }, { "epoch": 2.952625637432364, "grad_norm": 6.734121799468994, "learning_rate": 2.048620031920277e-05, "loss": 0.5365, "step": 75850 }, { "epoch": 2.954571995795866, "grad_norm": 6.402181625366211, "learning_rate": 2.0466736735567755e-05, "loss": 0.597, "step": 75900 }, { "epoch": 2.9565183541593676, "grad_norm": 15.866073608398438, "learning_rate": 2.0447273151932736e-05, "loss": 0.5431, "step": 75950 }, { "epoch": 2.95846471252287, "grad_norm": 5.543972969055176, "learning_rate": 2.0427809568297714e-05, "loss": 0.4414, "step": 76000 }, { "epoch": 2.9604110708863716, "grad_norm": 17.960554122924805, "learning_rate": 2.0408345984662698e-05, "loss": 0.4803, "step": 76050 }, { "epoch": 2.9623574292498733, "grad_norm": 3.8636868000030518, "learning_rate": 2.038888240102768e-05, "loss": 0.4737, "step": 76100 }, { "epoch": 2.9643037876133755, "grad_norm": 35.52202606201172, "learning_rate": 2.036941881739266e-05, "loss": 0.5494, "step": 76150 }, { "epoch": 2.9662501459768773, "grad_norm": 61.159305572509766, "learning_rate": 2.035034450543034e-05, "loss": 0.6296, "step": 76200 }, { "epoch": 2.968196504340379, "grad_norm": 34.21897506713867, "learning_rate": 2.0330880921795322e-05, "loss": 0.5708, "step": 76250 }, { "epoch": 2.970142862703881, "grad_norm": 7.910594940185547, "learning_rate": 2.0311417338160303e-05, "loss": 0.5465, "step": 76300 }, { "epoch": 2.972089221067383, "grad_norm": 2.639056921005249, "learning_rate": 2.0291953754525284e-05, "loss": 0.6043, "step": 76350 }, { "epoch": 2.974035579430885, "grad_norm": 9.456559181213379, "learning_rate": 2.0272490170890265e-05, "loss": 0.5258, "step": 76400 }, { "epoch": 2.9759819377943866, "grad_norm": 13.229592323303223, "learning_rate": 2.0253026587255246e-05, "loss": 0.5822, "step": 76450 }, { "epoch": 2.977928296157889, "grad_norm": 19.787858963012695, "learning_rate": 2.0233563003620227e-05, "loss": 0.6648, "step": 76500 }, { "epoch": 2.9798746545213906, "grad_norm": 9.876517295837402, "learning_rate": 2.0214099419985208e-05, "loss": 0.5366, "step": 76550 }, { "epoch": 2.9818210128848923, "grad_norm": 26.27078628540039, "learning_rate": 2.019463583635019e-05, "loss": 0.562, "step": 76600 }, { "epoch": 2.983767371248394, "grad_norm": 18.37929344177246, "learning_rate": 2.0175172252715173e-05, "loss": 0.5399, "step": 76650 }, { "epoch": 2.9857137296118963, "grad_norm": 19.099201202392578, "learning_rate": 2.0155708669080154e-05, "loss": 0.553, "step": 76700 }, { "epoch": 2.987660087975398, "grad_norm": 11.066075325012207, "learning_rate": 2.013624508544513e-05, "loss": 0.4731, "step": 76750 }, { "epoch": 2.9896064463389, "grad_norm": 19.65965461730957, "learning_rate": 2.0116781501810112e-05, "loss": 0.5182, "step": 76800 }, { "epoch": 2.991552804702402, "grad_norm": 4.790431499481201, "learning_rate": 2.0097317918175097e-05, "loss": 0.5336, "step": 76850 }, { "epoch": 2.993499163065904, "grad_norm": 38.61946487426758, "learning_rate": 2.0077854334540078e-05, "loss": 0.5447, "step": 76900 }, { "epoch": 2.9954455214294056, "grad_norm": 43.4021110534668, "learning_rate": 2.0058390750905055e-05, "loss": 0.5555, "step": 76950 }, { "epoch": 2.9973918797929073, "grad_norm": 11.164488792419434, "learning_rate": 2.003892716727004e-05, "loss": 0.5006, "step": 77000 }, { "epoch": 2.999338238156409, "grad_norm": 0.4316921830177307, "learning_rate": 2.001946358363502e-05, "loss": 0.5405, "step": 77050 }, { "epoch": 3.0, "eval_accuracy": 0.7911557475962474, "eval_f1_macro": 0.7458585031060708, "eval_f1_weighted": 0.7886391119152261, "eval_loss": 0.6513015627861023, "eval_roc_auc": 0.951678935179261, "eval_runtime": 27.5736, "eval_samples_per_second": 931.652, "eval_steps_per_second": 116.488, "step": 77067 }, { "epoch": 3.0012845965199113, "grad_norm": 11.428071022033691, "learning_rate": 2e-05, "loss": 0.6095, "step": 77100 }, { "epoch": 3.003230954883413, "grad_norm": 23.76935577392578, "learning_rate": 1.9980536416364983e-05, "loss": 0.5697, "step": 77150 }, { "epoch": 3.005177313246915, "grad_norm": 7.279968738555908, "learning_rate": 1.9961072832729964e-05, "loss": 0.4915, "step": 77200 }, { "epoch": 3.007123671610417, "grad_norm": 23.73040008544922, "learning_rate": 1.9941609249094945e-05, "loss": 0.5331, "step": 77250 }, { "epoch": 3.009070029973919, "grad_norm": 21.295827865600586, "learning_rate": 1.9922145665459926e-05, "loss": 0.4341, "step": 77300 }, { "epoch": 3.0110163883374206, "grad_norm": 18.624008178710938, "learning_rate": 1.9902682081824906e-05, "loss": 0.4985, "step": 77350 }, { "epoch": 3.012962746700923, "grad_norm": 12.03382396697998, "learning_rate": 1.9883218498189887e-05, "loss": 0.4595, "step": 77400 }, { "epoch": 3.0149091050644246, "grad_norm": 40.933921813964844, "learning_rate": 1.986375491455487e-05, "loss": 0.5259, "step": 77450 }, { "epoch": 3.0168554634279263, "grad_norm": 8.203516960144043, "learning_rate": 1.984429133091985e-05, "loss": 0.4801, "step": 77500 }, { "epoch": 3.018801821791428, "grad_norm": 18.686450958251953, "learning_rate": 1.982482774728483e-05, "loss": 0.4462, "step": 77550 }, { "epoch": 3.0207481801549303, "grad_norm": 10.924481391906738, "learning_rate": 1.980536416364981e-05, "loss": 0.4882, "step": 77600 }, { "epoch": 3.022694538518432, "grad_norm": 17.74546241760254, "learning_rate": 1.9785900580014792e-05, "loss": 0.5082, "step": 77650 }, { "epoch": 3.024640896881934, "grad_norm": 41.24504852294922, "learning_rate": 1.9766436996379777e-05, "loss": 0.4677, "step": 77700 }, { "epoch": 3.0265872552454356, "grad_norm": 11.686141967773438, "learning_rate": 1.9746973412744754e-05, "loss": 0.5001, "step": 77750 }, { "epoch": 3.028533613608938, "grad_norm": 20.286235809326172, "learning_rate": 1.9727509829109735e-05, "loss": 0.5772, "step": 77800 }, { "epoch": 3.0304799719724396, "grad_norm": 54.215641021728516, "learning_rate": 1.9708046245474716e-05, "loss": 0.4799, "step": 77850 }, { "epoch": 3.0324263303359413, "grad_norm": 25.828012466430664, "learning_rate": 1.96885826618397e-05, "loss": 0.458, "step": 77900 }, { "epoch": 3.0343726886994435, "grad_norm": 18.02463150024414, "learning_rate": 1.966911907820468e-05, "loss": 0.473, "step": 77950 }, { "epoch": 3.0363190470629453, "grad_norm": 2.963188409805298, "learning_rate": 1.964965549456966e-05, "loss": 0.4678, "step": 78000 }, { "epoch": 3.038265405426447, "grad_norm": 8.645845413208008, "learning_rate": 1.9630191910934643e-05, "loss": 0.4976, "step": 78050 }, { "epoch": 3.040211763789949, "grad_norm": 12.508298873901367, "learning_rate": 1.9610728327299624e-05, "loss": 0.541, "step": 78100 }, { "epoch": 3.042158122153451, "grad_norm": 14.969858169555664, "learning_rate": 1.9591264743664605e-05, "loss": 0.4708, "step": 78150 }, { "epoch": 3.044104480516953, "grad_norm": 200.67343139648438, "learning_rate": 1.9571801160029586e-05, "loss": 0.4879, "step": 78200 }, { "epoch": 3.0460508388804546, "grad_norm": 350.2611083984375, "learning_rate": 1.9552337576394567e-05, "loss": 0.5473, "step": 78250 }, { "epoch": 3.0479971972439563, "grad_norm": 15.808113098144531, "learning_rate": 1.953287399275955e-05, "loss": 0.5972, "step": 78300 }, { "epoch": 3.0499435556074586, "grad_norm": 26.784873962402344, "learning_rate": 1.951341040912453e-05, "loss": 0.5579, "step": 78350 }, { "epoch": 3.0518899139709603, "grad_norm": 67.9161376953125, "learning_rate": 1.949394682548951e-05, "loss": 0.5287, "step": 78400 }, { "epoch": 3.053836272334462, "grad_norm": 48.711875915527344, "learning_rate": 1.947448324185449e-05, "loss": 0.473, "step": 78450 }, { "epoch": 3.0557826306979643, "grad_norm": 15.290504455566406, "learning_rate": 1.9455019658219472e-05, "loss": 0.3991, "step": 78500 }, { "epoch": 3.057728989061466, "grad_norm": 76.05422973632812, "learning_rate": 1.9435556074584453e-05, "loss": 0.5462, "step": 78550 }, { "epoch": 3.059675347424968, "grad_norm": 12.220948219299316, "learning_rate": 1.9416092490949434e-05, "loss": 0.5093, "step": 78600 }, { "epoch": 3.0616217057884696, "grad_norm": 5.989344120025635, "learning_rate": 1.9396628907314415e-05, "loss": 0.518, "step": 78650 }, { "epoch": 3.063568064151972, "grad_norm": 6.037346839904785, "learning_rate": 1.9377165323679396e-05, "loss": 0.4882, "step": 78700 }, { "epoch": 3.0655144225154736, "grad_norm": 28.46247673034668, "learning_rate": 1.935770174004438e-05, "loss": 0.4712, "step": 78750 }, { "epoch": 3.0674607808789753, "grad_norm": 7.438333511352539, "learning_rate": 1.9338238156409358e-05, "loss": 0.5338, "step": 78800 }, { "epoch": 3.069407139242477, "grad_norm": 15.007311820983887, "learning_rate": 1.931877457277434e-05, "loss": 0.4184, "step": 78850 }, { "epoch": 3.0713534976059793, "grad_norm": 40.89399337768555, "learning_rate": 1.929931098913932e-05, "loss": 0.528, "step": 78900 }, { "epoch": 3.073299855969481, "grad_norm": 18.118614196777344, "learning_rate": 1.9279847405504304e-05, "loss": 0.5264, "step": 78950 }, { "epoch": 3.075246214332983, "grad_norm": 16.78095817565918, "learning_rate": 1.9260383821869282e-05, "loss": 0.4072, "step": 79000 }, { "epoch": 3.077192572696485, "grad_norm": 2.0887844562530518, "learning_rate": 1.9240920238234263e-05, "loss": 0.3747, "step": 79050 }, { "epoch": 3.079138931059987, "grad_norm": 58.29981994628906, "learning_rate": 1.9221456654599247e-05, "loss": 0.5357, "step": 79100 }, { "epoch": 3.0810852894234886, "grad_norm": 30.621421813964844, "learning_rate": 1.9201993070964228e-05, "loss": 0.415, "step": 79150 }, { "epoch": 3.0830316477869903, "grad_norm": 0.9029637575149536, "learning_rate": 1.9182529487329206e-05, "loss": 0.4607, "step": 79200 }, { "epoch": 3.0849780061504926, "grad_norm": 0.2026718705892563, "learning_rate": 1.9163065903694187e-05, "loss": 0.5082, "step": 79250 }, { "epoch": 3.0869243645139943, "grad_norm": 38.04810333251953, "learning_rate": 1.914360232005917e-05, "loss": 0.5961, "step": 79300 }, { "epoch": 3.088870722877496, "grad_norm": 49.892147064208984, "learning_rate": 1.9124138736424152e-05, "loss": 0.4428, "step": 79350 }, { "epoch": 3.0908170812409983, "grad_norm": 8.498678207397461, "learning_rate": 1.9104675152789133e-05, "loss": 0.426, "step": 79400 }, { "epoch": 3.0927634396045, "grad_norm": 13.9781494140625, "learning_rate": 1.9085211569154114e-05, "loss": 0.4385, "step": 79450 }, { "epoch": 3.094709797968002, "grad_norm": 24.182893753051758, "learning_rate": 1.9065747985519095e-05, "loss": 0.4566, "step": 79500 }, { "epoch": 3.0966561563315036, "grad_norm": 15.523015022277832, "learning_rate": 1.9046673673556776e-05, "loss": 0.4555, "step": 79550 }, { "epoch": 3.098602514695006, "grad_norm": 40.458900451660156, "learning_rate": 1.9027210089921757e-05, "loss": 0.424, "step": 79600 }, { "epoch": 3.1005488730585076, "grad_norm": 25.219158172607422, "learning_rate": 1.9007746506286738e-05, "loss": 0.4523, "step": 79650 }, { "epoch": 3.1024952314220093, "grad_norm": 3.413469076156616, "learning_rate": 1.8988282922651722e-05, "loss": 0.5217, "step": 79700 }, { "epoch": 3.104441589785511, "grad_norm": 1.65800940990448, "learning_rate": 1.8969208610689403e-05, "loss": 0.4206, "step": 79750 }, { "epoch": 3.1063879481490133, "grad_norm": 9.823315620422363, "learning_rate": 1.8949745027054384e-05, "loss": 0.5119, "step": 79800 }, { "epoch": 3.108334306512515, "grad_norm": 26.40736198425293, "learning_rate": 1.893028144341936e-05, "loss": 0.4736, "step": 79850 }, { "epoch": 3.110280664876017, "grad_norm": 4.507157802581787, "learning_rate": 1.8910817859784342e-05, "loss": 0.4923, "step": 79900 }, { "epoch": 3.112227023239519, "grad_norm": 16.55584144592285, "learning_rate": 1.8891354276149327e-05, "loss": 0.5203, "step": 79950 }, { "epoch": 3.114173381603021, "grad_norm": 12.428496360778809, "learning_rate": 1.8871890692514308e-05, "loss": 0.4937, "step": 80000 }, { "epoch": 3.1161197399665226, "grad_norm": 4.2584662437438965, "learning_rate": 1.8852427108879285e-05, "loss": 0.4888, "step": 80050 }, { "epoch": 3.1180660983300243, "grad_norm": 13.264801025390625, "learning_rate": 1.883296352524427e-05, "loss": 0.447, "step": 80100 }, { "epoch": 3.1200124566935266, "grad_norm": 20.110061645507812, "learning_rate": 1.881349994160925e-05, "loss": 0.4592, "step": 80150 }, { "epoch": 3.1219588150570283, "grad_norm": 8.246827125549316, "learning_rate": 1.879403635797423e-05, "loss": 0.5923, "step": 80200 }, { "epoch": 3.12390517342053, "grad_norm": 96.33789825439453, "learning_rate": 1.8774572774339212e-05, "loss": 0.4382, "step": 80250 }, { "epoch": 3.125851531784032, "grad_norm": 20.25374984741211, "learning_rate": 1.8755109190704193e-05, "loss": 0.5402, "step": 80300 }, { "epoch": 3.127797890147534, "grad_norm": 100.00299072265625, "learning_rate": 1.8735645607069174e-05, "loss": 0.5429, "step": 80350 }, { "epoch": 3.129744248511036, "grad_norm": 15.827549934387207, "learning_rate": 1.8716182023434155e-05, "loss": 0.501, "step": 80400 }, { "epoch": 3.1316906068745376, "grad_norm": 15.041054725646973, "learning_rate": 1.8696718439799136e-05, "loss": 0.5215, "step": 80450 }, { "epoch": 3.13363696523804, "grad_norm": 176.82833862304688, "learning_rate": 1.8677254856164117e-05, "loss": 0.4418, "step": 80500 }, { "epoch": 3.1355833236015416, "grad_norm": 3.8265488147735596, "learning_rate": 1.8657791272529098e-05, "loss": 0.5447, "step": 80550 }, { "epoch": 3.1375296819650433, "grad_norm": 58.53923034667969, "learning_rate": 1.863832768889408e-05, "loss": 0.5693, "step": 80600 }, { "epoch": 3.139476040328545, "grad_norm": 23.215343475341797, "learning_rate": 1.8618864105259064e-05, "loss": 0.5006, "step": 80650 }, { "epoch": 3.1414223986920473, "grad_norm": 73.66390991210938, "learning_rate": 1.859940052162404e-05, "loss": 0.4995, "step": 80700 }, { "epoch": 3.143368757055549, "grad_norm": 56.97943115234375, "learning_rate": 1.8579936937989022e-05, "loss": 0.4712, "step": 80750 }, { "epoch": 3.145315115419051, "grad_norm": 6.458110332489014, "learning_rate": 1.8560473354354007e-05, "loss": 0.4626, "step": 80800 }, { "epoch": 3.147261473782553, "grad_norm": 1.0450634956359863, "learning_rate": 1.8541009770718987e-05, "loss": 0.4413, "step": 80850 }, { "epoch": 3.149207832146055, "grad_norm": 16.697589874267578, "learning_rate": 1.8521546187083965e-05, "loss": 0.5393, "step": 80900 }, { "epoch": 3.1511541905095566, "grad_norm": 39.846153259277344, "learning_rate": 1.8502082603448946e-05, "loss": 0.4983, "step": 80950 }, { "epoch": 3.1531005488730584, "grad_norm": 74.1617431640625, "learning_rate": 1.848261901981393e-05, "loss": 0.5159, "step": 81000 }, { "epoch": 3.1550469072365606, "grad_norm": 150.62716674804688, "learning_rate": 1.846315543617891e-05, "loss": 0.5099, "step": 81050 }, { "epoch": 3.1569932656000623, "grad_norm": 15.482856750488281, "learning_rate": 1.844369185254389e-05, "loss": 0.4401, "step": 81100 }, { "epoch": 3.158939623963564, "grad_norm": 11.664920806884766, "learning_rate": 1.8424228268908873e-05, "loss": 0.521, "step": 81150 }, { "epoch": 3.160885982327066, "grad_norm": 14.082572937011719, "learning_rate": 1.8404764685273854e-05, "loss": 0.5646, "step": 81200 }, { "epoch": 3.162832340690568, "grad_norm": 28.738656997680664, "learning_rate": 1.8385301101638835e-05, "loss": 0.5311, "step": 81250 }, { "epoch": 3.16477869905407, "grad_norm": 12.015096664428711, "learning_rate": 1.8365837518003813e-05, "loss": 0.4798, "step": 81300 }, { "epoch": 3.1667250574175716, "grad_norm": 10.081704139709473, "learning_rate": 1.8346373934368797e-05, "loss": 0.618, "step": 81350 }, { "epoch": 3.168671415781074, "grad_norm": 2.2845330238342285, "learning_rate": 1.8326910350733778e-05, "loss": 0.5101, "step": 81400 }, { "epoch": 3.1706177741445756, "grad_norm": 15.001742362976074, "learning_rate": 1.830744676709876e-05, "loss": 0.5734, "step": 81450 }, { "epoch": 3.1725641325080773, "grad_norm": 82.33760833740234, "learning_rate": 1.828798318346374e-05, "loss": 0.4903, "step": 81500 }, { "epoch": 3.174510490871579, "grad_norm": 15.481046676635742, "learning_rate": 1.826851959982872e-05, "loss": 0.5296, "step": 81550 }, { "epoch": 3.1764568492350813, "grad_norm": 388.97882080078125, "learning_rate": 1.8249056016193702e-05, "loss": 0.5938, "step": 81600 }, { "epoch": 3.178403207598583, "grad_norm": 45.19728469848633, "learning_rate": 1.8229592432558683e-05, "loss": 0.5167, "step": 81650 }, { "epoch": 3.180349565962085, "grad_norm": 23.02707862854004, "learning_rate": 1.8210128848923667e-05, "loss": 0.4616, "step": 81700 }, { "epoch": 3.1822959243255866, "grad_norm": 5.246418476104736, "learning_rate": 1.8190665265288645e-05, "loss": 0.4981, "step": 81750 }, { "epoch": 3.184242282689089, "grad_norm": 2.9148266315460205, "learning_rate": 1.8171201681653626e-05, "loss": 0.5327, "step": 81800 }, { "epoch": 3.1861886410525906, "grad_norm": 6.815219879150391, "learning_rate": 1.8151738098018607e-05, "loss": 0.5084, "step": 81850 }, { "epoch": 3.1881349994160924, "grad_norm": 13.483736038208008, "learning_rate": 1.813227451438359e-05, "loss": 0.4857, "step": 81900 }, { "epoch": 3.1900813577795946, "grad_norm": 35.191925048828125, "learning_rate": 1.811281093074857e-05, "loss": 0.509, "step": 81950 }, { "epoch": 3.1920277161430963, "grad_norm": 12.267219543457031, "learning_rate": 1.809334734711355e-05, "loss": 0.4884, "step": 82000 }, { "epoch": 3.193974074506598, "grad_norm": 59.42839813232422, "learning_rate": 1.8073883763478534e-05, "loss": 0.6088, "step": 82050 }, { "epoch": 3.1959204328701, "grad_norm": 32.859981536865234, "learning_rate": 1.8054420179843515e-05, "loss": 0.4294, "step": 82100 }, { "epoch": 3.197866791233602, "grad_norm": 29.06385040283203, "learning_rate": 1.8034956596208493e-05, "loss": 0.4848, "step": 82150 }, { "epoch": 3.199813149597104, "grad_norm": 32.723793029785156, "learning_rate": 1.8015493012573477e-05, "loss": 0.4987, "step": 82200 }, { "epoch": 3.2017595079606056, "grad_norm": 27.024803161621094, "learning_rate": 1.7996029428938458e-05, "loss": 0.4051, "step": 82250 }, { "epoch": 3.203705866324108, "grad_norm": 23.775087356567383, "learning_rate": 1.797656584530344e-05, "loss": 0.4756, "step": 82300 }, { "epoch": 3.2056522246876096, "grad_norm": 27.507732391357422, "learning_rate": 1.7957102261668417e-05, "loss": 0.5316, "step": 82350 }, { "epoch": 3.2075985830511113, "grad_norm": 21.146055221557617, "learning_rate": 1.79376386780334e-05, "loss": 0.5632, "step": 82400 }, { "epoch": 3.209544941414613, "grad_norm": 19.14077377319336, "learning_rate": 1.7918175094398382e-05, "loss": 0.4783, "step": 82450 }, { "epoch": 3.2114912997781153, "grad_norm": 22.936565399169922, "learning_rate": 1.7898711510763363e-05, "loss": 0.4275, "step": 82500 }, { "epoch": 3.213437658141617, "grad_norm": 0.9977525472640991, "learning_rate": 1.7879247927128344e-05, "loss": 0.3983, "step": 82550 }, { "epoch": 3.215384016505119, "grad_norm": 3.478424072265625, "learning_rate": 1.7859784343493325e-05, "loss": 0.5438, "step": 82600 }, { "epoch": 3.2173303748686206, "grad_norm": 39.202945709228516, "learning_rate": 1.7840320759858306e-05, "loss": 0.5613, "step": 82650 }, { "epoch": 3.219276733232123, "grad_norm": 14.357657432556152, "learning_rate": 1.7820857176223287e-05, "loss": 0.4868, "step": 82700 }, { "epoch": 3.2212230915956246, "grad_norm": 18.849241256713867, "learning_rate": 1.7801393592588268e-05, "loss": 0.4798, "step": 82750 }, { "epoch": 3.2231694499591264, "grad_norm": 148.13580322265625, "learning_rate": 1.778193000895325e-05, "loss": 0.4994, "step": 82800 }, { "epoch": 3.2251158083226286, "grad_norm": 24.0234317779541, "learning_rate": 1.776246642531823e-05, "loss": 0.4105, "step": 82850 }, { "epoch": 3.2270621666861303, "grad_norm": 27.903730392456055, "learning_rate": 1.774300284168321e-05, "loss": 0.5489, "step": 82900 }, { "epoch": 3.229008525049632, "grad_norm": 15.72659969329834, "learning_rate": 1.7723539258048195e-05, "loss": 0.4246, "step": 82950 }, { "epoch": 3.230954883413134, "grad_norm": 18.573537826538086, "learning_rate": 1.7704075674413173e-05, "loss": 0.5257, "step": 83000 }, { "epoch": 3.232901241776636, "grad_norm": 5.624817848205566, "learning_rate": 1.7684612090778154e-05, "loss": 0.5236, "step": 83050 }, { "epoch": 3.234847600140138, "grad_norm": 27.57298469543457, "learning_rate": 1.7665148507143138e-05, "loss": 0.4562, "step": 83100 }, { "epoch": 3.2367939585036396, "grad_norm": 20.729084014892578, "learning_rate": 1.764568492350812e-05, "loss": 0.5711, "step": 83150 }, { "epoch": 3.2387403168671414, "grad_norm": 16.5264892578125, "learning_rate": 1.7626221339873096e-05, "loss": 0.4743, "step": 83200 }, { "epoch": 3.2406866752306436, "grad_norm": 17.68093490600586, "learning_rate": 1.760675775623808e-05, "loss": 0.5235, "step": 83250 }, { "epoch": 3.2426330335941453, "grad_norm": 19.455875396728516, "learning_rate": 1.7587294172603062e-05, "loss": 0.4915, "step": 83300 }, { "epoch": 3.244579391957647, "grad_norm": 12.819408416748047, "learning_rate": 1.7567830588968043e-05, "loss": 0.4696, "step": 83350 }, { "epoch": 3.2465257503211493, "grad_norm": 3.413386583328247, "learning_rate": 1.754836700533302e-05, "loss": 0.5177, "step": 83400 }, { "epoch": 3.248472108684651, "grad_norm": 21.20757293701172, "learning_rate": 1.7528903421698005e-05, "loss": 0.579, "step": 83450 }, { "epoch": 3.250418467048153, "grad_norm": 14.022490501403809, "learning_rate": 1.7509439838062986e-05, "loss": 0.4886, "step": 83500 }, { "epoch": 3.2523648254116546, "grad_norm": 11.81977367401123, "learning_rate": 1.7489976254427967e-05, "loss": 0.4937, "step": 83550 }, { "epoch": 3.254311183775157, "grad_norm": 25.633989334106445, "learning_rate": 1.7470512670792948e-05, "loss": 0.4661, "step": 83600 }, { "epoch": 3.2562575421386586, "grad_norm": 37.28684616088867, "learning_rate": 1.745104908715793e-05, "loss": 0.3369, "step": 83650 }, { "epoch": 3.2582039005021604, "grad_norm": 47.74785232543945, "learning_rate": 1.743158550352291e-05, "loss": 0.4469, "step": 83700 }, { "epoch": 3.2601502588656626, "grad_norm": 28.25569725036621, "learning_rate": 1.741212191988789e-05, "loss": 0.4547, "step": 83750 }, { "epoch": 3.2620966172291643, "grad_norm": 2.9699912071228027, "learning_rate": 1.739265833625287e-05, "loss": 0.4535, "step": 83800 }, { "epoch": 3.264042975592666, "grad_norm": 24.99479866027832, "learning_rate": 1.7373194752617852e-05, "loss": 0.4653, "step": 83850 }, { "epoch": 3.265989333956168, "grad_norm": 21.05978012084961, "learning_rate": 1.7353731168982833e-05, "loss": 0.4486, "step": 83900 }, { "epoch": 3.26793569231967, "grad_norm": 49.076072692871094, "learning_rate": 1.7334267585347814e-05, "loss": 0.5173, "step": 83950 }, { "epoch": 3.269882050683172, "grad_norm": 25.810136795043945, "learning_rate": 1.7314804001712795e-05, "loss": 0.3715, "step": 84000 }, { "epoch": 3.2718284090466736, "grad_norm": 6.307109355926514, "learning_rate": 1.7295340418077776e-05, "loss": 0.5169, "step": 84050 }, { "epoch": 3.2737747674101754, "grad_norm": 18.627384185791016, "learning_rate": 1.7275876834442757e-05, "loss": 0.5132, "step": 84100 }, { "epoch": 3.2757211257736776, "grad_norm": 17.11471176147461, "learning_rate": 1.7256413250807742e-05, "loss": 0.4948, "step": 84150 }, { "epoch": 3.2776674841371793, "grad_norm": 14.81241226196289, "learning_rate": 1.723694966717272e-05, "loss": 0.5912, "step": 84200 }, { "epoch": 3.279613842500681, "grad_norm": 34.87680435180664, "learning_rate": 1.72174860835377e-05, "loss": 0.456, "step": 84250 }, { "epoch": 3.281560200864183, "grad_norm": 9.187166213989258, "learning_rate": 1.719802249990268e-05, "loss": 0.5117, "step": 84300 }, { "epoch": 3.283506559227685, "grad_norm": 12.002781867980957, "learning_rate": 1.7178558916267666e-05, "loss": 0.5657, "step": 84350 }, { "epoch": 3.285452917591187, "grad_norm": 58.14558029174805, "learning_rate": 1.7159095332632647e-05, "loss": 0.417, "step": 84400 }, { "epoch": 3.2873992759546886, "grad_norm": 3.72061824798584, "learning_rate": 1.7139631748997624e-05, "loss": 0.5899, "step": 84450 }, { "epoch": 3.289345634318191, "grad_norm": 24.68319320678711, "learning_rate": 1.712016816536261e-05, "loss": 0.6123, "step": 84500 }, { "epoch": 3.2912919926816926, "grad_norm": 18.541675567626953, "learning_rate": 1.710070458172759e-05, "loss": 0.5274, "step": 84550 }, { "epoch": 3.2932383510451944, "grad_norm": 7.046708583831787, "learning_rate": 1.708124099809257e-05, "loss": 0.4805, "step": 84600 }, { "epoch": 3.295184709408696, "grad_norm": 14.895744323730469, "learning_rate": 1.706177741445755e-05, "loss": 0.4298, "step": 84650 }, { "epoch": 3.2971310677721983, "grad_norm": 28.868318557739258, "learning_rate": 1.7042313830822532e-05, "loss": 0.5117, "step": 84700 }, { "epoch": 3.2990774261357, "grad_norm": 27.220046997070312, "learning_rate": 1.7022850247187513e-05, "loss": 0.4715, "step": 84750 }, { "epoch": 3.301023784499202, "grad_norm": 16.41514778137207, "learning_rate": 1.7003386663552494e-05, "loss": 0.4822, "step": 84800 }, { "epoch": 3.302970142862704, "grad_norm": 16.698837280273438, "learning_rate": 1.6983923079917475e-05, "loss": 0.5548, "step": 84850 }, { "epoch": 3.304916501226206, "grad_norm": 12.242215156555176, "learning_rate": 1.6964459496282456e-05, "loss": 0.4286, "step": 84900 }, { "epoch": 3.3068628595897076, "grad_norm": 17.816787719726562, "learning_rate": 1.6944995912647437e-05, "loss": 0.5168, "step": 84950 }, { "epoch": 3.3088092179532094, "grad_norm": 6.191596984863281, "learning_rate": 1.6925532329012418e-05, "loss": 0.4257, "step": 85000 }, { "epoch": 3.3107555763167116, "grad_norm": 16.207738876342773, "learning_rate": 1.69060687453774e-05, "loss": 0.5386, "step": 85050 }, { "epoch": 3.3127019346802133, "grad_norm": 2.015899658203125, "learning_rate": 1.688660516174238e-05, "loss": 0.5626, "step": 85100 }, { "epoch": 3.314648293043715, "grad_norm": 6.160524845123291, "learning_rate": 1.686714157810736e-05, "loss": 0.4424, "step": 85150 }, { "epoch": 3.3165946514072173, "grad_norm": 18.630582809448242, "learning_rate": 1.6847677994472345e-05, "loss": 0.4808, "step": 85200 }, { "epoch": 3.318541009770719, "grad_norm": 3.7779552936553955, "learning_rate": 1.6828214410837323e-05, "loss": 0.5015, "step": 85250 }, { "epoch": 3.320487368134221, "grad_norm": 33.970977783203125, "learning_rate": 1.6808750827202304e-05, "loss": 0.5051, "step": 85300 }, { "epoch": 3.3224337264977226, "grad_norm": 12.07990837097168, "learning_rate": 1.6789287243567285e-05, "loss": 0.4079, "step": 85350 }, { "epoch": 3.324380084861225, "grad_norm": 11.3780517578125, "learning_rate": 1.676982365993227e-05, "loss": 0.4501, "step": 85400 }, { "epoch": 3.3263264432247266, "grad_norm": 15.240259170532227, "learning_rate": 1.6750360076297247e-05, "loss": 0.4642, "step": 85450 }, { "epoch": 3.3282728015882284, "grad_norm": 29.753185272216797, "learning_rate": 1.6730896492662228e-05, "loss": 0.4899, "step": 85500 }, { "epoch": 3.33021915995173, "grad_norm": 4.215719699859619, "learning_rate": 1.6711432909027212e-05, "loss": 0.4114, "step": 85550 }, { "epoch": 3.3321655183152323, "grad_norm": 31.039506912231445, "learning_rate": 1.6691969325392193e-05, "loss": 0.4427, "step": 85600 }, { "epoch": 3.334111876678734, "grad_norm": 27.714685440063477, "learning_rate": 1.6672505741757174e-05, "loss": 0.4681, "step": 85650 }, { "epoch": 3.336058235042236, "grad_norm": 7.944158554077148, "learning_rate": 1.6653042158122155e-05, "loss": 0.4851, "step": 85700 }, { "epoch": 3.3380045934057376, "grad_norm": 47.552955627441406, "learning_rate": 1.6633578574487136e-05, "loss": 0.5317, "step": 85750 }, { "epoch": 3.33995095176924, "grad_norm": 3.475970983505249, "learning_rate": 1.6614114990852117e-05, "loss": 0.4612, "step": 85800 }, { "epoch": 3.3418973101327416, "grad_norm": 12.919591903686523, "learning_rate": 1.6594651407217098e-05, "loss": 0.4941, "step": 85850 }, { "epoch": 3.3438436684962434, "grad_norm": 12.282877922058105, "learning_rate": 1.657557709525478e-05, "loss": 0.4889, "step": 85900 }, { "epoch": 3.3457900268597456, "grad_norm": 26.290491104125977, "learning_rate": 1.655611351161976e-05, "loss": 0.4528, "step": 85950 }, { "epoch": 3.3477363852232473, "grad_norm": 43.4367790222168, "learning_rate": 1.653664992798474e-05, "loss": 0.5138, "step": 86000 }, { "epoch": 3.349682743586749, "grad_norm": 9.075749397277832, "learning_rate": 1.6517186344349722e-05, "loss": 0.465, "step": 86050 }, { "epoch": 3.351629101950251, "grad_norm": 8.857653617858887, "learning_rate": 1.6497722760714703e-05, "loss": 0.4983, "step": 86100 }, { "epoch": 3.353575460313753, "grad_norm": 17.020437240600586, "learning_rate": 1.6478259177079687e-05, "loss": 0.5087, "step": 86150 }, { "epoch": 3.355521818677255, "grad_norm": 13.328280448913574, "learning_rate": 1.6458795593444665e-05, "loss": 0.5132, "step": 86200 }, { "epoch": 3.3574681770407566, "grad_norm": 14.303596496582031, "learning_rate": 1.6439332009809646e-05, "loss": 0.5731, "step": 86250 }, { "epoch": 3.359414535404259, "grad_norm": 36.909515380859375, "learning_rate": 1.641986842617463e-05, "loss": 0.5047, "step": 86300 }, { "epoch": 3.3613608937677606, "grad_norm": 34.945343017578125, "learning_rate": 1.640040484253961e-05, "loss": 0.4906, "step": 86350 }, { "epoch": 3.3633072521312624, "grad_norm": 3.4088380336761475, "learning_rate": 1.638094125890459e-05, "loss": 0.5812, "step": 86400 }, { "epoch": 3.365253610494764, "grad_norm": 23.7816219329834, "learning_rate": 1.636147767526957e-05, "loss": 0.5335, "step": 86450 }, { "epoch": 3.3671999688582663, "grad_norm": 20.68571662902832, "learning_rate": 1.6342014091634554e-05, "loss": 0.492, "step": 86500 }, { "epoch": 3.369146327221768, "grad_norm": 13.103964805603027, "learning_rate": 1.6322550507999535e-05, "loss": 0.4099, "step": 86550 }, { "epoch": 3.37109268558527, "grad_norm": 40.67021560668945, "learning_rate": 1.6303086924364512e-05, "loss": 0.5327, "step": 86600 }, { "epoch": 3.373039043948772, "grad_norm": 37.0681037902832, "learning_rate": 1.6283623340729497e-05, "loss": 0.573, "step": 86650 }, { "epoch": 3.374985402312274, "grad_norm": 32.48668670654297, "learning_rate": 1.6264159757094478e-05, "loss": 0.5043, "step": 86700 }, { "epoch": 3.3769317606757756, "grad_norm": 1.1996515989303589, "learning_rate": 1.624469617345946e-05, "loss": 0.4821, "step": 86750 }, { "epoch": 3.3788781190392774, "grad_norm": 30.48410987854004, "learning_rate": 1.622523258982444e-05, "loss": 0.4641, "step": 86800 }, { "epoch": 3.3808244774027796, "grad_norm": 11.57769775390625, "learning_rate": 1.620576900618942e-05, "loss": 0.5345, "step": 86850 }, { "epoch": 3.3827708357662813, "grad_norm": 25.45656394958496, "learning_rate": 1.61863054225544e-05, "loss": 0.3734, "step": 86900 }, { "epoch": 3.384717194129783, "grad_norm": 1.3770748376846313, "learning_rate": 1.6166841838919383e-05, "loss": 0.53, "step": 86950 }, { "epoch": 3.386663552493285, "grad_norm": 7.654837131500244, "learning_rate": 1.6147378255284364e-05, "loss": 0.48, "step": 87000 }, { "epoch": 3.388609910856787, "grad_norm": 25.943571090698242, "learning_rate": 1.6127914671649345e-05, "loss": 0.5363, "step": 87050 }, { "epoch": 3.390556269220289, "grad_norm": 9.88652515411377, "learning_rate": 1.6108451088014325e-05, "loss": 0.4813, "step": 87100 }, { "epoch": 3.3925026275837906, "grad_norm": 4.1711835861206055, "learning_rate": 1.6088987504379306e-05, "loss": 0.4701, "step": 87150 }, { "epoch": 3.3944489859472924, "grad_norm": 12.36887264251709, "learning_rate": 1.606952392074429e-05, "loss": 0.5586, "step": 87200 }, { "epoch": 3.3963953443107946, "grad_norm": 14.898747444152832, "learning_rate": 1.605006033710927e-05, "loss": 0.5525, "step": 87250 }, { "epoch": 3.3983417026742964, "grad_norm": 5.836781978607178, "learning_rate": 1.603059675347425e-05, "loss": 0.5673, "step": 87300 }, { "epoch": 3.400288061037798, "grad_norm": 16.837377548217773, "learning_rate": 1.601113316983923e-05, "loss": 0.4679, "step": 87350 }, { "epoch": 3.4022344194013003, "grad_norm": 2.747189521789551, "learning_rate": 1.5991669586204215e-05, "loss": 0.5251, "step": 87400 }, { "epoch": 3.404180777764802, "grad_norm": 0.37861940264701843, "learning_rate": 1.5972206002569192e-05, "loss": 0.5431, "step": 87450 }, { "epoch": 3.406127136128304, "grad_norm": 32.865234375, "learning_rate": 1.5952742418934173e-05, "loss": 0.5094, "step": 87500 }, { "epoch": 3.4080734944918056, "grad_norm": 8.586151123046875, "learning_rate": 1.5933278835299158e-05, "loss": 0.5417, "step": 87550 }, { "epoch": 3.410019852855308, "grad_norm": 4.736692428588867, "learning_rate": 1.591381525166414e-05, "loss": 0.4584, "step": 87600 }, { "epoch": 3.4119662112188096, "grad_norm": 26.9857234954834, "learning_rate": 1.5894351668029116e-05, "loss": 0.4873, "step": 87650 }, { "epoch": 3.4139125695823114, "grad_norm": 9.838881492614746, "learning_rate": 1.58748880843941e-05, "loss": 0.461, "step": 87700 }, { "epoch": 3.4158589279458136, "grad_norm": 33.93864440917969, "learning_rate": 1.585542450075908e-05, "loss": 0.4782, "step": 87750 }, { "epoch": 3.4178052863093153, "grad_norm": 8.799504280090332, "learning_rate": 1.5835960917124062e-05, "loss": 0.4452, "step": 87800 }, { "epoch": 3.419751644672817, "grad_norm": 46.79837417602539, "learning_rate": 1.581649733348904e-05, "loss": 0.496, "step": 87850 }, { "epoch": 3.421698003036319, "grad_norm": 15.259099960327148, "learning_rate": 1.5797033749854024e-05, "loss": 0.4851, "step": 87900 }, { "epoch": 3.423644361399821, "grad_norm": 0.6476923227310181, "learning_rate": 1.5777570166219005e-05, "loss": 0.4841, "step": 87950 }, { "epoch": 3.425590719763323, "grad_norm": 7.186517238616943, "learning_rate": 1.5758106582583986e-05, "loss": 0.5161, "step": 88000 }, { "epoch": 3.4275370781268246, "grad_norm": 21.87000274658203, "learning_rate": 1.5738642998948967e-05, "loss": 0.5522, "step": 88050 }, { "epoch": 3.429483436490327, "grad_norm": 22.505739212036133, "learning_rate": 1.5719179415313948e-05, "loss": 0.519, "step": 88100 }, { "epoch": 3.4314297948538286, "grad_norm": 28.601577758789062, "learning_rate": 1.569971583167893e-05, "loss": 0.5156, "step": 88150 }, { "epoch": 3.4333761532173304, "grad_norm": 32.24738693237305, "learning_rate": 1.568025224804391e-05, "loss": 0.4771, "step": 88200 }, { "epoch": 3.435322511580832, "grad_norm": 288.96923828125, "learning_rate": 1.5660788664408895e-05, "loss": 0.6392, "step": 88250 }, { "epoch": 3.4372688699443343, "grad_norm": 5.704242706298828, "learning_rate": 1.5641325080773872e-05, "loss": 0.4793, "step": 88300 }, { "epoch": 3.439215228307836, "grad_norm": 27.538801193237305, "learning_rate": 1.5621861497138853e-05, "loss": 0.4719, "step": 88350 }, { "epoch": 3.441161586671338, "grad_norm": 45.14986038208008, "learning_rate": 1.5602397913503834e-05, "loss": 0.4189, "step": 88400 }, { "epoch": 3.4431079450348396, "grad_norm": 5.952028274536133, "learning_rate": 1.558293432986882e-05, "loss": 0.5095, "step": 88450 }, { "epoch": 3.445054303398342, "grad_norm": 16.07669448852539, "learning_rate": 1.5563470746233796e-05, "loss": 0.4118, "step": 88500 }, { "epoch": 3.4470006617618436, "grad_norm": 26.172380447387695, "learning_rate": 1.5544007162598777e-05, "loss": 0.4611, "step": 88550 }, { "epoch": 3.4489470201253454, "grad_norm": 28.655195236206055, "learning_rate": 1.552454357896376e-05, "loss": 0.5309, "step": 88600 }, { "epoch": 3.450893378488847, "grad_norm": 32.19723129272461, "learning_rate": 1.5505079995328742e-05, "loss": 0.5321, "step": 88650 }, { "epoch": 3.4528397368523494, "grad_norm": 48.828697204589844, "learning_rate": 1.548561641169372e-05, "loss": 0.4031, "step": 88700 }, { "epoch": 3.454786095215851, "grad_norm": 42.7237548828125, "learning_rate": 1.54661528280587e-05, "loss": 0.5419, "step": 88750 }, { "epoch": 3.456732453579353, "grad_norm": 1.8510116338729858, "learning_rate": 1.5446689244423685e-05, "loss": 0.5223, "step": 88800 }, { "epoch": 3.458678811942855, "grad_norm": 17.58136558532715, "learning_rate": 1.5427225660788666e-05, "loss": 0.4746, "step": 88850 }, { "epoch": 3.460625170306357, "grad_norm": 9.887907028198242, "learning_rate": 1.5407762077153644e-05, "loss": 0.4697, "step": 88900 }, { "epoch": 3.4625715286698586, "grad_norm": 5.4429450035095215, "learning_rate": 1.5388298493518628e-05, "loss": 0.519, "step": 88950 }, { "epoch": 3.4645178870333604, "grad_norm": 8.413978576660156, "learning_rate": 1.536883490988361e-05, "loss": 0.502, "step": 89000 }, { "epoch": 3.4664642453968626, "grad_norm": 32.86538314819336, "learning_rate": 1.534937132624859e-05, "loss": 0.533, "step": 89050 }, { "epoch": 3.4684106037603644, "grad_norm": 17.34349250793457, "learning_rate": 1.532990774261357e-05, "loss": 0.5324, "step": 89100 }, { "epoch": 3.470356962123866, "grad_norm": 5.926978588104248, "learning_rate": 1.5310444158978552e-05, "loss": 0.5008, "step": 89150 }, { "epoch": 3.4723033204873683, "grad_norm": 10.122005462646484, "learning_rate": 1.5290980575343533e-05, "loss": 0.3408, "step": 89200 }, { "epoch": 3.47424967885087, "grad_norm": 22.54458999633789, "learning_rate": 1.5271516991708514e-05, "loss": 0.435, "step": 89250 }, { "epoch": 3.476196037214372, "grad_norm": 35.85359191894531, "learning_rate": 1.5252053408073493e-05, "loss": 0.5494, "step": 89300 }, { "epoch": 3.4781423955778736, "grad_norm": 11.549179077148438, "learning_rate": 1.5232589824438476e-05, "loss": 0.5125, "step": 89350 }, { "epoch": 3.480088753941376, "grad_norm": 22.915964126586914, "learning_rate": 1.5213126240803457e-05, "loss": 0.5488, "step": 89400 }, { "epoch": 3.4820351123048776, "grad_norm": 1.340731143951416, "learning_rate": 1.519366265716844e-05, "loss": 0.4749, "step": 89450 }, { "epoch": 3.4839814706683794, "grad_norm": 10.522486686706543, "learning_rate": 1.517419907353342e-05, "loss": 0.3396, "step": 89500 }, { "epoch": 3.4859278290318816, "grad_norm": 20.816572189331055, "learning_rate": 1.51547354898984e-05, "loss": 0.5747, "step": 89550 }, { "epoch": 3.4878741873953834, "grad_norm": 22.202091217041016, "learning_rate": 1.5135271906263382e-05, "loss": 0.471, "step": 89600 }, { "epoch": 3.489820545758885, "grad_norm": 12.18246078491211, "learning_rate": 1.5115808322628363e-05, "loss": 0.4482, "step": 89650 }, { "epoch": 3.491766904122387, "grad_norm": 5.34224271774292, "learning_rate": 1.5096344738993346e-05, "loss": 0.5475, "step": 89700 }, { "epoch": 3.493713262485889, "grad_norm": 15.139314651489258, "learning_rate": 1.5076881155358324e-05, "loss": 0.5935, "step": 89750 }, { "epoch": 3.495659620849391, "grad_norm": 95.16622161865234, "learning_rate": 1.5057417571723306e-05, "loss": 0.478, "step": 89800 }, { "epoch": 3.4976059792128926, "grad_norm": 23.95236587524414, "learning_rate": 1.5037953988088287e-05, "loss": 0.4105, "step": 89850 }, { "epoch": 3.4995523375763944, "grad_norm": 12.172629356384277, "learning_rate": 1.501849040445327e-05, "loss": 0.5522, "step": 89900 }, { "epoch": 3.5014986959398966, "grad_norm": 13.83673095703125, "learning_rate": 1.499902682081825e-05, "loss": 0.4705, "step": 89950 }, { "epoch": 3.5034450543033984, "grad_norm": 39.275943756103516, "learning_rate": 1.497956323718323e-05, "loss": 0.473, "step": 90000 }, { "epoch": 3.5053914126669, "grad_norm": 33.65157699584961, "learning_rate": 1.4960099653548213e-05, "loss": 0.6632, "step": 90050 }, { "epoch": 3.507337771030402, "grad_norm": 13.897461891174316, "learning_rate": 1.4940636069913194e-05, "loss": 0.3737, "step": 90100 }, { "epoch": 3.509284129393904, "grad_norm": 6.392612457275391, "learning_rate": 1.4921172486278173e-05, "loss": 0.4467, "step": 90150 }, { "epoch": 3.511230487757406, "grad_norm": 8.583169937133789, "learning_rate": 1.4901708902643156e-05, "loss": 0.5445, "step": 90200 }, { "epoch": 3.5131768461209076, "grad_norm": 8.920665740966797, "learning_rate": 1.4882245319008137e-05, "loss": 0.5243, "step": 90250 }, { "epoch": 3.51512320448441, "grad_norm": 19.46900177001953, "learning_rate": 1.4862781735373118e-05, "loss": 0.491, "step": 90300 }, { "epoch": 3.5170695628479116, "grad_norm": 9.86558723449707, "learning_rate": 1.4843318151738097e-05, "loss": 0.4946, "step": 90350 }, { "epoch": 3.5190159212114134, "grad_norm": 18.194711685180664, "learning_rate": 1.482385456810308e-05, "loss": 0.5451, "step": 90400 }, { "epoch": 3.520962279574915, "grad_norm": 5.253759860992432, "learning_rate": 1.480439098446806e-05, "loss": 0.4403, "step": 90450 }, { "epoch": 3.5229086379384174, "grad_norm": 3.901214599609375, "learning_rate": 1.4784927400833043e-05, "loss": 0.5732, "step": 90500 }, { "epoch": 3.524854996301919, "grad_norm": 78.9429931640625, "learning_rate": 1.4765463817198023e-05, "loss": 0.4609, "step": 90550 }, { "epoch": 3.526801354665421, "grad_norm": 29.523847579956055, "learning_rate": 1.4746000233563004e-05, "loss": 0.5783, "step": 90600 }, { "epoch": 3.528747713028923, "grad_norm": 48.042320251464844, "learning_rate": 1.4726536649927986e-05, "loss": 0.4147, "step": 90650 }, { "epoch": 3.530694071392425, "grad_norm": 28.287132263183594, "learning_rate": 1.4707073066292967e-05, "loss": 0.482, "step": 90700 }, { "epoch": 3.5326404297559266, "grad_norm": 32.135311126708984, "learning_rate": 1.4687609482657946e-05, "loss": 0.4716, "step": 90750 }, { "epoch": 3.5345867881194284, "grad_norm": 28.061336517333984, "learning_rate": 1.4668145899022927e-05, "loss": 0.5021, "step": 90800 }, { "epoch": 3.5365331464829306, "grad_norm": 11.512069702148438, "learning_rate": 1.464868231538791e-05, "loss": 0.4165, "step": 90850 }, { "epoch": 3.5384795048464324, "grad_norm": 25.752897262573242, "learning_rate": 1.4629218731752891e-05, "loss": 0.4833, "step": 90900 }, { "epoch": 3.540425863209934, "grad_norm": 36.52203369140625, "learning_rate": 1.4609755148117874e-05, "loss": 0.4761, "step": 90950 }, { "epoch": 3.5423722215734363, "grad_norm": 13.523998260498047, "learning_rate": 1.4590291564482853e-05, "loss": 0.4117, "step": 91000 }, { "epoch": 3.544318579936938, "grad_norm": 22.3037052154541, "learning_rate": 1.4571217252520535e-05, "loss": 0.4394, "step": 91050 }, { "epoch": 3.54626493830044, "grad_norm": 54.5261116027832, "learning_rate": 1.4551753668885515e-05, "loss": 0.5141, "step": 91100 }, { "epoch": 3.5482112966639416, "grad_norm": 24.394609451293945, "learning_rate": 1.4532290085250497e-05, "loss": 0.5048, "step": 91150 }, { "epoch": 3.5501576550274434, "grad_norm": 62.89752960205078, "learning_rate": 1.4512826501615478e-05, "loss": 0.4633, "step": 91200 }, { "epoch": 3.5521040133909456, "grad_norm": 24.53225326538086, "learning_rate": 1.4493362917980461e-05, "loss": 0.5014, "step": 91250 }, { "epoch": 3.5540503717544474, "grad_norm": 18.73011589050293, "learning_rate": 1.4473899334345439e-05, "loss": 0.6049, "step": 91300 }, { "epoch": 3.5559967301179496, "grad_norm": 13.482606887817383, "learning_rate": 1.4454435750710421e-05, "loss": 0.5347, "step": 91350 }, { "epoch": 3.5579430884814514, "grad_norm": 62.24803161621094, "learning_rate": 1.4434972167075402e-05, "loss": 0.5317, "step": 91400 }, { "epoch": 3.559889446844953, "grad_norm": 16.059558868408203, "learning_rate": 1.4415508583440385e-05, "loss": 0.5435, "step": 91450 }, { "epoch": 3.561835805208455, "grad_norm": 7.5748090744018555, "learning_rate": 1.4396044999805364e-05, "loss": 0.5222, "step": 91500 }, { "epoch": 3.5637821635719567, "grad_norm": 9.6727933883667, "learning_rate": 1.4376970687843047e-05, "loss": 0.4676, "step": 91550 }, { "epoch": 3.565728521935459, "grad_norm": 10.531933784484863, "learning_rate": 1.4357507104208026e-05, "loss": 0.4865, "step": 91600 }, { "epoch": 3.5676748802989606, "grad_norm": 22.91495704650879, "learning_rate": 1.4338043520573009e-05, "loss": 0.429, "step": 91650 }, { "epoch": 3.5696212386624624, "grad_norm": 10.091038703918457, "learning_rate": 1.431857993693799e-05, "loss": 0.4799, "step": 91700 }, { "epoch": 3.5715675970259646, "grad_norm": 24.129825592041016, "learning_rate": 1.4299116353302972e-05, "loss": 0.5167, "step": 91750 }, { "epoch": 3.5735139553894664, "grad_norm": 26.014503479003906, "learning_rate": 1.4279652769667953e-05, "loss": 0.5975, "step": 91800 }, { "epoch": 3.575460313752968, "grad_norm": 28.673248291015625, "learning_rate": 1.4260189186032932e-05, "loss": 0.493, "step": 91850 }, { "epoch": 3.57740667211647, "grad_norm": 27.226417541503906, "learning_rate": 1.4240725602397913e-05, "loss": 0.5114, "step": 91900 }, { "epoch": 3.579353030479972, "grad_norm": 9.464468955993652, "learning_rate": 1.4221262018762896e-05, "loss": 0.4741, "step": 91950 }, { "epoch": 3.581299388843474, "grad_norm": 11.690084457397461, "learning_rate": 1.4201798435127877e-05, "loss": 0.581, "step": 92000 }, { "epoch": 3.5832457472069756, "grad_norm": 9.96972370147705, "learning_rate": 1.4182334851492856e-05, "loss": 0.4765, "step": 92050 }, { "epoch": 3.585192105570478, "grad_norm": 11.867464065551758, "learning_rate": 1.4162871267857839e-05, "loss": 0.5288, "step": 92100 }, { "epoch": 3.5871384639339796, "grad_norm": 20.472408294677734, "learning_rate": 1.414340768422282e-05, "loss": 0.4389, "step": 92150 }, { "epoch": 3.5890848222974814, "grad_norm": 20.991254806518555, "learning_rate": 1.4123944100587803e-05, "loss": 0.5546, "step": 92200 }, { "epoch": 3.591031180660983, "grad_norm": 50.74270248413086, "learning_rate": 1.410448051695278e-05, "loss": 0.4233, "step": 92250 }, { "epoch": 3.592977539024485, "grad_norm": 14.215154647827148, "learning_rate": 1.4085016933317763e-05, "loss": 0.524, "step": 92300 }, { "epoch": 3.594923897387987, "grad_norm": 39.83400344848633, "learning_rate": 1.4065553349682744e-05, "loss": 0.4872, "step": 92350 }, { "epoch": 3.596870255751489, "grad_norm": 30.82715606689453, "learning_rate": 1.4046089766047727e-05, "loss": 0.4418, "step": 92400 }, { "epoch": 3.598816614114991, "grad_norm": 11.579216003417969, "learning_rate": 1.4026626182412706e-05, "loss": 0.4745, "step": 92450 }, { "epoch": 3.600762972478493, "grad_norm": 9.16733455657959, "learning_rate": 1.4007162598777687e-05, "loss": 0.3857, "step": 92500 }, { "epoch": 3.6027093308419946, "grad_norm": 23.922304153442383, "learning_rate": 1.398769901514267e-05, "loss": 0.469, "step": 92550 }, { "epoch": 3.6046556892054964, "grad_norm": 10.857650756835938, "learning_rate": 1.396823543150765e-05, "loss": 0.4252, "step": 92600 }, { "epoch": 3.606602047568998, "grad_norm": 28.46628761291504, "learning_rate": 1.394877184787263e-05, "loss": 0.5119, "step": 92650 }, { "epoch": 3.6085484059325004, "grad_norm": 47.944297790527344, "learning_rate": 1.3929308264237612e-05, "loss": 0.5446, "step": 92700 }, { "epoch": 3.610494764296002, "grad_norm": 8.161083221435547, "learning_rate": 1.3909844680602593e-05, "loss": 0.4203, "step": 92750 }, { "epoch": 3.6124411226595043, "grad_norm": 6.596136093139648, "learning_rate": 1.3890381096967576e-05, "loss": 0.3563, "step": 92800 }, { "epoch": 3.614387481023006, "grad_norm": 18.027639389038086, "learning_rate": 1.3870917513332554e-05, "loss": 0.5082, "step": 92850 }, { "epoch": 3.616333839386508, "grad_norm": 26.916122436523438, "learning_rate": 1.3851453929697536e-05, "loss": 0.4798, "step": 92900 }, { "epoch": 3.6182801977500096, "grad_norm": 7.571145057678223, "learning_rate": 1.3831990346062517e-05, "loss": 0.5693, "step": 92950 }, { "epoch": 3.6202265561135114, "grad_norm": 37.17942810058594, "learning_rate": 1.38125267624275e-05, "loss": 0.4617, "step": 93000 }, { "epoch": 3.6221729144770136, "grad_norm": 22.981801986694336, "learning_rate": 1.3793063178792479e-05, "loss": 0.4948, "step": 93050 }, { "epoch": 3.6241192728405154, "grad_norm": 30.23388671875, "learning_rate": 1.377359959515746e-05, "loss": 0.4637, "step": 93100 }, { "epoch": 3.626065631204017, "grad_norm": 12.285123825073242, "learning_rate": 1.3754136011522443e-05, "loss": 0.4252, "step": 93150 }, { "epoch": 3.6280119895675194, "grad_norm": 22.598491668701172, "learning_rate": 1.3734672427887424e-05, "loss": 0.4828, "step": 93200 }, { "epoch": 3.629958347931021, "grad_norm": 66.8594970703125, "learning_rate": 1.3715208844252406e-05, "loss": 0.5256, "step": 93250 }, { "epoch": 3.631904706294523, "grad_norm": 2.265226125717163, "learning_rate": 1.3695745260617384e-05, "loss": 0.5141, "step": 93300 }, { "epoch": 3.6338510646580247, "grad_norm": 34.78238296508789, "learning_rate": 1.3676281676982367e-05, "loss": 0.4749, "step": 93350 }, { "epoch": 3.635797423021527, "grad_norm": 6.391922473907471, "learning_rate": 1.3656818093347348e-05, "loss": 0.3752, "step": 93400 }, { "epoch": 3.6377437813850286, "grad_norm": 4.146912097930908, "learning_rate": 1.363735450971233e-05, "loss": 0.5426, "step": 93450 }, { "epoch": 3.6396901397485304, "grad_norm": 34.57646942138672, "learning_rate": 1.361789092607731e-05, "loss": 0.4718, "step": 93500 }, { "epoch": 3.6416364981120326, "grad_norm": 15.51948070526123, "learning_rate": 1.359842734244229e-05, "loss": 0.4945, "step": 93550 }, { "epoch": 3.6435828564755344, "grad_norm": 14.132922172546387, "learning_rate": 1.3578963758807273e-05, "loss": 0.5055, "step": 93600 }, { "epoch": 3.645529214839036, "grad_norm": 12.45804500579834, "learning_rate": 1.3559500175172254e-05, "loss": 0.5068, "step": 93650 }, { "epoch": 3.647475573202538, "grad_norm": 60.909210205078125, "learning_rate": 1.3540036591537233e-05, "loss": 0.5146, "step": 93700 }, { "epoch": 3.6494219315660397, "grad_norm": 40.95624923706055, "learning_rate": 1.3520573007902216e-05, "loss": 0.5149, "step": 93750 }, { "epoch": 3.651368289929542, "grad_norm": 2.9656600952148438, "learning_rate": 1.3501109424267197e-05, "loss": 0.4238, "step": 93800 }, { "epoch": 3.6533146482930436, "grad_norm": 16.301713943481445, "learning_rate": 1.3481645840632178e-05, "loss": 0.5189, "step": 93850 }, { "epoch": 3.655261006656546, "grad_norm": 56.160308837890625, "learning_rate": 1.3462182256997157e-05, "loss": 0.5968, "step": 93900 }, { "epoch": 3.6572073650200476, "grad_norm": 15.405927658081055, "learning_rate": 1.344271867336214e-05, "loss": 0.4968, "step": 93950 }, { "epoch": 3.6591537233835494, "grad_norm": 13.14477825164795, "learning_rate": 1.3423255089727121e-05, "loss": 0.4628, "step": 94000 }, { "epoch": 3.661100081747051, "grad_norm": 42.588932037353516, "learning_rate": 1.3403791506092104e-05, "loss": 0.4738, "step": 94050 }, { "epoch": 3.663046440110553, "grad_norm": 42.08701705932617, "learning_rate": 1.3384327922457083e-05, "loss": 0.4802, "step": 94100 }, { "epoch": 3.664992798474055, "grad_norm": 3.9519824981689453, "learning_rate": 1.3364864338822064e-05, "loss": 0.5629, "step": 94150 }, { "epoch": 3.666939156837557, "grad_norm": 32.21622848510742, "learning_rate": 1.3345400755187047e-05, "loss": 0.5021, "step": 94200 }, { "epoch": 3.6688855152010587, "grad_norm": 14.495068550109863, "learning_rate": 1.3325937171552027e-05, "loss": 0.5429, "step": 94250 }, { "epoch": 3.670831873564561, "grad_norm": 43.30869674682617, "learning_rate": 1.3306473587917007e-05, "loss": 0.5029, "step": 94300 }, { "epoch": 3.6727782319280626, "grad_norm": 5.001723766326904, "learning_rate": 1.3287010004281988e-05, "loss": 0.4809, "step": 94350 }, { "epoch": 3.6747245902915644, "grad_norm": 31.4925594329834, "learning_rate": 1.326754642064697e-05, "loss": 0.4793, "step": 94400 }, { "epoch": 3.676670948655066, "grad_norm": 10.966466903686523, "learning_rate": 1.3248082837011951e-05, "loss": 0.4561, "step": 94450 }, { "epoch": 3.6786173070185684, "grad_norm": 3.398143768310547, "learning_rate": 1.3228619253376934e-05, "loss": 0.4167, "step": 94500 }, { "epoch": 3.68056366538207, "grad_norm": 10.39241886138916, "learning_rate": 1.3209155669741913e-05, "loss": 0.5284, "step": 94550 }, { "epoch": 3.682510023745572, "grad_norm": 15.34775161743164, "learning_rate": 1.3189692086106894e-05, "loss": 0.4645, "step": 94600 }, { "epoch": 3.684456382109074, "grad_norm": 8.861892700195312, "learning_rate": 1.3170228502471877e-05, "loss": 0.5138, "step": 94650 }, { "epoch": 3.686402740472576, "grad_norm": 15.240808486938477, "learning_rate": 1.3150764918836858e-05, "loss": 0.4764, "step": 94700 }, { "epoch": 3.6883490988360776, "grad_norm": 13.715555191040039, "learning_rate": 1.3131301335201837e-05, "loss": 0.5861, "step": 94750 }, { "epoch": 3.6902954571995794, "grad_norm": 11.875293731689453, "learning_rate": 1.3111837751566818e-05, "loss": 0.422, "step": 94800 }, { "epoch": 3.6922418155630816, "grad_norm": 20.126441955566406, "learning_rate": 1.30923741679318e-05, "loss": 0.5063, "step": 94850 }, { "epoch": 3.6941881739265834, "grad_norm": 24.678112030029297, "learning_rate": 1.3072910584296782e-05, "loss": 0.4492, "step": 94900 }, { "epoch": 3.696134532290085, "grad_norm": 14.7833890914917, "learning_rate": 1.3053447000661761e-05, "loss": 0.4478, "step": 94950 }, { "epoch": 3.6980808906535874, "grad_norm": 32.45911407470703, "learning_rate": 1.3033983417026744e-05, "loss": 0.5147, "step": 95000 }, { "epoch": 3.700027249017089, "grad_norm": 55.9444580078125, "learning_rate": 1.3014519833391725e-05, "loss": 0.3949, "step": 95050 }, { "epoch": 3.701973607380591, "grad_norm": 50.48737716674805, "learning_rate": 1.2995056249756707e-05, "loss": 0.4746, "step": 95100 }, { "epoch": 3.7039199657440927, "grad_norm": 6.9138593673706055, "learning_rate": 1.2975981937794388e-05, "loss": 0.5053, "step": 95150 }, { "epoch": 3.7058663241075944, "grad_norm": 16.19359588623047, "learning_rate": 1.2956518354159369e-05, "loss": 0.5154, "step": 95200 }, { "epoch": 3.7078126824710966, "grad_norm": 14.800187110900879, "learning_rate": 1.2937054770524348e-05, "loss": 0.5246, "step": 95250 }, { "epoch": 3.7097590408345984, "grad_norm": 19.955738067626953, "learning_rate": 1.291759118688933e-05, "loss": 0.4504, "step": 95300 }, { "epoch": 3.7117053991981006, "grad_norm": 8.567776679992676, "learning_rate": 1.2898127603254312e-05, "loss": 0.504, "step": 95350 }, { "epoch": 3.7136517575616024, "grad_norm": 61.39707565307617, "learning_rate": 1.2878664019619293e-05, "loss": 0.4417, "step": 95400 }, { "epoch": 3.715598115925104, "grad_norm": 32.80521774291992, "learning_rate": 1.2859200435984272e-05, "loss": 0.5313, "step": 95450 }, { "epoch": 3.717544474288606, "grad_norm": 4.738945484161377, "learning_rate": 1.2839736852349255e-05, "loss": 0.406, "step": 95500 }, { "epoch": 3.7194908326521077, "grad_norm": 62.732521057128906, "learning_rate": 1.2820273268714236e-05, "loss": 0.4065, "step": 95550 }, { "epoch": 3.72143719101561, "grad_norm": 20.88898468017578, "learning_rate": 1.2800809685079219e-05, "loss": 0.5716, "step": 95600 }, { "epoch": 3.7233835493791116, "grad_norm": 14.734282493591309, "learning_rate": 1.27813461014442e-05, "loss": 0.4442, "step": 95650 }, { "epoch": 3.7253299077426134, "grad_norm": 14.827335357666016, "learning_rate": 1.2761882517809179e-05, "loss": 0.5079, "step": 95700 }, { "epoch": 3.7272762661061156, "grad_norm": 11.47487735748291, "learning_rate": 1.2742418934174161e-05, "loss": 0.5094, "step": 95750 }, { "epoch": 3.7292226244696174, "grad_norm": 59.32285690307617, "learning_rate": 1.2722955350539142e-05, "loss": 0.4749, "step": 95800 }, { "epoch": 3.731168982833119, "grad_norm": 6.793711185455322, "learning_rate": 1.2703491766904125e-05, "loss": 0.4665, "step": 95850 }, { "epoch": 3.733115341196621, "grad_norm": 17.26508903503418, "learning_rate": 1.2684028183269103e-05, "loss": 0.4072, "step": 95900 }, { "epoch": 3.735061699560123, "grad_norm": 24.41179847717285, "learning_rate": 1.2664564599634085e-05, "loss": 0.5024, "step": 95950 }, { "epoch": 3.737008057923625, "grad_norm": 11.740961074829102, "learning_rate": 1.2645101015999066e-05, "loss": 0.4953, "step": 96000 }, { "epoch": 3.7389544162871267, "grad_norm": 10.22021484375, "learning_rate": 1.2625637432364049e-05, "loss": 0.3786, "step": 96050 }, { "epoch": 3.740900774650629, "grad_norm": 5.5205864906311035, "learning_rate": 1.2606173848729028e-05, "loss": 0.6109, "step": 96100 }, { "epoch": 3.7428471330141306, "grad_norm": 111.16633605957031, "learning_rate": 1.258671026509401e-05, "loss": 0.53, "step": 96150 }, { "epoch": 3.7447934913776324, "grad_norm": 11.668266296386719, "learning_rate": 1.2567246681458992e-05, "loss": 0.4683, "step": 96200 }, { "epoch": 3.746739849741134, "grad_norm": 44.07362365722656, "learning_rate": 1.2547783097823973e-05, "loss": 0.4509, "step": 96250 }, { "epoch": 3.7486862081046364, "grad_norm": 9.8851318359375, "learning_rate": 1.2528319514188952e-05, "loss": 0.4997, "step": 96300 }, { "epoch": 3.750632566468138, "grad_norm": 46.61189270019531, "learning_rate": 1.2508855930553933e-05, "loss": 0.5495, "step": 96350 }, { "epoch": 3.75257892483164, "grad_norm": 14.960881233215332, "learning_rate": 1.2489392346918916e-05, "loss": 0.4825, "step": 96400 }, { "epoch": 3.754525283195142, "grad_norm": 4.826798915863037, "learning_rate": 1.2469928763283897e-05, "loss": 0.5276, "step": 96450 }, { "epoch": 3.756471641558644, "grad_norm": 17.853574752807617, "learning_rate": 1.2450465179648878e-05, "loss": 0.428, "step": 96500 }, { "epoch": 3.7584179999221456, "grad_norm": 5.692882061004639, "learning_rate": 1.2431001596013859e-05, "loss": 0.4781, "step": 96550 }, { "epoch": 3.7603643582856474, "grad_norm": 6.580623149871826, "learning_rate": 1.241153801237884e-05, "loss": 0.5579, "step": 96600 }, { "epoch": 3.762310716649149, "grad_norm": 51.4645881652832, "learning_rate": 1.239207442874382e-05, "loss": 0.4962, "step": 96650 }, { "epoch": 3.7642570750126514, "grad_norm": 2.0929229259490967, "learning_rate": 1.2372610845108802e-05, "loss": 0.4409, "step": 96700 }, { "epoch": 3.766203433376153, "grad_norm": 66.61515045166016, "learning_rate": 1.2353147261473783e-05, "loss": 0.523, "step": 96750 }, { "epoch": 3.7681497917396554, "grad_norm": 72.4551010131836, "learning_rate": 1.2333683677838764e-05, "loss": 0.4396, "step": 96800 }, { "epoch": 3.770096150103157, "grad_norm": 17.541759490966797, "learning_rate": 1.2314220094203744e-05, "loss": 0.4659, "step": 96850 }, { "epoch": 3.772042508466659, "grad_norm": 12.937578201293945, "learning_rate": 1.2294756510568727e-05, "loss": 0.4316, "step": 96900 }, { "epoch": 3.7739888668301607, "grad_norm": 50.66515350341797, "learning_rate": 1.2275292926933708e-05, "loss": 0.4376, "step": 96950 }, { "epoch": 3.7759352251936624, "grad_norm": 12.141281127929688, "learning_rate": 1.2255829343298689e-05, "loss": 0.3665, "step": 97000 }, { "epoch": 3.7778815835571646, "grad_norm": 33.793094635009766, "learning_rate": 1.223636575966367e-05, "loss": 0.5437, "step": 97050 }, { "epoch": 3.7798279419206664, "grad_norm": 69.05925750732422, "learning_rate": 1.2216902176028651e-05, "loss": 0.489, "step": 97100 }, { "epoch": 3.781774300284168, "grad_norm": 5.19613790512085, "learning_rate": 1.2197438592393632e-05, "loss": 0.5084, "step": 97150 }, { "epoch": 3.7837206586476704, "grad_norm": 35.01382827758789, "learning_rate": 1.2177975008758613e-05, "loss": 0.4418, "step": 97200 }, { "epoch": 3.785667017011172, "grad_norm": 11.789973258972168, "learning_rate": 1.2158511425123596e-05, "loss": 0.5311, "step": 97250 }, { "epoch": 3.787613375374674, "grad_norm": 6.712963104248047, "learning_rate": 1.2139047841488575e-05, "loss": 0.4338, "step": 97300 }, { "epoch": 3.7895597337381757, "grad_norm": 28.77261734008789, "learning_rate": 1.2119584257853558e-05, "loss": 0.4443, "step": 97350 }, { "epoch": 3.791506092101678, "grad_norm": 5.675388813018799, "learning_rate": 1.2100120674218537e-05, "loss": 0.4036, "step": 97400 }, { "epoch": 3.7934524504651796, "grad_norm": 12.893020629882812, "learning_rate": 1.208065709058352e-05, "loss": 0.3651, "step": 97450 }, { "epoch": 3.7953988088286814, "grad_norm": 11.636592864990234, "learning_rate": 1.20611935069485e-05, "loss": 0.5236, "step": 97500 }, { "epoch": 3.7973451671921836, "grad_norm": 3.0348329544067383, "learning_rate": 1.2041729923313481e-05, "loss": 0.5308, "step": 97550 }, { "epoch": 3.7992915255556854, "grad_norm": 8.230152130126953, "learning_rate": 1.2022655611351162e-05, "loss": 0.4776, "step": 97600 }, { "epoch": 3.801237883919187, "grad_norm": 33.90690994262695, "learning_rate": 1.2003192027716143e-05, "loss": 0.4044, "step": 97650 }, { "epoch": 3.803184242282689, "grad_norm": 10.275712966918945, "learning_rate": 1.1983728444081124e-05, "loss": 0.4575, "step": 97700 }, { "epoch": 3.805130600646191, "grad_norm": 9.984426498413086, "learning_rate": 1.1964264860446107e-05, "loss": 0.533, "step": 97750 }, { "epoch": 3.807076959009693, "grad_norm": 77.2607650756836, "learning_rate": 1.1944801276811086e-05, "loss": 0.548, "step": 97800 }, { "epoch": 3.8090233173731947, "grad_norm": 13.065899848937988, "learning_rate": 1.1925337693176069e-05, "loss": 0.5135, "step": 97850 }, { "epoch": 3.810969675736697, "grad_norm": 7.783257484436035, "learning_rate": 1.190626338121375e-05, "loss": 0.536, "step": 97900 }, { "epoch": 3.8129160341001986, "grad_norm": 61.37982940673828, "learning_rate": 1.188679979757873e-05, "loss": 0.4704, "step": 97950 }, { "epoch": 3.8148623924637004, "grad_norm": 13.493196487426758, "learning_rate": 1.1867336213943713e-05, "loss": 0.4343, "step": 98000 }, { "epoch": 3.816808750827202, "grad_norm": 0.23307937383651733, "learning_rate": 1.1847872630308692e-05, "loss": 0.4056, "step": 98050 }, { "epoch": 3.818755109190704, "grad_norm": 28.918397903442383, "learning_rate": 1.1828409046673675e-05, "loss": 0.4727, "step": 98100 }, { "epoch": 3.820701467554206, "grad_norm": 1.3483188152313232, "learning_rate": 1.1808945463038654e-05, "loss": 0.3688, "step": 98150 }, { "epoch": 3.822647825917708, "grad_norm": 3.8271408081054688, "learning_rate": 1.1789481879403637e-05, "loss": 0.5076, "step": 98200 }, { "epoch": 3.82459418428121, "grad_norm": 31.471162796020508, "learning_rate": 1.1770018295768618e-05, "loss": 0.5004, "step": 98250 }, { "epoch": 3.826540542644712, "grad_norm": 6.024818420410156, "learning_rate": 1.1750554712133599e-05, "loss": 0.475, "step": 98300 }, { "epoch": 3.8284869010082136, "grad_norm": 13.542709350585938, "learning_rate": 1.173109112849858e-05, "loss": 0.477, "step": 98350 }, { "epoch": 3.8304332593717154, "grad_norm": 31.4610652923584, "learning_rate": 1.1711627544863561e-05, "loss": 0.4257, "step": 98400 }, { "epoch": 3.832379617735217, "grad_norm": 36.26358413696289, "learning_rate": 1.1692163961228542e-05, "loss": 0.524, "step": 98450 }, { "epoch": 3.8343259760987194, "grad_norm": 11.142889976501465, "learning_rate": 1.1672700377593523e-05, "loss": 0.5445, "step": 98500 }, { "epoch": 3.836272334462221, "grad_norm": 18.65030860900879, "learning_rate": 1.1653236793958504e-05, "loss": 0.5533, "step": 98550 }, { "epoch": 3.838218692825723, "grad_norm": 7.9255571365356445, "learning_rate": 1.1633773210323485e-05, "loss": 0.4232, "step": 98600 }, { "epoch": 3.840165051189225, "grad_norm": 29.32022476196289, "learning_rate": 1.1614309626688466e-05, "loss": 0.5372, "step": 98650 }, { "epoch": 3.842111409552727, "grad_norm": 67.20787048339844, "learning_rate": 1.1594846043053448e-05, "loss": 0.3996, "step": 98700 }, { "epoch": 3.8440577679162287, "grad_norm": 16.977563858032227, "learning_rate": 1.1575382459418428e-05, "loss": 0.4872, "step": 98750 }, { "epoch": 3.8460041262797304, "grad_norm": 10.015093803405762, "learning_rate": 1.155591887578341e-05, "loss": 0.4751, "step": 98800 }, { "epoch": 3.8479504846432326, "grad_norm": 3.539030075073242, "learning_rate": 1.153645529214839e-05, "loss": 0.4058, "step": 98850 }, { "epoch": 3.8498968430067344, "grad_norm": 23.75165557861328, "learning_rate": 1.1516991708513372e-05, "loss": 0.5301, "step": 98900 }, { "epoch": 3.851843201370236, "grad_norm": 13.989348411560059, "learning_rate": 1.1497528124878353e-05, "loss": 0.4874, "step": 98950 }, { "epoch": 3.8537895597337384, "grad_norm": 56.38072967529297, "learning_rate": 1.1478064541243334e-05, "loss": 0.5147, "step": 99000 }, { "epoch": 3.85573591809724, "grad_norm": 9.250102996826172, "learning_rate": 1.1458600957608315e-05, "loss": 0.4992, "step": 99050 }, { "epoch": 3.857682276460742, "grad_norm": 53.614566802978516, "learning_rate": 1.1439137373973296e-05, "loss": 0.3845, "step": 99100 }, { "epoch": 3.8596286348242437, "grad_norm": 7.328714370727539, "learning_rate": 1.1419673790338277e-05, "loss": 0.5257, "step": 99150 }, { "epoch": 3.861574993187746, "grad_norm": 8.70997142791748, "learning_rate": 1.1400210206703258e-05, "loss": 0.4472, "step": 99200 }, { "epoch": 3.8635213515512477, "grad_norm": 142.8336181640625, "learning_rate": 1.138074662306824e-05, "loss": 0.5274, "step": 99250 }, { "epoch": 3.8654677099147494, "grad_norm": 23.792314529418945, "learning_rate": 1.1361283039433222e-05, "loss": 0.4264, "step": 99300 }, { "epoch": 3.8674140682782516, "grad_norm": 20.57599639892578, "learning_rate": 1.1341819455798203e-05, "loss": 0.5228, "step": 99350 }, { "epoch": 3.8693604266417534, "grad_norm": 8.927663803100586, "learning_rate": 1.1322355872163184e-05, "loss": 0.4905, "step": 99400 }, { "epoch": 3.871306785005255, "grad_norm": 139.88259887695312, "learning_rate": 1.1302892288528165e-05, "loss": 0.5147, "step": 99450 }, { "epoch": 3.873253143368757, "grad_norm": 23.996654510498047, "learning_rate": 1.1283428704893146e-05, "loss": 0.5036, "step": 99500 }, { "epoch": 3.8751995017322587, "grad_norm": 21.701122283935547, "learning_rate": 1.1263965121258127e-05, "loss": 0.4556, "step": 99550 }, { "epoch": 3.877145860095761, "grad_norm": 23.8729305267334, "learning_rate": 1.1244501537623108e-05, "loss": 0.5037, "step": 99600 }, { "epoch": 3.8790922184592627, "grad_norm": 31.638456344604492, "learning_rate": 1.1225037953988089e-05, "loss": 0.5597, "step": 99650 }, { "epoch": 3.881038576822765, "grad_norm": 20.839107513427734, "learning_rate": 1.120557437035307e-05, "loss": 0.6107, "step": 99700 }, { "epoch": 3.8829849351862666, "grad_norm": 15.23254108428955, "learning_rate": 1.1186110786718052e-05, "loss": 0.5056, "step": 99750 }, { "epoch": 3.8849312935497684, "grad_norm": 18.76145362854004, "learning_rate": 1.1166647203083031e-05, "loss": 0.5534, "step": 99800 }, { "epoch": 3.88687765191327, "grad_norm": 30.425466537475586, "learning_rate": 1.1147183619448014e-05, "loss": 0.4554, "step": 99850 }, { "epoch": 3.888824010276772, "grad_norm": 4.753281116485596, "learning_rate": 1.1127720035812993e-05, "loss": 0.4125, "step": 99900 }, { "epoch": 3.890770368640274, "grad_norm": 33.54433059692383, "learning_rate": 1.1108256452177976e-05, "loss": 0.4925, "step": 99950 }, { "epoch": 3.892716727003776, "grad_norm": 20.05385971069336, "learning_rate": 1.1088792868542957e-05, "loss": 0.481, "step": 100000 }, { "epoch": 3.8946630853672777, "grad_norm": 6.096341609954834, "learning_rate": 1.1069329284907938e-05, "loss": 0.4392, "step": 100050 }, { "epoch": 3.89660944373078, "grad_norm": 10.044628143310547, "learning_rate": 1.1049865701272919e-05, "loss": 0.4644, "step": 100100 }, { "epoch": 3.8985558020942817, "grad_norm": 16.836627960205078, "learning_rate": 1.10304021176379e-05, "loss": 0.502, "step": 100150 }, { "epoch": 3.9005021604577834, "grad_norm": 6.3187432289123535, "learning_rate": 1.1010938534002881e-05, "loss": 0.5084, "step": 100200 }, { "epoch": 3.902448518821285, "grad_norm": 8.947513580322266, "learning_rate": 1.0991474950367862e-05, "loss": 0.4866, "step": 100250 }, { "epoch": 3.9043948771847874, "grad_norm": 19.744043350219727, "learning_rate": 1.0972011366732843e-05, "loss": 0.4878, "step": 100300 }, { "epoch": 3.906341235548289, "grad_norm": 1.113893747329712, "learning_rate": 1.0952547783097824e-05, "loss": 0.4996, "step": 100350 }, { "epoch": 3.908287593911791, "grad_norm": 3.2790753841400146, "learning_rate": 1.0933084199462805e-05, "loss": 0.4608, "step": 100400 }, { "epoch": 3.910233952275293, "grad_norm": 42.3125, "learning_rate": 1.0913620615827787e-05, "loss": 0.4822, "step": 100450 }, { "epoch": 3.912180310638795, "grad_norm": 9.043429374694824, "learning_rate": 1.0894157032192767e-05, "loss": 0.457, "step": 100500 }, { "epoch": 3.9141266690022967, "grad_norm": 20.143747329711914, "learning_rate": 1.087469344855775e-05, "loss": 0.5606, "step": 100550 }, { "epoch": 3.9160730273657984, "grad_norm": 56.024078369140625, "learning_rate": 1.085522986492273e-05, "loss": 0.488, "step": 100600 }, { "epoch": 3.9180193857293006, "grad_norm": 19.55257797241211, "learning_rate": 1.0835766281287711e-05, "loss": 0.4306, "step": 100650 }, { "epoch": 3.9199657440928024, "grad_norm": 1.79707670211792, "learning_rate": 1.0816302697652692e-05, "loss": 0.4858, "step": 100700 }, { "epoch": 3.921912102456304, "grad_norm": 0.5951626300811768, "learning_rate": 1.0796839114017673e-05, "loss": 0.5017, "step": 100750 }, { "epoch": 3.9238584608198064, "grad_norm": 4.452052593231201, "learning_rate": 1.0777375530382656e-05, "loss": 0.478, "step": 100800 }, { "epoch": 3.925804819183308, "grad_norm": 5.774175643920898, "learning_rate": 1.0757911946747635e-05, "loss": 0.517, "step": 100850 }, { "epoch": 3.92775117754681, "grad_norm": 15.177854537963867, "learning_rate": 1.0738448363112618e-05, "loss": 0.5528, "step": 100900 }, { "epoch": 3.9296975359103117, "grad_norm": 53.01628112792969, "learning_rate": 1.0718984779477597e-05, "loss": 0.537, "step": 100950 }, { "epoch": 3.9316438942738134, "grad_norm": 13.616596221923828, "learning_rate": 1.069952119584258e-05, "loss": 0.4803, "step": 101000 }, { "epoch": 3.9335902526373157, "grad_norm": 33.48299026489258, "learning_rate": 1.0680057612207559e-05, "loss": 0.439, "step": 101050 }, { "epoch": 3.9355366110008174, "grad_norm": 3.132103681564331, "learning_rate": 1.0660594028572542e-05, "loss": 0.4963, "step": 101100 }, { "epoch": 3.9374829693643196, "grad_norm": 47.67720413208008, "learning_rate": 1.0641130444937523e-05, "loss": 0.5422, "step": 101150 }, { "epoch": 3.9394293277278214, "grad_norm": 29.241731643676758, "learning_rate": 1.0621666861302504e-05, "loss": 0.473, "step": 101200 }, { "epoch": 3.941375686091323, "grad_norm": 0.9811972379684448, "learning_rate": 1.0602203277667485e-05, "loss": 0.477, "step": 101250 }, { "epoch": 3.943322044454825, "grad_norm": 10.928448677062988, "learning_rate": 1.0582739694032466e-05, "loss": 0.4633, "step": 101300 }, { "epoch": 3.9452684028183267, "grad_norm": 32.6392707824707, "learning_rate": 1.0563276110397447e-05, "loss": 0.4919, "step": 101350 }, { "epoch": 3.947214761181829, "grad_norm": 24.208375930786133, "learning_rate": 1.0543812526762428e-05, "loss": 0.4716, "step": 101400 }, { "epoch": 3.9491611195453307, "grad_norm": 2.6482245922088623, "learning_rate": 1.0524348943127409e-05, "loss": 0.4387, "step": 101450 }, { "epoch": 3.9511074779088324, "grad_norm": 5.199710845947266, "learning_rate": 1.0504885359492391e-05, "loss": 0.4419, "step": 101500 }, { "epoch": 3.9530538362723346, "grad_norm": 3.8799002170562744, "learning_rate": 1.048542177585737e-05, "loss": 0.436, "step": 101550 }, { "epoch": 3.9550001946358364, "grad_norm": 11.9754056930542, "learning_rate": 1.0465958192222353e-05, "loss": 0.4856, "step": 101600 }, { "epoch": 3.956946552999338, "grad_norm": 11.415141105651855, "learning_rate": 1.0446494608587332e-05, "loss": 0.5134, "step": 101650 }, { "epoch": 3.95889291136284, "grad_norm": 6.901851177215576, "learning_rate": 1.0427031024952315e-05, "loss": 0.4273, "step": 101700 }, { "epoch": 3.960839269726342, "grad_norm": 14.8219575881958, "learning_rate": 1.0407567441317296e-05, "loss": 0.5342, "step": 101750 }, { "epoch": 3.962785628089844, "grad_norm": 15.100242614746094, "learning_rate": 1.0388103857682277e-05, "loss": 0.4981, "step": 101800 }, { "epoch": 3.9647319864533457, "grad_norm": 27.43487548828125, "learning_rate": 1.0368640274047258e-05, "loss": 0.507, "step": 101850 }, { "epoch": 3.966678344816848, "grad_norm": 10.71286678314209, "learning_rate": 1.0349176690412239e-05, "loss": 0.4798, "step": 101900 }, { "epoch": 3.9686247031803497, "grad_norm": 15.568731307983398, "learning_rate": 1.0329713106777222e-05, "loss": 0.4997, "step": 101950 }, { "epoch": 3.9705710615438514, "grad_norm": 11.173429489135742, "learning_rate": 1.0310249523142201e-05, "loss": 0.4738, "step": 102000 }, { "epoch": 3.972517419907353, "grad_norm": 2.8975887298583984, "learning_rate": 1.0290785939507184e-05, "loss": 0.3989, "step": 102050 }, { "epoch": 3.9744637782708554, "grad_norm": 21.92950439453125, "learning_rate": 1.0271322355872163e-05, "loss": 0.4102, "step": 102100 }, { "epoch": 3.976410136634357, "grad_norm": 16.288848876953125, "learning_rate": 1.0251858772237146e-05, "loss": 0.4897, "step": 102150 }, { "epoch": 3.978356494997859, "grad_norm": 29.982603073120117, "learning_rate": 1.0232395188602126e-05, "loss": 0.5594, "step": 102200 }, { "epoch": 3.980302853361361, "grad_norm": 22.643003463745117, "learning_rate": 1.0212931604967107e-05, "loss": 0.5381, "step": 102250 }, { "epoch": 3.982249211724863, "grad_norm": 20.163101196289062, "learning_rate": 1.0193468021332088e-05, "loss": 0.454, "step": 102300 }, { "epoch": 3.9841955700883647, "grad_norm": 1.161385416984558, "learning_rate": 1.017400443769707e-05, "loss": 0.468, "step": 102350 }, { "epoch": 3.9861419284518664, "grad_norm": 3.418659210205078, "learning_rate": 1.015454085406205e-05, "loss": 0.5049, "step": 102400 }, { "epoch": 3.988088286815368, "grad_norm": 15.401394844055176, "learning_rate": 1.0135077270427031e-05, "loss": 0.5198, "step": 102450 }, { "epoch": 3.9900346451788704, "grad_norm": 13.182781219482422, "learning_rate": 1.0115613686792012e-05, "loss": 0.4272, "step": 102500 }, { "epoch": 3.991981003542372, "grad_norm": 25.696456909179688, "learning_rate": 1.0096150103156993e-05, "loss": 0.5113, "step": 102550 }, { "epoch": 3.9939273619058744, "grad_norm": 29.485492706298828, "learning_rate": 1.0076686519521974e-05, "loss": 0.5112, "step": 102600 }, { "epoch": 3.995873720269376, "grad_norm": 87.45968627929688, "learning_rate": 1.0057222935886957e-05, "loss": 0.4683, "step": 102650 }, { "epoch": 3.997820078632878, "grad_norm": 12.466547966003418, "learning_rate": 1.0037759352251936e-05, "loss": 0.4581, "step": 102700 }, { "epoch": 3.9997664369963797, "grad_norm": 180.298828125, "learning_rate": 1.0018295768616919e-05, "loss": 0.4848, "step": 102750 }, { "epoch": 4.0, "eval_accuracy": 0.8003425590719764, "eval_f1_macro": 0.7537688597736917, "eval_f1_weighted": 0.7977349439491848, "eval_loss": 0.6663820743560791, "eval_roc_auc": 0.9539792074909716, "eval_runtime": 27.8896, "eval_samples_per_second": 921.094, "eval_steps_per_second": 115.168, "step": 102756 }, { "epoch": 4.0017127953598814, "grad_norm": 20.755870819091797, "learning_rate": 9.998832184981898e-06, "loss": 0.4426, "step": 102800 }, { "epoch": 4.003659153723383, "grad_norm": 23.645389556884766, "learning_rate": 9.97936860134688e-06, "loss": 0.4702, "step": 102850 }, { "epoch": 4.005605512086886, "grad_norm": 26.50768280029297, "learning_rate": 9.959905017711862e-06, "loss": 0.3186, "step": 102900 }, { "epoch": 4.007551870450388, "grad_norm": 1.0571913719177246, "learning_rate": 9.940441434076843e-06, "loss": 0.5014, "step": 102950 }, { "epoch": 4.009498228813889, "grad_norm": 27.104028701782227, "learning_rate": 9.920977850441824e-06, "loss": 0.4667, "step": 103000 }, { "epoch": 4.011444587177391, "grad_norm": 26.997652053833008, "learning_rate": 9.901514266806805e-06, "loss": 0.3415, "step": 103050 }, { "epoch": 4.013390945540893, "grad_norm": 43.06028747558594, "learning_rate": 9.882050683171786e-06, "loss": 0.4485, "step": 103100 }, { "epoch": 4.015337303904395, "grad_norm": 14.695178031921387, "learning_rate": 9.862587099536767e-06, "loss": 0.4372, "step": 103150 }, { "epoch": 4.0172836622678965, "grad_norm": 21.269031524658203, "learning_rate": 9.843123515901748e-06, "loss": 0.5625, "step": 103200 }, { "epoch": 4.019230020631398, "grad_norm": 14.56990909576416, "learning_rate": 9.82365993226673e-06, "loss": 0.3939, "step": 103250 }, { "epoch": 4.021176378994901, "grad_norm": 43.492671966552734, "learning_rate": 9.804196348631711e-06, "loss": 0.3574, "step": 103300 }, { "epoch": 4.023122737358403, "grad_norm": 1.8793960809707642, "learning_rate": 9.784732764996692e-06, "loss": 0.4048, "step": 103350 }, { "epoch": 4.025069095721904, "grad_norm": 29.1122989654541, "learning_rate": 9.765269181361673e-06, "loss": 0.4506, "step": 103400 }, { "epoch": 4.027015454085406, "grad_norm": 59.64417266845703, "learning_rate": 9.745805597726654e-06, "loss": 0.4672, "step": 103450 }, { "epoch": 4.028961812448908, "grad_norm": 9.14636516571045, "learning_rate": 9.726342014091635e-06, "loss": 0.4196, "step": 103500 }, { "epoch": 4.03090817081241, "grad_norm": 4.682184219360352, "learning_rate": 9.706878430456616e-06, "loss": 0.3209, "step": 103550 }, { "epoch": 4.0328545291759115, "grad_norm": 16.743680953979492, "learning_rate": 9.687414846821597e-06, "loss": 0.4239, "step": 103600 }, { "epoch": 4.034800887539414, "grad_norm": 4.315186977386475, "learning_rate": 9.667951263186578e-06, "loss": 0.4422, "step": 103650 }, { "epoch": 4.036747245902916, "grad_norm": 12.99632453918457, "learning_rate": 9.64848767955156e-06, "loss": 0.4568, "step": 103700 }, { "epoch": 4.038693604266418, "grad_norm": 10.604944229125977, "learning_rate": 9.62902409591654e-06, "loss": 0.4314, "step": 103750 }, { "epoch": 4.040639962629919, "grad_norm": 20.25140953063965, "learning_rate": 9.609560512281523e-06, "loss": 0.403, "step": 103800 }, { "epoch": 4.042586320993421, "grad_norm": 30.329187393188477, "learning_rate": 9.590096928646502e-06, "loss": 0.4681, "step": 103850 }, { "epoch": 4.044532679356923, "grad_norm": 4.474374294281006, "learning_rate": 9.570633345011485e-06, "loss": 0.4127, "step": 103900 }, { "epoch": 4.046479037720425, "grad_norm": 8.685498237609863, "learning_rate": 9.551169761376466e-06, "loss": 0.3666, "step": 103950 }, { "epoch": 4.048425396083927, "grad_norm": 7.808584213256836, "learning_rate": 9.531706177741446e-06, "loss": 0.3744, "step": 104000 }, { "epoch": 4.050371754447429, "grad_norm": 10.561367988586426, "learning_rate": 9.512631865779127e-06, "loss": 0.4624, "step": 104050 }, { "epoch": 4.052318112810931, "grad_norm": 25.242935180664062, "learning_rate": 9.493168282144108e-06, "loss": 0.4459, "step": 104100 }, { "epoch": 4.054264471174433, "grad_norm": 18.416349411010742, "learning_rate": 9.47370469850909e-06, "loss": 0.3938, "step": 104150 }, { "epoch": 4.056210829537934, "grad_norm": 32.79038619995117, "learning_rate": 9.454241114874072e-06, "loss": 0.4409, "step": 104200 }, { "epoch": 4.058157187901436, "grad_norm": 18.97569465637207, "learning_rate": 9.434777531239051e-06, "loss": 0.3623, "step": 104250 }, { "epoch": 4.060103546264938, "grad_norm": 3.828615188598633, "learning_rate": 9.415313947604034e-06, "loss": 0.3718, "step": 104300 }, { "epoch": 4.062049904628441, "grad_norm": 8.737808227539062, "learning_rate": 9.395850363969013e-06, "loss": 0.436, "step": 104350 }, { "epoch": 4.063996262991942, "grad_norm": 21.98409652709961, "learning_rate": 9.376386780333996e-06, "loss": 0.5086, "step": 104400 }, { "epoch": 4.065942621355444, "grad_norm": 10.254690170288086, "learning_rate": 9.356923196698977e-06, "loss": 0.4132, "step": 104450 }, { "epoch": 4.067888979718946, "grad_norm": 41.430747985839844, "learning_rate": 9.337459613063958e-06, "loss": 0.3803, "step": 104500 }, { "epoch": 4.069835338082448, "grad_norm": 7.401790142059326, "learning_rate": 9.31799602942894e-06, "loss": 0.405, "step": 104550 }, { "epoch": 4.0717816964459494, "grad_norm": 27.976360321044922, "learning_rate": 9.29853244579392e-06, "loss": 0.3535, "step": 104600 }, { "epoch": 4.073728054809451, "grad_norm": 2.568925142288208, "learning_rate": 9.279068862158902e-06, "loss": 0.4332, "step": 104650 }, { "epoch": 4.075674413172953, "grad_norm": 24.87511444091797, "learning_rate": 9.259605278523882e-06, "loss": 0.3856, "step": 104700 }, { "epoch": 4.077620771536456, "grad_norm": 8.153419494628906, "learning_rate": 9.240530966561564e-06, "loss": 0.494, "step": 104750 }, { "epoch": 4.079567129899957, "grad_norm": 0.3370526134967804, "learning_rate": 9.221067382926545e-06, "loss": 0.405, "step": 104800 }, { "epoch": 4.081513488263459, "grad_norm": 0.9789894223213196, "learning_rate": 9.201603799291526e-06, "loss": 0.433, "step": 104850 }, { "epoch": 4.083459846626961, "grad_norm": 61.08134460449219, "learning_rate": 9.182140215656507e-06, "loss": 0.377, "step": 104900 }, { "epoch": 4.085406204990463, "grad_norm": 58.36869812011719, "learning_rate": 9.162676632021488e-06, "loss": 0.388, "step": 104950 }, { "epoch": 4.0873525633539645, "grad_norm": 9.322640419006348, "learning_rate": 9.143602320059169e-06, "loss": 0.4838, "step": 105000 }, { "epoch": 4.089298921717466, "grad_norm": 12.840750694274902, "learning_rate": 9.124138736424151e-06, "loss": 0.4753, "step": 105050 }, { "epoch": 4.091245280080969, "grad_norm": 8.076056480407715, "learning_rate": 9.10467515278913e-06, "loss": 0.4795, "step": 105100 }, { "epoch": 4.093191638444471, "grad_norm": 21.290081024169922, "learning_rate": 9.085211569154113e-06, "loss": 0.3817, "step": 105150 }, { "epoch": 4.095137996807972, "grad_norm": 23.44434928894043, "learning_rate": 9.065747985519094e-06, "loss": 0.4667, "step": 105200 }, { "epoch": 4.097084355171474, "grad_norm": 13.494409561157227, "learning_rate": 9.046284401884075e-06, "loss": 0.3006, "step": 105250 }, { "epoch": 4.099030713534976, "grad_norm": 25.044389724731445, "learning_rate": 9.026820818249058e-06, "loss": 0.4299, "step": 105300 }, { "epoch": 4.100977071898478, "grad_norm": 17.606698989868164, "learning_rate": 9.007357234614037e-06, "loss": 0.3806, "step": 105350 }, { "epoch": 4.1029234302619795, "grad_norm": 0.7165778875350952, "learning_rate": 8.98828292265172e-06, "loss": 0.5606, "step": 105400 }, { "epoch": 4.104869788625482, "grad_norm": 17.20554542541504, "learning_rate": 8.9688193390167e-06, "loss": 0.4298, "step": 105450 }, { "epoch": 4.106816146988984, "grad_norm": 23.83445167541504, "learning_rate": 8.949355755381681e-06, "loss": 0.3826, "step": 105500 }, { "epoch": 4.108762505352486, "grad_norm": 4.614549160003662, "learning_rate": 8.929892171746662e-06, "loss": 0.4704, "step": 105550 }, { "epoch": 4.110708863715987, "grad_norm": 15.383674621582031, "learning_rate": 8.910428588111643e-06, "loss": 0.4253, "step": 105600 }, { "epoch": 4.112655222079489, "grad_norm": 1.4554544687271118, "learning_rate": 8.890965004476624e-06, "loss": 0.3968, "step": 105650 }, { "epoch": 4.114601580442991, "grad_norm": 4.252751350402832, "learning_rate": 8.871501420841605e-06, "loss": 0.3894, "step": 105700 }, { "epoch": 4.116547938806493, "grad_norm": 7.073698997497559, "learning_rate": 8.852037837206586e-06, "loss": 0.4119, "step": 105750 }, { "epoch": 4.118494297169995, "grad_norm": 9.286458969116211, "learning_rate": 8.832574253571569e-06, "loss": 0.502, "step": 105800 }, { "epoch": 4.120440655533497, "grad_norm": 20.09278678894043, "learning_rate": 8.813110669936548e-06, "loss": 0.5042, "step": 105850 }, { "epoch": 4.122387013896999, "grad_norm": 20.160751342773438, "learning_rate": 8.793647086301531e-06, "loss": 0.4335, "step": 105900 }, { "epoch": 4.124333372260501, "grad_norm": 1.5743417739868164, "learning_rate": 8.77418350266651e-06, "loss": 0.4541, "step": 105950 }, { "epoch": 4.126279730624002, "grad_norm": 13.613061904907227, "learning_rate": 8.754719919031493e-06, "loss": 0.3163, "step": 106000 }, { "epoch": 4.128226088987504, "grad_norm": 24.892860412597656, "learning_rate": 8.735256335396474e-06, "loss": 0.4943, "step": 106050 }, { "epoch": 4.130172447351006, "grad_norm": 44.085365295410156, "learning_rate": 8.715792751761455e-06, "loss": 0.4762, "step": 106100 }, { "epoch": 4.132118805714508, "grad_norm": 24.189546585083008, "learning_rate": 8.696329168126436e-06, "loss": 0.5369, "step": 106150 }, { "epoch": 4.13406516407801, "grad_norm": 24.033960342407227, "learning_rate": 8.676865584491417e-06, "loss": 0.4847, "step": 106200 }, { "epoch": 4.136011522441512, "grad_norm": 38.062400817871094, "learning_rate": 8.657402000856398e-06, "loss": 0.433, "step": 106250 }, { "epoch": 4.137957880805014, "grad_norm": 2.0557141304016113, "learning_rate": 8.637938417221379e-06, "loss": 0.5258, "step": 106300 }, { "epoch": 4.139904239168516, "grad_norm": 5.926662921905518, "learning_rate": 8.61847483358636e-06, "loss": 0.4103, "step": 106350 }, { "epoch": 4.1418505975320175, "grad_norm": 46.40695571899414, "learning_rate": 8.59901124995134e-06, "loss": 0.3251, "step": 106400 }, { "epoch": 4.143796955895519, "grad_norm": 14.998189926147461, "learning_rate": 8.579547666316323e-06, "loss": 0.4429, "step": 106450 }, { "epoch": 4.145743314259021, "grad_norm": 1.5200467109680176, "learning_rate": 8.560084082681304e-06, "loss": 0.4444, "step": 106500 }, { "epoch": 4.147689672622524, "grad_norm": 20.426790237426758, "learning_rate": 8.540620499046285e-06, "loss": 0.3759, "step": 106550 }, { "epoch": 4.149636030986025, "grad_norm": 47.801151275634766, "learning_rate": 8.521156915411266e-06, "loss": 0.5121, "step": 106600 }, { "epoch": 4.151582389349527, "grad_norm": 34.219818115234375, "learning_rate": 8.501693331776247e-06, "loss": 0.4975, "step": 106650 }, { "epoch": 4.153528747713029, "grad_norm": 28.54174041748047, "learning_rate": 8.482229748141228e-06, "loss": 0.4137, "step": 106700 }, { "epoch": 4.155475106076531, "grad_norm": 10.206478118896484, "learning_rate": 8.462766164506209e-06, "loss": 0.4893, "step": 106750 }, { "epoch": 4.1574214644400325, "grad_norm": 31.077072143554688, "learning_rate": 8.44330258087119e-06, "loss": 0.3332, "step": 106800 }, { "epoch": 4.159367822803534, "grad_norm": 22.19476890563965, "learning_rate": 8.423838997236173e-06, "loss": 0.3983, "step": 106850 }, { "epoch": 4.161314181167037, "grad_norm": 24.78487205505371, "learning_rate": 8.404375413601152e-06, "loss": 0.4432, "step": 106900 }, { "epoch": 4.163260539530539, "grad_norm": 10.083465576171875, "learning_rate": 8.384911829966135e-06, "loss": 0.4757, "step": 106950 }, { "epoch": 4.16520689789404, "grad_norm": 23.581884384155273, "learning_rate": 8.365448246331114e-06, "loss": 0.465, "step": 107000 }, { "epoch": 4.167153256257542, "grad_norm": 13.917218208312988, "learning_rate": 8.345984662696097e-06, "loss": 0.385, "step": 107050 }, { "epoch": 4.169099614621044, "grad_norm": 15.955931663513184, "learning_rate": 8.326521079061078e-06, "loss": 0.4126, "step": 107100 }, { "epoch": 4.171045972984546, "grad_norm": 20.489660263061523, "learning_rate": 8.307057495426059e-06, "loss": 0.4554, "step": 107150 }, { "epoch": 4.1729923313480475, "grad_norm": 24.683095932006836, "learning_rate": 8.28759391179104e-06, "loss": 0.4231, "step": 107200 }, { "epoch": 4.174938689711549, "grad_norm": 43.40347671508789, "learning_rate": 8.26813032815602e-06, "loss": 0.4948, "step": 107250 }, { "epoch": 4.176885048075052, "grad_norm": 11.832257270812988, "learning_rate": 8.248666744521001e-06, "loss": 0.4136, "step": 107300 }, { "epoch": 4.178831406438554, "grad_norm": 8.322760581970215, "learning_rate": 8.229203160885982e-06, "loss": 0.4579, "step": 107350 }, { "epoch": 4.180777764802055, "grad_norm": 30.39175033569336, "learning_rate": 8.209739577250963e-06, "loss": 0.4107, "step": 107400 }, { "epoch": 4.182724123165557, "grad_norm": 3.286320924758911, "learning_rate": 8.190275993615944e-06, "loss": 0.4198, "step": 107450 }, { "epoch": 4.184670481529059, "grad_norm": 8.516502380371094, "learning_rate": 8.170812409980925e-06, "loss": 0.3904, "step": 107500 }, { "epoch": 4.186616839892561, "grad_norm": 16.162593841552734, "learning_rate": 8.151348826345908e-06, "loss": 0.4625, "step": 107550 }, { "epoch": 4.1885631982560625, "grad_norm": 40.759307861328125, "learning_rate": 8.131885242710887e-06, "loss": 0.3073, "step": 107600 }, { "epoch": 4.190509556619565, "grad_norm": 5.956638813018799, "learning_rate": 8.11242165907587e-06, "loss": 0.4073, "step": 107650 }, { "epoch": 4.192455914983067, "grad_norm": 22.47282600402832, "learning_rate": 8.09295807544085e-06, "loss": 0.4602, "step": 107700 }, { "epoch": 4.194402273346569, "grad_norm": 19.2171573638916, "learning_rate": 8.073494491805832e-06, "loss": 0.3882, "step": 107750 }, { "epoch": 4.19634863171007, "grad_norm": 10.946407318115234, "learning_rate": 8.054030908170813e-06, "loss": 0.4955, "step": 107800 }, { "epoch": 4.198294990073572, "grad_norm": 17.383869171142578, "learning_rate": 8.034567324535794e-06, "loss": 0.4235, "step": 107850 }, { "epoch": 4.200241348437074, "grad_norm": 1.9787017107009888, "learning_rate": 8.015103740900776e-06, "loss": 0.4146, "step": 107900 }, { "epoch": 4.202187706800576, "grad_norm": 44.14896011352539, "learning_rate": 7.995640157265756e-06, "loss": 0.5155, "step": 107950 }, { "epoch": 4.204134065164078, "grad_norm": 24.1195125579834, "learning_rate": 7.976176573630738e-06, "loss": 0.4599, "step": 108000 }, { "epoch": 4.20608042352758, "grad_norm": 9.527972221374512, "learning_rate": 7.956712989995718e-06, "loss": 0.4027, "step": 108050 }, { "epoch": 4.208026781891082, "grad_norm": 4.115023612976074, "learning_rate": 7.9372494063607e-06, "loss": 0.3914, "step": 108100 }, { "epoch": 4.209973140254584, "grad_norm": 38.40031051635742, "learning_rate": 7.91778582272568e-06, "loss": 0.3872, "step": 108150 }, { "epoch": 4.2119194986180855, "grad_norm": 249.76622009277344, "learning_rate": 7.898322239090662e-06, "loss": 0.4371, "step": 108200 }, { "epoch": 4.213865856981587, "grad_norm": 3.01899790763855, "learning_rate": 7.878858655455643e-06, "loss": 0.4541, "step": 108250 }, { "epoch": 4.215812215345089, "grad_norm": 28.21133041381836, "learning_rate": 7.859395071820624e-06, "loss": 0.4648, "step": 108300 }, { "epoch": 4.217758573708592, "grad_norm": 86.71720123291016, "learning_rate": 7.839931488185605e-06, "loss": 0.421, "step": 108350 }, { "epoch": 4.219704932072093, "grad_norm": 35.0771598815918, "learning_rate": 7.820467904550586e-06, "loss": 0.4544, "step": 108400 }, { "epoch": 4.221651290435595, "grad_norm": 16.53337860107422, "learning_rate": 7.801004320915567e-06, "loss": 0.4248, "step": 108450 }, { "epoch": 4.223597648799097, "grad_norm": 27.879592895507812, "learning_rate": 7.781540737280548e-06, "loss": 0.4446, "step": 108500 }, { "epoch": 4.225544007162599, "grad_norm": 16.420503616333008, "learning_rate": 7.762077153645529e-06, "loss": 0.3697, "step": 108550 }, { "epoch": 4.2274903655261005, "grad_norm": 33.303863525390625, "learning_rate": 7.742613570010512e-06, "loss": 0.4227, "step": 108600 }, { "epoch": 4.229436723889602, "grad_norm": 19.18815040588379, "learning_rate": 7.723149986375491e-06, "loss": 0.4109, "step": 108650 }, { "epoch": 4.231383082253105, "grad_norm": 23.49066734313965, "learning_rate": 7.703686402740474e-06, "loss": 0.4123, "step": 108700 }, { "epoch": 4.233329440616607, "grad_norm": 6.120394706726074, "learning_rate": 7.684222819105453e-06, "loss": 0.4369, "step": 108750 }, { "epoch": 4.235275798980108, "grad_norm": 13.172213554382324, "learning_rate": 7.664759235470436e-06, "loss": 0.4431, "step": 108800 }, { "epoch": 4.23722215734361, "grad_norm": 24.45387077331543, "learning_rate": 7.645295651835415e-06, "loss": 0.4363, "step": 108850 }, { "epoch": 4.239168515707112, "grad_norm": 4.77895450592041, "learning_rate": 7.625832068200398e-06, "loss": 0.4428, "step": 108900 }, { "epoch": 4.241114874070614, "grad_norm": 16.594873428344727, "learning_rate": 7.606368484565378e-06, "loss": 0.5234, "step": 108950 }, { "epoch": 4.2430612324341155, "grad_norm": 0.44446495175361633, "learning_rate": 7.5869049009303595e-06, "loss": 0.4077, "step": 109000 }, { "epoch": 4.245007590797617, "grad_norm": 2.1164255142211914, "learning_rate": 7.5674413172953405e-06, "loss": 0.4695, "step": 109050 }, { "epoch": 4.24695394916112, "grad_norm": 17.201433181762695, "learning_rate": 7.547977733660322e-06, "loss": 0.3925, "step": 109100 }, { "epoch": 4.248900307524622, "grad_norm": 2.7445130348205566, "learning_rate": 7.528514150025303e-06, "loss": 0.3644, "step": 109150 }, { "epoch": 4.250846665888123, "grad_norm": 32.21333694458008, "learning_rate": 7.509050566390284e-06, "loss": 0.4454, "step": 109200 }, { "epoch": 4.252793024251625, "grad_norm": 8.988659858703613, "learning_rate": 7.489586982755266e-06, "loss": 0.3649, "step": 109250 }, { "epoch": 4.254739382615127, "grad_norm": 30.789226531982422, "learning_rate": 7.470123399120246e-06, "loss": 0.4288, "step": 109300 }, { "epoch": 4.256685740978629, "grad_norm": 14.978452682495117, "learning_rate": 7.450659815485228e-06, "loss": 0.5094, "step": 109350 }, { "epoch": 4.2586320993421305, "grad_norm": 12.741453170776367, "learning_rate": 7.431196231850208e-06, "loss": 0.4373, "step": 109400 }, { "epoch": 4.260578457705633, "grad_norm": 4.852758884429932, "learning_rate": 7.41173264821519e-06, "loss": 0.3846, "step": 109450 }, { "epoch": 4.262524816069135, "grad_norm": 9.664127349853516, "learning_rate": 7.392269064580171e-06, "loss": 0.3831, "step": 109500 }, { "epoch": 4.264471174432637, "grad_norm": 6.058692455291748, "learning_rate": 7.372805480945153e-06, "loss": 0.4317, "step": 109550 }, { "epoch": 4.2664175327961384, "grad_norm": 28.10664176940918, "learning_rate": 7.353341897310133e-06, "loss": 0.396, "step": 109600 }, { "epoch": 4.26836389115964, "grad_norm": 6.292114734649658, "learning_rate": 7.333878313675115e-06, "loss": 0.4167, "step": 109650 }, { "epoch": 4.270310249523142, "grad_norm": 26.827301025390625, "learning_rate": 7.314414730040095e-06, "loss": 0.4775, "step": 109700 }, { "epoch": 4.272256607886644, "grad_norm": 2.8198108673095703, "learning_rate": 7.294951146405077e-06, "loss": 0.4826, "step": 109750 }, { "epoch": 4.2742029662501455, "grad_norm": 0.49175089597702026, "learning_rate": 7.275487562770058e-06, "loss": 0.4398, "step": 109800 }, { "epoch": 4.276149324613648, "grad_norm": 18.1221923828125, "learning_rate": 7.256023979135039e-06, "loss": 0.3891, "step": 109850 }, { "epoch": 4.27809568297715, "grad_norm": 23.49587631225586, "learning_rate": 7.2365603955000195e-06, "loss": 0.3684, "step": 109900 }, { "epoch": 4.280042041340652, "grad_norm": 125.69355010986328, "learning_rate": 7.217096811865001e-06, "loss": 0.4189, "step": 109950 }, { "epoch": 4.2819883997041535, "grad_norm": 5.460152626037598, "learning_rate": 7.1976332282299815e-06, "loss": 0.4261, "step": 110000 }, { "epoch": 4.283934758067655, "grad_norm": 3.960573673248291, "learning_rate": 7.178169644594963e-06, "loss": 0.3952, "step": 110050 }, { "epoch": 4.285881116431157, "grad_norm": 8.082985877990723, "learning_rate": 7.158706060959943e-06, "loss": 0.4578, "step": 110100 }, { "epoch": 4.287827474794659, "grad_norm": 18.085479736328125, "learning_rate": 7.139242477324925e-06, "loss": 0.3779, "step": 110150 }, { "epoch": 4.289773833158161, "grad_norm": 30.289541244506836, "learning_rate": 7.119778893689906e-06, "loss": 0.46, "step": 110200 }, { "epoch": 4.291720191521663, "grad_norm": 16.948190689086914, "learning_rate": 7.100315310054888e-06, "loss": 0.3504, "step": 110250 }, { "epoch": 4.293666549885165, "grad_norm": 0.8280588388442993, "learning_rate": 7.080851726419868e-06, "loss": 0.3317, "step": 110300 }, { "epoch": 4.295612908248667, "grad_norm": 25.485891342163086, "learning_rate": 7.06138814278485e-06, "loss": 0.4377, "step": 110350 }, { "epoch": 4.2975592666121685, "grad_norm": 8.10477066040039, "learning_rate": 7.04192455914983e-06, "loss": 0.3985, "step": 110400 }, { "epoch": 4.29950562497567, "grad_norm": 0.7604402303695679, "learning_rate": 7.022460975514812e-06, "loss": 0.4096, "step": 110450 }, { "epoch": 4.301451983339172, "grad_norm": 17.03667640686035, "learning_rate": 7.002997391879794e-06, "loss": 0.5145, "step": 110500 }, { "epoch": 4.303398341702675, "grad_norm": 4.9451751708984375, "learning_rate": 6.983533808244775e-06, "loss": 0.3872, "step": 110550 }, { "epoch": 4.305344700066176, "grad_norm": 43.63365173339844, "learning_rate": 6.9640702246097565e-06, "loss": 0.4255, "step": 110600 }, { "epoch": 4.307291058429678, "grad_norm": 30.57857894897461, "learning_rate": 6.944606640974737e-06, "loss": 0.4333, "step": 110650 }, { "epoch": 4.30923741679318, "grad_norm": 18.108638763427734, "learning_rate": 6.9251430573397184e-06, "loss": 0.3829, "step": 110700 }, { "epoch": 4.311183775156682, "grad_norm": 68.28421783447266, "learning_rate": 6.9056794737046986e-06, "loss": 0.4897, "step": 110750 }, { "epoch": 4.3131301335201835, "grad_norm": 18.5264892578125, "learning_rate": 6.88621589006968e-06, "loss": 0.439, "step": 110800 }, { "epoch": 4.315076491883685, "grad_norm": 5.763652801513672, "learning_rate": 6.8667523064346605e-06, "loss": 0.4603, "step": 110850 }, { "epoch": 4.317022850247188, "grad_norm": 25.027135848999023, "learning_rate": 6.847288722799642e-06, "loss": 0.3501, "step": 110900 }, { "epoch": 4.31896920861069, "grad_norm": 6.773870468139648, "learning_rate": 6.827825139164623e-06, "loss": 0.4132, "step": 110950 }, { "epoch": 4.320915566974191, "grad_norm": 25.025108337402344, "learning_rate": 6.808361555529605e-06, "loss": 0.4266, "step": 111000 }, { "epoch": 4.322861925337693, "grad_norm": 86.1151351928711, "learning_rate": 6.788897971894585e-06, "loss": 0.3728, "step": 111050 }, { "epoch": 4.324808283701195, "grad_norm": 0.4060121476650238, "learning_rate": 6.769434388259567e-06, "loss": 0.3944, "step": 111100 }, { "epoch": 4.326754642064697, "grad_norm": 2.714709758758545, "learning_rate": 6.749970804624547e-06, "loss": 0.3598, "step": 111150 }, { "epoch": 4.3287010004281985, "grad_norm": 20.680282592773438, "learning_rate": 6.730507220989529e-06, "loss": 0.4554, "step": 111200 }, { "epoch": 4.330647358791701, "grad_norm": 52.742652893066406, "learning_rate": 6.71104363735451e-06, "loss": 0.4369, "step": 111250 }, { "epoch": 4.332593717155203, "grad_norm": 35.26878356933594, "learning_rate": 6.691580053719492e-06, "loss": 0.4165, "step": 111300 }, { "epoch": 4.334540075518705, "grad_norm": 14.105500221252441, "learning_rate": 6.672116470084472e-06, "loss": 0.434, "step": 111350 }, { "epoch": 4.3364864338822064, "grad_norm": 22.81505012512207, "learning_rate": 6.652652886449454e-06, "loss": 0.4814, "step": 111400 }, { "epoch": 4.338432792245708, "grad_norm": 17.64825439453125, "learning_rate": 6.633189302814434e-06, "loss": 0.3661, "step": 111450 }, { "epoch": 4.34037915060921, "grad_norm": 23.406513214111328, "learning_rate": 6.613725719179416e-06, "loss": 0.4957, "step": 111500 }, { "epoch": 4.342325508972712, "grad_norm": 14.736431121826172, "learning_rate": 6.594262135544396e-06, "loss": 0.3991, "step": 111550 }, { "epoch": 4.344271867336214, "grad_norm": 15.385977745056152, "learning_rate": 6.574798551909378e-06, "loss": 0.5282, "step": 111600 }, { "epoch": 4.346218225699716, "grad_norm": 6.582085609436035, "learning_rate": 6.5553349682743586e-06, "loss": 0.5146, "step": 111650 }, { "epoch": 4.348164584063218, "grad_norm": 50.26306915283203, "learning_rate": 6.53587138463934e-06, "loss": 0.4549, "step": 111700 }, { "epoch": 4.35011094242672, "grad_norm": 1.3596677780151367, "learning_rate": 6.5164078010043205e-06, "loss": 0.466, "step": 111750 }, { "epoch": 4.3520573007902215, "grad_norm": 10.304804801940918, "learning_rate": 6.496944217369302e-06, "loss": 0.3248, "step": 111800 }, { "epoch": 4.354003659153723, "grad_norm": 46.837913513183594, "learning_rate": 6.477480633734284e-06, "loss": 0.336, "step": 111850 }, { "epoch": 4.355950017517225, "grad_norm": 0.9622809290885925, "learning_rate": 6.458017050099264e-06, "loss": 0.4056, "step": 111900 }, { "epoch": 4.357896375880727, "grad_norm": 12.035711288452148, "learning_rate": 6.438553466464246e-06, "loss": 0.4726, "step": 111950 }, { "epoch": 4.359842734244229, "grad_norm": 18.431489944458008, "learning_rate": 6.419089882829227e-06, "loss": 0.4118, "step": 112000 }, { "epoch": 4.361789092607731, "grad_norm": 36.6499137878418, "learning_rate": 6.399626299194209e-06, "loss": 0.3843, "step": 112050 }, { "epoch": 4.363735450971233, "grad_norm": 12.454045295715332, "learning_rate": 6.380162715559189e-06, "loss": 0.3936, "step": 112100 }, { "epoch": 4.365681809334735, "grad_norm": 26.415618896484375, "learning_rate": 6.360699131924171e-06, "loss": 0.327, "step": 112150 }, { "epoch": 4.3676281676982365, "grad_norm": 33.70527267456055, "learning_rate": 6.341235548289151e-06, "loss": 0.5096, "step": 112200 }, { "epoch": 4.369574526061738, "grad_norm": 52.020721435546875, "learning_rate": 6.321771964654133e-06, "loss": 0.3908, "step": 112250 }, { "epoch": 4.37152088442524, "grad_norm": 50.564064025878906, "learning_rate": 6.302308381019113e-06, "loss": 0.5276, "step": 112300 }, { "epoch": 4.373467242788743, "grad_norm": 10.555481910705566, "learning_rate": 6.282844797384095e-06, "loss": 0.4492, "step": 112350 }, { "epoch": 4.375413601152244, "grad_norm": 8.72326374053955, "learning_rate": 6.263381213749076e-06, "loss": 0.5415, "step": 112400 }, { "epoch": 4.377359959515746, "grad_norm": 29.085344314575195, "learning_rate": 6.243917630114057e-06, "loss": 0.4837, "step": 112450 }, { "epoch": 4.379306317879248, "grad_norm": 16.8189640045166, "learning_rate": 6.224454046479038e-06, "loss": 0.4196, "step": 112500 }, { "epoch": 4.38125267624275, "grad_norm": 32.00846862792969, "learning_rate": 6.2049904628440194e-06, "loss": 0.4615, "step": 112550 }, { "epoch": 4.3831990346062515, "grad_norm": 10.524999618530273, "learning_rate": 6.185526879209e-06, "loss": 0.3998, "step": 112600 }, { "epoch": 4.385145392969753, "grad_norm": 42.00443649291992, "learning_rate": 6.166063295573981e-06, "loss": 0.3663, "step": 112650 }, { "epoch": 4.387091751333255, "grad_norm": 7.032225608825684, "learning_rate": 6.146988983611663e-06, "loss": 0.3833, "step": 112700 }, { "epoch": 4.389038109696758, "grad_norm": 10.268936157226562, "learning_rate": 6.127525399976644e-06, "loss": 0.4533, "step": 112750 }, { "epoch": 4.390984468060259, "grad_norm": 17.854209899902344, "learning_rate": 6.108061816341625e-06, "loss": 0.4222, "step": 112800 }, { "epoch": 4.392930826423761, "grad_norm": 28.0588321685791, "learning_rate": 6.088598232706606e-06, "loss": 0.3978, "step": 112850 }, { "epoch": 4.394877184787263, "grad_norm": 5.5096845626831055, "learning_rate": 6.069134649071588e-06, "loss": 0.3574, "step": 112900 }, { "epoch": 4.396823543150765, "grad_norm": 7.411211013793945, "learning_rate": 6.049671065436569e-06, "loss": 0.4751, "step": 112950 }, { "epoch": 4.3987699015142665, "grad_norm": 28.325252532958984, "learning_rate": 6.03020748180155e-06, "loss": 0.3774, "step": 113000 }, { "epoch": 4.400716259877768, "grad_norm": 10.132955551147461, "learning_rate": 6.010743898166531e-06, "loss": 0.3667, "step": 113050 }, { "epoch": 4.402662618241271, "grad_norm": 14.65689468383789, "learning_rate": 5.9912803145315116e-06, "loss": 0.4417, "step": 113100 }, { "epoch": 4.404608976604773, "grad_norm": 42.277252197265625, "learning_rate": 5.9718167308964925e-06, "loss": 0.4857, "step": 113150 }, { "epoch": 4.4065553349682745, "grad_norm": 31.92972183227539, "learning_rate": 5.952353147261474e-06, "loss": 0.4292, "step": 113200 }, { "epoch": 4.408501693331776, "grad_norm": 19.756620407104492, "learning_rate": 5.932889563626455e-06, "loss": 0.4434, "step": 113250 }, { "epoch": 4.410448051695278, "grad_norm": 70.49607849121094, "learning_rate": 5.913425979991436e-06, "loss": 0.4047, "step": 113300 }, { "epoch": 4.41239441005878, "grad_norm": 11.815357208251953, "learning_rate": 5.893962396356417e-06, "loss": 0.3981, "step": 113350 }, { "epoch": 4.4143407684222815, "grad_norm": 40.02499771118164, "learning_rate": 5.874498812721398e-06, "loss": 0.4053, "step": 113400 }, { "epoch": 4.416287126785784, "grad_norm": 65.36296844482422, "learning_rate": 5.855035229086379e-06, "loss": 0.4301, "step": 113450 }, { "epoch": 4.418233485149286, "grad_norm": 0.9466508030891418, "learning_rate": 5.83557164545136e-06, "loss": 0.3606, "step": 113500 }, { "epoch": 4.420179843512788, "grad_norm": 47.74922180175781, "learning_rate": 5.816108061816342e-06, "loss": 0.38, "step": 113550 }, { "epoch": 4.4221262018762895, "grad_norm": 18.892345428466797, "learning_rate": 5.796644478181323e-06, "loss": 0.4374, "step": 113600 }, { "epoch": 4.424072560239791, "grad_norm": 28.107072830200195, "learning_rate": 5.777180894546305e-06, "loss": 0.4011, "step": 113650 }, { "epoch": 4.426018918603293, "grad_norm": 16.634885787963867, "learning_rate": 5.757717310911286e-06, "loss": 0.3771, "step": 113700 }, { "epoch": 4.427965276966795, "grad_norm": 26.510217666625977, "learning_rate": 5.738253727276267e-06, "loss": 0.3783, "step": 113750 }, { "epoch": 4.429911635330297, "grad_norm": 26.9971981048584, "learning_rate": 5.718790143641248e-06, "loss": 0.4851, "step": 113800 }, { "epoch": 4.431857993693799, "grad_norm": 0.5964798927307129, "learning_rate": 5.699326560006229e-06, "loss": 0.5043, "step": 113850 }, { "epoch": 4.433804352057301, "grad_norm": 1.5452959537506104, "learning_rate": 5.67986297637121e-06, "loss": 0.3468, "step": 113900 }, { "epoch": 4.435750710420803, "grad_norm": 27.820106506347656, "learning_rate": 5.6603993927361914e-06, "loss": 0.4527, "step": 113950 }, { "epoch": 4.4376970687843045, "grad_norm": 38.800411224365234, "learning_rate": 5.640935809101172e-06, "loss": 0.4446, "step": 114000 }, { "epoch": 4.439643427147806, "grad_norm": 3.902805805206299, "learning_rate": 5.621472225466153e-06, "loss": 0.4341, "step": 114050 }, { "epoch": 4.441589785511308, "grad_norm": 57.785675048828125, "learning_rate": 5.602008641831134e-06, "loss": 0.4858, "step": 114100 }, { "epoch": 4.443536143874811, "grad_norm": 22.999574661254883, "learning_rate": 5.582545058196115e-06, "loss": 0.387, "step": 114150 }, { "epoch": 4.445482502238312, "grad_norm": 14.884350776672363, "learning_rate": 5.563081474561096e-06, "loss": 0.3976, "step": 114200 }, { "epoch": 4.447428860601814, "grad_norm": 19.565195083618164, "learning_rate": 5.543617890926077e-06, "loss": 0.3773, "step": 114250 }, { "epoch": 4.449375218965316, "grad_norm": 20.821231842041016, "learning_rate": 5.524154307291059e-06, "loss": 0.4311, "step": 114300 }, { "epoch": 4.451321577328818, "grad_norm": 1.7796218395233154, "learning_rate": 5.50469072365604e-06, "loss": 0.5058, "step": 114350 }, { "epoch": 4.4532679356923195, "grad_norm": 15.797576904296875, "learning_rate": 5.485227140021021e-06, "loss": 0.4098, "step": 114400 }, { "epoch": 4.455214294055821, "grad_norm": 20.050750732421875, "learning_rate": 5.465763556386002e-06, "loss": 0.3921, "step": 114450 }, { "epoch": 4.457160652419324, "grad_norm": 20.930496215820312, "learning_rate": 5.446299972750983e-06, "loss": 0.4204, "step": 114500 }, { "epoch": 4.459107010782826, "grad_norm": 7.812229156494141, "learning_rate": 5.426836389115964e-06, "loss": 0.3597, "step": 114550 }, { "epoch": 4.461053369146327, "grad_norm": 23.20423698425293, "learning_rate": 5.407372805480945e-06, "loss": 0.4663, "step": 114600 }, { "epoch": 4.462999727509829, "grad_norm": 22.477508544921875, "learning_rate": 5.387909221845927e-06, "loss": 0.3661, "step": 114650 }, { "epoch": 4.464946085873331, "grad_norm": 21.700437545776367, "learning_rate": 5.368445638210908e-06, "loss": 0.3822, "step": 114700 }, { "epoch": 4.466892444236833, "grad_norm": 1.7359672784805298, "learning_rate": 5.348982054575889e-06, "loss": 0.4444, "step": 114750 }, { "epoch": 4.4688388026003345, "grad_norm": 1.0407538414001465, "learning_rate": 5.32951847094087e-06, "loss": 0.3788, "step": 114800 }, { "epoch": 4.470785160963836, "grad_norm": 56.78816223144531, "learning_rate": 5.310054887305851e-06, "loss": 0.4105, "step": 114850 }, { "epoch": 4.472731519327339, "grad_norm": 25.077980041503906, "learning_rate": 5.2905913036708316e-06, "loss": 0.4892, "step": 114900 }, { "epoch": 4.474677877690841, "grad_norm": 11.666507720947266, "learning_rate": 5.2711277200358125e-06, "loss": 0.5308, "step": 114950 }, { "epoch": 4.4766242360543425, "grad_norm": 15.243803977966309, "learning_rate": 5.251664136400794e-06, "loss": 0.3041, "step": 115000 }, { "epoch": 4.478570594417844, "grad_norm": 13.038241386413574, "learning_rate": 5.232200552765776e-06, "loss": 0.4088, "step": 115050 }, { "epoch": 4.480516952781346, "grad_norm": 23.510257720947266, "learning_rate": 5.212736969130757e-06, "loss": 0.3256, "step": 115100 }, { "epoch": 4.482463311144848, "grad_norm": 12.887557029724121, "learning_rate": 5.193273385495738e-06, "loss": 0.409, "step": 115150 }, { "epoch": 4.4844096695083495, "grad_norm": 11.299025535583496, "learning_rate": 5.173809801860719e-06, "loss": 0.4645, "step": 115200 }, { "epoch": 4.486356027871852, "grad_norm": 148.60275268554688, "learning_rate": 5.1543462182257e-06, "loss": 0.5657, "step": 115250 }, { "epoch": 4.488302386235354, "grad_norm": 35.81682205200195, "learning_rate": 5.134882634590681e-06, "loss": 0.4719, "step": 115300 }, { "epoch": 4.490248744598856, "grad_norm": 46.195457458496094, "learning_rate": 5.115419050955662e-06, "loss": 0.443, "step": 115350 }, { "epoch": 4.4921951029623575, "grad_norm": 20.33262062072754, "learning_rate": 5.095955467320644e-06, "loss": 0.3888, "step": 115400 }, { "epoch": 4.494141461325859, "grad_norm": 31.672149658203125, "learning_rate": 5.076491883685625e-06, "loss": 0.4978, "step": 115450 }, { "epoch": 4.496087819689361, "grad_norm": 71.4660415649414, "learning_rate": 5.057028300050606e-06, "loss": 0.431, "step": 115500 }, { "epoch": 4.498034178052863, "grad_norm": 71.10552215576172, "learning_rate": 5.037564716415587e-06, "loss": 0.4111, "step": 115550 }, { "epoch": 4.4999805364163645, "grad_norm": 12.178357124328613, "learning_rate": 5.018101132780568e-06, "loss": 0.4364, "step": 115600 }, { "epoch": 4.501926894779867, "grad_norm": 27.38094139099121, "learning_rate": 4.998637549145549e-06, "loss": 0.4543, "step": 115650 }, { "epoch": 4.503873253143369, "grad_norm": 29.70096206665039, "learning_rate": 4.97917396551053e-06, "loss": 0.4353, "step": 115700 }, { "epoch": 4.505819611506871, "grad_norm": 44.831539154052734, "learning_rate": 4.9597103818755115e-06, "loss": 0.2846, "step": 115750 }, { "epoch": 4.5077659698703725, "grad_norm": 12.026204109191895, "learning_rate": 4.9402467982404924e-06, "loss": 0.4125, "step": 115800 }, { "epoch": 4.509712328233874, "grad_norm": 51.11648941040039, "learning_rate": 4.920783214605473e-06, "loss": 0.533, "step": 115850 }, { "epoch": 4.511658686597376, "grad_norm": 12.273826599121094, "learning_rate": 4.901319630970454e-06, "loss": 0.4603, "step": 115900 }, { "epoch": 4.513605044960878, "grad_norm": 22.480710983276367, "learning_rate": 4.881856047335435e-06, "loss": 0.4285, "step": 115950 }, { "epoch": 4.51555140332438, "grad_norm": 17.653736114501953, "learning_rate": 4.862392463700416e-06, "loss": 0.4493, "step": 116000 }, { "epoch": 4.517497761687882, "grad_norm": 16.536144256591797, "learning_rate": 4.842928880065397e-06, "loss": 0.482, "step": 116050 }, { "epoch": 4.519444120051384, "grad_norm": 42.31017303466797, "learning_rate": 4.823465296430379e-06, "loss": 0.4212, "step": 116100 }, { "epoch": 4.521390478414886, "grad_norm": 43.79999923706055, "learning_rate": 4.80400171279536e-06, "loss": 0.4323, "step": 116150 }, { "epoch": 4.5233368367783875, "grad_norm": 0.5862814784049988, "learning_rate": 4.784538129160341e-06, "loss": 0.3262, "step": 116200 }, { "epoch": 4.525283195141889, "grad_norm": 20.25597381591797, "learning_rate": 4.765074545525322e-06, "loss": 0.44, "step": 116250 }, { "epoch": 4.527229553505391, "grad_norm": 56.35312271118164, "learning_rate": 4.745610961890303e-06, "loss": 0.4289, "step": 116300 }, { "epoch": 4.529175911868894, "grad_norm": 102.08218383789062, "learning_rate": 4.726147378255284e-06, "loss": 0.4536, "step": 116350 }, { "epoch": 4.531122270232395, "grad_norm": 71.20698547363281, "learning_rate": 4.706683794620266e-06, "loss": 0.4023, "step": 116400 }, { "epoch": 4.533068628595897, "grad_norm": 8.054243087768555, "learning_rate": 4.687220210985247e-06, "loss": 0.4836, "step": 116450 }, { "epoch": 4.535014986959399, "grad_norm": 29.04806900024414, "learning_rate": 4.6677566273502286e-06, "loss": 0.4712, "step": 116500 }, { "epoch": 4.536961345322901, "grad_norm": 21.187711715698242, "learning_rate": 4.6482930437152095e-06, "loss": 0.3216, "step": 116550 }, { "epoch": 4.5389077036864025, "grad_norm": 31.43624496459961, "learning_rate": 4.6288294600801905e-06, "loss": 0.4095, "step": 116600 }, { "epoch": 4.540854062049904, "grad_norm": 1.4774718284606934, "learning_rate": 4.6093658764451715e-06, "loss": 0.3795, "step": 116650 }, { "epoch": 4.542800420413407, "grad_norm": 11.922041893005371, "learning_rate": 4.5899022928101524e-06, "loss": 0.4138, "step": 116700 }, { "epoch": 4.544746778776909, "grad_norm": 39.0098991394043, "learning_rate": 4.570438709175133e-06, "loss": 0.3579, "step": 116750 }, { "epoch": 4.5466931371404105, "grad_norm": 38.26618957519531, "learning_rate": 4.550975125540114e-06, "loss": 0.3992, "step": 116800 }, { "epoch": 4.548639495503912, "grad_norm": 69.36569213867188, "learning_rate": 4.531511541905096e-06, "loss": 0.3805, "step": 116850 }, { "epoch": 4.550585853867414, "grad_norm": 42.37199783325195, "learning_rate": 4.512047958270077e-06, "loss": 0.4506, "step": 116900 }, { "epoch": 4.552532212230916, "grad_norm": 36.9631462097168, "learning_rate": 4.492584374635058e-06, "loss": 0.3387, "step": 116950 }, { "epoch": 4.5544785705944175, "grad_norm": 22.541261672973633, "learning_rate": 4.47351006267274e-06, "loss": 0.384, "step": 117000 }, { "epoch": 4.55642492895792, "grad_norm": 6.34126091003418, "learning_rate": 4.454046479037721e-06, "loss": 0.4457, "step": 117050 }, { "epoch": 4.558371287321422, "grad_norm": 23.18858528137207, "learning_rate": 4.434582895402702e-06, "loss": 0.4161, "step": 117100 }, { "epoch": 4.560317645684924, "grad_norm": 5.646255016326904, "learning_rate": 4.415119311767683e-06, "loss": 0.501, "step": 117150 }, { "epoch": 4.5622640040484255, "grad_norm": 17.820695877075195, "learning_rate": 4.395655728132664e-06, "loss": 0.3816, "step": 117200 }, { "epoch": 4.564210362411927, "grad_norm": 22.31595802307129, "learning_rate": 4.3761921444976446e-06, "loss": 0.39, "step": 117250 }, { "epoch": 4.566156720775429, "grad_norm": 35.5130500793457, "learning_rate": 4.356728560862626e-06, "loss": 0.4437, "step": 117300 }, { "epoch": 4.568103079138931, "grad_norm": 32.202701568603516, "learning_rate": 4.337264977227607e-06, "loss": 0.3956, "step": 117350 }, { "epoch": 4.570049437502433, "grad_norm": 20.33125114440918, "learning_rate": 4.317801393592588e-06, "loss": 0.3967, "step": 117400 }, { "epoch": 4.571995795865935, "grad_norm": 29.741682052612305, "learning_rate": 4.298337809957569e-06, "loss": 0.3602, "step": 117450 }, { "epoch": 4.573942154229437, "grad_norm": 6.447887897491455, "learning_rate": 4.278874226322551e-06, "loss": 0.4373, "step": 117500 }, { "epoch": 4.575888512592939, "grad_norm": 6.838235378265381, "learning_rate": 4.259410642687532e-06, "loss": 0.3545, "step": 117550 }, { "epoch": 4.5778348709564405, "grad_norm": 17.365354537963867, "learning_rate": 4.239947059052513e-06, "loss": 0.3262, "step": 117600 }, { "epoch": 4.579781229319942, "grad_norm": 35.75154495239258, "learning_rate": 4.220483475417494e-06, "loss": 0.427, "step": 117650 }, { "epoch": 4.581727587683444, "grad_norm": 6.490931987762451, "learning_rate": 4.201019891782476e-06, "loss": 0.4753, "step": 117700 }, { "epoch": 4.583673946046946, "grad_norm": 9.92873764038086, "learning_rate": 4.181556308147457e-06, "loss": 0.4294, "step": 117750 }, { "epoch": 4.585620304410448, "grad_norm": 9.08901596069336, "learning_rate": 4.162092724512438e-06, "loss": 0.467, "step": 117800 }, { "epoch": 4.58756666277395, "grad_norm": 23.237667083740234, "learning_rate": 4.142629140877419e-06, "loss": 0.4548, "step": 117850 }, { "epoch": 4.589513021137452, "grad_norm": 1.774194359779358, "learning_rate": 4.1231655572424e-06, "loss": 0.5246, "step": 117900 }, { "epoch": 4.591459379500954, "grad_norm": 6.753283977508545, "learning_rate": 4.103701973607381e-06, "loss": 0.3901, "step": 117950 }, { "epoch": 4.5934057378644555, "grad_norm": 23.61138153076172, "learning_rate": 4.084238389972362e-06, "loss": 0.4126, "step": 118000 }, { "epoch": 4.595352096227957, "grad_norm": 6.452661991119385, "learning_rate": 4.0647748063373435e-06, "loss": 0.4067, "step": 118050 }, { "epoch": 4.597298454591459, "grad_norm": 12.707996368408203, "learning_rate": 4.0453112227023245e-06, "loss": 0.4399, "step": 118100 }, { "epoch": 4.599244812954961, "grad_norm": 19.022279739379883, "learning_rate": 4.025847639067305e-06, "loss": 0.4268, "step": 118150 }, { "epoch": 4.6011911713184634, "grad_norm": 4.423501491546631, "learning_rate": 4.006384055432286e-06, "loss": 0.4324, "step": 118200 }, { "epoch": 4.603137529681965, "grad_norm": 179.89208984375, "learning_rate": 3.986920471797267e-06, "loss": 0.3771, "step": 118250 }, { "epoch": 4.605083888045467, "grad_norm": 17.877492904663086, "learning_rate": 3.967456888162248e-06, "loss": 0.4382, "step": 118300 }, { "epoch": 4.607030246408969, "grad_norm": 7.5927205085754395, "learning_rate": 3.947993304527229e-06, "loss": 0.5098, "step": 118350 }, { "epoch": 4.6089766047724705, "grad_norm": 3.624549627304077, "learning_rate": 3.928529720892211e-06, "loss": 0.4488, "step": 118400 }, { "epoch": 4.610922963135972, "grad_norm": 1.3822295665740967, "learning_rate": 3.909066137257192e-06, "loss": 0.4197, "step": 118450 }, { "epoch": 4.612869321499474, "grad_norm": 38.52121353149414, "learning_rate": 3.889602553622173e-06, "loss": 0.4152, "step": 118500 }, { "epoch": 4.614815679862977, "grad_norm": 17.21124839782715, "learning_rate": 3.870138969987154e-06, "loss": 0.3981, "step": 118550 }, { "epoch": 4.6167620382264785, "grad_norm": 7.82228946685791, "learning_rate": 3.850675386352135e-06, "loss": 0.4299, "step": 118600 }, { "epoch": 4.61870839658998, "grad_norm": 24.61615753173828, "learning_rate": 3.831211802717116e-06, "loss": 0.3446, "step": 118650 }, { "epoch": 4.620654754953482, "grad_norm": 18.59534454345703, "learning_rate": 3.8117482190820974e-06, "loss": 0.459, "step": 118700 }, { "epoch": 4.622601113316984, "grad_norm": 23.03034210205078, "learning_rate": 3.7922846354470783e-06, "loss": 0.3638, "step": 118750 }, { "epoch": 4.6245474716804855, "grad_norm": 0.7488228678703308, "learning_rate": 3.7728210518120593e-06, "loss": 0.3563, "step": 118800 }, { "epoch": 4.626493830043987, "grad_norm": 9.359607696533203, "learning_rate": 3.7533574681770407e-06, "loss": 0.4949, "step": 118850 }, { "epoch": 4.62844018840749, "grad_norm": 2.569417953491211, "learning_rate": 3.7338938845420225e-06, "loss": 0.4521, "step": 118900 }, { "epoch": 4.630386546770992, "grad_norm": 21.738922119140625, "learning_rate": 3.7144303009070035e-06, "loss": 0.4079, "step": 118950 }, { "epoch": 4.6323329051344935, "grad_norm": 52.428218841552734, "learning_rate": 3.6953559889446847e-06, "loss": 0.3931, "step": 119000 }, { "epoch": 4.634279263497995, "grad_norm": 12.82276439666748, "learning_rate": 3.675892405309666e-06, "loss": 0.3753, "step": 119050 }, { "epoch": 4.636225621861497, "grad_norm": 7.653590679168701, "learning_rate": 3.656428821674647e-06, "loss": 0.3843, "step": 119100 }, { "epoch": 4.638171980224999, "grad_norm": 30.276317596435547, "learning_rate": 3.636965238039628e-06, "loss": 0.4113, "step": 119150 }, { "epoch": 4.6401183385885005, "grad_norm": 5.313016891479492, "learning_rate": 3.6175016544046094e-06, "loss": 0.4503, "step": 119200 }, { "epoch": 4.642064696952003, "grad_norm": 9.61724853515625, "learning_rate": 3.5980380707695903e-06, "loss": 0.3734, "step": 119250 }, { "epoch": 4.644011055315505, "grad_norm": 26.873245239257812, "learning_rate": 3.5785744871345713e-06, "loss": 0.4478, "step": 119300 }, { "epoch": 4.645957413679007, "grad_norm": 10.735197067260742, "learning_rate": 3.5591109034995523e-06, "loss": 0.3765, "step": 119350 }, { "epoch": 4.6479037720425085, "grad_norm": 0.16046574711799622, "learning_rate": 3.5396473198645337e-06, "loss": 0.3806, "step": 119400 }, { "epoch": 4.64985013040601, "grad_norm": 10.170193672180176, "learning_rate": 3.5201837362295147e-06, "loss": 0.393, "step": 119450 }, { "epoch": 4.651796488769512, "grad_norm": 0.5223858952522278, "learning_rate": 3.5007201525944956e-06, "loss": 0.4742, "step": 119500 }, { "epoch": 4.653742847133014, "grad_norm": 16.60091781616211, "learning_rate": 3.481256568959477e-06, "loss": 0.4143, "step": 119550 }, { "epoch": 4.655689205496516, "grad_norm": 9.898569107055664, "learning_rate": 3.461792985324458e-06, "loss": 0.4086, "step": 119600 }, { "epoch": 4.657635563860018, "grad_norm": 13.71148681640625, "learning_rate": 3.442329401689439e-06, "loss": 0.4881, "step": 119650 }, { "epoch": 4.65958192222352, "grad_norm": 1.7067314386367798, "learning_rate": 3.42286581805442e-06, "loss": 0.3723, "step": 119700 }, { "epoch": 4.661528280587022, "grad_norm": 28.761686325073242, "learning_rate": 3.4034022344194013e-06, "loss": 0.38, "step": 119750 }, { "epoch": 4.6634746389505235, "grad_norm": 9.073128700256348, "learning_rate": 3.3839386507843823e-06, "loss": 0.3938, "step": 119800 }, { "epoch": 4.665420997314025, "grad_norm": 32.85749435424805, "learning_rate": 3.3644750671493633e-06, "loss": 0.4299, "step": 119850 }, { "epoch": 4.667367355677527, "grad_norm": 29.937353134155273, "learning_rate": 3.3450114835143447e-06, "loss": 0.5118, "step": 119900 }, { "epoch": 4.66931371404103, "grad_norm": 0.9227768778800964, "learning_rate": 3.3255478998793256e-06, "loss": 0.4191, "step": 119950 }, { "epoch": 4.6712600724045314, "grad_norm": 0.8721495270729065, "learning_rate": 3.3060843162443074e-06, "loss": 0.3652, "step": 120000 }, { "epoch": 4.673206430768033, "grad_norm": 18.87059211730957, "learning_rate": 3.2866207326092884e-06, "loss": 0.4879, "step": 120050 }, { "epoch": 4.675152789131535, "grad_norm": 8.195213317871094, "learning_rate": 3.2671571489742694e-06, "loss": 0.404, "step": 120100 }, { "epoch": 4.677099147495037, "grad_norm": 2.6305134296417236, "learning_rate": 3.2476935653392508e-06, "loss": 0.3783, "step": 120150 }, { "epoch": 4.6790455058585385, "grad_norm": 25.327335357666016, "learning_rate": 3.2282299817042317e-06, "loss": 0.4552, "step": 120200 }, { "epoch": 4.68099186422204, "grad_norm": 5.988562107086182, "learning_rate": 3.2087663980692127e-06, "loss": 0.3677, "step": 120250 }, { "epoch": 4.682938222585543, "grad_norm": 11.284552574157715, "learning_rate": 3.189302814434194e-06, "loss": 0.6246, "step": 120300 }, { "epoch": 4.684884580949045, "grad_norm": 23.658069610595703, "learning_rate": 3.169839230799175e-06, "loss": 0.3972, "step": 120350 }, { "epoch": 4.6868309393125465, "grad_norm": 2.0601258277893066, "learning_rate": 3.150375647164156e-06, "loss": 0.4059, "step": 120400 }, { "epoch": 4.688777297676048, "grad_norm": 11.087312698364258, "learning_rate": 3.1309120635291374e-06, "loss": 0.4145, "step": 120450 }, { "epoch": 4.69072365603955, "grad_norm": 1.2296793460845947, "learning_rate": 3.1114484798941184e-06, "loss": 0.3895, "step": 120500 }, { "epoch": 4.692670014403052, "grad_norm": 4.52140998840332, "learning_rate": 3.0919848962590994e-06, "loss": 0.3718, "step": 120550 }, { "epoch": 4.6946163727665535, "grad_norm": 18.292016983032227, "learning_rate": 3.0725213126240804e-06, "loss": 0.4063, "step": 120600 }, { "epoch": 4.696562731130055, "grad_norm": 14.411880493164062, "learning_rate": 3.0530577289890618e-06, "loss": 0.4186, "step": 120650 }, { "epoch": 4.698509089493557, "grad_norm": 1.5450948476791382, "learning_rate": 3.0335941453540427e-06, "loss": 0.3819, "step": 120700 }, { "epoch": 4.70045544785706, "grad_norm": 9.647147178649902, "learning_rate": 3.0141305617190237e-06, "loss": 0.4362, "step": 120750 }, { "epoch": 4.7024018062205615, "grad_norm": 53.960262298583984, "learning_rate": 2.994666978084005e-06, "loss": 0.3811, "step": 120800 }, { "epoch": 4.704348164584063, "grad_norm": 0.82645183801651, "learning_rate": 2.975203394448986e-06, "loss": 0.4104, "step": 120850 }, { "epoch": 4.706294522947565, "grad_norm": 17.5273380279541, "learning_rate": 2.955739810813967e-06, "loss": 0.4482, "step": 120900 }, { "epoch": 4.708240881311067, "grad_norm": 0.6812463402748108, "learning_rate": 2.936276227178948e-06, "loss": 0.3398, "step": 120950 }, { "epoch": 4.7101872396745685, "grad_norm": 6.719147205352783, "learning_rate": 2.91681264354393e-06, "loss": 0.3363, "step": 121000 }, { "epoch": 4.71213359803807, "grad_norm": 8.5432710647583, "learning_rate": 2.8973490599089108e-06, "loss": 0.3357, "step": 121050 }, { "epoch": 4.714079956401573, "grad_norm": 42.23102951049805, "learning_rate": 2.8778854762738918e-06, "loss": 0.3745, "step": 121100 }, { "epoch": 4.716026314765075, "grad_norm": 18.874868392944336, "learning_rate": 2.8584218926388727e-06, "loss": 0.3776, "step": 121150 }, { "epoch": 4.7179726731285765, "grad_norm": 1.941254734992981, "learning_rate": 2.838958309003854e-06, "loss": 0.4716, "step": 121200 }, { "epoch": 4.719919031492078, "grad_norm": 56.92995071411133, "learning_rate": 2.819494725368835e-06, "loss": 0.4594, "step": 121250 }, { "epoch": 4.72186538985558, "grad_norm": 18.44036865234375, "learning_rate": 2.800031141733816e-06, "loss": 0.4256, "step": 121300 }, { "epoch": 4.723811748219082, "grad_norm": 1.7283982038497925, "learning_rate": 2.7805675580987975e-06, "loss": 0.4666, "step": 121350 }, { "epoch": 4.7257581065825836, "grad_norm": 18.259185791015625, "learning_rate": 2.7611039744637784e-06, "loss": 0.366, "step": 121400 }, { "epoch": 4.727704464946086, "grad_norm": 6.602967739105225, "learning_rate": 2.74202966250146e-06, "loss": 0.3284, "step": 121450 }, { "epoch": 4.729650823309588, "grad_norm": 43.031394958496094, "learning_rate": 2.722566078866441e-06, "loss": 0.4705, "step": 121500 }, { "epoch": 4.73159718167309, "grad_norm": 12.522574424743652, "learning_rate": 2.7031024952314224e-06, "loss": 0.4036, "step": 121550 }, { "epoch": 4.7335435400365915, "grad_norm": 13.766326904296875, "learning_rate": 2.6836389115964033e-06, "loss": 0.423, "step": 121600 }, { "epoch": 4.735489898400093, "grad_norm": 23.994712829589844, "learning_rate": 2.6641753279613843e-06, "loss": 0.4674, "step": 121650 }, { "epoch": 4.737436256763595, "grad_norm": 22.61267852783203, "learning_rate": 2.6447117443263657e-06, "loss": 0.4006, "step": 121700 }, { "epoch": 4.739382615127097, "grad_norm": 8.996373176574707, "learning_rate": 2.6252481606913467e-06, "loss": 0.4004, "step": 121750 }, { "epoch": 4.7413289734905995, "grad_norm": 5.271254062652588, "learning_rate": 2.6057845770563276e-06, "loss": 0.3796, "step": 121800 }, { "epoch": 4.743275331854101, "grad_norm": 1.0191978216171265, "learning_rate": 2.5863209934213086e-06, "loss": 0.4578, "step": 121850 }, { "epoch": 4.745221690217603, "grad_norm": 37.78928756713867, "learning_rate": 2.56685740978629e-06, "loss": 0.3983, "step": 121900 }, { "epoch": 4.747168048581105, "grad_norm": 73.3762435913086, "learning_rate": 2.547393826151271e-06, "loss": 0.4097, "step": 121950 }, { "epoch": 4.7491144069446065, "grad_norm": 0.5706442594528198, "learning_rate": 2.527930242516252e-06, "loss": 0.4292, "step": 122000 }, { "epoch": 4.751060765308108, "grad_norm": 5.134289264678955, "learning_rate": 2.5084666588812333e-06, "loss": 0.4516, "step": 122050 }, { "epoch": 4.75300712367161, "grad_norm": 23.824195861816406, "learning_rate": 2.4890030752462143e-06, "loss": 0.4999, "step": 122100 }, { "epoch": 4.754953482035113, "grad_norm": 3.4469504356384277, "learning_rate": 2.4695394916111957e-06, "loss": 0.3169, "step": 122150 }, { "epoch": 4.7568998403986145, "grad_norm": 46.90275192260742, "learning_rate": 2.4500759079761767e-06, "loss": 0.446, "step": 122200 }, { "epoch": 4.758846198762116, "grad_norm": 12.387267112731934, "learning_rate": 2.430612324341158e-06, "loss": 0.4484, "step": 122250 }, { "epoch": 4.760792557125618, "grad_norm": 9.679031372070312, "learning_rate": 2.411148740706139e-06, "loss": 0.4704, "step": 122300 }, { "epoch": 4.76273891548912, "grad_norm": 67.42961120605469, "learning_rate": 2.39168515707112e-06, "loss": 0.3599, "step": 122350 }, { "epoch": 4.7646852738526215, "grad_norm": 18.48516845703125, "learning_rate": 2.3722215734361014e-06, "loss": 0.3368, "step": 122400 }, { "epoch": 4.766631632216123, "grad_norm": 50.06645202636719, "learning_rate": 2.3527579898010824e-06, "loss": 0.4329, "step": 122450 }, { "epoch": 4.768577990579626, "grad_norm": 9.684593200683594, "learning_rate": 2.3332944061660633e-06, "loss": 0.369, "step": 122500 }, { "epoch": 4.770524348943128, "grad_norm": 1.2856169939041138, "learning_rate": 2.3138308225310443e-06, "loss": 0.4169, "step": 122550 }, { "epoch": 4.7724707073066295, "grad_norm": 31.687021255493164, "learning_rate": 2.2943672388960257e-06, "loss": 0.3934, "step": 122600 }, { "epoch": 4.774417065670131, "grad_norm": 65.8917236328125, "learning_rate": 2.2749036552610067e-06, "loss": 0.3881, "step": 122650 }, { "epoch": 4.776363424033633, "grad_norm": 26.982084274291992, "learning_rate": 2.2554400716259877e-06, "loss": 0.4383, "step": 122700 }, { "epoch": 4.778309782397135, "grad_norm": 9.884613037109375, "learning_rate": 2.235976487990969e-06, "loss": 0.5085, "step": 122750 }, { "epoch": 4.7802561407606365, "grad_norm": 0.43567150831222534, "learning_rate": 2.21651290435595e-06, "loss": 0.4179, "step": 122800 }, { "epoch": 4.782202499124139, "grad_norm": 17.87385368347168, "learning_rate": 2.1970493207209314e-06, "loss": 0.3287, "step": 122850 }, { "epoch": 4.784148857487641, "grad_norm": 5.489084720611572, "learning_rate": 2.1775857370859124e-06, "loss": 0.4217, "step": 122900 }, { "epoch": 4.786095215851143, "grad_norm": 2.707693576812744, "learning_rate": 2.1581221534508938e-06, "loss": 0.4263, "step": 122950 }, { "epoch": 4.7880415742146445, "grad_norm": 4.057452201843262, "learning_rate": 2.1386585698158747e-06, "loss": 0.4925, "step": 123000 }, { "epoch": 4.789987932578146, "grad_norm": 16.334802627563477, "learning_rate": 2.1191949861808557e-06, "loss": 0.3229, "step": 123050 }, { "epoch": 4.791934290941648, "grad_norm": 67.90560150146484, "learning_rate": 2.0997314025458367e-06, "loss": 0.5818, "step": 123100 }, { "epoch": 4.79388064930515, "grad_norm": 78.99760437011719, "learning_rate": 2.080267818910818e-06, "loss": 0.452, "step": 123150 }, { "epoch": 4.795827007668652, "grad_norm": 8.23687744140625, "learning_rate": 2.060804235275799e-06, "loss": 0.3815, "step": 123200 }, { "epoch": 4.797773366032154, "grad_norm": 59.356632232666016, "learning_rate": 2.04134065164078e-06, "loss": 0.4444, "step": 123250 }, { "epoch": 4.799719724395656, "grad_norm": 19.103675842285156, "learning_rate": 2.0218770680057614e-06, "loss": 0.3642, "step": 123300 }, { "epoch": 4.801666082759158, "grad_norm": 12.400468826293945, "learning_rate": 2.002802756043443e-06, "loss": 0.4555, "step": 123350 }, { "epoch": 4.8036124411226595, "grad_norm": 20.987693786621094, "learning_rate": 1.983339172408424e-06, "loss": 0.4714, "step": 123400 }, { "epoch": 4.805558799486161, "grad_norm": 20.860063552856445, "learning_rate": 1.963875588773405e-06, "loss": 0.5035, "step": 123450 }, { "epoch": 4.807505157849663, "grad_norm": 28.804580688476562, "learning_rate": 1.9444120051383863e-06, "loss": 0.3514, "step": 123500 }, { "epoch": 4.809451516213165, "grad_norm": 14.524519920349121, "learning_rate": 1.9249484215033673e-06, "loss": 0.3851, "step": 123550 }, { "epoch": 4.811397874576667, "grad_norm": 2.81661319732666, "learning_rate": 1.9054848378683485e-06, "loss": 0.4587, "step": 123600 }, { "epoch": 4.813344232940169, "grad_norm": 28.764202117919922, "learning_rate": 1.8860212542333295e-06, "loss": 0.3919, "step": 123650 }, { "epoch": 4.815290591303671, "grad_norm": 3.962606906890869, "learning_rate": 1.8665576705983106e-06, "loss": 0.4651, "step": 123700 }, { "epoch": 4.817236949667173, "grad_norm": 20.503673553466797, "learning_rate": 1.8470940869632916e-06, "loss": 0.312, "step": 123750 }, { "epoch": 4.8191833080306745, "grad_norm": 35.60253143310547, "learning_rate": 1.8276305033282728e-06, "loss": 0.4826, "step": 123800 }, { "epoch": 4.821129666394176, "grad_norm": 16.72877311706543, "learning_rate": 1.808166919693254e-06, "loss": 0.3536, "step": 123850 }, { "epoch": 4.823076024757678, "grad_norm": 7.126533031463623, "learning_rate": 1.788703336058235e-06, "loss": 0.3926, "step": 123900 }, { "epoch": 4.82502238312118, "grad_norm": 20.84531593322754, "learning_rate": 1.7692397524232163e-06, "loss": 0.3449, "step": 123950 }, { "epoch": 4.8269687414846825, "grad_norm": 2.4635865688323975, "learning_rate": 1.7497761687881975e-06, "loss": 0.4366, "step": 124000 }, { "epoch": 4.828915099848184, "grad_norm": 0.760166585445404, "learning_rate": 1.7303125851531785e-06, "loss": 0.4729, "step": 124050 }, { "epoch": 4.830861458211686, "grad_norm": 7.754093647003174, "learning_rate": 1.7108490015181597e-06, "loss": 0.4109, "step": 124100 }, { "epoch": 4.832807816575188, "grad_norm": 35.86790466308594, "learning_rate": 1.6913854178831409e-06, "loss": 0.5015, "step": 124150 }, { "epoch": 4.8347541749386895, "grad_norm": 11.755407333374023, "learning_rate": 1.6719218342481218e-06, "loss": 0.3236, "step": 124200 }, { "epoch": 4.836700533302191, "grad_norm": 5.735936641693115, "learning_rate": 1.652458250613103e-06, "loss": 0.3722, "step": 124250 }, { "epoch": 4.838646891665693, "grad_norm": 26.884963989257812, "learning_rate": 1.632994666978084e-06, "loss": 0.3144, "step": 124300 }, { "epoch": 4.840593250029196, "grad_norm": 3.759874105453491, "learning_rate": 1.6135310833430652e-06, "loss": 0.3842, "step": 124350 }, { "epoch": 4.8425396083926975, "grad_norm": 20.413610458374023, "learning_rate": 1.5940674997080463e-06, "loss": 0.3349, "step": 124400 }, { "epoch": 4.844485966756199, "grad_norm": 0.10938812047243118, "learning_rate": 1.5746039160730273e-06, "loss": 0.3399, "step": 124450 }, { "epoch": 4.846432325119701, "grad_norm": 21.444753646850586, "learning_rate": 1.5551403324380087e-06, "loss": 0.4438, "step": 124500 }, { "epoch": 4.848378683483203, "grad_norm": 38.393917083740234, "learning_rate": 1.5356767488029897e-06, "loss": 0.4305, "step": 124550 }, { "epoch": 4.8503250418467045, "grad_norm": 17.5528507232666, "learning_rate": 1.5162131651679709e-06, "loss": 0.3989, "step": 124600 }, { "epoch": 4.852271400210206, "grad_norm": 14.685369491577148, "learning_rate": 1.4967495815329518e-06, "loss": 0.3699, "step": 124650 }, { "epoch": 4.854217758573709, "grad_norm": 32.542476654052734, "learning_rate": 1.477285997897933e-06, "loss": 0.4292, "step": 124700 }, { "epoch": 4.856164116937211, "grad_norm": 66.22562408447266, "learning_rate": 1.457822414262914e-06, "loss": 0.3516, "step": 124750 }, { "epoch": 4.8581104753007125, "grad_norm": 9.783712387084961, "learning_rate": 1.4383588306278954e-06, "loss": 0.4423, "step": 124800 }, { "epoch": 4.860056833664214, "grad_norm": 0.3905586302280426, "learning_rate": 1.4188952469928763e-06, "loss": 0.3364, "step": 124850 }, { "epoch": 4.862003192027716, "grad_norm": 22.273876190185547, "learning_rate": 1.3994316633578575e-06, "loss": 0.424, "step": 124900 }, { "epoch": 4.863949550391218, "grad_norm": 10.863186836242676, "learning_rate": 1.3799680797228387e-06, "loss": 0.4089, "step": 124950 }, { "epoch": 4.86589590875472, "grad_norm": 13.685665130615234, "learning_rate": 1.3605044960878197e-06, "loss": 0.4586, "step": 125000 }, { "epoch": 4.867842267118222, "grad_norm": 0.8886768221855164, "learning_rate": 1.3410409124528009e-06, "loss": 0.3672, "step": 125050 }, { "epoch": 4.869788625481724, "grad_norm": 13.353598594665527, "learning_rate": 1.3215773288177818e-06, "loss": 0.4942, "step": 125100 }, { "epoch": 4.871734983845226, "grad_norm": 11.301355361938477, "learning_rate": 1.3021137451827632e-06, "loss": 0.4489, "step": 125150 }, { "epoch": 4.8736813422087275, "grad_norm": 17.975528717041016, "learning_rate": 1.2826501615477442e-06, "loss": 0.4552, "step": 125200 }, { "epoch": 4.875627700572229, "grad_norm": 45.66489028930664, "learning_rate": 1.2631865779127254e-06, "loss": 0.414, "step": 125250 }, { "epoch": 4.877574058935731, "grad_norm": 17.926633834838867, "learning_rate": 1.2437229942777066e-06, "loss": 0.3967, "step": 125300 }, { "epoch": 4.879520417299233, "grad_norm": 8.491384506225586, "learning_rate": 1.2242594106426875e-06, "loss": 0.5302, "step": 125350 }, { "epoch": 4.8814667756627355, "grad_norm": 4.757686614990234, "learning_rate": 1.2047958270076687e-06, "loss": 0.4784, "step": 125400 }, { "epoch": 4.883413134026237, "grad_norm": 7.40605354309082, "learning_rate": 1.1853322433726497e-06, "loss": 0.3797, "step": 125450 }, { "epoch": 4.885359492389739, "grad_norm": 7.894580841064453, "learning_rate": 1.165868659737631e-06, "loss": 0.4923, "step": 125500 }, { "epoch": 4.887305850753241, "grad_norm": 128.11376953125, "learning_rate": 1.146405076102612e-06, "loss": 0.4197, "step": 125550 }, { "epoch": 4.8892522091167425, "grad_norm": 45.86446762084961, "learning_rate": 1.1269414924675932e-06, "loss": 0.3638, "step": 125600 }, { "epoch": 4.891198567480244, "grad_norm": 5.545990943908691, "learning_rate": 1.1074779088325742e-06, "loss": 0.3546, "step": 125650 }, { "epoch": 4.893144925843746, "grad_norm": 7.620852947235107, "learning_rate": 1.0880143251975554e-06, "loss": 0.4295, "step": 125700 }, { "epoch": 4.895091284207249, "grad_norm": 16.6533145904541, "learning_rate": 1.0685507415625366e-06, "loss": 0.3875, "step": 125750 }, { "epoch": 4.8970376425707505, "grad_norm": 4.389069080352783, "learning_rate": 1.0490871579275177e-06, "loss": 0.3895, "step": 125800 }, { "epoch": 4.898984000934252, "grad_norm": 22.765064239501953, "learning_rate": 1.029623574292499e-06, "loss": 0.4055, "step": 125850 }, { "epoch": 4.900930359297754, "grad_norm": 13.823589324951172, "learning_rate": 1.01015999065748e-06, "loss": 0.3472, "step": 125900 }, { "epoch": 4.902876717661256, "grad_norm": 1.197481632232666, "learning_rate": 9.90696407022461e-07, "loss": 0.4323, "step": 125950 }, { "epoch": 4.9048230760247575, "grad_norm": 26.19270896911621, "learning_rate": 9.71232823387442e-07, "loss": 0.3731, "step": 126000 }, { "epoch": 4.906769434388259, "grad_norm": 19.2567081451416, "learning_rate": 9.517692397524232e-07, "loss": 0.3963, "step": 126050 }, { "epoch": 4.908715792751762, "grad_norm": 31.530736923217773, "learning_rate": 9.323056561174043e-07, "loss": 0.3679, "step": 126100 }, { "epoch": 4.910662151115264, "grad_norm": 11.973395347595215, "learning_rate": 9.128420724823856e-07, "loss": 0.4387, "step": 126150 }, { "epoch": 4.9126085094787655, "grad_norm": 69.86974334716797, "learning_rate": 8.933784888473667e-07, "loss": 0.3698, "step": 126200 }, { "epoch": 4.914554867842267, "grad_norm": 0.3999830186367035, "learning_rate": 8.739149052123477e-07, "loss": 0.3145, "step": 126250 }, { "epoch": 4.916501226205769, "grad_norm": 10.723855018615723, "learning_rate": 8.548405932500293e-07, "loss": 0.4395, "step": 126300 }, { "epoch": 4.918447584569271, "grad_norm": 13.284255981445312, "learning_rate": 8.353770096150104e-07, "loss": 0.4125, "step": 126350 }, { "epoch": 4.9203939429327725, "grad_norm": 34.34794616699219, "learning_rate": 8.159134259799915e-07, "loss": 0.4041, "step": 126400 }, { "epoch": 4.922340301296274, "grad_norm": 1.6894426345825195, "learning_rate": 7.964498423449726e-07, "loss": 0.4726, "step": 126450 }, { "epoch": 4.924286659659776, "grad_norm": 13.036922454833984, "learning_rate": 7.769862587099537e-07, "loss": 0.4324, "step": 126500 }, { "epoch": 4.926233018023279, "grad_norm": 9.402647972106934, "learning_rate": 7.575226750749348e-07, "loss": 0.4815, "step": 126550 }, { "epoch": 4.9281793763867805, "grad_norm": 0.96004718542099, "learning_rate": 7.380590914399159e-07, "loss": 0.4619, "step": 126600 }, { "epoch": 4.930125734750282, "grad_norm": 9.593693733215332, "learning_rate": 7.185955078048971e-07, "loss": 0.4637, "step": 126650 }, { "epoch": 4.932072093113784, "grad_norm": 65.5577163696289, "learning_rate": 6.991319241698783e-07, "loss": 0.4701, "step": 126700 }, { "epoch": 4.934018451477286, "grad_norm": 12.65235710144043, "learning_rate": 6.796683405348593e-07, "loss": 0.4035, "step": 126750 }, { "epoch": 4.935964809840788, "grad_norm": 4.11557674407959, "learning_rate": 6.602047568998404e-07, "loss": 0.4196, "step": 126800 }, { "epoch": 4.937911168204289, "grad_norm": 76.83570861816406, "learning_rate": 6.407411732648216e-07, "loss": 0.3559, "step": 126850 }, { "epoch": 4.939857526567792, "grad_norm": 1.1314365863800049, "learning_rate": 6.212775896298027e-07, "loss": 0.3658, "step": 126900 }, { "epoch": 4.941803884931294, "grad_norm": 19.99557113647461, "learning_rate": 6.018140059947837e-07, "loss": 0.4334, "step": 126950 }, { "epoch": 4.9437502432947955, "grad_norm": 13.78470516204834, "learning_rate": 5.823504223597648e-07, "loss": 0.4825, "step": 127000 }, { "epoch": 4.945696601658297, "grad_norm": 21.23024559020996, "learning_rate": 5.62886838724746e-07, "loss": 0.3686, "step": 127050 }, { "epoch": 4.947642960021799, "grad_norm": 8.86468505859375, "learning_rate": 5.434232550897272e-07, "loss": 0.3479, "step": 127100 }, { "epoch": 4.949589318385301, "grad_norm": 56.35804748535156, "learning_rate": 5.239596714547083e-07, "loss": 0.2984, "step": 127150 }, { "epoch": 4.951535676748803, "grad_norm": 25.208290100097656, "learning_rate": 5.044960878196894e-07, "loss": 0.5106, "step": 127200 }, { "epoch": 4.953482035112305, "grad_norm": 0.950842022895813, "learning_rate": 4.850325041846705e-07, "loss": 0.3828, "step": 127250 }, { "epoch": 4.955428393475807, "grad_norm": 37.89137649536133, "learning_rate": 4.655689205496516e-07, "loss": 0.4537, "step": 127300 }, { "epoch": 4.957374751839309, "grad_norm": 36.720664978027344, "learning_rate": 4.461053369146328e-07, "loss": 0.5337, "step": 127350 }, { "epoch": 4.9593211102028105, "grad_norm": 18.910829544067383, "learning_rate": 4.2664175327961385e-07, "loss": 0.4493, "step": 127400 }, { "epoch": 4.961267468566312, "grad_norm": 11.041605949401855, "learning_rate": 4.07178169644595e-07, "loss": 0.4165, "step": 127450 }, { "epoch": 4.963213826929814, "grad_norm": 13.640743255615234, "learning_rate": 3.877145860095761e-07, "loss": 0.4026, "step": 127500 }, { "epoch": 4.965160185293316, "grad_norm": 0.4687836468219757, "learning_rate": 3.682510023745572e-07, "loss": 0.4568, "step": 127550 }, { "epoch": 4.9671065436568185, "grad_norm": 17.057714462280273, "learning_rate": 3.487874187395383e-07, "loss": 0.3976, "step": 127600 }, { "epoch": 4.96905290202032, "grad_norm": 48.19541549682617, "learning_rate": 3.293238351045195e-07, "loss": 0.3755, "step": 127650 }, { "epoch": 4.970999260383822, "grad_norm": 31.50641441345215, "learning_rate": 3.098602514695006e-07, "loss": 0.3472, "step": 127700 }, { "epoch": 4.972945618747324, "grad_norm": 64.74296569824219, "learning_rate": 2.903966678344817e-07, "loss": 0.4049, "step": 127750 }, { "epoch": 4.9748919771108255, "grad_norm": 24.63729476928711, "learning_rate": 2.709330841994628e-07, "loss": 0.5423, "step": 127800 }, { "epoch": 4.976838335474327, "grad_norm": 13.280863761901855, "learning_rate": 2.5146950056444396e-07, "loss": 0.4979, "step": 127850 }, { "epoch": 4.978784693837829, "grad_norm": 43.51626205444336, "learning_rate": 2.3200591692942504e-07, "loss": 0.3503, "step": 127900 }, { "epoch": 4.980731052201332, "grad_norm": 9.726184844970703, "learning_rate": 2.1254233329440617e-07, "loss": 0.4169, "step": 127950 }, { "epoch": 4.9826774105648335, "grad_norm": 22.772802352905273, "learning_rate": 1.930787496593873e-07, "loss": 0.3871, "step": 128000 }, { "epoch": 4.984623768928335, "grad_norm": 26.103466033935547, "learning_rate": 1.7361516602436843e-07, "loss": 0.3679, "step": 128050 }, { "epoch": 4.986570127291837, "grad_norm": 7.0824503898620605, "learning_rate": 1.5415158238934953e-07, "loss": 0.4003, "step": 128100 }, { "epoch": 4.988516485655339, "grad_norm": 0.5014774203300476, "learning_rate": 1.3468799875433066e-07, "loss": 0.4234, "step": 128150 }, { "epoch": 4.9904628440188405, "grad_norm": 41.7393913269043, "learning_rate": 1.1522441511931177e-07, "loss": 0.4426, "step": 128200 }, { "epoch": 4.992409202382342, "grad_norm": 29.443391799926758, "learning_rate": 9.576083148429289e-08, "loss": 0.4363, "step": 128250 }, { "epoch": 4.994355560745845, "grad_norm": 26.583566665649414, "learning_rate": 7.629724784927402e-08, "loss": 0.3978, "step": 128300 }, { "epoch": 4.996301919109347, "grad_norm": 3.6772215366363525, "learning_rate": 5.683366421425513e-08, "loss": 0.3433, "step": 128350 }, { "epoch": 4.9982482774728485, "grad_norm": 52.81440734863281, "learning_rate": 3.737008057923625e-08, "loss": 0.308, "step": 128400 }, { "epoch": 5.0, "eval_accuracy": 0.80415742146444, "eval_f1_macro": 0.7620613064857661, "eval_f1_weighted": 0.8029689974239149, "eval_loss": 0.700664222240448, "eval_roc_auc": 0.9564141840257689, "eval_runtime": 30.1314, "eval_samples_per_second": 852.566, "eval_steps_per_second": 106.6, "step": 128445 }, { "epoch": 5.00019463583635, "grad_norm": 32.244842529296875, "learning_rate": 2.5009147884308458e-05, "loss": 0.2515, "step": 128450 }, { "epoch": 5.002140994199852, "grad_norm": 2.572877883911133, "learning_rate": 2.4999610728327298e-05, "loss": 0.2866, "step": 128500 }, { "epoch": 5.004087352563354, "grad_norm": 43.19818115234375, "learning_rate": 2.4990073572346142e-05, "loss": 0.5145, "step": 128550 }, { "epoch": 5.006033710926856, "grad_norm": 14.287704467773438, "learning_rate": 2.498034178052863e-05, "loss": 0.4418, "step": 128600 }, { "epoch": 5.007980069290357, "grad_norm": 2.7757139205932617, "learning_rate": 2.4970609988711123e-05, "loss": 0.433, "step": 128650 }, { "epoch": 5.00992642765386, "grad_norm": 5.892545223236084, "learning_rate": 2.4960878196893615e-05, "loss": 0.4341, "step": 128700 }, { "epoch": 5.011872786017362, "grad_norm": 3.485265016555786, "learning_rate": 2.4951146405076104e-05, "loss": 0.5117, "step": 128750 }, { "epoch": 5.0138191443808635, "grad_norm": 26.276748657226562, "learning_rate": 2.4941414613258596e-05, "loss": 0.3938, "step": 128800 }, { "epoch": 5.015765502744365, "grad_norm": 9.817642211914062, "learning_rate": 2.4931682821441085e-05, "loss": 0.342, "step": 128850 }, { "epoch": 5.017711861107867, "grad_norm": 15.044334411621094, "learning_rate": 2.4922145665459925e-05, "loss": 0.4418, "step": 128900 }, { "epoch": 5.019658219471369, "grad_norm": 10.00013542175293, "learning_rate": 2.4912413873642414e-05, "loss": 0.4046, "step": 128950 }, { "epoch": 5.021604577834871, "grad_norm": 8.49260425567627, "learning_rate": 2.4902682081824906e-05, "loss": 0.4184, "step": 129000 }, { "epoch": 5.023550936198373, "grad_norm": 5.64994478225708, "learning_rate": 2.4892950290007398e-05, "loss": 0.4514, "step": 129050 }, { "epoch": 5.025497294561875, "grad_norm": 4.6553568840026855, "learning_rate": 2.488321849818989e-05, "loss": 0.4143, "step": 129100 }, { "epoch": 5.027443652925377, "grad_norm": 13.674705505371094, "learning_rate": 2.487348670637238e-05, "loss": 0.4264, "step": 129150 }, { "epoch": 5.0293900112888785, "grad_norm": 6.169399261474609, "learning_rate": 2.4863754914554868e-05, "loss": 0.4152, "step": 129200 }, { "epoch": 5.03133636965238, "grad_norm": 24.208412170410156, "learning_rate": 2.485402312273736e-05, "loss": 0.3801, "step": 129250 }, { "epoch": 5.033282728015882, "grad_norm": 40.63991165161133, "learning_rate": 2.484429133091985e-05, "loss": 0.3952, "step": 129300 }, { "epoch": 5.035229086379384, "grad_norm": 10.848690032958984, "learning_rate": 2.483455953910234e-05, "loss": 0.3615, "step": 129350 }, { "epoch": 5.0371754447428865, "grad_norm": 6.471902847290039, "learning_rate": 2.4824827747284833e-05, "loss": 0.4372, "step": 129400 }, { "epoch": 5.039121803106388, "grad_norm": 38.90018844604492, "learning_rate": 2.4815095955467322e-05, "loss": 0.4653, "step": 129450 }, { "epoch": 5.04106816146989, "grad_norm": 47.79963684082031, "learning_rate": 2.4805364163649814e-05, "loss": 0.4331, "step": 129500 }, { "epoch": 5.043014519833392, "grad_norm": 5.177999973297119, "learning_rate": 2.4795632371832303e-05, "loss": 0.3614, "step": 129550 }, { "epoch": 5.0449608781968935, "grad_norm": 3.5848968029022217, "learning_rate": 2.4785900580014792e-05, "loss": 0.3849, "step": 129600 }, { "epoch": 5.046907236560395, "grad_norm": 15.575108528137207, "learning_rate": 2.4776363424033636e-05, "loss": 0.4312, "step": 129650 }, { "epoch": 5.048853594923897, "grad_norm": 0.590552806854248, "learning_rate": 2.4766631632216124e-05, "loss": 0.3875, "step": 129700 }, { "epoch": 5.050799953287399, "grad_norm": 11.805408477783203, "learning_rate": 2.4756899840398613e-05, "loss": 0.3942, "step": 129750 }, { "epoch": 5.0527463116509015, "grad_norm": 19.866626739501953, "learning_rate": 2.4747168048581105e-05, "loss": 0.43, "step": 129800 }, { "epoch": 5.054692670014403, "grad_norm": 9.254434585571289, "learning_rate": 2.4737436256763598e-05, "loss": 0.4287, "step": 129850 }, { "epoch": 5.056639028377905, "grad_norm": 12.446402549743652, "learning_rate": 2.4727704464946086e-05, "loss": 0.5212, "step": 129900 }, { "epoch": 5.058585386741407, "grad_norm": 13.825610160827637, "learning_rate": 2.471797267312858e-05, "loss": 0.4398, "step": 129950 }, { "epoch": 5.0605317451049086, "grad_norm": 12.471844673156738, "learning_rate": 2.4708240881311067e-05, "loss": 0.3972, "step": 130000 }, { "epoch": 5.06247810346841, "grad_norm": 17.916723251342773, "learning_rate": 2.469850908949356e-05, "loss": 0.4279, "step": 130050 }, { "epoch": 5.064424461831912, "grad_norm": 20.63932228088379, "learning_rate": 2.4688777297676048e-05, "loss": 0.3861, "step": 130100 }, { "epoch": 5.066370820195415, "grad_norm": 6.9153571128845215, "learning_rate": 2.467904550585854e-05, "loss": 0.4805, "step": 130150 }, { "epoch": 5.0683171785589165, "grad_norm": 11.21633243560791, "learning_rate": 2.4669313714041033e-05, "loss": 0.4752, "step": 130200 }, { "epoch": 5.070263536922418, "grad_norm": 45.860294342041016, "learning_rate": 2.465958192222352e-05, "loss": 0.4194, "step": 130250 }, { "epoch": 5.07220989528592, "grad_norm": 66.19770812988281, "learning_rate": 2.464985013040601e-05, "loss": 0.4718, "step": 130300 }, { "epoch": 5.074156253649422, "grad_norm": 12.467779159545898, "learning_rate": 2.4640118338588502e-05, "loss": 0.4552, "step": 130350 }, { "epoch": 5.076102612012924, "grad_norm": 12.51685619354248, "learning_rate": 2.463038654677099e-05, "loss": 0.4414, "step": 130400 }, { "epoch": 5.078048970376425, "grad_norm": 26.430803298950195, "learning_rate": 2.4620654754953483e-05, "loss": 0.4034, "step": 130450 }, { "epoch": 5.079995328739928, "grad_norm": 1.0972925424575806, "learning_rate": 2.4610922963135976e-05, "loss": 0.4567, "step": 130500 }, { "epoch": 5.08194168710343, "grad_norm": 7.489884376525879, "learning_rate": 2.4601191171318464e-05, "loss": 0.4273, "step": 130550 }, { "epoch": 5.0838880454669315, "grad_norm": 11.996527671813965, "learning_rate": 2.4591459379500957e-05, "loss": 0.4079, "step": 130600 }, { "epoch": 5.085834403830433, "grad_norm": 41.55928039550781, "learning_rate": 2.4581727587683445e-05, "loss": 0.4362, "step": 130650 }, { "epoch": 5.087780762193935, "grad_norm": 5.6895880699157715, "learning_rate": 2.4571995795865934e-05, "loss": 0.4845, "step": 130700 }, { "epoch": 5.089727120557437, "grad_norm": 10.405617713928223, "learning_rate": 2.4562264004048426e-05, "loss": 0.5876, "step": 130750 }, { "epoch": 5.091673478920939, "grad_norm": 11.361498832702637, "learning_rate": 2.4552532212230915e-05, "loss": 0.4003, "step": 130800 }, { "epoch": 5.093619837284441, "grad_norm": 11.009017944335938, "learning_rate": 2.4542800420413407e-05, "loss": 0.4977, "step": 130850 }, { "epoch": 5.095566195647943, "grad_norm": 8.10717487335205, "learning_rate": 2.45330686285959e-05, "loss": 0.4702, "step": 130900 }, { "epoch": 5.097512554011445, "grad_norm": 12.66999626159668, "learning_rate": 2.4523336836778388e-05, "loss": 0.4799, "step": 130950 }, { "epoch": 5.0994589123749465, "grad_norm": 26.93848991394043, "learning_rate": 2.451360504496088e-05, "loss": 0.4037, "step": 131000 }, { "epoch": 5.101405270738448, "grad_norm": 31.64338493347168, "learning_rate": 2.450387325314337e-05, "loss": 0.3796, "step": 131050 }, { "epoch": 5.10335162910195, "grad_norm": 22.900121688842773, "learning_rate": 2.449414146132586e-05, "loss": 0.6115, "step": 131100 }, { "epoch": 5.105297987465452, "grad_norm": 15.333900451660156, "learning_rate": 2.448440966950835e-05, "loss": 0.4337, "step": 131150 }, { "epoch": 5.107244345828954, "grad_norm": 11.789368629455566, "learning_rate": 2.4474677877690842e-05, "loss": 0.4765, "step": 131200 }, { "epoch": 5.109190704192456, "grad_norm": 56.45655822753906, "learning_rate": 2.4464946085873335e-05, "loss": 0.5889, "step": 131250 }, { "epoch": 5.111137062555958, "grad_norm": 20.530410766601562, "learning_rate": 2.4455214294055823e-05, "loss": 0.4598, "step": 131300 }, { "epoch": 5.11308342091946, "grad_norm": 22.6951961517334, "learning_rate": 2.4445482502238312e-05, "loss": 0.4356, "step": 131350 }, { "epoch": 5.1150297792829615, "grad_norm": 11.24929141998291, "learning_rate": 2.4435750710420804e-05, "loss": 0.3685, "step": 131400 }, { "epoch": 5.116976137646463, "grad_norm": 9.874818801879883, "learning_rate": 2.4426018918603293e-05, "loss": 0.4781, "step": 131450 }, { "epoch": 5.118922496009965, "grad_norm": 8.340282440185547, "learning_rate": 2.4416287126785785e-05, "loss": 0.5203, "step": 131500 }, { "epoch": 5.120868854373467, "grad_norm": 28.66065216064453, "learning_rate": 2.4406555334968274e-05, "loss": 0.4409, "step": 131550 }, { "epoch": 5.1228152127369695, "grad_norm": 15.768098831176758, "learning_rate": 2.4396823543150766e-05, "loss": 0.4432, "step": 131600 }, { "epoch": 5.124761571100471, "grad_norm": 20.37438201904297, "learning_rate": 2.438709175133326e-05, "loss": 0.4813, "step": 131650 }, { "epoch": 5.126707929463973, "grad_norm": 4.350988864898682, "learning_rate": 2.43775545953521e-05, "loss": 0.4641, "step": 131700 }, { "epoch": 5.128654287827475, "grad_norm": 25.691415786743164, "learning_rate": 2.4367822803534588e-05, "loss": 0.5009, "step": 131750 }, { "epoch": 5.1306006461909766, "grad_norm": 20.466461181640625, "learning_rate": 2.435809101171708e-05, "loss": 0.4261, "step": 131800 }, { "epoch": 5.132547004554478, "grad_norm": 23.383852005004883, "learning_rate": 2.434835921989957e-05, "loss": 0.5488, "step": 131850 }, { "epoch": 5.13449336291798, "grad_norm": 4.308971881866455, "learning_rate": 2.4338627428082057e-05, "loss": 0.4681, "step": 131900 }, { "epoch": 5.136439721281483, "grad_norm": 13.661361694335938, "learning_rate": 2.432889563626455e-05, "loss": 0.4045, "step": 131950 }, { "epoch": 5.1383860796449845, "grad_norm": 34.34772491455078, "learning_rate": 2.4319163844447042e-05, "loss": 0.4265, "step": 132000 }, { "epoch": 5.140332438008486, "grad_norm": 7.264039039611816, "learning_rate": 2.430943205262953e-05, "loss": 0.5024, "step": 132050 }, { "epoch": 5.142278796371988, "grad_norm": 7.489038944244385, "learning_rate": 2.4299700260812023e-05, "loss": 0.4202, "step": 132100 }, { "epoch": 5.14422515473549, "grad_norm": 32.83377456665039, "learning_rate": 2.428996846899451e-05, "loss": 0.5013, "step": 132150 }, { "epoch": 5.146171513098992, "grad_norm": 8.578951835632324, "learning_rate": 2.4280236677177004e-05, "loss": 0.518, "step": 132200 }, { "epoch": 5.148117871462493, "grad_norm": 10.828824043273926, "learning_rate": 2.4270504885359492e-05, "loss": 0.4436, "step": 132250 }, { "epoch": 5.150064229825996, "grad_norm": 7.433048725128174, "learning_rate": 2.4260773093541985e-05, "loss": 0.3955, "step": 132300 }, { "epoch": 5.152010588189498, "grad_norm": 1.0247400999069214, "learning_rate": 2.4251041301724477e-05, "loss": 0.4668, "step": 132350 }, { "epoch": 5.1539569465529995, "grad_norm": 44.80519485473633, "learning_rate": 2.4241309509906966e-05, "loss": 0.5517, "step": 132400 }, { "epoch": 5.155903304916501, "grad_norm": 1.9640061855316162, "learning_rate": 2.4231577718089458e-05, "loss": 0.4831, "step": 132450 }, { "epoch": 5.157849663280003, "grad_norm": 5.587687015533447, "learning_rate": 2.4221845926271947e-05, "loss": 0.3573, "step": 132500 }, { "epoch": 5.159796021643505, "grad_norm": 86.47053527832031, "learning_rate": 2.4212114134454435e-05, "loss": 0.5362, "step": 132550 }, { "epoch": 5.161742380007007, "grad_norm": 33.415199279785156, "learning_rate": 2.4202382342636928e-05, "loss": 0.3466, "step": 132600 }, { "epoch": 5.163688738370508, "grad_norm": 4.576045513153076, "learning_rate": 2.4192650550819416e-05, "loss": 0.39, "step": 132650 }, { "epoch": 5.165635096734011, "grad_norm": 51.19283676147461, "learning_rate": 2.418291875900191e-05, "loss": 0.5067, "step": 132700 }, { "epoch": 5.167581455097513, "grad_norm": 112.53578186035156, "learning_rate": 2.41731869671844e-05, "loss": 0.4088, "step": 132750 }, { "epoch": 5.1695278134610145, "grad_norm": 33.01721954345703, "learning_rate": 2.416345517536689e-05, "loss": 0.5846, "step": 132800 }, { "epoch": 5.171474171824516, "grad_norm": 13.479867935180664, "learning_rate": 2.415372338354938e-05, "loss": 0.4009, "step": 132850 }, { "epoch": 5.173420530188018, "grad_norm": 68.3298568725586, "learning_rate": 2.414399159173187e-05, "loss": 0.4126, "step": 132900 }, { "epoch": 5.17536688855152, "grad_norm": 13.323561668395996, "learning_rate": 2.413425979991436e-05, "loss": 0.3624, "step": 132950 }, { "epoch": 5.177313246915022, "grad_norm": 6.477640628814697, "learning_rate": 2.412452800809685e-05, "loss": 0.4574, "step": 133000 }, { "epoch": 5.179259605278524, "grad_norm": 21.69005012512207, "learning_rate": 2.4114796216279344e-05, "loss": 0.4712, "step": 133050 }, { "epoch": 5.181205963642026, "grad_norm": 99.2454833984375, "learning_rate": 2.4105064424461832e-05, "loss": 0.4745, "step": 133100 }, { "epoch": 5.183152322005528, "grad_norm": 9.05982780456543, "learning_rate": 2.4095332632644325e-05, "loss": 0.5154, "step": 133150 }, { "epoch": 5.1850986803690295, "grad_norm": 9.585458755493164, "learning_rate": 2.4085600840826813e-05, "loss": 0.4689, "step": 133200 }, { "epoch": 5.187045038732531, "grad_norm": 57.596473693847656, "learning_rate": 2.4075869049009306e-05, "loss": 0.4993, "step": 133250 }, { "epoch": 5.188991397096033, "grad_norm": 6.516857147216797, "learning_rate": 2.4066137257191794e-05, "loss": 0.475, "step": 133300 }, { "epoch": 5.190937755459535, "grad_norm": 24.806440353393555, "learning_rate": 2.4056405465374286e-05, "loss": 0.5873, "step": 133350 }, { "epoch": 5.1928841138230375, "grad_norm": 21.148094177246094, "learning_rate": 2.404667367355678e-05, "loss": 0.4235, "step": 133400 }, { "epoch": 5.194830472186539, "grad_norm": 19.58844566345215, "learning_rate": 2.403713651757562e-05, "loss": 0.4336, "step": 133450 }, { "epoch": 5.196776830550041, "grad_norm": 18.382402420043945, "learning_rate": 2.402759936159446e-05, "loss": 0.5446, "step": 133500 }, { "epoch": 5.198723188913543, "grad_norm": 16.026710510253906, "learning_rate": 2.4017867569776948e-05, "loss": 0.4223, "step": 133550 }, { "epoch": 5.200669547277045, "grad_norm": 21.583370208740234, "learning_rate": 2.400813577795944e-05, "loss": 0.518, "step": 133600 }, { "epoch": 5.202615905640546, "grad_norm": 27.97723388671875, "learning_rate": 2.399840398614193e-05, "loss": 0.4674, "step": 133650 }, { "epoch": 5.204562264004048, "grad_norm": 31.915922164916992, "learning_rate": 2.398867219432442e-05, "loss": 0.47, "step": 133700 }, { "epoch": 5.206508622367551, "grad_norm": 47.71385192871094, "learning_rate": 2.397894040250691e-05, "loss": 0.496, "step": 133750 }, { "epoch": 5.2084549807310525, "grad_norm": 21.37966537475586, "learning_rate": 2.396940324652575e-05, "loss": 0.4416, "step": 133800 }, { "epoch": 5.210401339094554, "grad_norm": 11.356681823730469, "learning_rate": 2.395967145470824e-05, "loss": 0.4408, "step": 133850 }, { "epoch": 5.212347697458056, "grad_norm": 32.94057083129883, "learning_rate": 2.394993966289073e-05, "loss": 0.4753, "step": 133900 }, { "epoch": 5.214294055821558, "grad_norm": 15.341704368591309, "learning_rate": 2.3940207871073224e-05, "loss": 0.46, "step": 133950 }, { "epoch": 5.21624041418506, "grad_norm": 10.36394214630127, "learning_rate": 2.3930476079255716e-05, "loss": 0.616, "step": 134000 }, { "epoch": 5.218186772548561, "grad_norm": 9.221391677856445, "learning_rate": 2.3920744287438205e-05, "loss": 0.4193, "step": 134050 }, { "epoch": 5.220133130912063, "grad_norm": 16.520732879638672, "learning_rate": 2.3911012495620693e-05, "loss": 0.5879, "step": 134100 }, { "epoch": 5.222079489275566, "grad_norm": 2.7991585731506348, "learning_rate": 2.3901280703803186e-05, "loss": 0.5615, "step": 134150 }, { "epoch": 5.2240258476390675, "grad_norm": 15.630522727966309, "learning_rate": 2.3891548911985674e-05, "loss": 0.5624, "step": 134200 }, { "epoch": 5.225972206002569, "grad_norm": 33.98981857299805, "learning_rate": 2.3881817120168167e-05, "loss": 0.4777, "step": 134250 }, { "epoch": 5.227918564366071, "grad_norm": 6.848458766937256, "learning_rate": 2.387208532835066e-05, "loss": 0.4273, "step": 134300 }, { "epoch": 5.229864922729573, "grad_norm": 167.7740936279297, "learning_rate": 2.3862353536533148e-05, "loss": 0.4679, "step": 134350 }, { "epoch": 5.231811281093075, "grad_norm": 54.83897399902344, "learning_rate": 2.385262174471564e-05, "loss": 0.5014, "step": 134400 }, { "epoch": 5.233757639456576, "grad_norm": 15.81943130493164, "learning_rate": 2.384288995289813e-05, "loss": 0.4402, "step": 134450 }, { "epoch": 5.235703997820079, "grad_norm": 53.38258743286133, "learning_rate": 2.3833158161080617e-05, "loss": 0.4298, "step": 134500 }, { "epoch": 5.237650356183581, "grad_norm": 38.46998596191406, "learning_rate": 2.382342636926311e-05, "loss": 0.52, "step": 134550 }, { "epoch": 5.2395967145470825, "grad_norm": 8.483528137207031, "learning_rate": 2.3813694577445598e-05, "loss": 0.3949, "step": 134600 }, { "epoch": 5.241543072910584, "grad_norm": 28.725940704345703, "learning_rate": 2.3803962785628094e-05, "loss": 0.4046, "step": 134650 }, { "epoch": 5.243489431274086, "grad_norm": 19.22784423828125, "learning_rate": 2.3794230993810583e-05, "loss": 0.557, "step": 134700 }, { "epoch": 5.245435789637588, "grad_norm": 22.76089096069336, "learning_rate": 2.378449920199307e-05, "loss": 0.4475, "step": 134750 }, { "epoch": 5.24738214800109, "grad_norm": 18.847484588623047, "learning_rate": 2.3774767410175564e-05, "loss": 0.4149, "step": 134800 }, { "epoch": 5.249328506364592, "grad_norm": 10.882554054260254, "learning_rate": 2.3765035618358052e-05, "loss": 0.4709, "step": 134850 }, { "epoch": 5.251274864728094, "grad_norm": 15.885342597961426, "learning_rate": 2.375530382654054e-05, "loss": 0.5111, "step": 134900 }, { "epoch": 5.253221223091596, "grad_norm": 13.503155708312988, "learning_rate": 2.3745572034723033e-05, "loss": 0.4046, "step": 134950 }, { "epoch": 5.2551675814550975, "grad_norm": 29.062397003173828, "learning_rate": 2.3735840242905526e-05, "loss": 0.4874, "step": 135000 }, { "epoch": 5.257113939818599, "grad_norm": 13.92009449005127, "learning_rate": 2.3726108451088018e-05, "loss": 0.4507, "step": 135050 }, { "epoch": 5.259060298182101, "grad_norm": 21.107391357421875, "learning_rate": 2.3716376659270507e-05, "loss": 0.536, "step": 135100 }, { "epoch": 5.261006656545603, "grad_norm": 9.6906156539917, "learning_rate": 2.3706644867452995e-05, "loss": 0.4318, "step": 135150 }, { "epoch": 5.262953014909105, "grad_norm": 16.5903263092041, "learning_rate": 2.3696913075635487e-05, "loss": 0.5219, "step": 135200 }, { "epoch": 5.264899373272607, "grad_norm": 52.737117767333984, "learning_rate": 2.3687181283817976e-05, "loss": 0.5142, "step": 135250 }, { "epoch": 5.266845731636109, "grad_norm": 9.189010620117188, "learning_rate": 2.367744949200047e-05, "loss": 0.426, "step": 135300 }, { "epoch": 5.268792089999611, "grad_norm": 27.08891487121582, "learning_rate": 2.366771770018296e-05, "loss": 0.5204, "step": 135350 }, { "epoch": 5.270738448363113, "grad_norm": 73.86074829101562, "learning_rate": 2.365798590836545e-05, "loss": 0.5214, "step": 135400 }, { "epoch": 5.272684806726614, "grad_norm": 4.7774529457092285, "learning_rate": 2.364825411654794e-05, "loss": 0.4934, "step": 135450 }, { "epoch": 5.274631165090116, "grad_norm": 8.61899471282959, "learning_rate": 2.363852232473043e-05, "loss": 0.4413, "step": 135500 }, { "epoch": 5.276577523453618, "grad_norm": 9.450860977172852, "learning_rate": 2.362879053291292e-05, "loss": 0.4148, "step": 135550 }, { "epoch": 5.2785238818171205, "grad_norm": 49.48708724975586, "learning_rate": 2.361905874109541e-05, "loss": 0.5493, "step": 135600 }, { "epoch": 5.280470240180622, "grad_norm": 33.04400634765625, "learning_rate": 2.36093269492779e-05, "loss": 0.4875, "step": 135650 }, { "epoch": 5.282416598544124, "grad_norm": 1.4621058702468872, "learning_rate": 2.3599595157460396e-05, "loss": 0.5434, "step": 135700 }, { "epoch": 5.284362956907626, "grad_norm": 49.517234802246094, "learning_rate": 2.3589863365642885e-05, "loss": 0.4503, "step": 135750 }, { "epoch": 5.286309315271128, "grad_norm": 11.301891326904297, "learning_rate": 2.3580131573825373e-05, "loss": 0.4885, "step": 135800 }, { "epoch": 5.288255673634629, "grad_norm": 16.952756881713867, "learning_rate": 2.3570399782007865e-05, "loss": 0.548, "step": 135850 }, { "epoch": 5.290202031998131, "grad_norm": 70.54737091064453, "learning_rate": 2.3560667990190354e-05, "loss": 0.616, "step": 135900 }, { "epoch": 5.292148390361634, "grad_norm": 105.03553009033203, "learning_rate": 2.3550936198372843e-05, "loss": 0.4302, "step": 135950 }, { "epoch": 5.2940947487251355, "grad_norm": 12.73370361328125, "learning_rate": 2.3541204406555335e-05, "loss": 0.4888, "step": 136000 }, { "epoch": 5.296041107088637, "grad_norm": 9.52304458618164, "learning_rate": 2.3531472614737827e-05, "loss": 0.5169, "step": 136050 }, { "epoch": 5.297987465452139, "grad_norm": 138.459716796875, "learning_rate": 2.352174082292032e-05, "loss": 0.6568, "step": 136100 }, { "epoch": 5.299933823815641, "grad_norm": 24.51439666748047, "learning_rate": 2.351220366693916e-05, "loss": 0.488, "step": 136150 }, { "epoch": 5.301880182179143, "grad_norm": 25.098203659057617, "learning_rate": 2.350247187512165e-05, "loss": 0.4905, "step": 136200 }, { "epoch": 5.303826540542644, "grad_norm": 18.924976348876953, "learning_rate": 2.3492740083304138e-05, "loss": 0.5047, "step": 136250 }, { "epoch": 5.305772898906147, "grad_norm": 19.40249252319336, "learning_rate": 2.348300829148663e-05, "loss": 0.5231, "step": 136300 }, { "epoch": 5.307719257269649, "grad_norm": 6.543251991271973, "learning_rate": 2.347327649966912e-05, "loss": 0.4183, "step": 136350 }, { "epoch": 5.3096656156331505, "grad_norm": 19.86079978942871, "learning_rate": 2.346354470785161e-05, "loss": 0.4699, "step": 136400 }, { "epoch": 5.311611973996652, "grad_norm": 21.9833927154541, "learning_rate": 2.3453812916034103e-05, "loss": 0.4796, "step": 136450 }, { "epoch": 5.313558332360154, "grad_norm": 25.242565155029297, "learning_rate": 2.344408112421659e-05, "loss": 0.5039, "step": 136500 }, { "epoch": 5.315504690723656, "grad_norm": 58.573402404785156, "learning_rate": 2.3434349332399084e-05, "loss": 0.5079, "step": 136550 }, { "epoch": 5.317451049087158, "grad_norm": 1.689626932144165, "learning_rate": 2.3424617540581573e-05, "loss": 0.4701, "step": 136600 }, { "epoch": 5.31939740745066, "grad_norm": 12.73671817779541, "learning_rate": 2.3414885748764065e-05, "loss": 0.5206, "step": 136650 }, { "epoch": 5.321343765814162, "grad_norm": 50.25852966308594, "learning_rate": 2.3405153956946554e-05, "loss": 0.4375, "step": 136700 }, { "epoch": 5.323290124177664, "grad_norm": 14.195947647094727, "learning_rate": 2.3395422165129042e-05, "loss": 0.5275, "step": 136750 }, { "epoch": 5.3252364825411656, "grad_norm": 23.931568145751953, "learning_rate": 2.3385690373311535e-05, "loss": 0.4538, "step": 136800 }, { "epoch": 5.327182840904667, "grad_norm": 26.153202056884766, "learning_rate": 2.3375958581494027e-05, "loss": 0.4969, "step": 136850 }, { "epoch": 5.329129199268169, "grad_norm": 10.462492942810059, "learning_rate": 2.3366226789676516e-05, "loss": 0.4535, "step": 136900 }, { "epoch": 5.331075557631671, "grad_norm": 60.40643310546875, "learning_rate": 2.3356494997859008e-05, "loss": 0.4887, "step": 136950 }, { "epoch": 5.3330219159951735, "grad_norm": 121.0730209350586, "learning_rate": 2.3346763206041497e-05, "loss": 0.4777, "step": 137000 }, { "epoch": 5.334968274358675, "grad_norm": 26.502639770507812, "learning_rate": 2.333703141422399e-05, "loss": 0.4997, "step": 137050 }, { "epoch": 5.336914632722177, "grad_norm": 24.860809326171875, "learning_rate": 2.3327299622406478e-05, "loss": 0.4216, "step": 137100 }, { "epoch": 5.338860991085679, "grad_norm": 13.166356086730957, "learning_rate": 2.331756783058897e-05, "loss": 0.5292, "step": 137150 }, { "epoch": 5.340807349449181, "grad_norm": 17.932907104492188, "learning_rate": 2.3307836038771462e-05, "loss": 0.5268, "step": 137200 }, { "epoch": 5.342753707812682, "grad_norm": 7.913403511047363, "learning_rate": 2.329810424695395e-05, "loss": 0.4709, "step": 137250 }, { "epoch": 5.344700066176184, "grad_norm": 0.7377246022224426, "learning_rate": 2.328837245513644e-05, "loss": 0.4584, "step": 137300 }, { "epoch": 5.346646424539686, "grad_norm": 188.3112030029297, "learning_rate": 2.327864066331893e-05, "loss": 0.5107, "step": 137350 }, { "epoch": 5.3485927829031885, "grad_norm": 21.33624267578125, "learning_rate": 2.326890887150142e-05, "loss": 0.5378, "step": 137400 }, { "epoch": 5.35053914126669, "grad_norm": 4.419210910797119, "learning_rate": 2.3259177079683913e-05, "loss": 0.4901, "step": 137450 }, { "epoch": 5.352485499630192, "grad_norm": 0.5607961416244507, "learning_rate": 2.3249445287866405e-05, "loss": 0.5411, "step": 137500 }, { "epoch": 5.354431857993694, "grad_norm": 25.84388542175293, "learning_rate": 2.3239713496048894e-05, "loss": 0.4635, "step": 137550 }, { "epoch": 5.356378216357196, "grad_norm": 4.841454029083252, "learning_rate": 2.3229981704231386e-05, "loss": 0.4749, "step": 137600 }, { "epoch": 5.358324574720697, "grad_norm": 113.70428466796875, "learning_rate": 2.3220249912413875e-05, "loss": 0.5688, "step": 137650 }, { "epoch": 5.360270933084199, "grad_norm": 1.913034439086914, "learning_rate": 2.3210518120596363e-05, "loss": 0.4988, "step": 137700 }, { "epoch": 5.362217291447702, "grad_norm": 5.337471961975098, "learning_rate": 2.3200786328778856e-05, "loss": 0.5322, "step": 137750 }, { "epoch": 5.3641636498112035, "grad_norm": 6.988361358642578, "learning_rate": 2.3191054536961344e-05, "loss": 0.4708, "step": 137800 }, { "epoch": 5.366110008174705, "grad_norm": 16.58577537536621, "learning_rate": 2.3181322745143836e-05, "loss": 0.4767, "step": 137850 }, { "epoch": 5.368056366538207, "grad_norm": 119.39418029785156, "learning_rate": 2.317159095332633e-05, "loss": 0.4831, "step": 137900 }, { "epoch": 5.370002724901709, "grad_norm": 14.559215545654297, "learning_rate": 2.3161859161508817e-05, "loss": 0.4583, "step": 137950 }, { "epoch": 5.371949083265211, "grad_norm": 7.479008674621582, "learning_rate": 2.315212736969131e-05, "loss": 0.4424, "step": 138000 }, { "epoch": 5.373895441628712, "grad_norm": 17.40250015258789, "learning_rate": 2.31423955778738e-05, "loss": 0.5349, "step": 138050 }, { "epoch": 5.375841799992214, "grad_norm": 31.0782470703125, "learning_rate": 2.313266378605629e-05, "loss": 0.4888, "step": 138100 }, { "epoch": 5.377788158355717, "grad_norm": 49.43034362792969, "learning_rate": 2.312293199423878e-05, "loss": 0.503, "step": 138150 }, { "epoch": 5.3797345167192185, "grad_norm": 19.146635055541992, "learning_rate": 2.311320020242127e-05, "loss": 0.4514, "step": 138200 }, { "epoch": 5.38168087508272, "grad_norm": 21.382286071777344, "learning_rate": 2.3103468410603764e-05, "loss": 0.4786, "step": 138250 }, { "epoch": 5.383627233446222, "grad_norm": 58.44831466674805, "learning_rate": 2.3093736618786253e-05, "loss": 0.434, "step": 138300 }, { "epoch": 5.385573591809724, "grad_norm": 5.201425552368164, "learning_rate": 2.308400482696874e-05, "loss": 0.4884, "step": 138350 }, { "epoch": 5.387519950173226, "grad_norm": 9.40095043182373, "learning_rate": 2.3074273035151234e-05, "loss": 0.4463, "step": 138400 }, { "epoch": 5.389466308536727, "grad_norm": 31.44985580444336, "learning_rate": 2.3064541243333722e-05, "loss": 0.4469, "step": 138450 }, { "epoch": 5.39141266690023, "grad_norm": 17.353946685791016, "learning_rate": 2.3054809451516214e-05, "loss": 0.435, "step": 138500 }, { "epoch": 5.393359025263732, "grad_norm": 15.19691276550293, "learning_rate": 2.3045077659698707e-05, "loss": 0.4257, "step": 138550 }, { "epoch": 5.3953053836272336, "grad_norm": 17.196475982666016, "learning_rate": 2.3035345867881195e-05, "loss": 0.5052, "step": 138600 }, { "epoch": 5.397251741990735, "grad_norm": 7.192227363586426, "learning_rate": 2.3025614076063688e-05, "loss": 0.4422, "step": 138650 }, { "epoch": 5.399198100354237, "grad_norm": 4.805534839630127, "learning_rate": 2.3015882284246176e-05, "loss": 0.4111, "step": 138700 }, { "epoch": 5.401144458717739, "grad_norm": 9.723201751708984, "learning_rate": 2.3006150492428665e-05, "loss": 0.6235, "step": 138750 }, { "epoch": 5.403090817081241, "grad_norm": 4.88572359085083, "learning_rate": 2.2996418700611157e-05, "loss": 0.489, "step": 138800 }, { "epoch": 5.405037175444743, "grad_norm": 0.36469006538391113, "learning_rate": 2.2986686908793646e-05, "loss": 0.3931, "step": 138850 }, { "epoch": 5.406983533808245, "grad_norm": 70.02059173583984, "learning_rate": 2.297695511697614e-05, "loss": 0.4677, "step": 138900 }, { "epoch": 5.408929892171747, "grad_norm": 14.909634590148926, "learning_rate": 2.296722332515863e-05, "loss": 0.4508, "step": 138950 }, { "epoch": 5.410876250535249, "grad_norm": 0.7483935952186584, "learning_rate": 2.295749153334112e-05, "loss": 0.4625, "step": 139000 }, { "epoch": 5.41282260889875, "grad_norm": 62.61392593383789, "learning_rate": 2.294775974152361e-05, "loss": 0.4319, "step": 139050 }, { "epoch": 5.414768967262252, "grad_norm": 4.581214427947998, "learning_rate": 2.29380279497061e-05, "loss": 0.4992, "step": 139100 }, { "epoch": 5.416715325625754, "grad_norm": 6.97123384475708, "learning_rate": 2.292829615788859e-05, "loss": 0.5045, "step": 139150 }, { "epoch": 5.4186616839892565, "grad_norm": 10.48127555847168, "learning_rate": 2.291856436607108e-05, "loss": 0.5066, "step": 139200 }, { "epoch": 5.420608042352758, "grad_norm": 5.345021724700928, "learning_rate": 2.2908832574253573e-05, "loss": 0.4894, "step": 139250 }, { "epoch": 5.42255440071626, "grad_norm": 19.660125732421875, "learning_rate": 2.2899100782436066e-05, "loss": 0.4875, "step": 139300 }, { "epoch": 5.424500759079762, "grad_norm": 14.084202766418457, "learning_rate": 2.2889368990618554e-05, "loss": 0.3717, "step": 139350 }, { "epoch": 5.426447117443264, "grad_norm": 8.978292465209961, "learning_rate": 2.2879637198801043e-05, "loss": 0.398, "step": 139400 }, { "epoch": 5.428393475806765, "grad_norm": 7.443064212799072, "learning_rate": 2.2869905406983535e-05, "loss": 0.4414, "step": 139450 }, { "epoch": 5.430339834170267, "grad_norm": 97.16381072998047, "learning_rate": 2.2860173615166024e-05, "loss": 0.5839, "step": 139500 }, { "epoch": 5.43228619253377, "grad_norm": 21.743072509765625, "learning_rate": 2.2850441823348516e-05, "loss": 0.491, "step": 139550 }, { "epoch": 5.4342325508972715, "grad_norm": 2.9091548919677734, "learning_rate": 2.284071003153101e-05, "loss": 0.5303, "step": 139600 }, { "epoch": 5.436178909260773, "grad_norm": 65.35017395019531, "learning_rate": 2.2830978239713497e-05, "loss": 0.5444, "step": 139650 }, { "epoch": 5.438125267624275, "grad_norm": 0.7253431677818298, "learning_rate": 2.282124644789599e-05, "loss": 0.5232, "step": 139700 }, { "epoch": 5.440071625987777, "grad_norm": 33.15109634399414, "learning_rate": 2.2811514656078478e-05, "loss": 0.4538, "step": 139750 }, { "epoch": 5.442017984351279, "grad_norm": 29.251678466796875, "learning_rate": 2.2801782864260967e-05, "loss": 0.6158, "step": 139800 }, { "epoch": 5.44396434271478, "grad_norm": 24.854665756225586, "learning_rate": 2.279205107244346e-05, "loss": 0.4697, "step": 139850 }, { "epoch": 5.445910701078282, "grad_norm": 8.34721851348877, "learning_rate": 2.2782319280625948e-05, "loss": 0.3861, "step": 139900 }, { "epoch": 5.447857059441785, "grad_norm": 2.473447799682617, "learning_rate": 2.277258748880844e-05, "loss": 0.4845, "step": 139950 }, { "epoch": 5.4498034178052865, "grad_norm": 5.0479583740234375, "learning_rate": 2.2762855696990932e-05, "loss": 0.4678, "step": 140000 }, { "epoch": 5.451749776168788, "grad_norm": 10.540440559387207, "learning_rate": 2.275312390517342e-05, "loss": 0.4516, "step": 140050 }, { "epoch": 5.45369613453229, "grad_norm": 11.011929512023926, "learning_rate": 2.2743392113355913e-05, "loss": 0.5037, "step": 140100 }, { "epoch": 5.455642492895792, "grad_norm": 8.31088638305664, "learning_rate": 2.2733660321538402e-05, "loss": 0.4789, "step": 140150 }, { "epoch": 5.457588851259294, "grad_norm": 3.969402551651001, "learning_rate": 2.272392852972089e-05, "loss": 0.4431, "step": 140200 }, { "epoch": 5.459535209622795, "grad_norm": 24.417375564575195, "learning_rate": 2.2714196737903383e-05, "loss": 0.4114, "step": 140250 }, { "epoch": 5.461481567986298, "grad_norm": 2.9269864559173584, "learning_rate": 2.2704464946085875e-05, "loss": 0.3675, "step": 140300 }, { "epoch": 5.4634279263498, "grad_norm": 25.95055389404297, "learning_rate": 2.2694733154268368e-05, "loss": 0.6314, "step": 140350 }, { "epoch": 5.4653742847133016, "grad_norm": 5.8151469230651855, "learning_rate": 2.2685001362450856e-05, "loss": 0.4773, "step": 140400 }, { "epoch": 5.467320643076803, "grad_norm": 31.987106323242188, "learning_rate": 2.2675269570633345e-05, "loss": 0.4792, "step": 140450 }, { "epoch": 5.469267001440305, "grad_norm": 27.423948287963867, "learning_rate": 2.2665537778815837e-05, "loss": 0.5148, "step": 140500 }, { "epoch": 5.471213359803807, "grad_norm": 22.05579376220703, "learning_rate": 2.2655805986998326e-05, "loss": 0.5058, "step": 140550 }, { "epoch": 5.473159718167309, "grad_norm": 25.787580490112305, "learning_rate": 2.2646074195180815e-05, "loss": 0.4779, "step": 140600 }, { "epoch": 5.47510607653081, "grad_norm": 42.786338806152344, "learning_rate": 2.2636342403363307e-05, "loss": 0.4216, "step": 140650 }, { "epoch": 5.477052434894313, "grad_norm": 7.5085906982421875, "learning_rate": 2.26266106115458e-05, "loss": 0.4493, "step": 140700 }, { "epoch": 5.478998793257815, "grad_norm": 12.147705078125, "learning_rate": 2.261687881972829e-05, "loss": 0.4432, "step": 140750 }, { "epoch": 5.480945151621317, "grad_norm": 16.47398567199707, "learning_rate": 2.260714702791078e-05, "loss": 0.5382, "step": 140800 }, { "epoch": 5.482891509984818, "grad_norm": 24.04012680053711, "learning_rate": 2.259741523609327e-05, "loss": 0.4632, "step": 140850 }, { "epoch": 5.48483786834832, "grad_norm": 21.654024124145508, "learning_rate": 2.258768344427576e-05, "loss": 0.4851, "step": 140900 }, { "epoch": 5.486784226711822, "grad_norm": 7.615512371063232, "learning_rate": 2.257795165245825e-05, "loss": 0.517, "step": 140950 }, { "epoch": 5.488730585075324, "grad_norm": 38.602317810058594, "learning_rate": 2.2568219860640742e-05, "loss": 0.5825, "step": 141000 }, { "epoch": 5.490676943438826, "grad_norm": 16.893400192260742, "learning_rate": 2.2558488068823234e-05, "loss": 0.5806, "step": 141050 }, { "epoch": 5.492623301802328, "grad_norm": 19.668304443359375, "learning_rate": 2.2548756277005723e-05, "loss": 0.3632, "step": 141100 }, { "epoch": 5.49456966016583, "grad_norm": 27.08940315246582, "learning_rate": 2.2539024485188215e-05, "loss": 0.4127, "step": 141150 }, { "epoch": 5.496516018529332, "grad_norm": 6.265408039093018, "learning_rate": 2.2529292693370704e-05, "loss": 0.4779, "step": 141200 }, { "epoch": 5.498462376892833, "grad_norm": 23.51862907409668, "learning_rate": 2.2519560901553193e-05, "loss": 0.5045, "step": 141250 }, { "epoch": 5.500408735256335, "grad_norm": 28.534025192260742, "learning_rate": 2.2509829109735685e-05, "loss": 0.5215, "step": 141300 }, { "epoch": 5.502355093619837, "grad_norm": 25.67772102355957, "learning_rate": 2.2500097317918177e-05, "loss": 0.6268, "step": 141350 }, { "epoch": 5.5043014519833395, "grad_norm": 35.883575439453125, "learning_rate": 2.249036552610067e-05, "loss": 0.4937, "step": 141400 }, { "epoch": 5.506247810346841, "grad_norm": 20.927465438842773, "learning_rate": 2.2480633734283158e-05, "loss": 0.5311, "step": 141450 }, { "epoch": 5.508194168710343, "grad_norm": 15.133782386779785, "learning_rate": 2.2470901942465647e-05, "loss": 0.557, "step": 141500 }, { "epoch": 5.510140527073845, "grad_norm": 25.067594528198242, "learning_rate": 2.246117015064814e-05, "loss": 0.4667, "step": 141550 }, { "epoch": 5.512086885437347, "grad_norm": 17.89718246459961, "learning_rate": 2.2451438358830628e-05, "loss": 0.506, "step": 141600 }, { "epoch": 5.514033243800848, "grad_norm": 50.1882209777832, "learning_rate": 2.2441706567013117e-05, "loss": 0.4653, "step": 141650 }, { "epoch": 5.51597960216435, "grad_norm": 2.0506484508514404, "learning_rate": 2.243197477519561e-05, "loss": 0.3188, "step": 141700 }, { "epoch": 5.517925960527853, "grad_norm": 4.699078559875488, "learning_rate": 2.24222429833781e-05, "loss": 0.4374, "step": 141750 }, { "epoch": 5.5198723188913545, "grad_norm": 205.85720825195312, "learning_rate": 2.2412511191560593e-05, "loss": 0.5379, "step": 141800 }, { "epoch": 5.521818677254856, "grad_norm": 0.22048422694206238, "learning_rate": 2.2402779399743082e-05, "loss": 0.3956, "step": 141850 }, { "epoch": 5.523765035618358, "grad_norm": 21.243675231933594, "learning_rate": 2.239304760792557e-05, "loss": 0.4839, "step": 141900 }, { "epoch": 5.52571139398186, "grad_norm": 661.2019653320312, "learning_rate": 2.2383315816108063e-05, "loss": 0.4598, "step": 141950 }, { "epoch": 5.527657752345362, "grad_norm": 90.62779998779297, "learning_rate": 2.2373584024290552e-05, "loss": 0.5378, "step": 142000 }, { "epoch": 5.529604110708863, "grad_norm": 4.262439250946045, "learning_rate": 2.2363852232473044e-05, "loss": 0.4564, "step": 142050 }, { "epoch": 5.531550469072366, "grad_norm": 8.47938060760498, "learning_rate": 2.2354120440655536e-05, "loss": 0.4373, "step": 142100 }, { "epoch": 5.533496827435868, "grad_norm": 26.904829025268555, "learning_rate": 2.2344388648838025e-05, "loss": 0.4978, "step": 142150 }, { "epoch": 5.53544318579937, "grad_norm": 23.377729415893555, "learning_rate": 2.2334656857020517e-05, "loss": 0.556, "step": 142200 }, { "epoch": 5.537389544162871, "grad_norm": 19.180496215820312, "learning_rate": 2.2324925065203006e-05, "loss": 0.4118, "step": 142250 }, { "epoch": 5.539335902526373, "grad_norm": 10.439373016357422, "learning_rate": 2.2315193273385495e-05, "loss": 0.3995, "step": 142300 }, { "epoch": 5.541282260889875, "grad_norm": 2.0222413539886475, "learning_rate": 2.2305461481567987e-05, "loss": 0.4454, "step": 142350 }, { "epoch": 5.543228619253377, "grad_norm": 58.872169494628906, "learning_rate": 2.229572968975048e-05, "loss": 0.5988, "step": 142400 }, { "epoch": 5.545174977616879, "grad_norm": 7.4645514488220215, "learning_rate": 2.228599789793297e-05, "loss": 0.4759, "step": 142450 }, { "epoch": 5.547121335980381, "grad_norm": 5.107184886932373, "learning_rate": 2.227626610611546e-05, "loss": 0.4337, "step": 142500 }, { "epoch": 5.549067694343883, "grad_norm": 12.68246841430664, "learning_rate": 2.226653431429795e-05, "loss": 0.4982, "step": 142550 }, { "epoch": 5.551014052707385, "grad_norm": 14.070191383361816, "learning_rate": 2.225680252248044e-05, "loss": 0.4856, "step": 142600 }, { "epoch": 5.552960411070886, "grad_norm": 14.807028770446777, "learning_rate": 2.224707073066293e-05, "loss": 0.5003, "step": 142650 }, { "epoch": 5.554906769434388, "grad_norm": 4.305740833282471, "learning_rate": 2.223733893884542e-05, "loss": 0.4677, "step": 142700 }, { "epoch": 5.55685312779789, "grad_norm": 38.39807891845703, "learning_rate": 2.222760714702791e-05, "loss": 0.5608, "step": 142750 }, { "epoch": 5.5587994861613925, "grad_norm": 2.913740396499634, "learning_rate": 2.2217875355210403e-05, "loss": 0.4683, "step": 142800 }, { "epoch": 5.560745844524894, "grad_norm": 55.86124801635742, "learning_rate": 2.2208143563392895e-05, "loss": 0.4049, "step": 142850 }, { "epoch": 5.562692202888396, "grad_norm": 81.45223999023438, "learning_rate": 2.2198411771575384e-05, "loss": 0.4593, "step": 142900 }, { "epoch": 5.564638561251898, "grad_norm": 14.493507385253906, "learning_rate": 2.2188679979757873e-05, "loss": 0.5226, "step": 142950 }, { "epoch": 5.5665849196154, "grad_norm": 16.254417419433594, "learning_rate": 2.2178948187940365e-05, "loss": 0.4848, "step": 143000 }, { "epoch": 5.568531277978901, "grad_norm": 0.47006094455718994, "learning_rate": 2.2169216396122854e-05, "loss": 0.3918, "step": 143050 }, { "epoch": 5.570477636342403, "grad_norm": 54.54814529418945, "learning_rate": 2.2159484604305346e-05, "loss": 0.5148, "step": 143100 }, { "epoch": 5.572423994705905, "grad_norm": 1.5716171264648438, "learning_rate": 2.2149752812487838e-05, "loss": 0.4404, "step": 143150 }, { "epoch": 5.574370353069407, "grad_norm": 6.937029838562012, "learning_rate": 2.2140021020670327e-05, "loss": 0.4752, "step": 143200 }, { "epoch": 5.576316711432909, "grad_norm": 17.308713912963867, "learning_rate": 2.213028922885282e-05, "loss": 0.4869, "step": 143250 }, { "epoch": 5.578263069796411, "grad_norm": 21.589351654052734, "learning_rate": 2.2120557437035308e-05, "loss": 0.5291, "step": 143300 }, { "epoch": 5.580209428159913, "grad_norm": 30.810428619384766, "learning_rate": 2.2110825645217797e-05, "loss": 0.4489, "step": 143350 }, { "epoch": 5.582155786523415, "grad_norm": 1.8680299520492554, "learning_rate": 2.210109385340029e-05, "loss": 0.454, "step": 143400 }, { "epoch": 5.584102144886916, "grad_norm": 33.658687591552734, "learning_rate": 2.209136206158278e-05, "loss": 0.5077, "step": 143450 }, { "epoch": 5.586048503250418, "grad_norm": 11.225595474243164, "learning_rate": 2.2081630269765273e-05, "loss": 0.4933, "step": 143500 }, { "epoch": 5.58799486161392, "grad_norm": 2.978534460067749, "learning_rate": 2.2071898477947762e-05, "loss": 0.3266, "step": 143550 }, { "epoch": 5.5899412199774225, "grad_norm": 16.478660583496094, "learning_rate": 2.206216668613025e-05, "loss": 0.4889, "step": 143600 }, { "epoch": 5.591887578340924, "grad_norm": 9.229466438293457, "learning_rate": 2.2052434894312743e-05, "loss": 0.506, "step": 143650 }, { "epoch": 5.593833936704426, "grad_norm": 5.321776390075684, "learning_rate": 2.2042703102495232e-05, "loss": 0.5302, "step": 143700 }, { "epoch": 5.595780295067928, "grad_norm": 19.91082763671875, "learning_rate": 2.203297131067772e-05, "loss": 0.5299, "step": 143750 }, { "epoch": 5.59772665343143, "grad_norm": 33.32722854614258, "learning_rate": 2.2023239518860213e-05, "loss": 0.4226, "step": 143800 }, { "epoch": 5.599673011794931, "grad_norm": 9.714330673217773, "learning_rate": 2.2013507727042705e-05, "loss": 0.5065, "step": 143850 }, { "epoch": 5.601619370158433, "grad_norm": 20.57841682434082, "learning_rate": 2.2003775935225197e-05, "loss": 0.46, "step": 143900 }, { "epoch": 5.603565728521936, "grad_norm": 15.237015724182129, "learning_rate": 2.1994044143407686e-05, "loss": 0.4946, "step": 143950 }, { "epoch": 5.605512086885438, "grad_norm": 65.70580291748047, "learning_rate": 2.1984312351590175e-05, "loss": 0.4868, "step": 144000 }, { "epoch": 5.607458445248939, "grad_norm": 7.076148509979248, "learning_rate": 2.1974775195609015e-05, "loss": 0.5588, "step": 144050 }, { "epoch": 5.609404803612441, "grad_norm": 27.55814552307129, "learning_rate": 2.1965043403791507e-05, "loss": 0.4494, "step": 144100 }, { "epoch": 5.611351161975943, "grad_norm": 18.32331085205078, "learning_rate": 2.1955311611973996e-05, "loss": 0.4906, "step": 144150 }, { "epoch": 5.613297520339445, "grad_norm": 5.705947399139404, "learning_rate": 2.1945579820156488e-05, "loss": 0.5425, "step": 144200 }, { "epoch": 5.615243878702946, "grad_norm": 3.2096259593963623, "learning_rate": 2.193584802833898e-05, "loss": 0.5308, "step": 144250 }, { "epoch": 5.617190237066449, "grad_norm": 35.87384796142578, "learning_rate": 2.192611623652147e-05, "loss": 0.5505, "step": 144300 }, { "epoch": 5.619136595429951, "grad_norm": 21.15291404724121, "learning_rate": 2.191638444470396e-05, "loss": 0.5549, "step": 144350 }, { "epoch": 5.621082953793453, "grad_norm": 21.640710830688477, "learning_rate": 2.190665265288645e-05, "loss": 0.4122, "step": 144400 }, { "epoch": 5.623029312156954, "grad_norm": 19.42620086669922, "learning_rate": 2.189692086106894e-05, "loss": 0.4944, "step": 144450 }, { "epoch": 5.624975670520456, "grad_norm": 3.4957187175750732, "learning_rate": 2.188718906925143e-05, "loss": 0.4683, "step": 144500 }, { "epoch": 5.626922028883958, "grad_norm": 26.069080352783203, "learning_rate": 2.187745727743392e-05, "loss": 0.5278, "step": 144550 }, { "epoch": 5.62886838724746, "grad_norm": 10.825551986694336, "learning_rate": 2.1867725485616415e-05, "loss": 0.3944, "step": 144600 }, { "epoch": 5.630814745610962, "grad_norm": 22.900117874145508, "learning_rate": 2.1857993693798904e-05, "loss": 0.5278, "step": 144650 }, { "epoch": 5.632761103974464, "grad_norm": 31.81256675720215, "learning_rate": 2.1848261901981393e-05, "loss": 0.4772, "step": 144700 }, { "epoch": 5.634707462337966, "grad_norm": 14.143387794494629, "learning_rate": 2.1838530110163885e-05, "loss": 0.5259, "step": 144750 }, { "epoch": 5.636653820701468, "grad_norm": 3.359781265258789, "learning_rate": 2.1828798318346374e-05, "loss": 0.4736, "step": 144800 }, { "epoch": 5.638600179064969, "grad_norm": 4.243276596069336, "learning_rate": 2.1819066526528866e-05, "loss": 0.5466, "step": 144850 }, { "epoch": 5.640546537428471, "grad_norm": 65.97655487060547, "learning_rate": 2.1809334734711355e-05, "loss": 0.5089, "step": 144900 }, { "epoch": 5.642492895791973, "grad_norm": 26.80994415283203, "learning_rate": 2.1799602942893847e-05, "loss": 0.4787, "step": 144950 }, { "epoch": 5.6444392541554755, "grad_norm": 77.48614501953125, "learning_rate": 2.178987115107634e-05, "loss": 0.425, "step": 145000 }, { "epoch": 5.646385612518977, "grad_norm": 43.99659729003906, "learning_rate": 2.1780139359258828e-05, "loss": 0.3752, "step": 145050 }, { "epoch": 5.648331970882479, "grad_norm": 79.5300064086914, "learning_rate": 2.1770407567441317e-05, "loss": 0.5461, "step": 145100 }, { "epoch": 5.650278329245981, "grad_norm": 9.227584838867188, "learning_rate": 2.176067577562381e-05, "loss": 0.5453, "step": 145150 }, { "epoch": 5.652224687609483, "grad_norm": 35.68589401245117, "learning_rate": 2.1750943983806298e-05, "loss": 0.4229, "step": 145200 }, { "epoch": 5.654171045972984, "grad_norm": 3.0414645671844482, "learning_rate": 2.174121219198879e-05, "loss": 0.5282, "step": 145250 }, { "epoch": 5.656117404336486, "grad_norm": 19.29361915588379, "learning_rate": 2.1731480400171282e-05, "loss": 0.4186, "step": 145300 }, { "epoch": 5.658063762699989, "grad_norm": 20.59784698486328, "learning_rate": 2.172174860835377e-05, "loss": 0.4428, "step": 145350 }, { "epoch": 5.6600101210634906, "grad_norm": 9.201354026794434, "learning_rate": 2.1712016816536263e-05, "loss": 0.5313, "step": 145400 }, { "epoch": 5.661956479426992, "grad_norm": 10.13439655303955, "learning_rate": 2.1702285024718752e-05, "loss": 0.4357, "step": 145450 }, { "epoch": 5.663902837790494, "grad_norm": 6.450570583343506, "learning_rate": 2.169255323290124e-05, "loss": 0.5019, "step": 145500 }, { "epoch": 5.665849196153996, "grad_norm": 110.96885681152344, "learning_rate": 2.1682821441083733e-05, "loss": 0.3784, "step": 145550 }, { "epoch": 5.667795554517498, "grad_norm": 27.496273040771484, "learning_rate": 2.1673089649266222e-05, "loss": 0.5828, "step": 145600 }, { "epoch": 5.669741912880999, "grad_norm": 28.17935562133789, "learning_rate": 2.1663357857448717e-05, "loss": 0.5214, "step": 145650 }, { "epoch": 5.671688271244502, "grad_norm": 26.768798828125, "learning_rate": 2.1653626065631206e-05, "loss": 0.5176, "step": 145700 }, { "epoch": 5.673634629608004, "grad_norm": 230.8640899658203, "learning_rate": 2.1643894273813695e-05, "loss": 0.4203, "step": 145750 }, { "epoch": 5.675580987971506, "grad_norm": 3.8385086059570312, "learning_rate": 2.1634162481996187e-05, "loss": 0.4922, "step": 145800 }, { "epoch": 5.677527346335007, "grad_norm": 28.53762435913086, "learning_rate": 2.1624430690178676e-05, "loss": 0.4558, "step": 145850 }, { "epoch": 5.679473704698509, "grad_norm": 16.768970489501953, "learning_rate": 2.1614698898361168e-05, "loss": 0.437, "step": 145900 }, { "epoch": 5.681420063062011, "grad_norm": 40.530574798583984, "learning_rate": 2.1604967106543657e-05, "loss": 0.4156, "step": 145950 }, { "epoch": 5.683366421425513, "grad_norm": 2.358097553253174, "learning_rate": 2.159523531472615e-05, "loss": 0.4544, "step": 146000 }, { "epoch": 5.685312779789014, "grad_norm": 30.83602523803711, "learning_rate": 2.158550352290864e-05, "loss": 0.4321, "step": 146050 }, { "epoch": 5.687259138152516, "grad_norm": 15.630293846130371, "learning_rate": 2.157577173109113e-05, "loss": 0.4745, "step": 146100 }, { "epoch": 5.689205496516019, "grad_norm": 10.675311088562012, "learning_rate": 2.156603993927362e-05, "loss": 0.4883, "step": 146150 }, { "epoch": 5.691151854879521, "grad_norm": 35.700706481933594, "learning_rate": 2.1556502783292463e-05, "loss": 0.4488, "step": 146200 }, { "epoch": 5.693098213243022, "grad_norm": 3.4618043899536133, "learning_rate": 2.154677099147495e-05, "loss": 0.3995, "step": 146250 }, { "epoch": 5.695044571606524, "grad_norm": 39.35212326049805, "learning_rate": 2.153703919965744e-05, "loss": 0.4832, "step": 146300 }, { "epoch": 5.696990929970026, "grad_norm": 18.510805130004883, "learning_rate": 2.1527307407839932e-05, "loss": 0.5239, "step": 146350 }, { "epoch": 5.698937288333528, "grad_norm": 15.100415229797363, "learning_rate": 2.1517575616022424e-05, "loss": 0.4178, "step": 146400 }, { "epoch": 5.700883646697029, "grad_norm": 3.013082981109619, "learning_rate": 2.1507843824204913e-05, "loss": 0.4194, "step": 146450 }, { "epoch": 5.702830005060532, "grad_norm": 2.6142711639404297, "learning_rate": 2.1498112032387405e-05, "loss": 0.492, "step": 146500 }, { "epoch": 5.704776363424034, "grad_norm": 26.63750648498535, "learning_rate": 2.1488380240569894e-05, "loss": 0.5548, "step": 146550 }, { "epoch": 5.706722721787536, "grad_norm": 6.892706871032715, "learning_rate": 2.1478648448752386e-05, "loss": 0.5041, "step": 146600 }, { "epoch": 5.708669080151037, "grad_norm": 7.009606838226318, "learning_rate": 2.1468916656934875e-05, "loss": 0.5534, "step": 146650 }, { "epoch": 5.710615438514539, "grad_norm": 33.845394134521484, "learning_rate": 2.1459184865117364e-05, "loss": 0.5186, "step": 146700 }, { "epoch": 5.712561796878041, "grad_norm": 9.704541206359863, "learning_rate": 2.1449453073299856e-05, "loss": 0.4156, "step": 146750 }, { "epoch": 5.714508155241543, "grad_norm": 9.700241088867188, "learning_rate": 2.143972128148235e-05, "loss": 0.4522, "step": 146800 }, { "epoch": 5.716454513605045, "grad_norm": 52.7364501953125, "learning_rate": 2.1429989489664837e-05, "loss": 0.4854, "step": 146850 }, { "epoch": 5.718400871968547, "grad_norm": 41.20835494995117, "learning_rate": 2.142045233368368e-05, "loss": 0.4253, "step": 146900 }, { "epoch": 5.720347230332049, "grad_norm": 1.6110278367996216, "learning_rate": 2.141072054186617e-05, "loss": 0.4767, "step": 146950 }, { "epoch": 5.722293588695551, "grad_norm": 22.531103134155273, "learning_rate": 2.140098875004866e-05, "loss": 0.4321, "step": 147000 }, { "epoch": 5.724239947059052, "grad_norm": 49.92802047729492, "learning_rate": 2.139125695823115e-05, "loss": 0.4868, "step": 147050 }, { "epoch": 5.726186305422554, "grad_norm": 18.764982223510742, "learning_rate": 2.138152516641364e-05, "loss": 0.5367, "step": 147100 }, { "epoch": 5.728132663786056, "grad_norm": 32.99585723876953, "learning_rate": 2.137179337459613e-05, "loss": 0.5013, "step": 147150 }, { "epoch": 5.7300790221495586, "grad_norm": 9.351590156555176, "learning_rate": 2.1362061582778624e-05, "loss": 0.4643, "step": 147200 }, { "epoch": 5.73202538051306, "grad_norm": 38.2765998840332, "learning_rate": 2.1352329790961113e-05, "loss": 0.4836, "step": 147250 }, { "epoch": 5.733971738876562, "grad_norm": 143.04417419433594, "learning_rate": 2.1342597999143605e-05, "loss": 0.4256, "step": 147300 }, { "epoch": 5.735918097240064, "grad_norm": 36.64295196533203, "learning_rate": 2.1332866207326094e-05, "loss": 0.46, "step": 147350 }, { "epoch": 5.737864455603566, "grad_norm": 43.446231842041016, "learning_rate": 2.1323134415508582e-05, "loss": 0.4459, "step": 147400 }, { "epoch": 5.739810813967067, "grad_norm": 17.09413719177246, "learning_rate": 2.1313402623691075e-05, "loss": 0.3484, "step": 147450 }, { "epoch": 5.741757172330569, "grad_norm": 25.26546287536621, "learning_rate": 2.1303670831873567e-05, "loss": 0.4479, "step": 147500 }, { "epoch": 5.743703530694072, "grad_norm": 21.048912048339844, "learning_rate": 2.129393904005606e-05, "loss": 0.5579, "step": 147550 }, { "epoch": 5.745649889057574, "grad_norm": 34.24161148071289, "learning_rate": 2.1284207248238548e-05, "loss": 0.409, "step": 147600 }, { "epoch": 5.747596247421075, "grad_norm": 10.358455657958984, "learning_rate": 2.1274475456421036e-05, "loss": 0.581, "step": 147650 }, { "epoch": 5.749542605784577, "grad_norm": 6.936595439910889, "learning_rate": 2.126474366460353e-05, "loss": 0.4579, "step": 147700 }, { "epoch": 5.751488964148079, "grad_norm": 25.206392288208008, "learning_rate": 2.1255011872786017e-05, "loss": 0.5252, "step": 147750 }, { "epoch": 5.753435322511581, "grad_norm": 5.626473426818848, "learning_rate": 2.1245280080968506e-05, "loss": 0.5449, "step": 147800 }, { "epoch": 5.755381680875082, "grad_norm": 7.164819717407227, "learning_rate": 2.1235548289151e-05, "loss": 0.4498, "step": 147850 }, { "epoch": 5.757328039238585, "grad_norm": 10.400779724121094, "learning_rate": 2.122581649733349e-05, "loss": 0.4469, "step": 147900 }, { "epoch": 5.759274397602087, "grad_norm": 34.63956069946289, "learning_rate": 2.1216084705515983e-05, "loss": 0.5269, "step": 147950 }, { "epoch": 5.761220755965589, "grad_norm": 50.11520767211914, "learning_rate": 2.120635291369847e-05, "loss": 0.4505, "step": 148000 }, { "epoch": 5.76316711432909, "grad_norm": 52.6129035949707, "learning_rate": 2.119662112188096e-05, "loss": 0.5876, "step": 148050 }, { "epoch": 5.765113472692592, "grad_norm": 27.648406982421875, "learning_rate": 2.1186889330063453e-05, "loss": 0.5444, "step": 148100 }, { "epoch": 5.767059831056094, "grad_norm": 40.91321563720703, "learning_rate": 2.117715753824594e-05, "loss": 0.4795, "step": 148150 }, { "epoch": 5.769006189419596, "grad_norm": 31.96234130859375, "learning_rate": 2.1167425746428433e-05, "loss": 0.5122, "step": 148200 }, { "epoch": 5.770952547783098, "grad_norm": 29.546472549438477, "learning_rate": 2.1157693954610926e-05, "loss": 0.492, "step": 148250 }, { "epoch": 5.7728989061466, "grad_norm": 18.00714111328125, "learning_rate": 2.1147962162793414e-05, "loss": 0.4028, "step": 148300 }, { "epoch": 5.774845264510102, "grad_norm": 6.667193412780762, "learning_rate": 2.1138230370975907e-05, "loss": 0.3893, "step": 148350 }, { "epoch": 5.776791622873604, "grad_norm": 15.449394226074219, "learning_rate": 2.1128498579158395e-05, "loss": 0.5104, "step": 148400 }, { "epoch": 5.778737981237105, "grad_norm": 22.22861099243164, "learning_rate": 2.1118766787340884e-05, "loss": 0.4748, "step": 148450 }, { "epoch": 5.780684339600607, "grad_norm": 87.00386047363281, "learning_rate": 2.1109034995523376e-05, "loss": 0.458, "step": 148500 }, { "epoch": 5.782630697964109, "grad_norm": 2.6363420486450195, "learning_rate": 2.109930320370587e-05, "loss": 0.5729, "step": 148550 }, { "epoch": 5.784577056327611, "grad_norm": 12.727959632873535, "learning_rate": 2.108957141188836e-05, "loss": 0.5319, "step": 148600 }, { "epoch": 5.786523414691113, "grad_norm": 13.028393745422363, "learning_rate": 2.107983962007085e-05, "loss": 0.55, "step": 148650 }, { "epoch": 5.788469773054615, "grad_norm": 22.92647933959961, "learning_rate": 2.107010782825334e-05, "loss": 0.4, "step": 148700 }, { "epoch": 5.790416131418117, "grad_norm": 13.907632827758789, "learning_rate": 2.106037603643583e-05, "loss": 0.4625, "step": 148750 }, { "epoch": 5.792362489781619, "grad_norm": 22.58814239501953, "learning_rate": 2.105064424461832e-05, "loss": 0.5798, "step": 148800 }, { "epoch": 5.79430884814512, "grad_norm": 33.648719787597656, "learning_rate": 2.1040912452800808e-05, "loss": 0.5242, "step": 148850 }, { "epoch": 5.796255206508622, "grad_norm": 64.69039154052734, "learning_rate": 2.10311806609833e-05, "loss": 0.569, "step": 148900 }, { "epoch": 5.798201564872124, "grad_norm": 20.522449493408203, "learning_rate": 2.1021448869165792e-05, "loss": 0.5806, "step": 148950 }, { "epoch": 5.800147923235626, "grad_norm": 11.917279243469238, "learning_rate": 2.1011717077348285e-05, "loss": 0.4754, "step": 149000 }, { "epoch": 5.802094281599128, "grad_norm": 10.689806938171387, "learning_rate": 2.1001985285530773e-05, "loss": 0.4819, "step": 149050 }, { "epoch": 5.80404063996263, "grad_norm": 12.543091773986816, "learning_rate": 2.0992253493713262e-05, "loss": 0.4502, "step": 149100 }, { "epoch": 5.805986998326132, "grad_norm": 49.59513854980469, "learning_rate": 2.0982521701895754e-05, "loss": 0.4515, "step": 149150 }, { "epoch": 5.807933356689634, "grad_norm": 27.326045989990234, "learning_rate": 2.0972789910078243e-05, "loss": 0.5357, "step": 149200 }, { "epoch": 5.809879715053135, "grad_norm": 52.378639221191406, "learning_rate": 2.0963058118260735e-05, "loss": 0.496, "step": 149250 }, { "epoch": 5.811826073416637, "grad_norm": 2.5411744117736816, "learning_rate": 2.0953326326443228e-05, "loss": 0.4324, "step": 149300 }, { "epoch": 5.813772431780139, "grad_norm": 2.0252749919891357, "learning_rate": 2.0943594534625716e-05, "loss": 0.4737, "step": 149350 }, { "epoch": 5.815718790143642, "grad_norm": 20.123262405395508, "learning_rate": 2.093386274280821e-05, "loss": 0.4293, "step": 149400 }, { "epoch": 5.817665148507143, "grad_norm": 2.375354051589966, "learning_rate": 2.0924130950990697e-05, "loss": 0.4684, "step": 149450 }, { "epoch": 5.819611506870645, "grad_norm": 2.508639335632324, "learning_rate": 2.0914399159173186e-05, "loss": 0.4961, "step": 149500 }, { "epoch": 5.821557865234147, "grad_norm": 30.05324363708496, "learning_rate": 2.0904667367355678e-05, "loss": 0.4167, "step": 149550 }, { "epoch": 5.823504223597649, "grad_norm": 19.884994506835938, "learning_rate": 2.0894935575538167e-05, "loss": 0.4293, "step": 149600 }, { "epoch": 5.82545058196115, "grad_norm": 57.117088317871094, "learning_rate": 2.0885203783720663e-05, "loss": 0.5615, "step": 149650 }, { "epoch": 5.827396940324652, "grad_norm": 19.315885543823242, "learning_rate": 2.087547199190315e-05, "loss": 0.5412, "step": 149700 }, { "epoch": 5.829343298688155, "grad_norm": 32.28496551513672, "learning_rate": 2.086574020008564e-05, "loss": 0.4855, "step": 149750 }, { "epoch": 5.831289657051657, "grad_norm": 23.856151580810547, "learning_rate": 2.0856008408268132e-05, "loss": 0.5135, "step": 149800 }, { "epoch": 5.833236015415158, "grad_norm": 9.105504035949707, "learning_rate": 2.084627661645062e-05, "loss": 0.4762, "step": 149850 }, { "epoch": 5.83518237377866, "grad_norm": 12.42764663696289, "learning_rate": 2.083654482463311e-05, "loss": 0.5232, "step": 149900 }, { "epoch": 5.837128732142162, "grad_norm": 30.808252334594727, "learning_rate": 2.0826813032815602e-05, "loss": 0.4429, "step": 149950 }, { "epoch": 5.839075090505664, "grad_norm": 22.769996643066406, "learning_rate": 2.0817081240998094e-05, "loss": 0.5015, "step": 150000 }, { "epoch": 5.841021448869165, "grad_norm": 6.832203388214111, "learning_rate": 2.0807349449180587e-05, "loss": 0.5003, "step": 150050 }, { "epoch": 5.842967807232668, "grad_norm": 58.393314361572266, "learning_rate": 2.0797617657363075e-05, "loss": 0.4334, "step": 150100 }, { "epoch": 5.84491416559617, "grad_norm": 13.426053047180176, "learning_rate": 2.0787885865545564e-05, "loss": 0.3697, "step": 150150 }, { "epoch": 5.846860523959672, "grad_norm": 57.27088928222656, "learning_rate": 2.0778154073728056e-05, "loss": 0.6124, "step": 150200 }, { "epoch": 5.848806882323173, "grad_norm": 27.30216407775879, "learning_rate": 2.0768422281910545e-05, "loss": 0.4101, "step": 150250 }, { "epoch": 5.850753240686675, "grad_norm": 9.873483657836914, "learning_rate": 2.0758690490093037e-05, "loss": 0.5141, "step": 150300 }, { "epoch": 5.852699599050177, "grad_norm": 18.660297393798828, "learning_rate": 2.074895869827553e-05, "loss": 0.5309, "step": 150350 }, { "epoch": 5.854645957413679, "grad_norm": 15.02762222290039, "learning_rate": 2.0739226906458018e-05, "loss": 0.4347, "step": 150400 }, { "epoch": 5.856592315777181, "grad_norm": 15.578240394592285, "learning_rate": 2.072949511464051e-05, "loss": 0.4462, "step": 150450 }, { "epoch": 5.858538674140683, "grad_norm": 16.750364303588867, "learning_rate": 2.0719763322823e-05, "loss": 0.5005, "step": 150500 }, { "epoch": 5.860485032504185, "grad_norm": 21.7381649017334, "learning_rate": 2.0710031531005488e-05, "loss": 0.4574, "step": 150550 }, { "epoch": 5.862431390867687, "grad_norm": 8.603922843933105, "learning_rate": 2.070029973918798e-05, "loss": 0.3975, "step": 150600 }, { "epoch": 5.864377749231188, "grad_norm": 9.554339408874512, "learning_rate": 2.069056794737047e-05, "loss": 0.5527, "step": 150650 }, { "epoch": 5.86632410759469, "grad_norm": 31.730798721313477, "learning_rate": 2.068083615555296e-05, "loss": 0.5167, "step": 150700 }, { "epoch": 5.868270465958192, "grad_norm": 53.88674545288086, "learning_rate": 2.0671104363735453e-05, "loss": 0.4863, "step": 150750 }, { "epoch": 5.870216824321695, "grad_norm": 7.8756866455078125, "learning_rate": 2.0661372571917942e-05, "loss": 0.5131, "step": 150800 }, { "epoch": 5.872163182685196, "grad_norm": 35.03476333618164, "learning_rate": 2.0651640780100434e-05, "loss": 0.4802, "step": 150850 }, { "epoch": 5.874109541048698, "grad_norm": 38.42994689941406, "learning_rate": 2.0641908988282923e-05, "loss": 0.5149, "step": 150900 }, { "epoch": 5.8760558994122, "grad_norm": 12.800299644470215, "learning_rate": 2.0632177196465412e-05, "loss": 0.4458, "step": 150950 }, { "epoch": 5.878002257775702, "grad_norm": 14.403288841247559, "learning_rate": 2.0622445404647904e-05, "loss": 0.4891, "step": 151000 }, { "epoch": 5.879948616139203, "grad_norm": 7.121120929718018, "learning_rate": 2.0612713612830396e-05, "loss": 0.4686, "step": 151050 }, { "epoch": 5.881894974502705, "grad_norm": 14.893117904663086, "learning_rate": 2.060298182101289e-05, "loss": 0.4252, "step": 151100 }, { "epoch": 5.883841332866208, "grad_norm": 27.542522430419922, "learning_rate": 2.0593250029195377e-05, "loss": 0.4264, "step": 151150 }, { "epoch": 5.88578769122971, "grad_norm": 23.38057518005371, "learning_rate": 2.0583518237377866e-05, "loss": 0.4266, "step": 151200 }, { "epoch": 5.887734049593211, "grad_norm": 1.5496599674224854, "learning_rate": 2.0573786445560358e-05, "loss": 0.5295, "step": 151250 }, { "epoch": 5.889680407956713, "grad_norm": 11.892908096313477, "learning_rate": 2.05642492895792e-05, "loss": 0.4914, "step": 151300 }, { "epoch": 5.891626766320215, "grad_norm": 8.352860450744629, "learning_rate": 2.0554517497761687e-05, "loss": 0.4962, "step": 151350 }, { "epoch": 5.893573124683717, "grad_norm": 5.320369243621826, "learning_rate": 2.054478570594418e-05, "loss": 0.4618, "step": 151400 }, { "epoch": 5.895519483047218, "grad_norm": 18.53015899658203, "learning_rate": 2.053505391412667e-05, "loss": 0.4173, "step": 151450 }, { "epoch": 5.89746584141072, "grad_norm": 10.321112632751465, "learning_rate": 2.052532212230916e-05, "loss": 0.5469, "step": 151500 }, { "epoch": 5.899412199774222, "grad_norm": 13.529706954956055, "learning_rate": 2.0515590330491653e-05, "loss": 0.5524, "step": 151550 }, { "epoch": 5.901358558137725, "grad_norm": 15.767763137817383, "learning_rate": 2.050585853867414e-05, "loss": 0.472, "step": 151600 }, { "epoch": 5.903304916501226, "grad_norm": 17.068130493164062, "learning_rate": 2.049612674685663e-05, "loss": 0.4859, "step": 151650 }, { "epoch": 5.905251274864728, "grad_norm": 8.694064140319824, "learning_rate": 2.0486394955039122e-05, "loss": 0.4754, "step": 151700 }, { "epoch": 5.90719763322823, "grad_norm": 8.695759773254395, "learning_rate": 2.047666316322161e-05, "loss": 0.5398, "step": 151750 }, { "epoch": 5.909143991591732, "grad_norm": 3.4272806644439697, "learning_rate": 2.0466931371404103e-05, "loss": 0.357, "step": 151800 }, { "epoch": 5.911090349955233, "grad_norm": 20.61173439025879, "learning_rate": 2.0457199579586596e-05, "loss": 0.4371, "step": 151850 }, { "epoch": 5.913036708318735, "grad_norm": 1.038123369216919, "learning_rate": 2.0447467787769084e-05, "loss": 0.479, "step": 151900 }, { "epoch": 5.914983066682238, "grad_norm": 39.45209884643555, "learning_rate": 2.0437735995951577e-05, "loss": 0.5399, "step": 151950 }, { "epoch": 5.91692942504574, "grad_norm": 16.484363555908203, "learning_rate": 2.0428004204134065e-05, "loss": 0.4641, "step": 152000 }, { "epoch": 5.918875783409241, "grad_norm": 16.853939056396484, "learning_rate": 2.0418467048152906e-05, "loss": 0.4511, "step": 152050 }, { "epoch": 5.920822141772743, "grad_norm": 0.9782990217208862, "learning_rate": 2.0408735256335398e-05, "loss": 0.5191, "step": 152100 }, { "epoch": 5.922768500136245, "grad_norm": 13.055575370788574, "learning_rate": 2.0399003464517887e-05, "loss": 0.5844, "step": 152150 }, { "epoch": 5.924714858499747, "grad_norm": 8.821320533752441, "learning_rate": 2.038927167270038e-05, "loss": 0.4963, "step": 152200 }, { "epoch": 5.926661216863248, "grad_norm": 32.99136734008789, "learning_rate": 2.037953988088287e-05, "loss": 0.5171, "step": 152250 }, { "epoch": 5.928607575226751, "grad_norm": 48.70899200439453, "learning_rate": 2.036980808906536e-05, "loss": 0.4584, "step": 152300 }, { "epoch": 5.930553933590253, "grad_norm": 6.978852272033691, "learning_rate": 2.0360076297247852e-05, "loss": 0.4954, "step": 152350 }, { "epoch": 5.932500291953755, "grad_norm": 2.8952019214630127, "learning_rate": 2.035034450543034e-05, "loss": 0.488, "step": 152400 }, { "epoch": 5.934446650317256, "grad_norm": 88.61508178710938, "learning_rate": 2.034061271361283e-05, "loss": 0.4406, "step": 152450 }, { "epoch": 5.936393008680758, "grad_norm": 10.144498825073242, "learning_rate": 2.0330880921795322e-05, "loss": 0.4014, "step": 152500 }, { "epoch": 5.93833936704426, "grad_norm": 32.65559005737305, "learning_rate": 2.0321149129977814e-05, "loss": 0.5318, "step": 152550 }, { "epoch": 5.940285725407762, "grad_norm": 1.1318234205245972, "learning_rate": 2.0311417338160303e-05, "loss": 0.4876, "step": 152600 }, { "epoch": 5.942232083771264, "grad_norm": 25.15874671936035, "learning_rate": 2.0301685546342795e-05, "loss": 0.5391, "step": 152650 }, { "epoch": 5.944178442134766, "grad_norm": 24.66851806640625, "learning_rate": 2.0291953754525284e-05, "loss": 0.3568, "step": 152700 }, { "epoch": 5.946124800498268, "grad_norm": 66.10899353027344, "learning_rate": 2.0282221962707776e-05, "loss": 0.5704, "step": 152750 }, { "epoch": 5.94807115886177, "grad_norm": 1.2414644956588745, "learning_rate": 2.0272490170890265e-05, "loss": 0.5291, "step": 152800 }, { "epoch": 5.950017517225271, "grad_norm": 1.036410927772522, "learning_rate": 2.0262758379072753e-05, "loss": 0.4364, "step": 152850 }, { "epoch": 5.951963875588773, "grad_norm": 4.711224555969238, "learning_rate": 2.0253026587255246e-05, "loss": 0.4839, "step": 152900 }, { "epoch": 5.953910233952275, "grad_norm": 9.774585723876953, "learning_rate": 2.0243294795437738e-05, "loss": 0.4522, "step": 152950 }, { "epoch": 5.955856592315778, "grad_norm": 55.37896728515625, "learning_rate": 2.0233563003620227e-05, "loss": 0.5298, "step": 153000 }, { "epoch": 5.957802950679279, "grad_norm": 8.964422225952148, "learning_rate": 2.022383121180272e-05, "loss": 0.4292, "step": 153050 }, { "epoch": 5.959749309042781, "grad_norm": 3.2365469932556152, "learning_rate": 2.0214099419985208e-05, "loss": 0.4631, "step": 153100 }, { "epoch": 5.961695667406283, "grad_norm": 18.938894271850586, "learning_rate": 2.02043676281677e-05, "loss": 0.4682, "step": 153150 }, { "epoch": 5.963642025769785, "grad_norm": 31.883237838745117, "learning_rate": 2.019463583635019e-05, "loss": 0.5014, "step": 153200 }, { "epoch": 5.965588384133286, "grad_norm": 6.5992112159729, "learning_rate": 2.018490404453268e-05, "loss": 0.5443, "step": 153250 }, { "epoch": 5.967534742496788, "grad_norm": 10.283914566040039, "learning_rate": 2.0175172252715173e-05, "loss": 0.4979, "step": 153300 }, { "epoch": 5.969481100860291, "grad_norm": 19.170806884765625, "learning_rate": 2.0165440460897662e-05, "loss": 0.4255, "step": 153350 }, { "epoch": 5.971427459223793, "grad_norm": 1.1737772226333618, "learning_rate": 2.0155708669080154e-05, "loss": 0.4863, "step": 153400 }, { "epoch": 5.973373817587294, "grad_norm": 38.96372985839844, "learning_rate": 2.0145976877262643e-05, "loss": 0.5106, "step": 153450 }, { "epoch": 5.975320175950796, "grad_norm": 74.73748016357422, "learning_rate": 2.013624508544513e-05, "loss": 0.5108, "step": 153500 }, { "epoch": 5.977266534314298, "grad_norm": 9.684989929199219, "learning_rate": 2.0126513293627624e-05, "loss": 0.4619, "step": 153550 }, { "epoch": 5.9792128926778, "grad_norm": 17.42524528503418, "learning_rate": 2.0116781501810112e-05, "loss": 0.4552, "step": 153600 }, { "epoch": 5.981159251041301, "grad_norm": 31.885581970214844, "learning_rate": 2.0107049709992605e-05, "loss": 0.4655, "step": 153650 }, { "epoch": 5.983105609404804, "grad_norm": 6.066231727600098, "learning_rate": 2.0097317918175097e-05, "loss": 0.4191, "step": 153700 }, { "epoch": 5.985051967768306, "grad_norm": 21.954912185668945, "learning_rate": 2.0087586126357586e-05, "loss": 0.4373, "step": 153750 }, { "epoch": 5.986998326131808, "grad_norm": 23.308425903320312, "learning_rate": 2.0077854334540078e-05, "loss": 0.4538, "step": 153800 }, { "epoch": 5.988944684495309, "grad_norm": 24.721473693847656, "learning_rate": 2.0068122542722567e-05, "loss": 0.3729, "step": 153850 }, { "epoch": 5.990891042858811, "grad_norm": 56.41871643066406, "learning_rate": 2.0058390750905055e-05, "loss": 0.4819, "step": 153900 }, { "epoch": 5.992837401222313, "grad_norm": 13.821495056152344, "learning_rate": 2.0048658959087548e-05, "loss": 0.5007, "step": 153950 }, { "epoch": 5.994783759585815, "grad_norm": 8.664423942565918, "learning_rate": 2.003892716727004e-05, "loss": 0.4394, "step": 154000 }, { "epoch": 5.996730117949317, "grad_norm": 15.08137321472168, "learning_rate": 2.002919537545253e-05, "loss": 0.4629, "step": 154050 }, { "epoch": 5.998676476312819, "grad_norm": 6.79280424118042, "learning_rate": 2.001946358363502e-05, "loss": 0.4779, "step": 154100 }, { "epoch": 6.0, "eval_accuracy": 0.7653859628634824, "eval_f1_macro": 0.6655107932016976, "eval_f1_weighted": 0.7526503013330789, "eval_loss": 0.7793481945991516, "eval_roc_auc": 0.9481714851866412, "eval_runtime": 26.6358, "eval_samples_per_second": 964.455, "eval_steps_per_second": 120.59, "step": 154134 }, { "epoch": 6.000622834676321, "grad_norm": 26.01578140258789, "learning_rate": 2.000973179181751e-05, "loss": 0.4721, "step": 154150 }, { "epoch": 6.002569193039823, "grad_norm": 12.468201637268066, "learning_rate": 2e-05, "loss": 0.4373, "step": 154200 }, { "epoch": 6.004515551403324, "grad_norm": 3.0325231552124023, "learning_rate": 1.999026820818249e-05, "loss": 0.4103, "step": 154250 }, { "epoch": 6.006461909766826, "grad_norm": 0.9780355095863342, "learning_rate": 1.9980536416364983e-05, "loss": 0.486, "step": 154300 }, { "epoch": 6.008408268130328, "grad_norm": 48.99637222290039, "learning_rate": 1.9970804624547475e-05, "loss": 0.4419, "step": 154350 }, { "epoch": 6.01035462649383, "grad_norm": 12.753762245178223, "learning_rate": 1.9961072832729964e-05, "loss": 0.4015, "step": 154400 }, { "epoch": 6.012300984857332, "grad_norm": 32.74552536010742, "learning_rate": 1.9951341040912452e-05, "loss": 0.368, "step": 154450 }, { "epoch": 6.014247343220834, "grad_norm": 12.258967399597168, "learning_rate": 1.9941609249094945e-05, "loss": 0.4581, "step": 154500 }, { "epoch": 6.016193701584336, "grad_norm": 13.169581413269043, "learning_rate": 1.9931877457277433e-05, "loss": 0.4218, "step": 154550 }, { "epoch": 6.018140059947838, "grad_norm": 26.043437957763672, "learning_rate": 1.9922145665459926e-05, "loss": 0.4282, "step": 154600 }, { "epoch": 6.020086418311339, "grad_norm": 44.09939193725586, "learning_rate": 1.9912413873642414e-05, "loss": 0.4004, "step": 154650 }, { "epoch": 6.022032776674841, "grad_norm": 114.01387786865234, "learning_rate": 1.9902682081824906e-05, "loss": 0.465, "step": 154700 }, { "epoch": 6.023979135038343, "grad_norm": 10.816126823425293, "learning_rate": 1.98929502900074e-05, "loss": 0.4799, "step": 154750 }, { "epoch": 6.025925493401846, "grad_norm": 13.570274353027344, "learning_rate": 1.9883218498189887e-05, "loss": 0.3892, "step": 154800 }, { "epoch": 6.027871851765347, "grad_norm": 27.59929084777832, "learning_rate": 1.987348670637238e-05, "loss": 0.4418, "step": 154850 }, { "epoch": 6.029818210128849, "grad_norm": 96.50619506835938, "learning_rate": 1.986375491455487e-05, "loss": 0.4656, "step": 154900 }, { "epoch": 6.031764568492351, "grad_norm": 4.234004020690918, "learning_rate": 1.9854023122737357e-05, "loss": 0.418, "step": 154950 }, { "epoch": 6.033710926855853, "grad_norm": 24.70825958251953, "learning_rate": 1.984429133091985e-05, "loss": 0.3972, "step": 155000 }, { "epoch": 6.035657285219354, "grad_norm": 153.59967041015625, "learning_rate": 1.983455953910234e-05, "loss": 0.4231, "step": 155050 }, { "epoch": 6.037603643582856, "grad_norm": 26.965532302856445, "learning_rate": 1.982482774728483e-05, "loss": 0.4879, "step": 155100 }, { "epoch": 6.039550001946358, "grad_norm": 126.84495544433594, "learning_rate": 1.9815095955467323e-05, "loss": 0.3791, "step": 155150 }, { "epoch": 6.041496360309861, "grad_norm": 13.878125190734863, "learning_rate": 1.980536416364981e-05, "loss": 0.4272, "step": 155200 }, { "epoch": 6.043442718673362, "grad_norm": 57.5096435546875, "learning_rate": 1.9795632371832304e-05, "loss": 0.4507, "step": 155250 }, { "epoch": 6.045389077036864, "grad_norm": 9.265453338623047, "learning_rate": 1.9785900580014792e-05, "loss": 0.3738, "step": 155300 }, { "epoch": 6.047335435400366, "grad_norm": 0.18589606881141663, "learning_rate": 1.9776168788197284e-05, "loss": 0.2632, "step": 155350 }, { "epoch": 6.049281793763868, "grad_norm": 44.53890609741211, "learning_rate": 1.9766436996379777e-05, "loss": 0.5413, "step": 155400 }, { "epoch": 6.051228152127369, "grad_norm": 21.09198570251465, "learning_rate": 1.9756705204562265e-05, "loss": 0.4402, "step": 155450 }, { "epoch": 6.053174510490871, "grad_norm": 0.22208233177661896, "learning_rate": 1.9746973412744754e-05, "loss": 0.516, "step": 155500 }, { "epoch": 6.055120868854374, "grad_norm": 5.1662516593933105, "learning_rate": 1.9737241620927246e-05, "loss": 0.5119, "step": 155550 }, { "epoch": 6.057067227217876, "grad_norm": 12.44427490234375, "learning_rate": 1.9727509829109735e-05, "loss": 0.4288, "step": 155600 }, { "epoch": 6.059013585581377, "grad_norm": 135.75253295898438, "learning_rate": 1.9717778037292227e-05, "loss": 0.4628, "step": 155650 }, { "epoch": 6.060959943944879, "grad_norm": 34.116233825683594, "learning_rate": 1.9708046245474716e-05, "loss": 0.4619, "step": 155700 }, { "epoch": 6.062906302308381, "grad_norm": 57.39120864868164, "learning_rate": 1.969831445365721e-05, "loss": 0.403, "step": 155750 }, { "epoch": 6.064852660671883, "grad_norm": 1.842354416847229, "learning_rate": 1.96885826618397e-05, "loss": 0.418, "step": 155800 }, { "epoch": 6.066799019035384, "grad_norm": 73.33029174804688, "learning_rate": 1.967885087002219e-05, "loss": 0.4141, "step": 155850 }, { "epoch": 6.068745377398887, "grad_norm": 25.58429718017578, "learning_rate": 1.966911907820468e-05, "loss": 0.3867, "step": 155900 }, { "epoch": 6.070691735762389, "grad_norm": 102.4791488647461, "learning_rate": 1.965938728638717e-05, "loss": 0.4567, "step": 155950 }, { "epoch": 6.072638094125891, "grad_norm": 143.00341796875, "learning_rate": 1.964965549456966e-05, "loss": 0.5514, "step": 156000 }, { "epoch": 6.074584452489392, "grad_norm": 39.01069259643555, "learning_rate": 1.963992370275215e-05, "loss": 0.4019, "step": 156050 }, { "epoch": 6.076530810852894, "grad_norm": 17.038347244262695, "learning_rate": 1.9630191910934643e-05, "loss": 0.3562, "step": 156100 }, { "epoch": 6.078477169216396, "grad_norm": 28.36899757385254, "learning_rate": 1.9620460119117132e-05, "loss": 0.424, "step": 156150 }, { "epoch": 6.080423527579898, "grad_norm": 13.023849487304688, "learning_rate": 1.9610922963135976e-05, "loss": 0.3368, "step": 156200 }, { "epoch": 6.0823698859434, "grad_norm": 2.479005813598633, "learning_rate": 1.9601191171318465e-05, "loss": 0.5307, "step": 156250 }, { "epoch": 6.084316244306902, "grad_norm": 37.577919006347656, "learning_rate": 1.9591459379500954e-05, "loss": 0.4762, "step": 156300 }, { "epoch": 6.086262602670404, "grad_norm": 0.2546081840991974, "learning_rate": 1.9581727587683446e-05, "loss": 0.3793, "step": 156350 }, { "epoch": 6.088208961033906, "grad_norm": 27.892820358276367, "learning_rate": 1.9571995795865935e-05, "loss": 0.5666, "step": 156400 }, { "epoch": 6.090155319397407, "grad_norm": 1.6399470567703247, "learning_rate": 1.9562264004048427e-05, "loss": 0.4598, "step": 156450 }, { "epoch": 6.092101677760909, "grad_norm": 33.33218765258789, "learning_rate": 1.955253221223092e-05, "loss": 0.4005, "step": 156500 }, { "epoch": 6.094048036124411, "grad_norm": 34.42781448364258, "learning_rate": 1.9542800420413408e-05, "loss": 0.4751, "step": 156550 }, { "epoch": 6.095994394487913, "grad_norm": 19.277366638183594, "learning_rate": 1.95330686285959e-05, "loss": 0.3646, "step": 156600 }, { "epoch": 6.097940752851415, "grad_norm": 8.559799194335938, "learning_rate": 1.952333683677839e-05, "loss": 0.4691, "step": 156650 }, { "epoch": 6.099887111214917, "grad_norm": 207.42413330078125, "learning_rate": 1.9513605044960877e-05, "loss": 0.4375, "step": 156700 }, { "epoch": 6.101833469578419, "grad_norm": 1.6912468671798706, "learning_rate": 1.950387325314337e-05, "loss": 0.3118, "step": 156750 }, { "epoch": 6.103779827941921, "grad_norm": 48.396663665771484, "learning_rate": 1.949414146132586e-05, "loss": 0.3447, "step": 156800 }, { "epoch": 6.105726186305422, "grad_norm": 0.4083416759967804, "learning_rate": 1.948440966950835e-05, "loss": 0.5882, "step": 156850 }, { "epoch": 6.107672544668924, "grad_norm": 34.54789733886719, "learning_rate": 1.9474677877690843e-05, "loss": 0.5442, "step": 156900 }, { "epoch": 6.109618903032426, "grad_norm": 9.686118125915527, "learning_rate": 1.946494608587333e-05, "loss": 0.3831, "step": 156950 }, { "epoch": 6.111565261395929, "grad_norm": 32.14944839477539, "learning_rate": 1.9455214294055824e-05, "loss": 0.4627, "step": 157000 }, { "epoch": 6.11351161975943, "grad_norm": 10.397801399230957, "learning_rate": 1.9445482502238313e-05, "loss": 0.4819, "step": 157050 }, { "epoch": 6.115457978122932, "grad_norm": 6.615303039550781, "learning_rate": 1.94357507104208e-05, "loss": 0.3555, "step": 157100 }, { "epoch": 6.117404336486434, "grad_norm": 11.995848655700684, "learning_rate": 1.9426018918603294e-05, "loss": 0.4904, "step": 157150 }, { "epoch": 6.119350694849936, "grad_norm": 2.466294050216675, "learning_rate": 1.9416287126785786e-05, "loss": 0.36, "step": 157200 }, { "epoch": 6.121297053213437, "grad_norm": 39.90447998046875, "learning_rate": 1.9406555334968278e-05, "loss": 0.428, "step": 157250 }, { "epoch": 6.123243411576939, "grad_norm": 19.361326217651367, "learning_rate": 1.9396823543150767e-05, "loss": 0.4938, "step": 157300 }, { "epoch": 6.125189769940442, "grad_norm": 51.88689041137695, "learning_rate": 1.9387091751333255e-05, "loss": 0.4594, "step": 157350 }, { "epoch": 6.127136128303944, "grad_norm": 7.311392784118652, "learning_rate": 1.9377359959515748e-05, "loss": 0.4859, "step": 157400 }, { "epoch": 6.129082486667445, "grad_norm": 40.08598709106445, "learning_rate": 1.9367628167698236e-05, "loss": 0.4326, "step": 157450 }, { "epoch": 6.131028845030947, "grad_norm": 72.93892669677734, "learning_rate": 1.9357896375880725e-05, "loss": 0.444, "step": 157500 }, { "epoch": 6.132975203394449, "grad_norm": 12.450722694396973, "learning_rate": 1.934816458406322e-05, "loss": 0.4037, "step": 157550 }, { "epoch": 6.134921561757951, "grad_norm": 14.572117805480957, "learning_rate": 1.933843279224571e-05, "loss": 0.4177, "step": 157600 }, { "epoch": 6.136867920121452, "grad_norm": 51.43737030029297, "learning_rate": 1.9328701000428202e-05, "loss": 0.4525, "step": 157650 }, { "epoch": 6.138814278484954, "grad_norm": 13.203474044799805, "learning_rate": 1.931896920861069e-05, "loss": 0.4543, "step": 157700 }, { "epoch": 6.140760636848457, "grad_norm": 9.115474700927734, "learning_rate": 1.930923741679318e-05, "loss": 0.4844, "step": 157750 }, { "epoch": 6.142706995211959, "grad_norm": 7.616307258605957, "learning_rate": 1.929950562497567e-05, "loss": 0.4648, "step": 157800 }, { "epoch": 6.14465335357546, "grad_norm": 37.609500885009766, "learning_rate": 1.928977383315816e-05, "loss": 0.3962, "step": 157850 }, { "epoch": 6.146599711938962, "grad_norm": 15.38205623626709, "learning_rate": 1.9280042041340653e-05, "loss": 0.3877, "step": 157900 }, { "epoch": 6.148546070302464, "grad_norm": 31.797788619995117, "learning_rate": 1.9270310249523145e-05, "loss": 0.5446, "step": 157950 }, { "epoch": 6.150492428665966, "grad_norm": 0.35846465826034546, "learning_rate": 1.9260578457705633e-05, "loss": 0.4121, "step": 158000 }, { "epoch": 6.1524387870294674, "grad_norm": 73.22759246826172, "learning_rate": 1.9250846665888126e-05, "loss": 0.3957, "step": 158050 }, { "epoch": 6.15438514539297, "grad_norm": 10.111128807067871, "learning_rate": 1.9241114874070614e-05, "loss": 0.3485, "step": 158100 }, { "epoch": 6.156331503756472, "grad_norm": 8.28075122833252, "learning_rate": 1.9231383082253103e-05, "loss": 0.4143, "step": 158150 }, { "epoch": 6.158277862119974, "grad_norm": 2.5658810138702393, "learning_rate": 1.9221651290435595e-05, "loss": 0.4904, "step": 158200 }, { "epoch": 6.160224220483475, "grad_norm": 2.767765760421753, "learning_rate": 1.9211919498618088e-05, "loss": 0.4308, "step": 158250 }, { "epoch": 6.162170578846977, "grad_norm": 11.51589298248291, "learning_rate": 1.9202187706800576e-05, "loss": 0.5037, "step": 158300 }, { "epoch": 6.164116937210479, "grad_norm": 18.034000396728516, "learning_rate": 1.919245591498307e-05, "loss": 0.4237, "step": 158350 }, { "epoch": 6.166063295573981, "grad_norm": 13.62302017211914, "learning_rate": 1.9182724123165557e-05, "loss": 0.4174, "step": 158400 }, { "epoch": 6.168009653937483, "grad_norm": 73.90406036376953, "learning_rate": 1.917299233134805e-05, "loss": 0.4261, "step": 158450 }, { "epoch": 6.169956012300985, "grad_norm": 39.62423324584961, "learning_rate": 1.916345517536689e-05, "loss": 0.4021, "step": 158500 }, { "epoch": 6.171902370664487, "grad_norm": 0.2424212396144867, "learning_rate": 1.915372338354938e-05, "loss": 0.3736, "step": 158550 }, { "epoch": 6.173848729027989, "grad_norm": 19.04499626159668, "learning_rate": 1.914399159173187e-05, "loss": 0.4586, "step": 158600 }, { "epoch": 6.17579508739149, "grad_norm": 64.3941879272461, "learning_rate": 1.913425979991436e-05, "loss": 0.4206, "step": 158650 }, { "epoch": 6.177741445754992, "grad_norm": 6.526639938354492, "learning_rate": 1.91247226439332e-05, "loss": 0.5269, "step": 158700 }, { "epoch": 6.179687804118494, "grad_norm": 11.641952514648438, "learning_rate": 1.9114990852115692e-05, "loss": 0.4591, "step": 158750 }, { "epoch": 6.181634162481997, "grad_norm": 29.792831420898438, "learning_rate": 1.9105259060298184e-05, "loss": 0.4856, "step": 158800 }, { "epoch": 6.183580520845498, "grad_norm": 3.98466420173645, "learning_rate": 1.9095527268480673e-05, "loss": 0.4459, "step": 158850 }, { "epoch": 6.185526879209, "grad_norm": 0.175152987241745, "learning_rate": 1.9085795476663165e-05, "loss": 0.3292, "step": 158900 }, { "epoch": 6.187473237572502, "grad_norm": 5.449732303619385, "learning_rate": 1.9076063684845654e-05, "loss": 0.3419, "step": 158950 }, { "epoch": 6.189419595936004, "grad_norm": 17.26066780090332, "learning_rate": 1.9066331893028143e-05, "loss": 0.5604, "step": 159000 }, { "epoch": 6.191365954299505, "grad_norm": 30.819442749023438, "learning_rate": 1.9056600101210635e-05, "loss": 0.4147, "step": 159050 }, { "epoch": 6.193312312663007, "grad_norm": 13.20460033416748, "learning_rate": 1.9046868309393127e-05, "loss": 0.4388, "step": 159100 }, { "epoch": 6.19525867102651, "grad_norm": 50.725704193115234, "learning_rate": 1.9037136517575616e-05, "loss": 0.4151, "step": 159150 }, { "epoch": 6.197205029390012, "grad_norm": 55.710304260253906, "learning_rate": 1.9027404725758108e-05, "loss": 0.4247, "step": 159200 }, { "epoch": 6.199151387753513, "grad_norm": 7.072338104248047, "learning_rate": 1.9017672933940597e-05, "loss": 0.4814, "step": 159250 }, { "epoch": 6.201097746117015, "grad_norm": 16.21757698059082, "learning_rate": 1.900794114212309e-05, "loss": 0.3555, "step": 159300 }, { "epoch": 6.203044104480517, "grad_norm": 119.4755859375, "learning_rate": 1.8998209350305578e-05, "loss": 0.4509, "step": 159350 }, { "epoch": 6.204990462844019, "grad_norm": 13.783527374267578, "learning_rate": 1.898847755848807e-05, "loss": 0.4367, "step": 159400 }, { "epoch": 6.20693682120752, "grad_norm": 241.25157165527344, "learning_rate": 1.8978745766670562e-05, "loss": 0.4458, "step": 159450 }, { "epoch": 6.208883179571022, "grad_norm": 18.61009407043457, "learning_rate": 1.896901397485305e-05, "loss": 0.3881, "step": 159500 }, { "epoch": 6.210829537934525, "grad_norm": 65.35408020019531, "learning_rate": 1.8959282183035543e-05, "loss": 0.5039, "step": 159550 }, { "epoch": 6.212775896298027, "grad_norm": 53.62315368652344, "learning_rate": 1.8949550391218032e-05, "loss": 0.4077, "step": 159600 }, { "epoch": 6.214722254661528, "grad_norm": 31.48565673828125, "learning_rate": 1.893981859940052e-05, "loss": 0.4006, "step": 159650 }, { "epoch": 6.21666861302503, "grad_norm": 16.8851318359375, "learning_rate": 1.8930086807583013e-05, "loss": 0.3988, "step": 159700 }, { "epoch": 6.218614971388532, "grad_norm": 0.3442683219909668, "learning_rate": 1.8920355015765502e-05, "loss": 0.4347, "step": 159750 }, { "epoch": 6.220561329752034, "grad_norm": 28.33207893371582, "learning_rate": 1.8910623223947994e-05, "loss": 0.4905, "step": 159800 }, { "epoch": 6.2225076881155355, "grad_norm": 7.22058629989624, "learning_rate": 1.8900891432130486e-05, "loss": 0.4205, "step": 159850 }, { "epoch": 6.224454046479038, "grad_norm": 19.302865982055664, "learning_rate": 1.8891159640312975e-05, "loss": 0.464, "step": 159900 }, { "epoch": 6.22640040484254, "grad_norm": 12.403021812438965, "learning_rate": 1.8881427848495467e-05, "loss": 0.454, "step": 159950 }, { "epoch": 6.228346763206042, "grad_norm": 13.831979751586914, "learning_rate": 1.8871696056677956e-05, "loss": 0.4029, "step": 160000 }, { "epoch": 6.230293121569543, "grad_norm": 16.252643585205078, "learning_rate": 1.8861964264860445e-05, "loss": 0.5484, "step": 160050 }, { "epoch": 6.232239479933045, "grad_norm": 34.256160736083984, "learning_rate": 1.8852232473042937e-05, "loss": 0.3751, "step": 160100 }, { "epoch": 6.234185838296547, "grad_norm": 13.429237365722656, "learning_rate": 1.884250068122543e-05, "loss": 0.366, "step": 160150 }, { "epoch": 6.236132196660049, "grad_norm": 38.21743392944336, "learning_rate": 1.8832768889407918e-05, "loss": 0.5077, "step": 160200 }, { "epoch": 6.238078555023551, "grad_norm": 20.37754249572754, "learning_rate": 1.882303709759041e-05, "loss": 0.3965, "step": 160250 }, { "epoch": 6.240024913387053, "grad_norm": 31.65819549560547, "learning_rate": 1.88133053057729e-05, "loss": 0.4637, "step": 160300 }, { "epoch": 6.241971271750555, "grad_norm": 6.4414753913879395, "learning_rate": 1.880357351395539e-05, "loss": 0.3854, "step": 160350 }, { "epoch": 6.243917630114057, "grad_norm": 5.641358852386475, "learning_rate": 1.879384172213788e-05, "loss": 0.3934, "step": 160400 }, { "epoch": 6.245863988477558, "grad_norm": 17.410694122314453, "learning_rate": 1.8784109930320372e-05, "loss": 0.4321, "step": 160450 }, { "epoch": 6.24781034684106, "grad_norm": 137.3054656982422, "learning_rate": 1.8774378138502864e-05, "loss": 0.446, "step": 160500 }, { "epoch": 6.249756705204562, "grad_norm": 16.930944442749023, "learning_rate": 1.8764646346685353e-05, "loss": 0.4962, "step": 160550 }, { "epoch": 6.251703063568064, "grad_norm": 5.440722942352295, "learning_rate": 1.8754914554867845e-05, "loss": 0.4135, "step": 160600 }, { "epoch": 6.253649421931566, "grad_norm": 35.647430419921875, "learning_rate": 1.8745182763050334e-05, "loss": 0.427, "step": 160650 }, { "epoch": 6.255595780295068, "grad_norm": 77.83668518066406, "learning_rate": 1.8735450971232823e-05, "loss": 0.435, "step": 160700 }, { "epoch": 6.25754213865857, "grad_norm": 46.391883850097656, "learning_rate": 1.8725719179415315e-05, "loss": 0.484, "step": 160750 }, { "epoch": 6.259488497022072, "grad_norm": 32.473121643066406, "learning_rate": 1.8715987387597804e-05, "loss": 0.3588, "step": 160800 }, { "epoch": 6.261434855385573, "grad_norm": 42.228912353515625, "learning_rate": 1.8706255595780296e-05, "loss": 0.4254, "step": 160850 }, { "epoch": 6.263381213749075, "grad_norm": 11.934520721435547, "learning_rate": 1.8696523803962788e-05, "loss": 0.4567, "step": 160900 }, { "epoch": 6.265327572112577, "grad_norm": 6.510226249694824, "learning_rate": 1.8686792012145277e-05, "loss": 0.4801, "step": 160950 }, { "epoch": 6.26727393047608, "grad_norm": 0.27869871258735657, "learning_rate": 1.867706022032777e-05, "loss": 0.325, "step": 161000 }, { "epoch": 6.269220288839581, "grad_norm": 4.568768501281738, "learning_rate": 1.8667328428510258e-05, "loss": 0.4558, "step": 161050 }, { "epoch": 6.271166647203083, "grad_norm": 22.888038635253906, "learning_rate": 1.8657596636692747e-05, "loss": 0.4558, "step": 161100 }, { "epoch": 6.273113005566585, "grad_norm": 6.0665459632873535, "learning_rate": 1.864786484487524e-05, "loss": 0.4251, "step": 161150 }, { "epoch": 6.275059363930087, "grad_norm": 5.7257914543151855, "learning_rate": 1.863813305305773e-05, "loss": 0.4005, "step": 161200 }, { "epoch": 6.277005722293588, "grad_norm": 0.9024612903594971, "learning_rate": 1.862840126124022e-05, "loss": 0.4655, "step": 161250 }, { "epoch": 6.27895208065709, "grad_norm": 22.764698028564453, "learning_rate": 1.8618669469422712e-05, "loss": 0.4522, "step": 161300 }, { "epoch": 6.280898439020593, "grad_norm": 19.43137550354004, "learning_rate": 1.86089376776052e-05, "loss": 0.4967, "step": 161350 }, { "epoch": 6.282844797384095, "grad_norm": 29.61271095275879, "learning_rate": 1.8599205885787693e-05, "loss": 0.4467, "step": 161400 }, { "epoch": 6.284791155747596, "grad_norm": 32.52839660644531, "learning_rate": 1.8589474093970182e-05, "loss": 0.4012, "step": 161450 }, { "epoch": 6.286737514111098, "grad_norm": 97.26527404785156, "learning_rate": 1.8579742302152674e-05, "loss": 0.325, "step": 161500 }, { "epoch": 6.2886838724746, "grad_norm": 152.7152862548828, "learning_rate": 1.8570010510335166e-05, "loss": 0.4106, "step": 161550 }, { "epoch": 6.290630230838102, "grad_norm": 3.3697128295898438, "learning_rate": 1.8560278718517655e-05, "loss": 0.4448, "step": 161600 }, { "epoch": 6.2925765892016035, "grad_norm": 61.639549255371094, "learning_rate": 1.8550546926700144e-05, "loss": 0.4198, "step": 161650 }, { "epoch": 6.294522947565106, "grad_norm": 0.8601705431938171, "learning_rate": 1.8540815134882636e-05, "loss": 0.4657, "step": 161700 }, { "epoch": 6.296469305928608, "grad_norm": 37.344913482666016, "learning_rate": 1.8531083343065125e-05, "loss": 0.483, "step": 161750 }, { "epoch": 6.29841566429211, "grad_norm": 119.39787292480469, "learning_rate": 1.8521351551247617e-05, "loss": 0.4317, "step": 161800 }, { "epoch": 6.300362022655611, "grad_norm": 29.999801635742188, "learning_rate": 1.8511619759430106e-05, "loss": 0.3736, "step": 161850 }, { "epoch": 6.302308381019113, "grad_norm": 23.579345703125, "learning_rate": 1.8501887967612598e-05, "loss": 0.4225, "step": 161900 }, { "epoch": 6.304254739382615, "grad_norm": 11.237874984741211, "learning_rate": 1.849215617579509e-05, "loss": 0.4149, "step": 161950 }, { "epoch": 6.306201097746117, "grad_norm": 21.445850372314453, "learning_rate": 1.848242438397758e-05, "loss": 0.3945, "step": 162000 }, { "epoch": 6.308147456109619, "grad_norm": 2.539973258972168, "learning_rate": 1.847269259216007e-05, "loss": 0.3686, "step": 162050 }, { "epoch": 6.310093814473121, "grad_norm": 10.708599090576172, "learning_rate": 1.846296080034256e-05, "loss": 0.3781, "step": 162100 }, { "epoch": 6.312040172836623, "grad_norm": 27.037824630737305, "learning_rate": 1.845322900852505e-05, "loss": 0.3943, "step": 162150 }, { "epoch": 6.313986531200125, "grad_norm": 16.29541778564453, "learning_rate": 1.844349721670754e-05, "loss": 0.5384, "step": 162200 }, { "epoch": 6.315932889563626, "grad_norm": 11.450916290283203, "learning_rate": 1.8433765424890033e-05, "loss": 0.3779, "step": 162250 }, { "epoch": 6.317879247927128, "grad_norm": 12.848800659179688, "learning_rate": 1.8424033633072522e-05, "loss": 0.5011, "step": 162300 }, { "epoch": 6.31982560629063, "grad_norm": 109.42160034179688, "learning_rate": 1.8414301841255014e-05, "loss": 0.3823, "step": 162350 }, { "epoch": 6.321771964654132, "grad_norm": 5.256141185760498, "learning_rate": 1.8404570049437503e-05, "loss": 0.4575, "step": 162400 }, { "epoch": 6.323718323017634, "grad_norm": 0.5374679565429688, "learning_rate": 1.8394838257619995e-05, "loss": 0.4985, "step": 162450 }, { "epoch": 6.325664681381136, "grad_norm": 73.47692108154297, "learning_rate": 1.8385106465802484e-05, "loss": 0.4532, "step": 162500 }, { "epoch": 6.327611039744638, "grad_norm": 10.123400688171387, "learning_rate": 1.8375374673984972e-05, "loss": 0.4333, "step": 162550 }, { "epoch": 6.32955739810814, "grad_norm": 4.759811878204346, "learning_rate": 1.8365642882167468e-05, "loss": 0.3661, "step": 162600 }, { "epoch": 6.331503756471641, "grad_norm": 19.795320510864258, "learning_rate": 1.8355911090349957e-05, "loss": 0.3865, "step": 162650 }, { "epoch": 6.333450114835143, "grad_norm": 11.715143203735352, "learning_rate": 1.8346179298532446e-05, "loss": 0.4667, "step": 162700 }, { "epoch": 6.335396473198645, "grad_norm": 20.30187225341797, "learning_rate": 1.8336447506714938e-05, "loss": 0.3973, "step": 162750 }, { "epoch": 6.337342831562148, "grad_norm": 1.8292460441589355, "learning_rate": 1.8326715714897427e-05, "loss": 0.4634, "step": 162800 }, { "epoch": 6.339289189925649, "grad_norm": 37.13668441772461, "learning_rate": 1.8317178558916267e-05, "loss": 0.4146, "step": 162850 }, { "epoch": 6.341235548289151, "grad_norm": 23.011384963989258, "learning_rate": 1.830744676709876e-05, "loss": 0.3539, "step": 162900 }, { "epoch": 6.343181906652653, "grad_norm": 11.635228157043457, "learning_rate": 1.8297714975281248e-05, "loss": 0.5004, "step": 162950 }, { "epoch": 6.345128265016155, "grad_norm": 32.78508377075195, "learning_rate": 1.828798318346374e-05, "loss": 0.5205, "step": 163000 }, { "epoch": 6.347074623379656, "grad_norm": 24.06715965270996, "learning_rate": 1.8278251391646232e-05, "loss": 0.4853, "step": 163050 }, { "epoch": 6.349020981743158, "grad_norm": 39.78276443481445, "learning_rate": 1.826851959982872e-05, "loss": 0.4612, "step": 163100 }, { "epoch": 6.35096734010666, "grad_norm": 48.03463363647461, "learning_rate": 1.8258787808011213e-05, "loss": 0.4451, "step": 163150 }, { "epoch": 6.352913698470163, "grad_norm": 23.799102783203125, "learning_rate": 1.8249056016193702e-05, "loss": 0.4796, "step": 163200 }, { "epoch": 6.354860056833664, "grad_norm": 42.144386291503906, "learning_rate": 1.823932422437619e-05, "loss": 0.487, "step": 163250 }, { "epoch": 6.356806415197166, "grad_norm": 36.88383102416992, "learning_rate": 1.8229592432558683e-05, "loss": 0.3947, "step": 163300 }, { "epoch": 6.358752773560668, "grad_norm": 36.441829681396484, "learning_rate": 1.8219860640741175e-05, "loss": 0.5132, "step": 163350 }, { "epoch": 6.36069913192417, "grad_norm": 8.627400398254395, "learning_rate": 1.8210128848923667e-05, "loss": 0.451, "step": 163400 }, { "epoch": 6.3626454902876715, "grad_norm": 21.450843811035156, "learning_rate": 1.8200397057106156e-05, "loss": 0.56, "step": 163450 }, { "epoch": 6.364591848651173, "grad_norm": 6.085022449493408, "learning_rate": 1.8190665265288645e-05, "loss": 0.4514, "step": 163500 }, { "epoch": 6.366538207014676, "grad_norm": 28.007122039794922, "learning_rate": 1.8180933473471137e-05, "loss": 0.4223, "step": 163550 }, { "epoch": 6.368484565378178, "grad_norm": 93.67680358886719, "learning_rate": 1.8171201681653626e-05, "loss": 0.4643, "step": 163600 }, { "epoch": 6.370430923741679, "grad_norm": 24.45863151550293, "learning_rate": 1.8161469889836115e-05, "loss": 0.4672, "step": 163650 }, { "epoch": 6.372377282105181, "grad_norm": 28.838844299316406, "learning_rate": 1.8151738098018607e-05, "loss": 0.4889, "step": 163700 }, { "epoch": 6.374323640468683, "grad_norm": 44.29559326171875, "learning_rate": 1.81420063062011e-05, "loss": 0.3925, "step": 163750 }, { "epoch": 6.376269998832185, "grad_norm": 10.107538223266602, "learning_rate": 1.813227451438359e-05, "loss": 0.5713, "step": 163800 }, { "epoch": 6.3782163571956865, "grad_norm": 47.3173713684082, "learning_rate": 1.812254272256608e-05, "loss": 0.5073, "step": 163850 }, { "epoch": 6.380162715559189, "grad_norm": 34.003658294677734, "learning_rate": 1.811281093074857e-05, "loss": 0.4163, "step": 163900 }, { "epoch": 6.382109073922691, "grad_norm": 8.202786445617676, "learning_rate": 1.810307913893106e-05, "loss": 0.4838, "step": 163950 }, { "epoch": 6.384055432286193, "grad_norm": 25.210220336914062, "learning_rate": 1.809334734711355e-05, "loss": 0.4826, "step": 164000 }, { "epoch": 6.386001790649694, "grad_norm": 3.6907715797424316, "learning_rate": 1.8083615555296042e-05, "loss": 0.4457, "step": 164050 }, { "epoch": 6.387948149013196, "grad_norm": 0.8716779351234436, "learning_rate": 1.8073883763478534e-05, "loss": 0.5195, "step": 164100 }, { "epoch": 6.389894507376698, "grad_norm": 33.83924102783203, "learning_rate": 1.8064151971661023e-05, "loss": 0.4398, "step": 164150 }, { "epoch": 6.3918408657402, "grad_norm": 19.970422744750977, "learning_rate": 1.8054420179843515e-05, "loss": 0.564, "step": 164200 }, { "epoch": 6.393787224103702, "grad_norm": 5.726925373077393, "learning_rate": 1.8044688388026004e-05, "loss": 0.5103, "step": 164250 }, { "epoch": 6.395733582467204, "grad_norm": 10.435811996459961, "learning_rate": 1.8034956596208493e-05, "loss": 0.5144, "step": 164300 }, { "epoch": 6.397679940830706, "grad_norm": 22.380281448364258, "learning_rate": 1.8025224804390985e-05, "loss": 0.3932, "step": 164350 }, { "epoch": 6.399626299194208, "grad_norm": 15.051977157592773, "learning_rate": 1.8015493012573477e-05, "loss": 0.3967, "step": 164400 }, { "epoch": 6.401572657557709, "grad_norm": 7.8569841384887695, "learning_rate": 1.800576122075597e-05, "loss": 0.4086, "step": 164450 }, { "epoch": 6.403519015921211, "grad_norm": 33.51527786254883, "learning_rate": 1.7996029428938458e-05, "loss": 0.5256, "step": 164500 }, { "epoch": 6.405465374284713, "grad_norm": 7.364991188049316, "learning_rate": 1.7986297637120947e-05, "loss": 0.4526, "step": 164550 }, { "epoch": 6.407411732648216, "grad_norm": 33.84960174560547, "learning_rate": 1.797656584530344e-05, "loss": 0.4796, "step": 164600 }, { "epoch": 6.409358091011717, "grad_norm": 67.4484634399414, "learning_rate": 1.7966834053485928e-05, "loss": 0.4387, "step": 164650 }, { "epoch": 6.411304449375219, "grad_norm": 14.073902130126953, "learning_rate": 1.7957102261668417e-05, "loss": 0.5252, "step": 164700 }, { "epoch": 6.413250807738721, "grad_norm": 8.136956214904785, "learning_rate": 1.794737046985091e-05, "loss": 0.4871, "step": 164750 }, { "epoch": 6.415197166102223, "grad_norm": 112.5655517578125, "learning_rate": 1.79376386780334e-05, "loss": 0.4196, "step": 164800 }, { "epoch": 6.4171435244657244, "grad_norm": 50.12166976928711, "learning_rate": 1.7927906886215893e-05, "loss": 0.5581, "step": 164850 }, { "epoch": 6.419089882829226, "grad_norm": 14.34860897064209, "learning_rate": 1.7918175094398382e-05, "loss": 0.539, "step": 164900 }, { "epoch": 6.421036241192729, "grad_norm": 19.631179809570312, "learning_rate": 1.790844330258087e-05, "loss": 0.4728, "step": 164950 }, { "epoch": 6.422982599556231, "grad_norm": 49.741519927978516, "learning_rate": 1.7898711510763363e-05, "loss": 0.5075, "step": 165000 }, { "epoch": 6.424928957919732, "grad_norm": 15.354835510253906, "learning_rate": 1.788897971894585e-05, "loss": 0.4054, "step": 165050 }, { "epoch": 6.426875316283234, "grad_norm": 67.64155578613281, "learning_rate": 1.7879247927128344e-05, "loss": 0.4094, "step": 165100 }, { "epoch": 6.428821674646736, "grad_norm": 7.105681419372559, "learning_rate": 1.7869516135310836e-05, "loss": 0.4541, "step": 165150 }, { "epoch": 6.430768033010238, "grad_norm": 33.354671478271484, "learning_rate": 1.7859784343493325e-05, "loss": 0.4049, "step": 165200 }, { "epoch": 6.4327143913737395, "grad_norm": 30.970449447631836, "learning_rate": 1.7850052551675817e-05, "loss": 0.4464, "step": 165250 }, { "epoch": 6.434660749737241, "grad_norm": 97.85169982910156, "learning_rate": 1.7840320759858306e-05, "loss": 0.5332, "step": 165300 }, { "epoch": 6.436607108100744, "grad_norm": 38.69292068481445, "learning_rate": 1.7830588968040795e-05, "loss": 0.4437, "step": 165350 }, { "epoch": 6.438553466464246, "grad_norm": 12.331722259521484, "learning_rate": 1.7820857176223287e-05, "loss": 0.3875, "step": 165400 }, { "epoch": 6.440499824827747, "grad_norm": 35.633731842041016, "learning_rate": 1.781112538440578e-05, "loss": 0.5129, "step": 165450 }, { "epoch": 6.442446183191249, "grad_norm": 20.54450225830078, "learning_rate": 1.7801393592588268e-05, "loss": 0.3828, "step": 165500 }, { "epoch": 6.444392541554751, "grad_norm": 11.044628143310547, "learning_rate": 1.779166180077076e-05, "loss": 0.5205, "step": 165550 }, { "epoch": 6.446338899918253, "grad_norm": 9.249312400817871, "learning_rate": 1.778193000895325e-05, "loss": 0.5452, "step": 165600 }, { "epoch": 6.4482852582817545, "grad_norm": 19.215213775634766, "learning_rate": 1.777219821713574e-05, "loss": 0.4338, "step": 165650 }, { "epoch": 6.450231616645257, "grad_norm": 0.5105314254760742, "learning_rate": 1.776246642531823e-05, "loss": 0.3551, "step": 165700 }, { "epoch": 6.452177975008759, "grad_norm": 33.68521499633789, "learning_rate": 1.775273463350072e-05, "loss": 0.4558, "step": 165750 }, { "epoch": 6.454124333372261, "grad_norm": 68.068603515625, "learning_rate": 1.774300284168321e-05, "loss": 0.4899, "step": 165800 }, { "epoch": 6.456070691735762, "grad_norm": 127.41122436523438, "learning_rate": 1.7733271049865703e-05, "loss": 0.4998, "step": 165850 }, { "epoch": 6.458017050099264, "grad_norm": 15.116394996643066, "learning_rate": 1.7723539258048195e-05, "loss": 0.3822, "step": 165900 }, { "epoch": 6.459963408462766, "grad_norm": 22.472278594970703, "learning_rate": 1.7713807466230684e-05, "loss": 0.4376, "step": 165950 }, { "epoch": 6.461909766826268, "grad_norm": 13.101561546325684, "learning_rate": 1.7704075674413173e-05, "loss": 0.463, "step": 166000 }, { "epoch": 6.4638561251897695, "grad_norm": 1.9539737701416016, "learning_rate": 1.7694343882595665e-05, "loss": 0.487, "step": 166050 }, { "epoch": 6.465802483553272, "grad_norm": 6.149293422698975, "learning_rate": 1.7684612090778154e-05, "loss": 0.4222, "step": 166100 }, { "epoch": 6.467748841916774, "grad_norm": 23.06251335144043, "learning_rate": 1.7674880298960646e-05, "loss": 0.3833, "step": 166150 }, { "epoch": 6.469695200280276, "grad_norm": 3.5130128860473633, "learning_rate": 1.7665148507143138e-05, "loss": 0.4838, "step": 166200 }, { "epoch": 6.471641558643777, "grad_norm": 0.19443346560001373, "learning_rate": 1.7655416715325627e-05, "loss": 0.3956, "step": 166250 }, { "epoch": 6.473587917007279, "grad_norm": 38.29069519042969, "learning_rate": 1.764568492350812e-05, "loss": 0.3994, "step": 166300 }, { "epoch": 6.475534275370781, "grad_norm": 51.387107849121094, "learning_rate": 1.7635953131690608e-05, "loss": 0.4514, "step": 166350 }, { "epoch": 6.477480633734283, "grad_norm": 80.25624084472656, "learning_rate": 1.7626221339873096e-05, "loss": 0.3383, "step": 166400 }, { "epoch": 6.479426992097785, "grad_norm": 2.4838273525238037, "learning_rate": 1.761648954805559e-05, "loss": 0.4287, "step": 166450 }, { "epoch": 6.481373350461287, "grad_norm": 144.6028594970703, "learning_rate": 1.760675775623808e-05, "loss": 0.5075, "step": 166500 }, { "epoch": 6.483319708824789, "grad_norm": 69.26531219482422, "learning_rate": 1.759702596442057e-05, "loss": 0.4596, "step": 166550 }, { "epoch": 6.485266067188291, "grad_norm": 8.002422332763672, "learning_rate": 1.7587294172603062e-05, "loss": 0.4063, "step": 166600 }, { "epoch": 6.4872124255517924, "grad_norm": 101.27769470214844, "learning_rate": 1.757756238078555e-05, "loss": 0.4411, "step": 166650 }, { "epoch": 6.489158783915294, "grad_norm": 66.18395233154297, "learning_rate": 1.7567830588968043e-05, "loss": 0.3988, "step": 166700 }, { "epoch": 6.491105142278796, "grad_norm": 27.5322322845459, "learning_rate": 1.7558293432986883e-05, "loss": 0.4759, "step": 166750 }, { "epoch": 6.493051500642299, "grad_norm": 4.20414400100708, "learning_rate": 1.7548561641169372e-05, "loss": 0.4957, "step": 166800 }, { "epoch": 6.4949978590058, "grad_norm": 24.160593032836914, "learning_rate": 1.753882984935186e-05, "loss": 0.4469, "step": 166850 }, { "epoch": 6.496944217369302, "grad_norm": 7.635683059692383, "learning_rate": 1.7529098057534353e-05, "loss": 0.3965, "step": 166900 }, { "epoch": 6.498890575732804, "grad_norm": 34.403011322021484, "learning_rate": 1.7519366265716845e-05, "loss": 0.3793, "step": 166950 }, { "epoch": 6.500836934096306, "grad_norm": 32.63420104980469, "learning_rate": 1.7509634473899337e-05, "loss": 0.396, "step": 167000 }, { "epoch": 6.5027832924598075, "grad_norm": 1.6379088163375854, "learning_rate": 1.7499902682081826e-05, "loss": 0.3962, "step": 167050 }, { "epoch": 6.504729650823309, "grad_norm": 39.58858108520508, "learning_rate": 1.7490170890264315e-05, "loss": 0.4293, "step": 167100 }, { "epoch": 6.506676009186812, "grad_norm": 0.21514013409614563, "learning_rate": 1.7480439098446807e-05, "loss": 0.5195, "step": 167150 }, { "epoch": 6.508622367550314, "grad_norm": 4.564968109130859, "learning_rate": 1.7470707306629296e-05, "loss": 0.4707, "step": 167200 }, { "epoch": 6.510568725913815, "grad_norm": 15.301307678222656, "learning_rate": 1.7460975514811788e-05, "loss": 0.3979, "step": 167250 }, { "epoch": 6.512515084277317, "grad_norm": 3.09021258354187, "learning_rate": 1.745124372299428e-05, "loss": 0.4157, "step": 167300 }, { "epoch": 6.514461442640819, "grad_norm": 123.56671905517578, "learning_rate": 1.744151193117677e-05, "loss": 0.4439, "step": 167350 }, { "epoch": 6.516407801004321, "grad_norm": 3.495093822479248, "learning_rate": 1.743178013935926e-05, "loss": 0.5308, "step": 167400 }, { "epoch": 6.5183541593678225, "grad_norm": 1.8619613647460938, "learning_rate": 1.742204834754175e-05, "loss": 0.4872, "step": 167450 }, { "epoch": 6.520300517731325, "grad_norm": 2.8039398193359375, "learning_rate": 1.741231655572424e-05, "loss": 0.4792, "step": 167500 }, { "epoch": 6.522246876094827, "grad_norm": 44.60377883911133, "learning_rate": 1.740258476390673e-05, "loss": 0.409, "step": 167550 }, { "epoch": 6.524193234458329, "grad_norm": 22.685863494873047, "learning_rate": 1.739285297208922e-05, "loss": 0.4477, "step": 167600 }, { "epoch": 6.52613959282183, "grad_norm": 30.780317306518555, "learning_rate": 1.7383121180271715e-05, "loss": 0.4951, "step": 167650 }, { "epoch": 6.528085951185332, "grad_norm": 21.907108306884766, "learning_rate": 1.7373389388454204e-05, "loss": 0.3782, "step": 167700 }, { "epoch": 6.530032309548834, "grad_norm": 22.628963470458984, "learning_rate": 1.7363657596636693e-05, "loss": 0.4514, "step": 167750 }, { "epoch": 6.531978667912336, "grad_norm": 18.88793182373047, "learning_rate": 1.7353925804819185e-05, "loss": 0.3776, "step": 167800 }, { "epoch": 6.533925026275838, "grad_norm": 19.25441551208496, "learning_rate": 1.7344194013001674e-05, "loss": 0.4444, "step": 167850 }, { "epoch": 6.53587138463934, "grad_norm": 6.278541564941406, "learning_rate": 1.7334462221184163e-05, "loss": 0.4397, "step": 167900 }, { "epoch": 6.537817743002842, "grad_norm": 10.813148498535156, "learning_rate": 1.7324730429366655e-05, "loss": 0.4902, "step": 167950 }, { "epoch": 6.539764101366344, "grad_norm": 1.8225017786026, "learning_rate": 1.7314998637549147e-05, "loss": 0.4855, "step": 168000 }, { "epoch": 6.541710459729845, "grad_norm": 14.661171913146973, "learning_rate": 1.730526684573164e-05, "loss": 0.4139, "step": 168050 }, { "epoch": 6.543656818093347, "grad_norm": 51.11988830566406, "learning_rate": 1.7295535053914128e-05, "loss": 0.396, "step": 168100 }, { "epoch": 6.545603176456849, "grad_norm": Infinity, "learning_rate": 1.728599789793297e-05, "loss": 0.4702, "step": 168150 }, { "epoch": 6.547549534820351, "grad_norm": 11.350605010986328, "learning_rate": 1.727626610611546e-05, "loss": 0.4534, "step": 168200 }, { "epoch": 6.549495893183853, "grad_norm": 18.933795928955078, "learning_rate": 1.726653431429795e-05, "loss": 0.4438, "step": 168250 }, { "epoch": 6.551442251547355, "grad_norm": 0.5532538294792175, "learning_rate": 1.7256802522480438e-05, "loss": 0.3604, "step": 168300 }, { "epoch": 6.553388609910857, "grad_norm": 269.0112609863281, "learning_rate": 1.724707073066293e-05, "loss": 0.4358, "step": 168350 }, { "epoch": 6.555334968274359, "grad_norm": 4.343705654144287, "learning_rate": 1.7237338938845422e-05, "loss": 0.411, "step": 168400 }, { "epoch": 6.5572813266378605, "grad_norm": 13.431562423706055, "learning_rate": 1.722760714702791e-05, "loss": 0.4616, "step": 168450 }, { "epoch": 6.559227685001362, "grad_norm": 0.5652437210083008, "learning_rate": 1.7217875355210403e-05, "loss": 0.5287, "step": 168500 }, { "epoch": 6.561174043364864, "grad_norm": 41.882225036621094, "learning_rate": 1.7208143563392892e-05, "loss": 0.5048, "step": 168550 }, { "epoch": 6.563120401728366, "grad_norm": 110.62078094482422, "learning_rate": 1.7198411771575384e-05, "loss": 0.4007, "step": 168600 }, { "epoch": 6.565066760091868, "grad_norm": 0.5447094440460205, "learning_rate": 1.7188679979757873e-05, "loss": 0.3892, "step": 168650 }, { "epoch": 6.56701311845537, "grad_norm": 0.82981276512146, "learning_rate": 1.7178948187940362e-05, "loss": 0.4663, "step": 168700 }, { "epoch": 6.568959476818872, "grad_norm": 28.209880828857422, "learning_rate": 1.7169216396122854e-05, "loss": 0.3678, "step": 168750 }, { "epoch": 6.570905835182374, "grad_norm": 1.1150568723678589, "learning_rate": 1.7159484604305346e-05, "loss": 0.4142, "step": 168800 }, { "epoch": 6.5728521935458755, "grad_norm": 33.58174133300781, "learning_rate": 1.7149752812487835e-05, "loss": 0.4737, "step": 168850 }, { "epoch": 6.574798551909377, "grad_norm": 7.121500015258789, "learning_rate": 1.7140021020670327e-05, "loss": 0.4309, "step": 168900 }, { "epoch": 6.576744910272879, "grad_norm": 28.292343139648438, "learning_rate": 1.7130289228852816e-05, "loss": 0.4633, "step": 168950 }, { "epoch": 6.578691268636382, "grad_norm": 3.615755558013916, "learning_rate": 1.7120557437035308e-05, "loss": 0.3922, "step": 169000 }, { "epoch": 6.580637626999883, "grad_norm": 26.641544342041016, "learning_rate": 1.7110825645217797e-05, "loss": 0.4421, "step": 169050 }, { "epoch": 6.582583985363385, "grad_norm": 61.53840255737305, "learning_rate": 1.710109385340029e-05, "loss": 0.4029, "step": 169100 }, { "epoch": 6.584530343726887, "grad_norm": 60.58290100097656, "learning_rate": 1.709136206158278e-05, "loss": 0.4701, "step": 169150 }, { "epoch": 6.586476702090389, "grad_norm": 0.7324537634849548, "learning_rate": 1.708163026976527e-05, "loss": 0.3948, "step": 169200 }, { "epoch": 6.5884230604538905, "grad_norm": 5.871123790740967, "learning_rate": 1.707189847794776e-05, "loss": 0.426, "step": 169250 }, { "epoch": 6.590369418817392, "grad_norm": 3.628249406814575, "learning_rate": 1.706216668613025e-05, "loss": 0.3528, "step": 169300 }, { "epoch": 6.592315777180895, "grad_norm": 63.30718994140625, "learning_rate": 1.705243489431274e-05, "loss": 0.4941, "step": 169350 }, { "epoch": 6.594262135544397, "grad_norm": 28.927669525146484, "learning_rate": 1.7042703102495232e-05, "loss": 0.4034, "step": 169400 }, { "epoch": 6.596208493907898, "grad_norm": 110.71542358398438, "learning_rate": 1.7032971310677724e-05, "loss": 0.4462, "step": 169450 }, { "epoch": 6.5981548522714, "grad_norm": 5.976147651672363, "learning_rate": 1.7023239518860213e-05, "loss": 0.5044, "step": 169500 }, { "epoch": 6.600101210634902, "grad_norm": 497.32781982421875, "learning_rate": 1.7013507727042705e-05, "loss": 0.3666, "step": 169550 }, { "epoch": 6.602047568998404, "grad_norm": 17.364734649658203, "learning_rate": 1.7003775935225194e-05, "loss": 0.4531, "step": 169600 }, { "epoch": 6.6039939273619055, "grad_norm": 0.8668875694274902, "learning_rate": 1.6994044143407686e-05, "loss": 0.4635, "step": 169650 }, { "epoch": 6.605940285725408, "grad_norm": 16.016597747802734, "learning_rate": 1.6984312351590175e-05, "loss": 0.3438, "step": 169700 }, { "epoch": 6.60788664408891, "grad_norm": 19.273040771484375, "learning_rate": 1.6974580559772664e-05, "loss": 0.3719, "step": 169750 }, { "epoch": 6.609833002452412, "grad_norm": 27.25801658630371, "learning_rate": 1.6964848767955156e-05, "loss": 0.3639, "step": 169800 }, { "epoch": 6.611779360815913, "grad_norm": 29.864049911499023, "learning_rate": 1.6955116976137648e-05, "loss": 0.5545, "step": 169850 }, { "epoch": 6.613725719179415, "grad_norm": 81.00566864013672, "learning_rate": 1.6945385184320137e-05, "loss": 0.3032, "step": 169900 }, { "epoch": 6.615672077542917, "grad_norm": 16.994285583496094, "learning_rate": 1.693565339250263e-05, "loss": 0.5125, "step": 169950 }, { "epoch": 6.617618435906419, "grad_norm": 73.93651580810547, "learning_rate": 1.6925921600685118e-05, "loss": 0.4799, "step": 170000 }, { "epoch": 6.619564794269921, "grad_norm": 69.49459075927734, "learning_rate": 1.691618980886761e-05, "loss": 0.4788, "step": 170050 }, { "epoch": 6.621511152633423, "grad_norm": 35.26637268066406, "learning_rate": 1.69064580170501e-05, "loss": 0.4547, "step": 170100 }, { "epoch": 6.623457510996925, "grad_norm": 11.494912147521973, "learning_rate": 1.689672622523259e-05, "loss": 0.4528, "step": 170150 }, { "epoch": 6.625403869360427, "grad_norm": 0.7823959589004517, "learning_rate": 1.6886994433415083e-05, "loss": 0.431, "step": 170200 }, { "epoch": 6.6273502277239285, "grad_norm": 1.2535557746887207, "learning_rate": 1.6877262641597572e-05, "loss": 0.4663, "step": 170250 }, { "epoch": 6.62929658608743, "grad_norm": 20.921730041503906, "learning_rate": 1.686753084978006e-05, "loss": 0.3896, "step": 170300 }, { "epoch": 6.631242944450932, "grad_norm": 3.8683385848999023, "learning_rate": 1.6857799057962553e-05, "loss": 0.4316, "step": 170350 }, { "epoch": 6.633189302814435, "grad_norm": 24.812400817871094, "learning_rate": 1.6848067266145042e-05, "loss": 0.4339, "step": 170400 }, { "epoch": 6.635135661177936, "grad_norm": 28.19902229309082, "learning_rate": 1.6838335474327534e-05, "loss": 0.4019, "step": 170450 }, { "epoch": 6.637082019541438, "grad_norm": 10.423519134521484, "learning_rate": 1.6828603682510026e-05, "loss": 0.3909, "step": 170500 }, { "epoch": 6.63902837790494, "grad_norm": 68.85060119628906, "learning_rate": 1.6818871890692515e-05, "loss": 0.5066, "step": 170550 }, { "epoch": 6.640974736268442, "grad_norm": 0.14533741772174835, "learning_rate": 1.6809140098875007e-05, "loss": 0.4009, "step": 170600 }, { "epoch": 6.6429210946319435, "grad_norm": 14.585738182067871, "learning_rate": 1.6799408307057496e-05, "loss": 0.4941, "step": 170650 }, { "epoch": 6.644867452995445, "grad_norm": 10.630616188049316, "learning_rate": 1.6789676515239985e-05, "loss": 0.3578, "step": 170700 }, { "epoch": 6.646813811358948, "grad_norm": 87.86992645263672, "learning_rate": 1.6779944723422477e-05, "loss": 0.4039, "step": 170750 }, { "epoch": 6.64876016972245, "grad_norm": 7.54316520690918, "learning_rate": 1.6770212931604966e-05, "loss": 0.498, "step": 170800 }, { "epoch": 6.650706528085951, "grad_norm": 41.721473693847656, "learning_rate": 1.6760481139787458e-05, "loss": 0.4191, "step": 170850 }, { "epoch": 6.652652886449453, "grad_norm": 63.801448822021484, "learning_rate": 1.675074934796995e-05, "loss": 0.4441, "step": 170900 }, { "epoch": 6.654599244812955, "grad_norm": 47.36151885986328, "learning_rate": 1.674101755615244e-05, "loss": 0.3513, "step": 170950 }, { "epoch": 6.656545603176457, "grad_norm": 30.822750091552734, "learning_rate": 1.673128576433493e-05, "loss": 0.3446, "step": 171000 }, { "epoch": 6.6584919615399585, "grad_norm": 20.216995239257812, "learning_rate": 1.672155397251742e-05, "loss": 0.5555, "step": 171050 }, { "epoch": 6.66043831990346, "grad_norm": 38.93817901611328, "learning_rate": 1.6711822180699912e-05, "loss": 0.4792, "step": 171100 }, { "epoch": 6.662384678266963, "grad_norm": 17.984939575195312, "learning_rate": 1.67020903888824e-05, "loss": 0.5292, "step": 171150 }, { "epoch": 6.664331036630465, "grad_norm": 6.522416114807129, "learning_rate": 1.6692358597064893e-05, "loss": 0.3394, "step": 171200 }, { "epoch": 6.666277394993966, "grad_norm": 2.8281068801879883, "learning_rate": 1.6682626805247385e-05, "loss": 0.4719, "step": 171250 }, { "epoch": 6.668223753357468, "grad_norm": 50.66307830810547, "learning_rate": 1.6672895013429874e-05, "loss": 0.3724, "step": 171300 }, { "epoch": 6.67017011172097, "grad_norm": 144.461669921875, "learning_rate": 1.6663163221612363e-05, "loss": 0.4004, "step": 171350 }, { "epoch": 6.672116470084472, "grad_norm": 36.30421829223633, "learning_rate": 1.6653431429794855e-05, "loss": 0.4618, "step": 171400 }, { "epoch": 6.6740628284479735, "grad_norm": 6.674366474151611, "learning_rate": 1.6643699637977344e-05, "loss": 0.3555, "step": 171450 }, { "epoch": 6.676009186811475, "grad_norm": 10.946220397949219, "learning_rate": 1.6633967846159836e-05, "loss": 0.4952, "step": 171500 }, { "epoch": 6.677955545174978, "grad_norm": 3.956291675567627, "learning_rate": 1.6624236054342328e-05, "loss": 0.4602, "step": 171550 }, { "epoch": 6.67990190353848, "grad_norm": 9.007140159606934, "learning_rate": 1.6614504262524817e-05, "loss": 0.4144, "step": 171600 }, { "epoch": 6.6818482619019814, "grad_norm": 8.70380687713623, "learning_rate": 1.660477247070731e-05, "loss": 0.3674, "step": 171650 }, { "epoch": 6.683794620265483, "grad_norm": 17.44054412841797, "learning_rate": 1.6595040678889798e-05, "loss": 0.4589, "step": 171700 }, { "epoch": 6.685740978628985, "grad_norm": 85.075439453125, "learning_rate": 1.6585308887072287e-05, "loss": 0.4352, "step": 171750 }, { "epoch": 6.687687336992487, "grad_norm": 5.803760051727295, "learning_rate": 1.657557709525478e-05, "loss": 0.4776, "step": 171800 }, { "epoch": 6.6896336953559885, "grad_norm": 90.54117584228516, "learning_rate": 1.6565845303437268e-05, "loss": 0.4478, "step": 171850 }, { "epoch": 6.691580053719491, "grad_norm": 36.33639907836914, "learning_rate": 1.655611351161976e-05, "loss": 0.403, "step": 171900 }, { "epoch": 6.693526412082993, "grad_norm": 193.7175750732422, "learning_rate": 1.6546381719802252e-05, "loss": 0.397, "step": 171950 }, { "epoch": 6.695472770446495, "grad_norm": 55.96407699584961, "learning_rate": 1.653664992798474e-05, "loss": 0.4223, "step": 172000 }, { "epoch": 6.6974191288099965, "grad_norm": 87.12764739990234, "learning_rate": 1.6526918136167233e-05, "loss": 0.4486, "step": 172050 }, { "epoch": 6.699365487173498, "grad_norm": 81.18550872802734, "learning_rate": 1.6517186344349722e-05, "loss": 0.4537, "step": 172100 }, { "epoch": 6.701311845537, "grad_norm": 16.30360221862793, "learning_rate": 1.6507454552532214e-05, "loss": 0.4293, "step": 172150 }, { "epoch": 6.703258203900502, "grad_norm": 0.892579972743988, "learning_rate": 1.6497722760714703e-05, "loss": 0.4127, "step": 172200 }, { "epoch": 6.705204562264004, "grad_norm": 90.41192626953125, "learning_rate": 1.6487990968897195e-05, "loss": 0.5064, "step": 172250 }, { "epoch": 6.707150920627506, "grad_norm": 17.318647384643555, "learning_rate": 1.6478259177079687e-05, "loss": 0.4245, "step": 172300 }, { "epoch": 6.709097278991008, "grad_norm": 46.909584045410156, "learning_rate": 1.6468722021098527e-05, "loss": 0.4503, "step": 172350 }, { "epoch": 6.71104363735451, "grad_norm": 20.43644905090332, "learning_rate": 1.6458990229281016e-05, "loss": 0.4684, "step": 172400 }, { "epoch": 6.7129899957180115, "grad_norm": 2.1099941730499268, "learning_rate": 1.644925843746351e-05, "loss": 0.4039, "step": 172450 }, { "epoch": 6.714936354081513, "grad_norm": 0.8152977824211121, "learning_rate": 1.643972128148235e-05, "loss": 0.4812, "step": 172500 }, { "epoch": 6.716882712445015, "grad_norm": 1.2001897096633911, "learning_rate": 1.6429989489664838e-05, "loss": 0.3548, "step": 172550 }, { "epoch": 6.718829070808518, "grad_norm": 8.137870788574219, "learning_rate": 1.6420257697847326e-05, "loss": 0.5458, "step": 172600 }, { "epoch": 6.720775429172019, "grad_norm": 2.851652145385742, "learning_rate": 1.641052590602982e-05, "loss": 0.4557, "step": 172650 }, { "epoch": 6.722721787535521, "grad_norm": 5.98255729675293, "learning_rate": 1.6400794114212307e-05, "loss": 0.3957, "step": 172700 }, { "epoch": 6.724668145899023, "grad_norm": 74.15946197509766, "learning_rate": 1.6391256958231148e-05, "loss": 0.5755, "step": 172750 }, { "epoch": 6.726614504262525, "grad_norm": 11.896809577941895, "learning_rate": 1.638152516641364e-05, "loss": 0.5065, "step": 172800 }, { "epoch": 6.7285608626260265, "grad_norm": 7.54029655456543, "learning_rate": 1.6371793374596132e-05, "loss": 0.404, "step": 172850 }, { "epoch": 6.730507220989528, "grad_norm": 47.70340347290039, "learning_rate": 1.6362061582778624e-05, "loss": 0.5577, "step": 172900 }, { "epoch": 6.732453579353031, "grad_norm": 29.179927825927734, "learning_rate": 1.6352329790961113e-05, "loss": 0.4032, "step": 172950 }, { "epoch": 6.734399937716533, "grad_norm": 43.555458068847656, "learning_rate": 1.6342597999143602e-05, "loss": 0.5284, "step": 173000 }, { "epoch": 6.736346296080034, "grad_norm": 2.3026933670043945, "learning_rate": 1.6332866207326094e-05, "loss": 0.5416, "step": 173050 }, { "epoch": 6.738292654443536, "grad_norm": 7.419118881225586, "learning_rate": 1.6323134415508583e-05, "loss": 0.4014, "step": 173100 }, { "epoch": 6.740239012807038, "grad_norm": 6.990537166595459, "learning_rate": 1.6313402623691075e-05, "loss": 0.4784, "step": 173150 }, { "epoch": 6.74218537117054, "grad_norm": 0.31381234526634216, "learning_rate": 1.6303670831873567e-05, "loss": 0.4983, "step": 173200 }, { "epoch": 6.7441317295340415, "grad_norm": 103.41786193847656, "learning_rate": 1.6293939040056056e-05, "loss": 0.4305, "step": 173250 }, { "epoch": 6.746078087897544, "grad_norm": 131.96315002441406, "learning_rate": 1.6284207248238548e-05, "loss": 0.5526, "step": 173300 }, { "epoch": 6.748024446261046, "grad_norm": 55.33472442626953, "learning_rate": 1.6274475456421037e-05, "loss": 0.5643, "step": 173350 }, { "epoch": 6.749970804624548, "grad_norm": 3.25771427154541, "learning_rate": 1.6264743664603526e-05, "loss": 0.426, "step": 173400 }, { "epoch": 6.7519171629880494, "grad_norm": 19.115589141845703, "learning_rate": 1.6255011872786018e-05, "loss": 0.4203, "step": 173450 }, { "epoch": 6.753863521351551, "grad_norm": 8.432541847229004, "learning_rate": 1.624528008096851e-05, "loss": 0.4443, "step": 173500 }, { "epoch": 6.755809879715053, "grad_norm": 38.60895538330078, "learning_rate": 1.6235548289151e-05, "loss": 0.4463, "step": 173550 }, { "epoch": 6.757756238078555, "grad_norm": 14.178308486938477, "learning_rate": 1.622581649733349e-05, "loss": 0.3639, "step": 173600 }, { "epoch": 6.759702596442057, "grad_norm": 7.151598930358887, "learning_rate": 1.621608470551598e-05, "loss": 0.5109, "step": 173650 }, { "epoch": 6.761648954805559, "grad_norm": 4.330681324005127, "learning_rate": 1.6206352913698472e-05, "loss": 0.4377, "step": 173700 }, { "epoch": 6.763595313169061, "grad_norm": 99.73262786865234, "learning_rate": 1.619662112188096e-05, "loss": 0.4738, "step": 173750 }, { "epoch": 6.765541671532563, "grad_norm": 58.88716506958008, "learning_rate": 1.618688933006345e-05, "loss": 0.4178, "step": 173800 }, { "epoch": 6.7674880298960645, "grad_norm": 20.461563110351562, "learning_rate": 1.6177157538245942e-05, "loss": 0.421, "step": 173850 }, { "epoch": 6.769434388259566, "grad_norm": 5.343242645263672, "learning_rate": 1.6167425746428434e-05, "loss": 0.5026, "step": 173900 }, { "epoch": 6.771380746623068, "grad_norm": 79.31763458251953, "learning_rate": 1.6157693954610923e-05, "loss": 0.3947, "step": 173950 }, { "epoch": 6.77332710498657, "grad_norm": 38.26629638671875, "learning_rate": 1.6147962162793415e-05, "loss": 0.4565, "step": 174000 }, { "epoch": 6.7752734633500715, "grad_norm": 34.237125396728516, "learning_rate": 1.6138230370975904e-05, "loss": 0.3903, "step": 174050 }, { "epoch": 6.777219821713574, "grad_norm": 14.353682518005371, "learning_rate": 1.6128498579158396e-05, "loss": 0.4097, "step": 174100 }, { "epoch": 6.779166180077076, "grad_norm": 43.9599609375, "learning_rate": 1.6118766787340885e-05, "loss": 0.4123, "step": 174150 }, { "epoch": 6.781112538440578, "grad_norm": 0.737567126750946, "learning_rate": 1.6109034995523377e-05, "loss": 0.507, "step": 174200 }, { "epoch": 6.7830588968040795, "grad_norm": 58.32870101928711, "learning_rate": 1.609930320370587e-05, "loss": 0.4334, "step": 174250 }, { "epoch": 6.785005255167581, "grad_norm": 12.36221694946289, "learning_rate": 1.6089571411888358e-05, "loss": 0.4775, "step": 174300 }, { "epoch": 6.786951613531083, "grad_norm": 19.081192016601562, "learning_rate": 1.607983962007085e-05, "loss": 0.4425, "step": 174350 }, { "epoch": 6.788897971894585, "grad_norm": 3.697744369506836, "learning_rate": 1.607010782825334e-05, "loss": 0.5261, "step": 174400 }, { "epoch": 6.790844330258087, "grad_norm": 9.81187915802002, "learning_rate": 1.6060376036435828e-05, "loss": 0.3813, "step": 174450 }, { "epoch": 6.792790688621589, "grad_norm": 4.128719806671143, "learning_rate": 1.605064424461832e-05, "loss": 0.4404, "step": 174500 }, { "epoch": 6.794737046985091, "grad_norm": 4.65837287902832, "learning_rate": 1.6040912452800812e-05, "loss": 0.5282, "step": 174550 }, { "epoch": 6.796683405348593, "grad_norm": 11.053204536437988, "learning_rate": 1.60311806609833e-05, "loss": 0.4531, "step": 174600 }, { "epoch": 6.7986297637120945, "grad_norm": 5.666337966918945, "learning_rate": 1.6021448869165793e-05, "loss": 0.3235, "step": 174650 }, { "epoch": 6.800576122075596, "grad_norm": 23.113445281982422, "learning_rate": 1.601171707734828e-05, "loss": 0.5475, "step": 174700 }, { "epoch": 6.802522480439098, "grad_norm": 24.457948684692383, "learning_rate": 1.6001985285530774e-05, "loss": 0.4446, "step": 174750 }, { "epoch": 6.804468838802601, "grad_norm": 36.229312896728516, "learning_rate": 1.5992253493713263e-05, "loss": 0.4678, "step": 174800 }, { "epoch": 6.806415197166102, "grad_norm": 59.58543395996094, "learning_rate": 1.598252170189575e-05, "loss": 0.4241, "step": 174850 }, { "epoch": 6.808361555529604, "grad_norm": 2.2173614501953125, "learning_rate": 1.5972789910078244e-05, "loss": 0.5051, "step": 174900 }, { "epoch": 6.810307913893106, "grad_norm": 0.8717502355575562, "learning_rate": 1.5963058118260736e-05, "loss": 0.4755, "step": 174950 }, { "epoch": 6.812254272256608, "grad_norm": 7.971965789794922, "learning_rate": 1.5953326326443225e-05, "loss": 0.4224, "step": 175000 }, { "epoch": 6.8142006306201095, "grad_norm": 216.59898376464844, "learning_rate": 1.5943594534625717e-05, "loss": 0.4491, "step": 175050 }, { "epoch": 6.816146988983611, "grad_norm": 42.34726333618164, "learning_rate": 1.5933862742808206e-05, "loss": 0.3901, "step": 175100 }, { "epoch": 6.818093347347114, "grad_norm": 3.8182296752929688, "learning_rate": 1.5924130950990698e-05, "loss": 0.4609, "step": 175150 }, { "epoch": 6.820039705710616, "grad_norm": 89.68109893798828, "learning_rate": 1.5914399159173187e-05, "loss": 0.4341, "step": 175200 }, { "epoch": 6.8219860640741175, "grad_norm": 0.164622500538826, "learning_rate": 1.590466736735568e-05, "loss": 0.5291, "step": 175250 }, { "epoch": 6.823932422437619, "grad_norm": 100.59160614013672, "learning_rate": 1.589493557553817e-05, "loss": 0.4748, "step": 175300 }, { "epoch": 6.825878780801121, "grad_norm": 6.656195640563965, "learning_rate": 1.588520378372066e-05, "loss": 0.465, "step": 175350 }, { "epoch": 6.827825139164623, "grad_norm": 56.103965759277344, "learning_rate": 1.587547199190315e-05, "loss": 0.5049, "step": 175400 }, { "epoch": 6.8297714975281245, "grad_norm": 1.6164345741271973, "learning_rate": 1.586574020008564e-05, "loss": 0.4027, "step": 175450 }, { "epoch": 6.831717855891627, "grad_norm": 2.5568299293518066, "learning_rate": 1.585600840826813e-05, "loss": 0.3161, "step": 175500 }, { "epoch": 6.833664214255129, "grad_norm": 107.04721069335938, "learning_rate": 1.584627661645062e-05, "loss": 0.381, "step": 175550 }, { "epoch": 6.835610572618631, "grad_norm": 27.296253204345703, "learning_rate": 1.5836544824633114e-05, "loss": 0.447, "step": 175600 }, { "epoch": 6.8375569309821325, "grad_norm": 16.71565818786621, "learning_rate": 1.5826813032815603e-05, "loss": 0.5042, "step": 175650 }, { "epoch": 6.839503289345634, "grad_norm": 22.469036102294922, "learning_rate": 1.5817081240998095e-05, "loss": 0.3793, "step": 175700 }, { "epoch": 6.841449647709136, "grad_norm": 26.498794555664062, "learning_rate": 1.5807349449180584e-05, "loss": 0.3701, "step": 175750 }, { "epoch": 6.843396006072638, "grad_norm": 3.5625040531158447, "learning_rate": 1.5797617657363076e-05, "loss": 0.3961, "step": 175800 }, { "epoch": 6.84534236443614, "grad_norm": 6.342215061187744, "learning_rate": 1.5787885865545565e-05, "loss": 0.4494, "step": 175850 }, { "epoch": 6.847288722799642, "grad_norm": 2.209322214126587, "learning_rate": 1.5778154073728053e-05, "loss": 0.5147, "step": 175900 }, { "epoch": 6.849235081163144, "grad_norm": 18.162052154541016, "learning_rate": 1.5768422281910546e-05, "loss": 0.4878, "step": 175950 }, { "epoch": 6.851181439526646, "grad_norm": 34.20637130737305, "learning_rate": 1.5758690490093038e-05, "loss": 0.4528, "step": 176000 }, { "epoch": 6.8531277978901475, "grad_norm": 76.78726959228516, "learning_rate": 1.5748958698275526e-05, "loss": 0.5245, "step": 176050 }, { "epoch": 6.855074156253649, "grad_norm": 16.091270446777344, "learning_rate": 1.573922690645802e-05, "loss": 0.4058, "step": 176100 }, { "epoch": 6.857020514617151, "grad_norm": 25.3853702545166, "learning_rate": 1.5729495114640507e-05, "loss": 0.4587, "step": 176150 }, { "epoch": 6.858966872980654, "grad_norm": 28.858661651611328, "learning_rate": 1.5719763322823e-05, "loss": 0.4355, "step": 176200 }, { "epoch": 6.860913231344155, "grad_norm": 16.213464736938477, "learning_rate": 1.571003153100549e-05, "loss": 0.4686, "step": 176250 }, { "epoch": 6.862859589707657, "grad_norm": 21.057289123535156, "learning_rate": 1.570029973918798e-05, "loss": 0.4666, "step": 176300 }, { "epoch": 6.864805948071159, "grad_norm": 34.83915328979492, "learning_rate": 1.5690567947370473e-05, "loss": 0.3924, "step": 176350 }, { "epoch": 6.866752306434661, "grad_norm": 45.161376953125, "learning_rate": 1.568083615555296e-05, "loss": 0.4301, "step": 176400 }, { "epoch": 6.8686986647981625, "grad_norm": 19.5456485748291, "learning_rate": 1.567110436373545e-05, "loss": 0.4708, "step": 176450 }, { "epoch": 6.870645023161664, "grad_norm": 9.169517517089844, "learning_rate": 1.5661372571917943e-05, "loss": 0.3692, "step": 176500 }, { "epoch": 6.872591381525167, "grad_norm": 27.90553092956543, "learning_rate": 1.565164078010043e-05, "loss": 0.4442, "step": 176550 }, { "epoch": 6.874537739888669, "grad_norm": 6.996828556060791, "learning_rate": 1.5641908988282924e-05, "loss": 0.3687, "step": 176600 }, { "epoch": 6.87648409825217, "grad_norm": 7.604318141937256, "learning_rate": 1.5632177196465416e-05, "loss": 0.3625, "step": 176650 }, { "epoch": 6.878430456615672, "grad_norm": 1.9483892917633057, "learning_rate": 1.5622445404647904e-05, "loss": 0.5133, "step": 176700 }, { "epoch": 6.880376814979174, "grad_norm": 13.37495231628418, "learning_rate": 1.5612713612830397e-05, "loss": 0.4666, "step": 176750 }, { "epoch": 6.882323173342676, "grad_norm": 29.518442153930664, "learning_rate": 1.5602981821012885e-05, "loss": 0.4125, "step": 176800 }, { "epoch": 6.8842695317061775, "grad_norm": 19.60718536376953, "learning_rate": 1.5593250029195378e-05, "loss": 0.4762, "step": 176850 }, { "epoch": 6.886215890069679, "grad_norm": 58.63534164428711, "learning_rate": 1.5583518237377866e-05, "loss": 0.4558, "step": 176900 }, { "epoch": 6.888162248433181, "grad_norm": 52.750762939453125, "learning_rate": 1.5573786445560355e-05, "loss": 0.4035, "step": 176950 }, { "epoch": 6.890108606796684, "grad_norm": 14.705206871032715, "learning_rate": 1.5564054653742847e-05, "loss": 0.3832, "step": 177000 }, { "epoch": 6.8920549651601855, "grad_norm": 13.544828414916992, "learning_rate": 1.555432286192534e-05, "loss": 0.4492, "step": 177050 }, { "epoch": 6.894001323523687, "grad_norm": 17.25482749938965, "learning_rate": 1.554478570594418e-05, "loss": 0.4464, "step": 177100 }, { "epoch": 6.895947681887189, "grad_norm": 9.565489768981934, "learning_rate": 1.5535053914126672e-05, "loss": 0.4777, "step": 177150 }, { "epoch": 6.897894040250691, "grad_norm": 12.322957992553711, "learning_rate": 1.552532212230916e-05, "loss": 0.4648, "step": 177200 }, { "epoch": 6.8998403986141925, "grad_norm": 24.077999114990234, "learning_rate": 1.551559033049165e-05, "loss": 0.4377, "step": 177250 }, { "epoch": 6.901786756977694, "grad_norm": 13.070788383483887, "learning_rate": 1.5505858538674142e-05, "loss": 0.5709, "step": 177300 }, { "epoch": 6.903733115341197, "grad_norm": 12.48793888092041, "learning_rate": 1.549612674685663e-05, "loss": 0.4725, "step": 177350 }, { "epoch": 6.905679473704699, "grad_norm": 4.744405269622803, "learning_rate": 1.5486394955039123e-05, "loss": 0.4766, "step": 177400 }, { "epoch": 6.9076258320682005, "grad_norm": 24.03437042236328, "learning_rate": 1.5476663163221615e-05, "loss": 0.3711, "step": 177450 }, { "epoch": 6.909572190431702, "grad_norm": 57.78565216064453, "learning_rate": 1.5466931371404104e-05, "loss": 0.4055, "step": 177500 }, { "epoch": 6.911518548795204, "grad_norm": 20.376176834106445, "learning_rate": 1.5457199579586596e-05, "loss": 0.5128, "step": 177550 }, { "epoch": 6.913464907158706, "grad_norm": 77.71524047851562, "learning_rate": 1.5447467787769085e-05, "loss": 0.3637, "step": 177600 }, { "epoch": 6.9154112655222075, "grad_norm": 11.565607070922852, "learning_rate": 1.5437735995951574e-05, "loss": 0.411, "step": 177650 }, { "epoch": 6.91735762388571, "grad_norm": 3.730390787124634, "learning_rate": 1.5428004204134066e-05, "loss": 0.4483, "step": 177700 }, { "epoch": 6.919303982249212, "grad_norm": 0.2226252406835556, "learning_rate": 1.5418272412316555e-05, "loss": 0.4548, "step": 177750 }, { "epoch": 6.921250340612714, "grad_norm": 9.682961463928223, "learning_rate": 1.5408540620499047e-05, "loss": 0.4694, "step": 177800 }, { "epoch": 6.9231966989762155, "grad_norm": 29.561433792114258, "learning_rate": 1.539880882868154e-05, "loss": 0.413, "step": 177850 }, { "epoch": 6.925143057339717, "grad_norm": 1.1223008632659912, "learning_rate": 1.5389077036864028e-05, "loss": 0.4699, "step": 177900 }, { "epoch": 6.927089415703219, "grad_norm": 39.684200286865234, "learning_rate": 1.537934524504652e-05, "loss": 0.5002, "step": 177950 }, { "epoch": 6.929035774066721, "grad_norm": 20.282854080200195, "learning_rate": 1.536961345322901e-05, "loss": 0.4282, "step": 178000 }, { "epoch": 6.930982132430223, "grad_norm": 2.0944101810455322, "learning_rate": 1.5359881661411497e-05, "loss": 0.3972, "step": 178050 }, { "epoch": 6.932928490793725, "grad_norm": 23.140995025634766, "learning_rate": 1.535014986959399e-05, "loss": 0.3887, "step": 178100 }, { "epoch": 6.934874849157227, "grad_norm": 5.677530765533447, "learning_rate": 1.5340418077776482e-05, "loss": 0.3421, "step": 178150 }, { "epoch": 6.936821207520729, "grad_norm": 80.22833251953125, "learning_rate": 1.5330686285958974e-05, "loss": 0.3982, "step": 178200 }, { "epoch": 6.9387675658842305, "grad_norm": 20.94641876220703, "learning_rate": 1.5320954494141463e-05, "loss": 0.4058, "step": 178250 }, { "epoch": 6.940713924247732, "grad_norm": 10.123577117919922, "learning_rate": 1.531122270232395e-05, "loss": 0.524, "step": 178300 }, { "epoch": 6.942660282611234, "grad_norm": 74.12499237060547, "learning_rate": 1.5301490910506444e-05, "loss": 0.3525, "step": 178350 }, { "epoch": 6.944606640974737, "grad_norm": 72.55623626708984, "learning_rate": 1.5291759118688933e-05, "loss": 0.4571, "step": 178400 }, { "epoch": 6.946552999338238, "grad_norm": 32.10700225830078, "learning_rate": 1.5282027326871425e-05, "loss": 0.3453, "step": 178450 }, { "epoch": 6.94849935770174, "grad_norm": 14.812585830688477, "learning_rate": 1.5272295535053917e-05, "loss": 0.5218, "step": 178500 }, { "epoch": 6.950445716065242, "grad_norm": 43.18041229248047, "learning_rate": 1.5262563743236406e-05, "loss": 0.4537, "step": 178550 }, { "epoch": 6.952392074428744, "grad_norm": 45.910160064697266, "learning_rate": 1.5252831951418898e-05, "loss": 0.4768, "step": 178600 }, { "epoch": 6.9543384327922455, "grad_norm": 0.9708271026611328, "learning_rate": 1.5243100159601387e-05, "loss": 0.4189, "step": 178650 }, { "epoch": 6.956284791155747, "grad_norm": 7.688144207000732, "learning_rate": 1.5233368367783875e-05, "loss": 0.4302, "step": 178700 }, { "epoch": 6.95823114951925, "grad_norm": 5.482926845550537, "learning_rate": 1.522363657596637e-05, "loss": 0.4483, "step": 178750 }, { "epoch": 6.960177507882752, "grad_norm": 10.415481567382812, "learning_rate": 1.5213904784148858e-05, "loss": 0.3071, "step": 178800 }, { "epoch": 6.9621238662462535, "grad_norm": 82.0234603881836, "learning_rate": 1.5204172992331347e-05, "loss": 0.4352, "step": 178850 }, { "epoch": 6.964070224609755, "grad_norm": 16.34450340270996, "learning_rate": 1.5194441200513839e-05, "loss": 0.5094, "step": 178900 }, { "epoch": 6.966016582973257, "grad_norm": 15.32991886138916, "learning_rate": 1.518470940869633e-05, "loss": 0.4888, "step": 178950 }, { "epoch": 6.967962941336759, "grad_norm": 0.612285315990448, "learning_rate": 1.5174977616878822e-05, "loss": 0.4575, "step": 179000 }, { "epoch": 6.9699092997002605, "grad_norm": 811.0374755859375, "learning_rate": 1.516524582506131e-05, "loss": 0.4793, "step": 179050 }, { "epoch": 6.971855658063763, "grad_norm": 62.36554718017578, "learning_rate": 1.5155514033243801e-05, "loss": 0.403, "step": 179100 }, { "epoch": 6.973802016427265, "grad_norm": 11.841583251953125, "learning_rate": 1.5145782241426293e-05, "loss": 0.4719, "step": 179150 }, { "epoch": 6.975748374790767, "grad_norm": 11.542821884155273, "learning_rate": 1.5136050449608782e-05, "loss": 0.3526, "step": 179200 }, { "epoch": 6.9776947331542685, "grad_norm": 56.73066329956055, "learning_rate": 1.5126318657791272e-05, "loss": 0.5104, "step": 179250 }, { "epoch": 6.97964109151777, "grad_norm": 20.087482452392578, "learning_rate": 1.5116586865973765e-05, "loss": 0.3628, "step": 179300 }, { "epoch": 6.981587449881272, "grad_norm": 68.25955200195312, "learning_rate": 1.5106855074156253e-05, "loss": 0.3901, "step": 179350 }, { "epoch": 6.983533808244774, "grad_norm": 20.5374813079834, "learning_rate": 1.5097123282338746e-05, "loss": 0.4973, "step": 179400 }, { "epoch": 6.9854801666082755, "grad_norm": 0.9383215308189392, "learning_rate": 1.5087391490521236e-05, "loss": 0.5138, "step": 179450 }, { "epoch": 6.987426524971778, "grad_norm": 39.858116149902344, "learning_rate": 1.5077659698703725e-05, "loss": 0.4192, "step": 179500 }, { "epoch": 6.98937288333528, "grad_norm": 17.773500442504883, "learning_rate": 1.5067927906886217e-05, "loss": 0.4746, "step": 179550 }, { "epoch": 6.991319241698782, "grad_norm": 21.890804290771484, "learning_rate": 1.5058196115068708e-05, "loss": 0.4325, "step": 179600 }, { "epoch": 6.9932656000622835, "grad_norm": 15.152179718017578, "learning_rate": 1.50484643232512e-05, "loss": 0.5038, "step": 179650 }, { "epoch": 6.995211958425785, "grad_norm": 1.113182783126831, "learning_rate": 1.5038732531433689e-05, "loss": 0.4203, "step": 179700 }, { "epoch": 6.997158316789287, "grad_norm": 79.75494384765625, "learning_rate": 1.5029000739616177e-05, "loss": 0.3909, "step": 179750 }, { "epoch": 6.999104675152789, "grad_norm": 64.26984405517578, "learning_rate": 1.501926894779867e-05, "loss": 0.4078, "step": 179800 }, { "epoch": 7.0, "eval_accuracy": 0.7909221845926272, "eval_f1_macro": 0.7336279851621574, "eval_f1_weighted": 0.7867826649751204, "eval_loss": 0.838506817817688, "eval_roc_auc": 0.9392839874115949, "eval_runtime": 30.4513, "eval_samples_per_second": 843.609, "eval_steps_per_second": 105.48, "step": 179823 }, { "epoch": 7.001051033516291, "grad_norm": 8.177434921264648, "learning_rate": 1.500953715598116e-05, "loss": 0.3495, "step": 179850 }, { "epoch": 7.002997391879793, "grad_norm": 2.0902533531188965, "learning_rate": 1.4999805364163649e-05, "loss": 0.3818, "step": 179900 }, { "epoch": 7.004943750243295, "grad_norm": 0.11049611121416092, "learning_rate": 1.4990073572346141e-05, "loss": 0.3803, "step": 179950 }, { "epoch": 7.006890108606797, "grad_norm": 23.048192977905273, "learning_rate": 1.4980341780528631e-05, "loss": 0.4413, "step": 180000 }, { "epoch": 7.0088364669702985, "grad_norm": 3.431915283203125, "learning_rate": 1.4970609988711124e-05, "loss": 0.3874, "step": 180050 }, { "epoch": 7.0107828253338, "grad_norm": 48.032997131347656, "learning_rate": 1.4960878196893612e-05, "loss": 0.3697, "step": 180100 }, { "epoch": 7.012729183697302, "grad_norm": 902.7916870117188, "learning_rate": 1.4951146405076103e-05, "loss": 0.4136, "step": 180150 }, { "epoch": 7.014675542060804, "grad_norm": 99.59768676757812, "learning_rate": 1.4941414613258595e-05, "loss": 0.4271, "step": 180200 }, { "epoch": 7.0166219004243064, "grad_norm": 10.888458251953125, "learning_rate": 1.4931682821441084e-05, "loss": 0.4641, "step": 180250 }, { "epoch": 7.018568258787808, "grad_norm": 16.093708038330078, "learning_rate": 1.4921951029623574e-05, "loss": 0.3677, "step": 180300 }, { "epoch": 7.02051461715131, "grad_norm": 40.47978973388672, "learning_rate": 1.4912219237806067e-05, "loss": 0.3844, "step": 180350 }, { "epoch": 7.022460975514812, "grad_norm": 6.511009216308594, "learning_rate": 1.4902487445988555e-05, "loss": 0.3314, "step": 180400 }, { "epoch": 7.0244073338783135, "grad_norm": 0.914084792137146, "learning_rate": 1.4892755654171048e-05, "loss": 0.4399, "step": 180450 }, { "epoch": 7.026353692241815, "grad_norm": 54.379539489746094, "learning_rate": 1.4883023862353538e-05, "loss": 0.4319, "step": 180500 }, { "epoch": 7.028300050605317, "grad_norm": 22.315540313720703, "learning_rate": 1.4873292070536027e-05, "loss": 0.3237, "step": 180550 }, { "epoch": 7.03024640896882, "grad_norm": 35.61616897583008, "learning_rate": 1.4863560278718519e-05, "loss": 0.3599, "step": 180600 }, { "epoch": 7.0321927673323215, "grad_norm": 4.870152950286865, "learning_rate": 1.4853828486901008e-05, "loss": 0.4909, "step": 180650 }, { "epoch": 7.034139125695823, "grad_norm": 9.320188522338867, "learning_rate": 1.4844096695083502e-05, "loss": 0.3996, "step": 180700 }, { "epoch": 7.036085484059325, "grad_norm": 16.091428756713867, "learning_rate": 1.483436490326599e-05, "loss": 0.4107, "step": 180750 }, { "epoch": 7.038031842422827, "grad_norm": 21.28891372680664, "learning_rate": 1.482463311144848e-05, "loss": 0.2765, "step": 180800 }, { "epoch": 7.0399782007863285, "grad_norm": 30.57461166381836, "learning_rate": 1.4814901319630971e-05, "loss": 0.3346, "step": 180850 }, { "epoch": 7.04192455914983, "grad_norm": 132.38194274902344, "learning_rate": 1.4805169527813462e-05, "loss": 0.2986, "step": 180900 }, { "epoch": 7.043870917513333, "grad_norm": 48.24897384643555, "learning_rate": 1.479543773599595e-05, "loss": 0.3093, "step": 180950 }, { "epoch": 7.045817275876835, "grad_norm": 2.08687686920166, "learning_rate": 1.4785705944178443e-05, "loss": 0.4334, "step": 181000 }, { "epoch": 7.0477636342403365, "grad_norm": 0.14371348917484283, "learning_rate": 1.4775974152360933e-05, "loss": 0.4134, "step": 181050 }, { "epoch": 7.049709992603838, "grad_norm": 125.48160552978516, "learning_rate": 1.4766242360543426e-05, "loss": 0.3681, "step": 181100 }, { "epoch": 7.05165635096734, "grad_norm": 29.674253463745117, "learning_rate": 1.4756510568725914e-05, "loss": 0.3837, "step": 181150 }, { "epoch": 7.053602709330842, "grad_norm": 28.395517349243164, "learning_rate": 1.4746778776908405e-05, "loss": 0.383, "step": 181200 }, { "epoch": 7.0555490676943435, "grad_norm": 24.796735763549805, "learning_rate": 1.4737046985090897e-05, "loss": 0.4227, "step": 181250 }, { "epoch": 7.057495426057846, "grad_norm": 0.24337196350097656, "learning_rate": 1.4727315193273386e-05, "loss": 0.3883, "step": 181300 }, { "epoch": 7.059441784421348, "grad_norm": 0.3522365391254425, "learning_rate": 1.4717583401455876e-05, "loss": 0.4089, "step": 181350 }, { "epoch": 7.06138814278485, "grad_norm": 12.888083457946777, "learning_rate": 1.4707851609638368e-05, "loss": 0.3068, "step": 181400 }, { "epoch": 7.0633345011483515, "grad_norm": 22.178659439086914, "learning_rate": 1.4698119817820857e-05, "loss": 0.3947, "step": 181450 }, { "epoch": 7.065280859511853, "grad_norm": 1.06340754032135, "learning_rate": 1.468838802600335e-05, "loss": 0.3348, "step": 181500 }, { "epoch": 7.067227217875355, "grad_norm": 2.6738626956939697, "learning_rate": 1.467865623418584e-05, "loss": 0.4281, "step": 181550 }, { "epoch": 7.069173576238857, "grad_norm": 181.56077575683594, "learning_rate": 1.4668924442368329e-05, "loss": 0.5483, "step": 181600 }, { "epoch": 7.071119934602359, "grad_norm": 3.7843399047851562, "learning_rate": 1.4659192650550821e-05, "loss": 0.4227, "step": 181650 }, { "epoch": 7.073066292965861, "grad_norm": 37.30977249145508, "learning_rate": 1.464946085873331e-05, "loss": 0.3173, "step": 181700 }, { "epoch": 7.075012651329363, "grad_norm": 7.4201178550720215, "learning_rate": 1.46397290669158e-05, "loss": 0.336, "step": 181750 }, { "epoch": 7.076959009692865, "grad_norm": 11.621960639953613, "learning_rate": 1.4629997275098292e-05, "loss": 0.3716, "step": 181800 }, { "epoch": 7.0789053680563665, "grad_norm": 4.277522563934326, "learning_rate": 1.4620265483280781e-05, "loss": 0.4267, "step": 181850 }, { "epoch": 7.080851726419868, "grad_norm": 0.14336463809013367, "learning_rate": 1.4610533691463273e-05, "loss": 0.3807, "step": 181900 }, { "epoch": 7.08279808478337, "grad_norm": 53.821067810058594, "learning_rate": 1.4600801899645764e-05, "loss": 0.3405, "step": 181950 }, { "epoch": 7.084744443146872, "grad_norm": 70.37870788574219, "learning_rate": 1.4591070107828253e-05, "loss": 0.3819, "step": 182000 }, { "epoch": 7.0866908015103744, "grad_norm": 0.044716548174619675, "learning_rate": 1.4581338316010745e-05, "loss": 0.3157, "step": 182050 }, { "epoch": 7.088637159873876, "grad_norm": 5.5557403564453125, "learning_rate": 1.4571606524193235e-05, "loss": 0.367, "step": 182100 }, { "epoch": 7.090583518237378, "grad_norm": 2.494814157485962, "learning_rate": 1.4561874732375727e-05, "loss": 0.3994, "step": 182150 }, { "epoch": 7.09252987660088, "grad_norm": 1.2038137912750244, "learning_rate": 1.4552142940558216e-05, "loss": 0.3463, "step": 182200 }, { "epoch": 7.0944762349643815, "grad_norm": 46.449092864990234, "learning_rate": 1.4542411148740707e-05, "loss": 0.544, "step": 182250 }, { "epoch": 7.096422593327883, "grad_norm": 0.4709734320640564, "learning_rate": 1.4532679356923199e-05, "loss": 0.2793, "step": 182300 }, { "epoch": 7.098368951691385, "grad_norm": 4.249170303344727, "learning_rate": 1.4522947565105688e-05, "loss": 0.4087, "step": 182350 }, { "epoch": 7.100315310054888, "grad_norm": 40.74331283569336, "learning_rate": 1.4513215773288178e-05, "loss": 0.4375, "step": 182400 }, { "epoch": 7.1022616684183895, "grad_norm": 19.360843658447266, "learning_rate": 1.450367861730702e-05, "loss": 0.5285, "step": 182450 }, { "epoch": 7.104208026781891, "grad_norm": 17.026344299316406, "learning_rate": 1.449394682548951e-05, "loss": 0.4416, "step": 182500 }, { "epoch": 7.106154385145393, "grad_norm": 5.179976940155029, "learning_rate": 1.4484215033672e-05, "loss": 0.3809, "step": 182550 }, { "epoch": 7.108100743508895, "grad_norm": 0.1951877325773239, "learning_rate": 1.4474483241854492e-05, "loss": 0.3749, "step": 182600 }, { "epoch": 7.1100471018723965, "grad_norm": 4.823157787322998, "learning_rate": 1.446475145003698e-05, "loss": 0.5387, "step": 182650 }, { "epoch": 7.111993460235898, "grad_norm": 0.2376062273979187, "learning_rate": 1.4455019658219471e-05, "loss": 0.4133, "step": 182700 }, { "epoch": 7.113939818599401, "grad_norm": 7.599267482757568, "learning_rate": 1.4445287866401963e-05, "loss": 0.4479, "step": 182750 }, { "epoch": 7.115886176962903, "grad_norm": 76.02981567382812, "learning_rate": 1.4435556074584452e-05, "loss": 0.4842, "step": 182800 }, { "epoch": 7.1178325353264045, "grad_norm": 17.477622985839844, "learning_rate": 1.4425824282766944e-05, "loss": 0.3508, "step": 182850 }, { "epoch": 7.119778893689906, "grad_norm": 38.127281188964844, "learning_rate": 1.4416092490949435e-05, "loss": 0.3583, "step": 182900 }, { "epoch": 7.121725252053408, "grad_norm": 62.956871032714844, "learning_rate": 1.4406360699131923e-05, "loss": 0.3589, "step": 182950 }, { "epoch": 7.12367161041691, "grad_norm": 23.116178512573242, "learning_rate": 1.4396628907314416e-05, "loss": 0.43, "step": 183000 }, { "epoch": 7.1256179687804115, "grad_norm": 16.618125915527344, "learning_rate": 1.4386897115496906e-05, "loss": 0.4499, "step": 183050 }, { "epoch": 7.127564327143913, "grad_norm": 13.36301326751709, "learning_rate": 1.4377165323679395e-05, "loss": 0.3887, "step": 183100 }, { "epoch": 7.129510685507416, "grad_norm": 0.1355084478855133, "learning_rate": 1.4367433531861887e-05, "loss": 0.3937, "step": 183150 }, { "epoch": 7.131457043870918, "grad_norm": 59.278438568115234, "learning_rate": 1.4357701740044377e-05, "loss": 0.4606, "step": 183200 }, { "epoch": 7.1334034022344195, "grad_norm": 6.652516841888428, "learning_rate": 1.434796994822687e-05, "loss": 0.378, "step": 183250 }, { "epoch": 7.135349760597921, "grad_norm": 22.238317489624023, "learning_rate": 1.4338238156409358e-05, "loss": 0.5083, "step": 183300 }, { "epoch": 7.137296118961423, "grad_norm": 0.6243910193443298, "learning_rate": 1.4328506364591849e-05, "loss": 0.3687, "step": 183350 }, { "epoch": 7.139242477324925, "grad_norm": 6.517775535583496, "learning_rate": 1.4318774572774341e-05, "loss": 0.3707, "step": 183400 }, { "epoch": 7.1411888356884266, "grad_norm": 10.591985702514648, "learning_rate": 1.430904278095683e-05, "loss": 0.4235, "step": 183450 }, { "epoch": 7.143135194051929, "grad_norm": 9.72916316986084, "learning_rate": 1.4299310989139322e-05, "loss": 0.3549, "step": 183500 }, { "epoch": 7.145081552415431, "grad_norm": 45.7458610534668, "learning_rate": 1.4289579197321813e-05, "loss": 0.4383, "step": 183550 }, { "epoch": 7.147027910778933, "grad_norm": 2.9110443592071533, "learning_rate": 1.4279847405504301e-05, "loss": 0.4608, "step": 183600 }, { "epoch": 7.1489742691424345, "grad_norm": 15.859969139099121, "learning_rate": 1.4270310249523142e-05, "loss": 0.3421, "step": 183650 }, { "epoch": 7.150920627505936, "grad_norm": 20.312519073486328, "learning_rate": 1.4260578457705634e-05, "loss": 0.3601, "step": 183700 }, { "epoch": 7.152866985869438, "grad_norm": 2.895301103591919, "learning_rate": 1.4250846665888123e-05, "loss": 0.4778, "step": 183750 }, { "epoch": 7.15481334423294, "grad_norm": 7.919884204864502, "learning_rate": 1.4241114874070617e-05, "loss": 0.3998, "step": 183800 }, { "epoch": 7.1567597025964425, "grad_norm": 9.843233108520508, "learning_rate": 1.4231383082253105e-05, "loss": 0.4142, "step": 183850 }, { "epoch": 7.158706060959944, "grad_norm": 12.053441047668457, "learning_rate": 1.4221845926271946e-05, "loss": 0.3912, "step": 183900 }, { "epoch": 7.160652419323446, "grad_norm": 49.61294174194336, "learning_rate": 1.4212114134454435e-05, "loss": 0.2921, "step": 183950 }, { "epoch": 7.162598777686948, "grad_norm": 23.205936431884766, "learning_rate": 1.4202382342636927e-05, "loss": 0.3347, "step": 184000 }, { "epoch": 7.1645451360504495, "grad_norm": 18.482946395874023, "learning_rate": 1.4192650550819417e-05, "loss": 0.3761, "step": 184050 }, { "epoch": 7.166491494413951, "grad_norm": 18.23341941833496, "learning_rate": 1.418291875900191e-05, "loss": 0.3051, "step": 184100 }, { "epoch": 7.168437852777453, "grad_norm": 4.103431701660156, "learning_rate": 1.4173186967184398e-05, "loss": 0.4069, "step": 184150 }, { "epoch": 7.170384211140956, "grad_norm": 0.6710313558578491, "learning_rate": 1.4163455175366889e-05, "loss": 0.4049, "step": 184200 }, { "epoch": 7.1723305695044575, "grad_norm": 23.082239151000977, "learning_rate": 1.415372338354938e-05, "loss": 0.3466, "step": 184250 }, { "epoch": 7.174276927867959, "grad_norm": 30.7349853515625, "learning_rate": 1.414399159173187e-05, "loss": 0.3828, "step": 184300 }, { "epoch": 7.176223286231461, "grad_norm": 45.85377502441406, "learning_rate": 1.4134259799914362e-05, "loss": 0.5194, "step": 184350 }, { "epoch": 7.178169644594963, "grad_norm": 18.44282341003418, "learning_rate": 1.4124528008096852e-05, "loss": 0.32, "step": 184400 }, { "epoch": 7.1801160029584645, "grad_norm": 0.42556726932525635, "learning_rate": 1.4114796216279341e-05, "loss": 0.4945, "step": 184450 }, { "epoch": 7.182062361321966, "grad_norm": 12.982514381408691, "learning_rate": 1.4105064424461833e-05, "loss": 0.4693, "step": 184500 }, { "epoch": 7.184008719685468, "grad_norm": 3.535325527191162, "learning_rate": 1.4095332632644324e-05, "loss": 0.3853, "step": 184550 }, { "epoch": 7.185955078048971, "grad_norm": 48.84566116333008, "learning_rate": 1.4085600840826813e-05, "loss": 0.4326, "step": 184600 }, { "epoch": 7.1879014364124725, "grad_norm": 36.13020324707031, "learning_rate": 1.4075869049009305e-05, "loss": 0.374, "step": 184650 }, { "epoch": 7.189847794775974, "grad_norm": 15.227568626403809, "learning_rate": 1.4066137257191793e-05, "loss": 0.5266, "step": 184700 }, { "epoch": 7.191794153139476, "grad_norm": 19.566404342651367, "learning_rate": 1.4056405465374287e-05, "loss": 0.5344, "step": 184750 }, { "epoch": 7.193740511502978, "grad_norm": 162.442626953125, "learning_rate": 1.4046673673556776e-05, "loss": 0.4493, "step": 184800 }, { "epoch": 7.1956868698664795, "grad_norm": 26.74009132385254, "learning_rate": 1.4036941881739265e-05, "loss": 0.4376, "step": 184850 }, { "epoch": 7.197633228229981, "grad_norm": 61.33104705810547, "learning_rate": 1.4027210089921757e-05, "loss": 0.4343, "step": 184900 }, { "epoch": 7.199579586593484, "grad_norm": 0.587601900100708, "learning_rate": 1.4017478298104248e-05, "loss": 0.3738, "step": 184950 }, { "epoch": 7.201525944956986, "grad_norm": 35.41316604614258, "learning_rate": 1.4007746506286736e-05, "loss": 0.56, "step": 185000 }, { "epoch": 7.2034723033204875, "grad_norm": 26.02621078491211, "learning_rate": 1.3998014714469229e-05, "loss": 0.4544, "step": 185050 }, { "epoch": 7.205418661683989, "grad_norm": 18.80217933654785, "learning_rate": 1.3988282922651719e-05, "loss": 0.3727, "step": 185100 }, { "epoch": 7.207365020047491, "grad_norm": 34.16789627075195, "learning_rate": 1.3978551130834211e-05, "loss": 0.4231, "step": 185150 }, { "epoch": 7.209311378410993, "grad_norm": 8.130133628845215, "learning_rate": 1.39688193390167e-05, "loss": 0.4427, "step": 185200 }, { "epoch": 7.2112577367744946, "grad_norm": 24.828889846801758, "learning_rate": 1.395908754719919e-05, "loss": 0.3722, "step": 185250 }, { "epoch": 7.213204095137997, "grad_norm": 91.73816680908203, "learning_rate": 1.3949355755381683e-05, "loss": 0.3892, "step": 185300 }, { "epoch": 7.215150453501499, "grad_norm": 13.374418258666992, "learning_rate": 1.3939623963564171e-05, "loss": 0.4267, "step": 185350 }, { "epoch": 7.217096811865001, "grad_norm": 3.8777432441711426, "learning_rate": 1.3929892171746662e-05, "loss": 0.4183, "step": 185400 }, { "epoch": 7.2190431702285025, "grad_norm": 6.667410373687744, "learning_rate": 1.3920160379929154e-05, "loss": 0.3351, "step": 185450 }, { "epoch": 7.220989528592004, "grad_norm": 4.151566028594971, "learning_rate": 1.3910428588111643e-05, "loss": 0.3296, "step": 185500 }, { "epoch": 7.222935886955506, "grad_norm": 3.2558765411376953, "learning_rate": 1.3900696796294135e-05, "loss": 0.3366, "step": 185550 }, { "epoch": 7.224882245319008, "grad_norm": 17.657455444335938, "learning_rate": 1.3890965004476626e-05, "loss": 0.2976, "step": 185600 }, { "epoch": 7.22682860368251, "grad_norm": 11.439208984375, "learning_rate": 1.3881233212659114e-05, "loss": 0.3815, "step": 185650 }, { "epoch": 7.228774962046012, "grad_norm": 31.133695602416992, "learning_rate": 1.3871501420841607e-05, "loss": 0.409, "step": 185700 }, { "epoch": 7.230721320409514, "grad_norm": 24.091655731201172, "learning_rate": 1.3861769629024095e-05, "loss": 0.4102, "step": 185750 }, { "epoch": 7.232667678773016, "grad_norm": 8.227604866027832, "learning_rate": 1.385203783720659e-05, "loss": 0.4214, "step": 185800 }, { "epoch": 7.2346140371365175, "grad_norm": 45.501251220703125, "learning_rate": 1.3842306045389078e-05, "loss": 0.4954, "step": 185850 }, { "epoch": 7.236560395500019, "grad_norm": 10.62215805053711, "learning_rate": 1.3832574253571567e-05, "loss": 0.4642, "step": 185900 }, { "epoch": 7.238506753863521, "grad_norm": 451.8420715332031, "learning_rate": 1.3822842461754059e-05, "loss": 0.3273, "step": 185950 }, { "epoch": 7.240453112227023, "grad_norm": 63.489803314208984, "learning_rate": 1.381311066993655e-05, "loss": 0.4808, "step": 186000 }, { "epoch": 7.2423994705905255, "grad_norm": 4.233457565307617, "learning_rate": 1.3803378878119038e-05, "loss": 0.3919, "step": 186050 }, { "epoch": 7.244345828954027, "grad_norm": 0.17588917911052704, "learning_rate": 1.379364708630153e-05, "loss": 0.3542, "step": 186100 }, { "epoch": 7.246292187317529, "grad_norm": 63.85796356201172, "learning_rate": 1.3783915294484021e-05, "loss": 0.4939, "step": 186150 }, { "epoch": 7.248238545681031, "grad_norm": 0.4229116141796112, "learning_rate": 1.3774183502666513e-05, "loss": 0.3236, "step": 186200 }, { "epoch": 7.2501849040445325, "grad_norm": 32.070281982421875, "learning_rate": 1.3764451710849002e-05, "loss": 0.4048, "step": 186250 }, { "epoch": 7.252131262408034, "grad_norm": 108.19818878173828, "learning_rate": 1.3754719919031492e-05, "loss": 0.4749, "step": 186300 }, { "epoch": 7.254077620771536, "grad_norm": 44.496925354003906, "learning_rate": 1.3744988127213985e-05, "loss": 0.3354, "step": 186350 }, { "epoch": 7.256023979135039, "grad_norm": 113.41729736328125, "learning_rate": 1.3735256335396473e-05, "loss": 0.5141, "step": 186400 }, { "epoch": 7.2579703374985405, "grad_norm": 68.62321472167969, "learning_rate": 1.3725524543578964e-05, "loss": 0.3794, "step": 186450 }, { "epoch": 7.259916695862042, "grad_norm": 58.90036392211914, "learning_rate": 1.3715792751761456e-05, "loss": 0.409, "step": 186500 }, { "epoch": 7.261863054225544, "grad_norm": 163.01504516601562, "learning_rate": 1.3706060959943945e-05, "loss": 0.4064, "step": 186550 }, { "epoch": 7.263809412589046, "grad_norm": 4.317603588104248, "learning_rate": 1.3696329168126437e-05, "loss": 0.3856, "step": 186600 }, { "epoch": 7.2657557709525475, "grad_norm": 29.211990356445312, "learning_rate": 1.3686597376308927e-05, "loss": 0.4299, "step": 186650 }, { "epoch": 7.267702129316049, "grad_norm": 0.13573099672794342, "learning_rate": 1.3676865584491416e-05, "loss": 0.4106, "step": 186700 }, { "epoch": 7.269648487679552, "grad_norm": 1.9789230823516846, "learning_rate": 1.3667133792673908e-05, "loss": 0.4111, "step": 186750 }, { "epoch": 7.271594846043054, "grad_norm": 64.70892333984375, "learning_rate": 1.3657402000856397e-05, "loss": 0.3892, "step": 186800 }, { "epoch": 7.2735412044065555, "grad_norm": 13.880356788635254, "learning_rate": 1.364767020903889e-05, "loss": 0.4163, "step": 186850 }, { "epoch": 7.275487562770057, "grad_norm": 516.9116821289062, "learning_rate": 1.363793841722138e-05, "loss": 0.3891, "step": 186900 }, { "epoch": 7.277433921133559, "grad_norm": 8.60952091217041, "learning_rate": 1.3628206625403869e-05, "loss": 0.3967, "step": 186950 }, { "epoch": 7.279380279497061, "grad_norm": 27.409679412841797, "learning_rate": 1.3618474833586361e-05, "loss": 0.495, "step": 187000 }, { "epoch": 7.281326637860563, "grad_norm": 47.69477081298828, "learning_rate": 1.3608743041768851e-05, "loss": 0.3408, "step": 187050 }, { "epoch": 7.283272996224065, "grad_norm": 14.65899658203125, "learning_rate": 1.359901124995134e-05, "loss": 0.3722, "step": 187100 }, { "epoch": 7.285219354587567, "grad_norm": 0.654814600944519, "learning_rate": 1.3589279458133832e-05, "loss": 0.3786, "step": 187150 }, { "epoch": 7.287165712951069, "grad_norm": 5.243232727050781, "learning_rate": 1.3579547666316323e-05, "loss": 0.4341, "step": 187200 }, { "epoch": 7.2891120713145705, "grad_norm": 53.85220718383789, "learning_rate": 1.3569815874498815e-05, "loss": 0.3894, "step": 187250 }, { "epoch": 7.291058429678072, "grad_norm": 25.96484375, "learning_rate": 1.3560084082681304e-05, "loss": 0.3391, "step": 187300 }, { "epoch": 7.293004788041574, "grad_norm": 46.68117141723633, "learning_rate": 1.3550352290863794e-05, "loss": 0.4722, "step": 187350 }, { "epoch": 7.294951146405076, "grad_norm": 5.5701093673706055, "learning_rate": 1.3540620499046286e-05, "loss": 0.3601, "step": 187400 }, { "epoch": 7.2968975047685785, "grad_norm": 0.16249068081378937, "learning_rate": 1.3530888707228775e-05, "loss": 0.5265, "step": 187450 }, { "epoch": 7.29884386313208, "grad_norm": 8.842360496520996, "learning_rate": 1.3521156915411266e-05, "loss": 0.4361, "step": 187500 }, { "epoch": 7.300790221495582, "grad_norm": 35.69076919555664, "learning_rate": 1.3511425123593758e-05, "loss": 0.3679, "step": 187550 }, { "epoch": 7.302736579859084, "grad_norm": 72.93497467041016, "learning_rate": 1.3501693331776247e-05, "loss": 0.3918, "step": 187600 }, { "epoch": 7.3046829382225855, "grad_norm": 42.22374725341797, "learning_rate": 1.3491961539958739e-05, "loss": 0.358, "step": 187650 }, { "epoch": 7.306629296586087, "grad_norm": 23.270000457763672, "learning_rate": 1.3482229748141228e-05, "loss": 0.3672, "step": 187700 }, { "epoch": 7.308575654949589, "grad_norm": 35.524166107177734, "learning_rate": 1.3472497956323718e-05, "loss": 0.4951, "step": 187750 }, { "epoch": 7.310522013313091, "grad_norm": 14.219884872436523, "learning_rate": 1.346276616450621e-05, "loss": 0.3762, "step": 187800 }, { "epoch": 7.3124683716765935, "grad_norm": 17.96320915222168, "learning_rate": 1.3453034372688699e-05, "loss": 0.4157, "step": 187850 }, { "epoch": 7.314414730040095, "grad_norm": 27.32589340209961, "learning_rate": 1.344330258087119e-05, "loss": 0.3096, "step": 187900 }, { "epoch": 7.316361088403597, "grad_norm": 26.07374382019043, "learning_rate": 1.3433570789053682e-05, "loss": 0.3638, "step": 187950 }, { "epoch": 7.318307446767099, "grad_norm": 52.43703079223633, "learning_rate": 1.342383899723617e-05, "loss": 0.4386, "step": 188000 }, { "epoch": 7.3202538051306005, "grad_norm": 31.952571868896484, "learning_rate": 1.3414107205418663e-05, "loss": 0.4034, "step": 188050 }, { "epoch": 7.322200163494102, "grad_norm": 41.925758361816406, "learning_rate": 1.3404375413601153e-05, "loss": 0.3665, "step": 188100 }, { "epoch": 7.324146521857604, "grad_norm": 0.4854191839694977, "learning_rate": 1.3394643621783642e-05, "loss": 0.356, "step": 188150 }, { "epoch": 7.326092880221107, "grad_norm": 1.0399171113967896, "learning_rate": 1.3385106465802486e-05, "loss": 0.3246, "step": 188200 }, { "epoch": 7.3280392385846085, "grad_norm": 51.59840393066406, "learning_rate": 1.3375374673984975e-05, "loss": 0.3795, "step": 188250 }, { "epoch": 7.32998559694811, "grad_norm": 37.2921028137207, "learning_rate": 1.3365642882167465e-05, "loss": 0.3363, "step": 188300 }, { "epoch": 7.331931955311612, "grad_norm": 220.36744689941406, "learning_rate": 1.3355911090349957e-05, "loss": 0.3923, "step": 188350 }, { "epoch": 7.333878313675114, "grad_norm": 22.828744888305664, "learning_rate": 1.3346179298532446e-05, "loss": 0.4526, "step": 188400 }, { "epoch": 7.3358246720386155, "grad_norm": 44.64876937866211, "learning_rate": 1.3336447506714937e-05, "loss": 0.4349, "step": 188450 }, { "epoch": 7.337771030402117, "grad_norm": 23.665224075317383, "learning_rate": 1.3326715714897429e-05, "loss": 0.3706, "step": 188500 }, { "epoch": 7.339717388765619, "grad_norm": 42.592952728271484, "learning_rate": 1.3316983923079918e-05, "loss": 0.4384, "step": 188550 }, { "epoch": 7.341663747129122, "grad_norm": 22.143966674804688, "learning_rate": 1.330725213126241e-05, "loss": 0.4177, "step": 188600 }, { "epoch": 7.3436101054926235, "grad_norm": 10.287309646606445, "learning_rate": 1.32975203394449e-05, "loss": 0.3477, "step": 188650 }, { "epoch": 7.345556463856125, "grad_norm": 9.137386322021484, "learning_rate": 1.3287788547627389e-05, "loss": 0.3599, "step": 188700 }, { "epoch": 7.347502822219627, "grad_norm": 3.595979690551758, "learning_rate": 1.3278056755809881e-05, "loss": 0.372, "step": 188750 }, { "epoch": 7.349449180583129, "grad_norm": 9.450173377990723, "learning_rate": 1.326832496399237e-05, "loss": 0.3706, "step": 188800 }, { "epoch": 7.351395538946631, "grad_norm": 4.105078220367432, "learning_rate": 1.325859317217486e-05, "loss": 0.4998, "step": 188850 }, { "epoch": 7.353341897310132, "grad_norm": 56.695003509521484, "learning_rate": 1.3248861380357353e-05, "loss": 0.5504, "step": 188900 }, { "epoch": 7.355288255673635, "grad_norm": 0.32409924268722534, "learning_rate": 1.3239129588539841e-05, "loss": 0.3633, "step": 188950 }, { "epoch": 7.357234614037137, "grad_norm": 27.03724479675293, "learning_rate": 1.3229397796722334e-05, "loss": 0.3953, "step": 189000 }, { "epoch": 7.3591809724006385, "grad_norm": 87.56633758544922, "learning_rate": 1.3219666004904824e-05, "loss": 0.3576, "step": 189050 }, { "epoch": 7.36112733076414, "grad_norm": 13.967483520507812, "learning_rate": 1.3209934213087313e-05, "loss": 0.4401, "step": 189100 }, { "epoch": 7.363073689127642, "grad_norm": 0.08677978068590164, "learning_rate": 1.3200202421269805e-05, "loss": 0.4038, "step": 189150 }, { "epoch": 7.365020047491144, "grad_norm": 4.179988861083984, "learning_rate": 1.3190470629452296e-05, "loss": 0.3947, "step": 189200 }, { "epoch": 7.366966405854646, "grad_norm": 3.099726915359497, "learning_rate": 1.3180738837634784e-05, "loss": 0.3942, "step": 189250 }, { "epoch": 7.368912764218148, "grad_norm": 53.530914306640625, "learning_rate": 1.3171007045817276e-05, "loss": 0.4347, "step": 189300 }, { "epoch": 7.37085912258165, "grad_norm": 9.681085586547852, "learning_rate": 1.3161275253999767e-05, "loss": 0.4469, "step": 189350 }, { "epoch": 7.372805480945152, "grad_norm": 42.595977783203125, "learning_rate": 1.3151543462182259e-05, "loss": 0.3838, "step": 189400 }, { "epoch": 7.3747518393086535, "grad_norm": 4.156348705291748, "learning_rate": 1.3141811670364748e-05, "loss": 0.4602, "step": 189450 }, { "epoch": 7.376698197672155, "grad_norm": 19.90520668029785, "learning_rate": 1.3132079878547238e-05, "loss": 0.4037, "step": 189500 }, { "epoch": 7.378644556035657, "grad_norm": 114.3270492553711, "learning_rate": 1.312234808672973e-05, "loss": 0.4258, "step": 189550 }, { "epoch": 7.380590914399159, "grad_norm": 139.85855102539062, "learning_rate": 1.311261629491222e-05, "loss": 0.3763, "step": 189600 }, { "epoch": 7.3825372727626615, "grad_norm": 13.528799057006836, "learning_rate": 1.3102884503094712e-05, "loss": 0.3833, "step": 189650 }, { "epoch": 7.384483631126163, "grad_norm": 86.04864501953125, "learning_rate": 1.3093152711277202e-05, "loss": 0.4016, "step": 189700 }, { "epoch": 7.386429989489665, "grad_norm": 1.8877902030944824, "learning_rate": 1.3083420919459691e-05, "loss": 0.3839, "step": 189750 }, { "epoch": 7.388376347853167, "grad_norm": 16.25452423095703, "learning_rate": 1.3073689127642183e-05, "loss": 0.2781, "step": 189800 }, { "epoch": 7.3903227062166685, "grad_norm": 57.03726577758789, "learning_rate": 1.3063957335824672e-05, "loss": 0.4018, "step": 189850 }, { "epoch": 7.39226906458017, "grad_norm": 314.8751220703125, "learning_rate": 1.3054225544007162e-05, "loss": 0.5611, "step": 189900 }, { "epoch": 7.394215422943672, "grad_norm": 15.158757209777832, "learning_rate": 1.3044493752189654e-05, "loss": 0.3719, "step": 189950 }, { "epoch": 7.396161781307175, "grad_norm": 0.2697979211807251, "learning_rate": 1.3034761960372143e-05, "loss": 0.3527, "step": 190000 }, { "epoch": 7.3981081396706765, "grad_norm": 95.49191284179688, "learning_rate": 1.3025030168554635e-05, "loss": 0.4071, "step": 190050 }, { "epoch": 7.400054498034178, "grad_norm": 0.8181177973747253, "learning_rate": 1.3015298376737126e-05, "loss": 0.4046, "step": 190100 }, { "epoch": 7.40200085639768, "grad_norm": 67.75640869140625, "learning_rate": 1.3005566584919615e-05, "loss": 0.4751, "step": 190150 }, { "epoch": 7.403947214761182, "grad_norm": 16.033540725708008, "learning_rate": 1.2995834793102107e-05, "loss": 0.3282, "step": 190200 }, { "epoch": 7.4058935731246835, "grad_norm": 35.41672897338867, "learning_rate": 1.2986103001284597e-05, "loss": 0.3716, "step": 190250 }, { "epoch": 7.407839931488185, "grad_norm": 20.560293197631836, "learning_rate": 1.2976371209467086e-05, "loss": 0.3221, "step": 190300 }, { "epoch": 7.409786289851687, "grad_norm": 52.77455520629883, "learning_rate": 1.2966639417649578e-05, "loss": 0.5433, "step": 190350 }, { "epoch": 7.41173264821519, "grad_norm": 36.49356460571289, "learning_rate": 1.2956907625832069e-05, "loss": 0.3378, "step": 190400 }, { "epoch": 7.4136790065786915, "grad_norm": 0.897151529788971, "learning_rate": 1.2947175834014561e-05, "loss": 0.4373, "step": 190450 }, { "epoch": 7.415625364942193, "grad_norm": 5.585019111633301, "learning_rate": 1.293744404219705e-05, "loss": 0.4319, "step": 190500 }, { "epoch": 7.417571723305695, "grad_norm": 21.270456314086914, "learning_rate": 1.292771225037954e-05, "loss": 0.4363, "step": 190550 }, { "epoch": 7.419518081669197, "grad_norm": 189.29531860351562, "learning_rate": 1.2917980458562032e-05, "loss": 0.4691, "step": 190600 }, { "epoch": 7.421464440032699, "grad_norm": 671.4727172851562, "learning_rate": 1.2908248666744521e-05, "loss": 0.349, "step": 190650 }, { "epoch": 7.4234107983962, "grad_norm": 22.685836791992188, "learning_rate": 1.2898516874927013e-05, "loss": 0.4076, "step": 190700 }, { "epoch": 7.425357156759703, "grad_norm": 6.094360828399658, "learning_rate": 1.2888785083109502e-05, "loss": 0.3305, "step": 190750 }, { "epoch": 7.427303515123205, "grad_norm": 0.2681405544281006, "learning_rate": 1.2879053291291993e-05, "loss": 0.3048, "step": 190800 }, { "epoch": 7.4292498734867065, "grad_norm": 17.841318130493164, "learning_rate": 1.2869321499474485e-05, "loss": 0.3772, "step": 190850 }, { "epoch": 7.431196231850208, "grad_norm": 73.83751678466797, "learning_rate": 1.2859589707656974e-05, "loss": 0.4298, "step": 190900 }, { "epoch": 7.43314259021371, "grad_norm": 16.88973045349121, "learning_rate": 1.2849857915839464e-05, "loss": 0.4648, "step": 190950 }, { "epoch": 7.435088948577212, "grad_norm": 0.6701951026916504, "learning_rate": 1.2840126124021956e-05, "loss": 0.3624, "step": 191000 }, { "epoch": 7.437035306940714, "grad_norm": 68.44085693359375, "learning_rate": 1.2830394332204445e-05, "loss": 0.4246, "step": 191050 }, { "epoch": 7.438981665304215, "grad_norm": 0.497310608625412, "learning_rate": 1.2820662540386937e-05, "loss": 0.3324, "step": 191100 }, { "epoch": 7.440928023667718, "grad_norm": 10.203828811645508, "learning_rate": 1.2810930748569428e-05, "loss": 0.3264, "step": 191150 }, { "epoch": 7.44287438203122, "grad_norm": 65.833251953125, "learning_rate": 1.2801198956751917e-05, "loss": 0.4685, "step": 191200 }, { "epoch": 7.4448207403947215, "grad_norm": 20.098148345947266, "learning_rate": 1.2791467164934409e-05, "loss": 0.4115, "step": 191250 }, { "epoch": 7.446767098758223, "grad_norm": 17.783605575561523, "learning_rate": 1.27817353731169e-05, "loss": 0.4841, "step": 191300 }, { "epoch": 7.448713457121725, "grad_norm": 24.287437438964844, "learning_rate": 1.2772003581299388e-05, "loss": 0.4325, "step": 191350 }, { "epoch": 7.450659815485227, "grad_norm": 58.30049514770508, "learning_rate": 1.276227178948188e-05, "loss": 0.3916, "step": 191400 }, { "epoch": 7.452606173848729, "grad_norm": 19.279565811157227, "learning_rate": 1.275253999766437e-05, "loss": 0.411, "step": 191450 }, { "epoch": 7.454552532212231, "grad_norm": 26.751937866210938, "learning_rate": 1.2742808205846863e-05, "loss": 0.3277, "step": 191500 }, { "epoch": 7.456498890575733, "grad_norm": 119.24364471435547, "learning_rate": 1.2733271049865703e-05, "loss": 0.3364, "step": 191550 }, { "epoch": 7.458445248939235, "grad_norm": 34.998775482177734, "learning_rate": 1.2723539258048192e-05, "loss": 0.4021, "step": 191600 }, { "epoch": 7.4603916073027365, "grad_norm": 593.449951171875, "learning_rate": 1.2713807466230681e-05, "loss": 0.3519, "step": 191650 }, { "epoch": 7.462337965666238, "grad_norm": 3.435620069503784, "learning_rate": 1.2704075674413175e-05, "loss": 0.3897, "step": 191700 }, { "epoch": 7.46428432402974, "grad_norm": 4.364711284637451, "learning_rate": 1.2694343882595664e-05, "loss": 0.4586, "step": 191750 }, { "epoch": 7.466230682393242, "grad_norm": 3.779606342315674, "learning_rate": 1.2684612090778156e-05, "loss": 0.391, "step": 191800 }, { "epoch": 7.4681770407567445, "grad_norm": 0.9348109364509583, "learning_rate": 1.2674880298960644e-05, "loss": 0.3847, "step": 191850 }, { "epoch": 7.470123399120246, "grad_norm": 55.43354034423828, "learning_rate": 1.2665148507143135e-05, "loss": 0.4589, "step": 191900 }, { "epoch": 7.472069757483748, "grad_norm": 2.38199520111084, "learning_rate": 1.2655416715325627e-05, "loss": 0.2922, "step": 191950 }, { "epoch": 7.47401611584725, "grad_norm": 185.17327880859375, "learning_rate": 1.2645684923508116e-05, "loss": 0.4715, "step": 192000 }, { "epoch": 7.4759624742107516, "grad_norm": 23.261920928955078, "learning_rate": 1.2635953131690608e-05, "loss": 0.4082, "step": 192050 }, { "epoch": 7.477908832574253, "grad_norm": 29.234046936035156, "learning_rate": 1.2626221339873099e-05, "loss": 0.4574, "step": 192100 }, { "epoch": 7.479855190937755, "grad_norm": 4.2683610916137695, "learning_rate": 1.2616489548055587e-05, "loss": 0.3809, "step": 192150 }, { "epoch": 7.481801549301258, "grad_norm": 29.76538848876953, "learning_rate": 1.260675775623808e-05, "loss": 0.5053, "step": 192200 }, { "epoch": 7.4837479076647595, "grad_norm": 0.5820913314819336, "learning_rate": 1.259702596442057e-05, "loss": 0.3172, "step": 192250 }, { "epoch": 7.485694266028261, "grad_norm": 1.3702598810195923, "learning_rate": 1.2587294172603059e-05, "loss": 0.3235, "step": 192300 }, { "epoch": 7.487640624391763, "grad_norm": 7.18239688873291, "learning_rate": 1.2577562380785551e-05, "loss": 0.3896, "step": 192350 }, { "epoch": 7.489586982755265, "grad_norm": 56.785133361816406, "learning_rate": 1.2567830588968042e-05, "loss": 0.4989, "step": 192400 }, { "epoch": 7.491533341118767, "grad_norm": 28.612390518188477, "learning_rate": 1.2558098797150534e-05, "loss": 0.45, "step": 192450 }, { "epoch": 7.493479699482268, "grad_norm": 12.122247695922852, "learning_rate": 1.2548367005333022e-05, "loss": 0.5404, "step": 192500 }, { "epoch": 7.495426057845771, "grad_norm": 40.39155578613281, "learning_rate": 1.2538829849351863e-05, "loss": 0.4258, "step": 192550 }, { "epoch": 7.497372416209273, "grad_norm": 11.01158332824707, "learning_rate": 1.2529098057534353e-05, "loss": 0.5214, "step": 192600 }, { "epoch": 7.4993187745727745, "grad_norm": 56.53664016723633, "learning_rate": 1.2519366265716846e-05, "loss": 0.4449, "step": 192650 }, { "epoch": 7.501265132936276, "grad_norm": 27.814659118652344, "learning_rate": 1.2509634473899334e-05, "loss": 0.4236, "step": 192700 }, { "epoch": 7.503211491299778, "grad_norm": 5.061644077301025, "learning_rate": 1.2499902682081825e-05, "loss": 0.4125, "step": 192750 }, { "epoch": 7.50515784966328, "grad_norm": 2.625807523727417, "learning_rate": 1.2490170890264315e-05, "loss": 0.3534, "step": 192800 }, { "epoch": 7.507104208026782, "grad_norm": 30.839242935180664, "learning_rate": 1.2480439098446807e-05, "loss": 0.4087, "step": 192850 }, { "epoch": 7.509050566390284, "grad_norm": 26.66027069091797, "learning_rate": 1.2470707306629298e-05, "loss": 0.4043, "step": 192900 }, { "epoch": 7.510996924753786, "grad_norm": 14.638086318969727, "learning_rate": 1.2460975514811787e-05, "loss": 0.3698, "step": 192950 }, { "epoch": 7.512943283117288, "grad_norm": 36.91025161743164, "learning_rate": 1.2451243722994279e-05, "loss": 0.344, "step": 193000 }, { "epoch": 7.5148896414807895, "grad_norm": 53.20779037475586, "learning_rate": 1.244151193117677e-05, "loss": 0.4247, "step": 193050 }, { "epoch": 7.516835999844291, "grad_norm": 0.4034773111343384, "learning_rate": 1.243178013935926e-05, "loss": 0.3967, "step": 193100 }, { "epoch": 7.518782358207793, "grad_norm": 17.56313133239746, "learning_rate": 1.242204834754175e-05, "loss": 0.4333, "step": 193150 }, { "epoch": 7.520728716571295, "grad_norm": 12.84005069732666, "learning_rate": 1.2412316555724241e-05, "loss": 0.4203, "step": 193200 }, { "epoch": 7.5226750749347975, "grad_norm": 25.710418701171875, "learning_rate": 1.2402584763906731e-05, "loss": 0.3088, "step": 193250 }, { "epoch": 7.524621433298299, "grad_norm": 1.667039155960083, "learning_rate": 1.2392852972089222e-05, "loss": 0.3615, "step": 193300 }, { "epoch": 7.526567791661801, "grad_norm": 31.60535430908203, "learning_rate": 1.2383121180271712e-05, "loss": 0.4544, "step": 193350 }, { "epoch": 7.528514150025303, "grad_norm": 38.2503547668457, "learning_rate": 1.2373389388454203e-05, "loss": 0.4388, "step": 193400 }, { "epoch": 7.5304605083888045, "grad_norm": 44.81056594848633, "learning_rate": 1.2363657596636693e-05, "loss": 0.436, "step": 193450 }, { "epoch": 7.532406866752306, "grad_norm": 24.36215591430664, "learning_rate": 1.2353925804819184e-05, "loss": 0.3991, "step": 193500 }, { "epoch": 7.534353225115808, "grad_norm": 7.509572505950928, "learning_rate": 1.2344388648838024e-05, "loss": 0.4643, "step": 193550 }, { "epoch": 7.53629958347931, "grad_norm": 0.6520997881889343, "learning_rate": 1.2334656857020516e-05, "loss": 0.3265, "step": 193600 }, { "epoch": 7.538245941842812, "grad_norm": 33.840572357177734, "learning_rate": 1.2324925065203005e-05, "loss": 0.3945, "step": 193650 }, { "epoch": 7.540192300206314, "grad_norm": 38.76713943481445, "learning_rate": 1.2315193273385496e-05, "loss": 0.4175, "step": 193700 }, { "epoch": 7.542138658569816, "grad_norm": 9.215863227844238, "learning_rate": 1.2305461481567988e-05, "loss": 0.4084, "step": 193750 }, { "epoch": 7.544085016933318, "grad_norm": 26.90289306640625, "learning_rate": 1.2295729689750478e-05, "loss": 0.3675, "step": 193800 }, { "epoch": 7.5460313752968196, "grad_norm": 10.263150215148926, "learning_rate": 1.2285997897932967e-05, "loss": 0.3777, "step": 193850 }, { "epoch": 7.547977733660321, "grad_norm": 21.98690414428711, "learning_rate": 1.2276266106115458e-05, "loss": 0.4322, "step": 193900 }, { "epoch": 7.549924092023823, "grad_norm": 14.788856506347656, "learning_rate": 1.226653431429795e-05, "loss": 0.3482, "step": 193950 }, { "epoch": 7.551870450387325, "grad_norm": 36.267826080322266, "learning_rate": 1.225680252248044e-05, "loss": 0.4929, "step": 194000 }, { "epoch": 7.5538168087508275, "grad_norm": 4.193339824676514, "learning_rate": 1.224707073066293e-05, "loss": 0.3895, "step": 194050 }, { "epoch": 7.555763167114329, "grad_norm": 12.389144897460938, "learning_rate": 1.2237338938845421e-05, "loss": 0.3923, "step": 194100 }, { "epoch": 7.557709525477831, "grad_norm": 84.3876724243164, "learning_rate": 1.2227607147027912e-05, "loss": 0.4322, "step": 194150 }, { "epoch": 7.559655883841333, "grad_norm": 15.501864433288574, "learning_rate": 1.2217875355210402e-05, "loss": 0.3357, "step": 194200 }, { "epoch": 7.561602242204835, "grad_norm": 39.80124282836914, "learning_rate": 1.2208143563392893e-05, "loss": 0.4399, "step": 194250 }, { "epoch": 7.563548600568336, "grad_norm": 17.584796905517578, "learning_rate": 1.2198411771575383e-05, "loss": 0.5053, "step": 194300 }, { "epoch": 7.565494958931838, "grad_norm": 14.90321159362793, "learning_rate": 1.2188679979757874e-05, "loss": 0.2591, "step": 194350 }, { "epoch": 7.567441317295341, "grad_norm": 49.947635650634766, "learning_rate": 1.2178948187940364e-05, "loss": 0.3722, "step": 194400 }, { "epoch": 7.5693876756588425, "grad_norm": 41.29994201660156, "learning_rate": 1.2169216396122855e-05, "loss": 0.4562, "step": 194450 }, { "epoch": 7.571334034022344, "grad_norm": 54.0233154296875, "learning_rate": 1.2159484604305345e-05, "loss": 0.427, "step": 194500 }, { "epoch": 7.573280392385846, "grad_norm": 43.192771911621094, "learning_rate": 1.2149752812487836e-05, "loss": 0.4648, "step": 194550 }, { "epoch": 7.575226750749348, "grad_norm": 11.478323936462402, "learning_rate": 1.2140021020670326e-05, "loss": 0.3662, "step": 194600 }, { "epoch": 7.57717310911285, "grad_norm": 24.49593162536621, "learning_rate": 1.2130289228852818e-05, "loss": 0.3581, "step": 194650 }, { "epoch": 7.579119467476351, "grad_norm": 71.53462982177734, "learning_rate": 1.2120557437035307e-05, "loss": 0.4686, "step": 194700 }, { "epoch": 7.581065825839854, "grad_norm": 11.09736442565918, "learning_rate": 1.2110825645217797e-05, "loss": 0.3183, "step": 194750 }, { "epoch": 7.583012184203356, "grad_norm": 138.43569946289062, "learning_rate": 1.2101093853400288e-05, "loss": 0.2965, "step": 194800 }, { "epoch": 7.5849585425668575, "grad_norm": 4.429830074310303, "learning_rate": 1.209136206158278e-05, "loss": 0.3158, "step": 194850 }, { "epoch": 7.586904900930359, "grad_norm": 0.17773869633674622, "learning_rate": 1.2081630269765269e-05, "loss": 0.4561, "step": 194900 }, { "epoch": 7.588851259293861, "grad_norm": 290.0849609375, "learning_rate": 1.207189847794776e-05, "loss": 0.471, "step": 194950 }, { "epoch": 7.590797617657363, "grad_norm": 148.2359161376953, "learning_rate": 1.2062166686130252e-05, "loss": 0.4238, "step": 195000 }, { "epoch": 7.592743976020865, "grad_norm": 16.815574645996094, "learning_rate": 1.2052434894312742e-05, "loss": 0.4021, "step": 195050 }, { "epoch": 7.594690334384367, "grad_norm": 8.539803504943848, "learning_rate": 1.2042703102495231e-05, "loss": 0.3965, "step": 195100 }, { "epoch": 7.596636692747869, "grad_norm": 12.410489082336426, "learning_rate": 1.2032971310677723e-05, "loss": 0.4585, "step": 195150 }, { "epoch": 7.598583051111371, "grad_norm": 37.39759063720703, "learning_rate": 1.2023239518860214e-05, "loss": 0.3727, "step": 195200 }, { "epoch": 7.6005294094748725, "grad_norm": 40.50080871582031, "learning_rate": 1.2013507727042704e-05, "loss": 0.4195, "step": 195250 }, { "epoch": 7.602475767838374, "grad_norm": 23.269105911254883, "learning_rate": 1.2003775935225193e-05, "loss": 0.3576, "step": 195300 }, { "epoch": 7.604422126201876, "grad_norm": 36.43637466430664, "learning_rate": 1.1994044143407685e-05, "loss": 0.4364, "step": 195350 }, { "epoch": 7.606368484565378, "grad_norm": 12.388874053955078, "learning_rate": 1.1984312351590175e-05, "loss": 0.3797, "step": 195400 }, { "epoch": 7.6083148429288805, "grad_norm": 0.14508052170276642, "learning_rate": 1.1974580559772666e-05, "loss": 0.3676, "step": 195450 }, { "epoch": 7.610261201292382, "grad_norm": 0.25158122181892395, "learning_rate": 1.1964848767955156e-05, "loss": 0.459, "step": 195500 }, { "epoch": 7.612207559655884, "grad_norm": 7.569283962249756, "learning_rate": 1.1955116976137647e-05, "loss": 0.3211, "step": 195550 }, { "epoch": 7.614153918019386, "grad_norm": 165.2548370361328, "learning_rate": 1.1945385184320137e-05, "loss": 0.4353, "step": 195600 }, { "epoch": 7.616100276382888, "grad_norm": 22.94586753845215, "learning_rate": 1.1935653392502628e-05, "loss": 0.4086, "step": 195650 }, { "epoch": 7.618046634746389, "grad_norm": 9.106801986694336, "learning_rate": 1.192592160068512e-05, "loss": 0.4119, "step": 195700 }, { "epoch": 7.619992993109891, "grad_norm": 24.481117248535156, "learning_rate": 1.1916189808867609e-05, "loss": 0.5111, "step": 195750 }, { "epoch": 7.621939351473394, "grad_norm": 53.41339111328125, "learning_rate": 1.19064580170501e-05, "loss": 0.3464, "step": 195800 }, { "epoch": 7.6238857098368955, "grad_norm": 0.35791051387786865, "learning_rate": 1.189672622523259e-05, "loss": 0.3432, "step": 195850 }, { "epoch": 7.625832068200397, "grad_norm": 99.39058685302734, "learning_rate": 1.1886994433415082e-05, "loss": 0.3677, "step": 195900 }, { "epoch": 7.627778426563899, "grad_norm": 19.86051368713379, "learning_rate": 1.187726264159757e-05, "loss": 0.5236, "step": 195950 }, { "epoch": 7.629724784927401, "grad_norm": 138.26278686523438, "learning_rate": 1.1867530849780061e-05, "loss": 0.4224, "step": 196000 }, { "epoch": 7.631671143290903, "grad_norm": 5.800327301025391, "learning_rate": 1.1857799057962553e-05, "loss": 0.4261, "step": 196050 }, { "epoch": 7.633617501654404, "grad_norm": 1.6222518682479858, "learning_rate": 1.1848067266145044e-05, "loss": 0.488, "step": 196100 }, { "epoch": 7.635563860017907, "grad_norm": 84.01456451416016, "learning_rate": 1.1838335474327533e-05, "loss": 0.4498, "step": 196150 }, { "epoch": 7.637510218381409, "grad_norm": 2.419691801071167, "learning_rate": 1.1828603682510025e-05, "loss": 0.3457, "step": 196200 }, { "epoch": 7.6394565767449105, "grad_norm": 4.687898635864258, "learning_rate": 1.1818871890692515e-05, "loss": 0.2926, "step": 196250 }, { "epoch": 7.641402935108412, "grad_norm": 34.30110549926758, "learning_rate": 1.1809140098875006e-05, "loss": 0.4432, "step": 196300 }, { "epoch": 7.643349293471914, "grad_norm": 26.789134979248047, "learning_rate": 1.1799408307057495e-05, "loss": 0.3441, "step": 196350 }, { "epoch": 7.645295651835416, "grad_norm": 109.98148345947266, "learning_rate": 1.1789676515239987e-05, "loss": 0.4433, "step": 196400 }, { "epoch": 7.647242010198918, "grad_norm": 26.97587013244629, "learning_rate": 1.1779944723422477e-05, "loss": 0.5078, "step": 196450 }, { "epoch": 7.649188368562419, "grad_norm": 0.2268209010362625, "learning_rate": 1.1770212931604968e-05, "loss": 0.4059, "step": 196500 }, { "epoch": 7.651134726925921, "grad_norm": 63.31412887573242, "learning_rate": 1.1760481139787458e-05, "loss": 0.552, "step": 196550 }, { "epoch": 7.653081085289424, "grad_norm": 0.2538148760795593, "learning_rate": 1.1750749347969949e-05, "loss": 0.4081, "step": 196600 }, { "epoch": 7.6550274436529255, "grad_norm": 16.11227035522461, "learning_rate": 1.174101755615244e-05, "loss": 0.2997, "step": 196650 }, { "epoch": 7.656973802016427, "grad_norm": 22.72163963317871, "learning_rate": 1.173128576433493e-05, "loss": 0.4428, "step": 196700 }, { "epoch": 7.658920160379929, "grad_norm": 52.86534118652344, "learning_rate": 1.1721553972517422e-05, "loss": 0.4489, "step": 196750 }, { "epoch": 7.660866518743431, "grad_norm": 43.2994384765625, "learning_rate": 1.171182218069991e-05, "loss": 0.4617, "step": 196800 }, { "epoch": 7.662812877106933, "grad_norm": 33.574405670166016, "learning_rate": 1.1702090388882401e-05, "loss": 0.3464, "step": 196850 }, { "epoch": 7.664759235470434, "grad_norm": 23.53083038330078, "learning_rate": 1.1692358597064892e-05, "loss": 0.4578, "step": 196900 }, { "epoch": 7.666705593833937, "grad_norm": 54.79582214355469, "learning_rate": 1.1682626805247384e-05, "loss": 0.4185, "step": 196950 }, { "epoch": 7.668651952197439, "grad_norm": 3.413828134536743, "learning_rate": 1.1672895013429873e-05, "loss": 0.4231, "step": 197000 }, { "epoch": 7.6705983105609405, "grad_norm": 52.55984115600586, "learning_rate": 1.1663357857448715e-05, "loss": 0.4675, "step": 197050 }, { "epoch": 7.672544668924442, "grad_norm": 5.365964889526367, "learning_rate": 1.1653626065631204e-05, "loss": 0.3557, "step": 197100 }, { "epoch": 7.674491027287944, "grad_norm": 39.04832077026367, "learning_rate": 1.1643894273813696e-05, "loss": 0.3549, "step": 197150 }, { "epoch": 7.676437385651446, "grad_norm": 24.280200958251953, "learning_rate": 1.1634162481996186e-05, "loss": 0.4082, "step": 197200 }, { "epoch": 7.678383744014948, "grad_norm": 9.341414451599121, "learning_rate": 1.1624430690178677e-05, "loss": 0.3827, "step": 197250 }, { "epoch": 7.68033010237845, "grad_norm": 54.29928970336914, "learning_rate": 1.1614698898361166e-05, "loss": 0.4523, "step": 197300 }, { "epoch": 7.682276460741952, "grad_norm": 0.6860129833221436, "learning_rate": 1.1604967106543658e-05, "loss": 0.4533, "step": 197350 }, { "epoch": 7.684222819105454, "grad_norm": 54.9898796081543, "learning_rate": 1.1595235314726148e-05, "loss": 0.4723, "step": 197400 }, { "epoch": 7.686169177468956, "grad_norm": 0.4962465167045593, "learning_rate": 1.1585503522908639e-05, "loss": 0.4551, "step": 197450 }, { "epoch": 7.688115535832457, "grad_norm": 0.8603480458259583, "learning_rate": 1.1575771731091129e-05, "loss": 0.3405, "step": 197500 }, { "epoch": 7.690061894195959, "grad_norm": 20.470951080322266, "learning_rate": 1.156603993927362e-05, "loss": 0.4331, "step": 197550 }, { "epoch": 7.692008252559461, "grad_norm": 12.106364250183105, "learning_rate": 1.155630814745611e-05, "loss": 0.3492, "step": 197600 }, { "epoch": 7.6939546109229635, "grad_norm": 5.039571762084961, "learning_rate": 1.15465763556386e-05, "loss": 0.3899, "step": 197650 }, { "epoch": 7.695900969286465, "grad_norm": 97.04281616210938, "learning_rate": 1.1536844563821091e-05, "loss": 0.3882, "step": 197700 }, { "epoch": 7.697847327649967, "grad_norm": 5.4996657371521, "learning_rate": 1.1527112772003582e-05, "loss": 0.3849, "step": 197750 }, { "epoch": 7.699793686013469, "grad_norm": 53.18754196166992, "learning_rate": 1.1517380980186072e-05, "loss": 0.4058, "step": 197800 }, { "epoch": 7.701740044376971, "grad_norm": 5.751142978668213, "learning_rate": 1.1507649188368563e-05, "loss": 0.3583, "step": 197850 }, { "epoch": 7.703686402740472, "grad_norm": 2.913818597793579, "learning_rate": 1.1497917396551055e-05, "loss": 0.4445, "step": 197900 }, { "epoch": 7.705632761103974, "grad_norm": 3.853438377380371, "learning_rate": 1.1488185604733544e-05, "loss": 0.2887, "step": 197950 }, { "epoch": 7.707579119467477, "grad_norm": 21.59657096862793, "learning_rate": 1.1478453812916034e-05, "loss": 0.4016, "step": 198000 }, { "epoch": 7.7095254778309785, "grad_norm": 11.333215713500977, "learning_rate": 1.1468722021098526e-05, "loss": 0.4108, "step": 198050 }, { "epoch": 7.71147183619448, "grad_norm": 1264.625732421875, "learning_rate": 1.1458990229281017e-05, "loss": 0.3648, "step": 198100 }, { "epoch": 7.713418194557982, "grad_norm": 8.435986518859863, "learning_rate": 1.1449258437463505e-05, "loss": 0.3839, "step": 198150 }, { "epoch": 7.715364552921484, "grad_norm": 15.1345853805542, "learning_rate": 1.1439526645645998e-05, "loss": 0.58, "step": 198200 }, { "epoch": 7.717310911284986, "grad_norm": 0.4895493984222412, "learning_rate": 1.1429794853828488e-05, "loss": 0.3914, "step": 198250 }, { "epoch": 7.719257269648487, "grad_norm": 2.9578657150268555, "learning_rate": 1.1420063062010979e-05, "loss": 0.3956, "step": 198300 }, { "epoch": 7.72120362801199, "grad_norm": 10.801392555236816, "learning_rate": 1.1410331270193467e-05, "loss": 0.384, "step": 198350 }, { "epoch": 7.723149986375492, "grad_norm": 0.2327081561088562, "learning_rate": 1.140059947837596e-05, "loss": 0.3803, "step": 198400 }, { "epoch": 7.7250963447389935, "grad_norm": 5.7982177734375, "learning_rate": 1.139086768655845e-05, "loss": 0.388, "step": 198450 }, { "epoch": 7.727042703102495, "grad_norm": 23.801372528076172, "learning_rate": 1.138113589474094e-05, "loss": 0.4608, "step": 198500 }, { "epoch": 7.728989061465997, "grad_norm": 50.89640808105469, "learning_rate": 1.1371404102923431e-05, "loss": 0.5263, "step": 198550 }, { "epoch": 7.730935419829499, "grad_norm": 60.11725616455078, "learning_rate": 1.1361672311105922e-05, "loss": 0.427, "step": 198600 }, { "epoch": 7.732881778193001, "grad_norm": 1.293814778327942, "learning_rate": 1.1351940519288412e-05, "loss": 0.4547, "step": 198650 }, { "epoch": 7.734828136556503, "grad_norm": 90.57634735107422, "learning_rate": 1.1342208727470902e-05, "loss": 0.3351, "step": 198700 }, { "epoch": 7.736774494920005, "grad_norm": 8.582649230957031, "learning_rate": 1.1332476935653393e-05, "loss": 0.5329, "step": 198750 }, { "epoch": 7.738720853283507, "grad_norm": 81.3638916015625, "learning_rate": 1.1322745143835883e-05, "loss": 0.3995, "step": 198800 }, { "epoch": 7.7406672116470085, "grad_norm": 8.912217140197754, "learning_rate": 1.1313013352018374e-05, "loss": 0.3576, "step": 198850 }, { "epoch": 7.74261357001051, "grad_norm": 11.14588737487793, "learning_rate": 1.1303281560200864e-05, "loss": 0.3799, "step": 198900 }, { "epoch": 7.744559928374012, "grad_norm": 58.205955505371094, "learning_rate": 1.1293549768383355e-05, "loss": 0.4596, "step": 198950 }, { "epoch": 7.746506286737514, "grad_norm": 28.86973762512207, "learning_rate": 1.1283817976565845e-05, "loss": 0.4365, "step": 199000 }, { "epoch": 7.748452645101016, "grad_norm": 55.369319915771484, "learning_rate": 1.1274086184748336e-05, "loss": 0.3976, "step": 199050 }, { "epoch": 7.750399003464518, "grad_norm": 24.391925811767578, "learning_rate": 1.1264354392930828e-05, "loss": 0.4718, "step": 199100 }, { "epoch": 7.75234536182802, "grad_norm": 1.9247472286224365, "learning_rate": 1.1254622601113317e-05, "loss": 0.4834, "step": 199150 }, { "epoch": 7.754291720191522, "grad_norm": 118.99787139892578, "learning_rate": 1.1244890809295807e-05, "loss": 0.4069, "step": 199200 }, { "epoch": 7.756238078555024, "grad_norm": 9.369443893432617, "learning_rate": 1.1235159017478298e-05, "loss": 0.3136, "step": 199250 }, { "epoch": 7.758184436918525, "grad_norm": 14.563655853271484, "learning_rate": 1.122542722566079e-05, "loss": 0.3865, "step": 199300 }, { "epoch": 7.760130795282027, "grad_norm": 65.21219635009766, "learning_rate": 1.121569543384328e-05, "loss": 0.3665, "step": 199350 }, { "epoch": 7.762077153645529, "grad_norm": 15.834604263305664, "learning_rate": 1.120596364202577e-05, "loss": 0.396, "step": 199400 }, { "epoch": 7.764023512009031, "grad_norm": 13.643835067749023, "learning_rate": 1.1196231850208261e-05, "loss": 0.3425, "step": 199450 }, { "epoch": 7.765969870372533, "grad_norm": 34.831871032714844, "learning_rate": 1.1186500058390752e-05, "loss": 0.4292, "step": 199500 }, { "epoch": 7.767916228736035, "grad_norm": 33.96308517456055, "learning_rate": 1.1176768266573242e-05, "loss": 0.3905, "step": 199550 }, { "epoch": 7.769862587099537, "grad_norm": 65.32780456542969, "learning_rate": 1.1167036474755733e-05, "loss": 0.3237, "step": 199600 }, { "epoch": 7.771808945463039, "grad_norm": 71.80028533935547, "learning_rate": 1.1157304682938223e-05, "loss": 0.3175, "step": 199650 }, { "epoch": 7.77375530382654, "grad_norm": 72.10408782958984, "learning_rate": 1.1147572891120714e-05, "loss": 0.3276, "step": 199700 }, { "epoch": 7.775701662190042, "grad_norm": 2.7103981971740723, "learning_rate": 1.1137841099303204e-05, "loss": 0.4466, "step": 199750 }, { "epoch": 7.777648020553544, "grad_norm": 0.21312932670116425, "learning_rate": 1.1128109307485695e-05, "loss": 0.3843, "step": 199800 }, { "epoch": 7.7795943789170465, "grad_norm": 18.886823654174805, "learning_rate": 1.1118377515668185e-05, "loss": 0.3563, "step": 199850 }, { "epoch": 7.781540737280548, "grad_norm": 34.071533203125, "learning_rate": 1.1108645723850676e-05, "loss": 0.4887, "step": 199900 }, { "epoch": 7.78348709564405, "grad_norm": 54.07440948486328, "learning_rate": 1.1098913932033166e-05, "loss": 0.4104, "step": 199950 }, { "epoch": 7.785433454007552, "grad_norm": 21.006654739379883, "learning_rate": 1.1089182140215657e-05, "loss": 0.423, "step": 200000 }, { "epoch": 7.787379812371054, "grad_norm": 69.35192108154297, "learning_rate": 1.1079450348398147e-05, "loss": 0.4564, "step": 200050 }, { "epoch": 7.789326170734555, "grad_norm": 148.49676513671875, "learning_rate": 1.1069718556580638e-05, "loss": 0.3803, "step": 200100 }, { "epoch": 7.791272529098057, "grad_norm": 7.674904823303223, "learning_rate": 1.105998676476313e-05, "loss": 0.3813, "step": 200150 }, { "epoch": 7.79321888746156, "grad_norm": 39.11200714111328, "learning_rate": 1.1050254972945619e-05, "loss": 0.3376, "step": 200200 }, { "epoch": 7.7951652458250615, "grad_norm": 15.698495864868164, "learning_rate": 1.104052318112811e-05, "loss": 0.4442, "step": 200250 }, { "epoch": 7.797111604188563, "grad_norm": 0.2548670768737793, "learning_rate": 1.10307913893106e-05, "loss": 0.3605, "step": 200300 }, { "epoch": 7.799057962552065, "grad_norm": 52.73774337768555, "learning_rate": 1.1021059597493092e-05, "loss": 0.396, "step": 200350 }, { "epoch": 7.801004320915567, "grad_norm": 36.18342971801758, "learning_rate": 1.101132780567558e-05, "loss": 0.4458, "step": 200400 }, { "epoch": 7.802950679279069, "grad_norm": 7.056611061096191, "learning_rate": 1.1001596013858071e-05, "loss": 0.3447, "step": 200450 }, { "epoch": 7.80489703764257, "grad_norm": 21.030773162841797, "learning_rate": 1.0991864222040563e-05, "loss": 0.415, "step": 200500 }, { "epoch": 7.806843396006073, "grad_norm": 38.261234283447266, "learning_rate": 1.0982132430223054e-05, "loss": 0.4713, "step": 200550 }, { "epoch": 7.808789754369575, "grad_norm": 1.045000433921814, "learning_rate": 1.0972400638405544e-05, "loss": 0.3746, "step": 200600 }, { "epoch": 7.8107361127330766, "grad_norm": 36.07394790649414, "learning_rate": 1.0962668846588035e-05, "loss": 0.3246, "step": 200650 }, { "epoch": 7.812682471096578, "grad_norm": 1.2884905338287354, "learning_rate": 1.0952937054770525e-05, "loss": 0.4227, "step": 200700 }, { "epoch": 7.81462882946008, "grad_norm": 41.651248931884766, "learning_rate": 1.0943205262953016e-05, "loss": 0.4099, "step": 200750 }, { "epoch": 7.816575187823582, "grad_norm": 5.3015546798706055, "learning_rate": 1.0933473471135506e-05, "loss": 0.3679, "step": 200800 }, { "epoch": 7.818521546187084, "grad_norm": 163.08164978027344, "learning_rate": 1.0923741679317997e-05, "loss": 0.4787, "step": 200850 }, { "epoch": 7.820467904550586, "grad_norm": 50.48542404174805, "learning_rate": 1.0914009887500487e-05, "loss": 0.4286, "step": 200900 }, { "epoch": 7.822414262914088, "grad_norm": 9.718600273132324, "learning_rate": 1.0904278095682978e-05, "loss": 0.3808, "step": 200950 }, { "epoch": 7.82436062127759, "grad_norm": 1.3452165126800537, "learning_rate": 1.0894546303865468e-05, "loss": 0.3959, "step": 201000 }, { "epoch": 7.826306979641092, "grad_norm": 0.09584139287471771, "learning_rate": 1.0884814512047959e-05, "loss": 0.3455, "step": 201050 }, { "epoch": 7.828253338004593, "grad_norm": 8.462895393371582, "learning_rate": 1.0875082720230449e-05, "loss": 0.3884, "step": 201100 }, { "epoch": 7.830199696368095, "grad_norm": 2.560997247695923, "learning_rate": 1.086535092841294e-05, "loss": 0.4023, "step": 201150 }, { "epoch": 7.832146054731597, "grad_norm": 4.037003993988037, "learning_rate": 1.0855619136595432e-05, "loss": 0.4133, "step": 201200 }, { "epoch": 7.8340924130950995, "grad_norm": 7.236379146575928, "learning_rate": 1.084588734477792e-05, "loss": 0.4815, "step": 201250 }, { "epoch": 7.836038771458601, "grad_norm": 29.818082809448242, "learning_rate": 1.0836155552960411e-05, "loss": 0.3298, "step": 201300 }, { "epoch": 7.837985129822103, "grad_norm": 23.262510299682617, "learning_rate": 1.0826423761142902e-05, "loss": 0.4498, "step": 201350 }, { "epoch": 7.839931488185605, "grad_norm": 21.815969467163086, "learning_rate": 1.0816691969325394e-05, "loss": 0.4967, "step": 201400 }, { "epoch": 7.841877846549107, "grad_norm": 36.90623474121094, "learning_rate": 1.0806960177507883e-05, "loss": 0.2806, "step": 201450 }, { "epoch": 7.843824204912608, "grad_norm": 48.93798828125, "learning_rate": 1.0797228385690373e-05, "loss": 0.4896, "step": 201500 }, { "epoch": 7.84577056327611, "grad_norm": 1.7547160387039185, "learning_rate": 1.0787496593872865e-05, "loss": 0.3939, "step": 201550 }, { "epoch": 7.847716921639613, "grad_norm": 1.0092178583145142, "learning_rate": 1.0777764802055356e-05, "loss": 0.2706, "step": 201600 }, { "epoch": 7.8496632800031145, "grad_norm": 6.940141677856445, "learning_rate": 1.0768033010237844e-05, "loss": 0.3741, "step": 201650 }, { "epoch": 7.851609638366616, "grad_norm": 26.16375160217285, "learning_rate": 1.0758301218420335e-05, "loss": 0.4209, "step": 201700 }, { "epoch": 7.853555996730118, "grad_norm": 138.69476318359375, "learning_rate": 1.0748569426602827e-05, "loss": 0.3825, "step": 201750 }, { "epoch": 7.85550235509362, "grad_norm": 10.950705528259277, "learning_rate": 1.0738837634785318e-05, "loss": 0.374, "step": 201800 }, { "epoch": 7.857448713457122, "grad_norm": 29.628215789794922, "learning_rate": 1.0729105842967806e-05, "loss": 0.446, "step": 201850 }, { "epoch": 7.859395071820623, "grad_norm": 12.16657829284668, "learning_rate": 1.0719374051150299e-05, "loss": 0.3275, "step": 201900 }, { "epoch": 7.861341430184125, "grad_norm": 6.357035160064697, "learning_rate": 1.0709642259332789e-05, "loss": 0.4805, "step": 201950 }, { "epoch": 7.863287788547628, "grad_norm": 32.60056686401367, "learning_rate": 1.069991046751528e-05, "loss": 0.3873, "step": 202000 }, { "epoch": 7.8652341469111295, "grad_norm": 53.584842681884766, "learning_rate": 1.069017867569777e-05, "loss": 0.4723, "step": 202050 }, { "epoch": 7.867180505274631, "grad_norm": 33.86785125732422, "learning_rate": 1.068044688388026e-05, "loss": 0.4567, "step": 202100 }, { "epoch": 7.869126863638133, "grad_norm": 22.454679489135742, "learning_rate": 1.0670715092062751e-05, "loss": 0.4006, "step": 202150 }, { "epoch": 7.871073222001635, "grad_norm": 47.98367691040039, "learning_rate": 1.0660983300245242e-05, "loss": 0.4864, "step": 202200 }, { "epoch": 7.873019580365137, "grad_norm": 1.4539780616760254, "learning_rate": 1.0651251508427732e-05, "loss": 0.3488, "step": 202250 }, { "epoch": 7.874965938728638, "grad_norm": 0.49950698018074036, "learning_rate": 1.0641519716610222e-05, "loss": 0.4148, "step": 202300 }, { "epoch": 7.87691229709214, "grad_norm": 55.633934020996094, "learning_rate": 1.0631787924792713e-05, "loss": 0.4412, "step": 202350 }, { "epoch": 7.878858655455643, "grad_norm": 4.395534992218018, "learning_rate": 1.0622056132975203e-05, "loss": 0.3151, "step": 202400 }, { "epoch": 7.8808050138191446, "grad_norm": 21.725080490112305, "learning_rate": 1.0612324341157696e-05, "loss": 0.3338, "step": 202450 }, { "epoch": 7.882751372182646, "grad_norm": 72.32022094726562, "learning_rate": 1.0602592549340184e-05, "loss": 0.3813, "step": 202500 }, { "epoch": 7.884697730546148, "grad_norm": 8.84114933013916, "learning_rate": 1.0592860757522675e-05, "loss": 0.2553, "step": 202550 }, { "epoch": 7.88664408890965, "grad_norm": 152.07254028320312, "learning_rate": 1.0583128965705167e-05, "loss": 0.256, "step": 202600 }, { "epoch": 7.888590447273152, "grad_norm": 156.19683837890625, "learning_rate": 1.0573397173887658e-05, "loss": 0.3554, "step": 202650 }, { "epoch": 7.890536805636653, "grad_norm": 36.649436950683594, "learning_rate": 1.0563665382070146e-05, "loss": 0.5795, "step": 202700 }, { "epoch": 7.892483164000156, "grad_norm": 25.40126609802246, "learning_rate": 1.0553933590252637e-05, "loss": 0.3129, "step": 202750 }, { "epoch": 7.894429522363658, "grad_norm": 98.39879608154297, "learning_rate": 1.0544201798435129e-05, "loss": 0.3341, "step": 202800 }, { "epoch": 7.89637588072716, "grad_norm": 47.598323822021484, "learning_rate": 1.053447000661762e-05, "loss": 0.4855, "step": 202850 }, { "epoch": 7.898322239090661, "grad_norm": 0.4219442903995514, "learning_rate": 1.0524738214800108e-05, "loss": 0.3837, "step": 202900 }, { "epoch": 7.900268597454163, "grad_norm": 0.18401573598384857, "learning_rate": 1.05150064229826e-05, "loss": 0.4368, "step": 202950 }, { "epoch": 7.902214955817665, "grad_norm": 39.588741302490234, "learning_rate": 1.0505274631165091e-05, "loss": 0.4759, "step": 203000 }, { "epoch": 7.904161314181167, "grad_norm": 15.19081974029541, "learning_rate": 1.0495542839347581e-05, "loss": 0.2891, "step": 203050 }, { "epoch": 7.906107672544669, "grad_norm": 82.86119079589844, "learning_rate": 1.0485811047530072e-05, "loss": 0.3954, "step": 203100 }, { "epoch": 7.908054030908171, "grad_norm": 24.2208251953125, "learning_rate": 1.0476079255712562e-05, "loss": 0.517, "step": 203150 }, { "epoch": 7.910000389271673, "grad_norm": 5.304823875427246, "learning_rate": 1.0466347463895053e-05, "loss": 0.4321, "step": 203200 }, { "epoch": 7.911946747635175, "grad_norm": 23.053625106811523, "learning_rate": 1.0456615672077543e-05, "loss": 0.4216, "step": 203250 }, { "epoch": 7.913893105998676, "grad_norm": 60.08042526245117, "learning_rate": 1.0446883880260034e-05, "loss": 0.4335, "step": 203300 }, { "epoch": 7.915839464362178, "grad_norm": 4.211606502532959, "learning_rate": 1.0437152088442524e-05, "loss": 0.3922, "step": 203350 }, { "epoch": 7.91778582272568, "grad_norm": 9.731739044189453, "learning_rate": 1.0427420296625015e-05, "loss": 0.4421, "step": 203400 }, { "epoch": 7.9197321810891825, "grad_norm": 1.461081862449646, "learning_rate": 1.0417688504807505e-05, "loss": 0.3231, "step": 203450 }, { "epoch": 7.921678539452684, "grad_norm": 36.696170806884766, "learning_rate": 1.0407956712989998e-05, "loss": 0.3565, "step": 203500 }, { "epoch": 7.923624897816186, "grad_norm": 10.428070068359375, "learning_rate": 1.0398224921172486e-05, "loss": 0.329, "step": 203550 }, { "epoch": 7.925571256179688, "grad_norm": 46.56930160522461, "learning_rate": 1.0388493129354977e-05, "loss": 0.3746, "step": 203600 }, { "epoch": 7.92751761454319, "grad_norm": 24.959083557128906, "learning_rate": 1.0378761337537469e-05, "loss": 0.398, "step": 203650 }, { "epoch": 7.929463972906691, "grad_norm": 45.63177490234375, "learning_rate": 1.036902954571996e-05, "loss": 0.4474, "step": 203700 }, { "epoch": 7.931410331270193, "grad_norm": 15.986963272094727, "learning_rate": 1.0359297753902448e-05, "loss": 0.3797, "step": 203750 }, { "epoch": 7.933356689633696, "grad_norm": 7.784165382385254, "learning_rate": 1.0349565962084939e-05, "loss": 0.3274, "step": 203800 }, { "epoch": 7.9353030479971975, "grad_norm": 1.6878647804260254, "learning_rate": 1.0339834170267431e-05, "loss": 0.39, "step": 203850 }, { "epoch": 7.937249406360699, "grad_norm": 12.919089317321777, "learning_rate": 1.0330102378449921e-05, "loss": 0.385, "step": 203900 }, { "epoch": 7.939195764724201, "grad_norm": 20.16663360595703, "learning_rate": 1.032037058663241e-05, "loss": 0.4104, "step": 203950 }, { "epoch": 7.941142123087703, "grad_norm": 32.20284652709961, "learning_rate": 1.0310638794814902e-05, "loss": 0.4716, "step": 204000 }, { "epoch": 7.943088481451205, "grad_norm": 4.086779594421387, "learning_rate": 1.0300907002997393e-05, "loss": 0.377, "step": 204050 }, { "epoch": 7.945034839814706, "grad_norm": 0.6102976202964783, "learning_rate": 1.0291175211179883e-05, "loss": 0.3839, "step": 204100 }, { "epoch": 7.946981198178209, "grad_norm": 127.1929931640625, "learning_rate": 1.0281443419362372e-05, "loss": 0.4365, "step": 204150 }, { "epoch": 7.948927556541711, "grad_norm": 9.6278715133667, "learning_rate": 1.0271711627544864e-05, "loss": 0.4491, "step": 204200 }, { "epoch": 7.950873914905213, "grad_norm": 62.74033737182617, "learning_rate": 1.0261979835727355e-05, "loss": 0.3079, "step": 204250 }, { "epoch": 7.952820273268714, "grad_norm": 5.622780799865723, "learning_rate": 1.0252248043909845e-05, "loss": 0.3093, "step": 204300 }, { "epoch": 7.954766631632216, "grad_norm": 34.50833511352539, "learning_rate": 1.0242516252092336e-05, "loss": 0.4271, "step": 204350 }, { "epoch": 7.956712989995718, "grad_norm": 8.31539249420166, "learning_rate": 1.0232784460274826e-05, "loss": 0.3615, "step": 204400 }, { "epoch": 7.95865934835922, "grad_norm": 40.227901458740234, "learning_rate": 1.0223052668457317e-05, "loss": 0.2974, "step": 204450 }, { "epoch": 7.960605706722722, "grad_norm": 11.37437915802002, "learning_rate": 1.0213320876639807e-05, "loss": 0.4998, "step": 204500 }, { "epoch": 7.962552065086224, "grad_norm": 0.8098180294036865, "learning_rate": 1.0203589084822298e-05, "loss": 0.3205, "step": 204550 }, { "epoch": 7.964498423449726, "grad_norm": 10.22897720336914, "learning_rate": 1.0193857293004788e-05, "loss": 0.4607, "step": 204600 }, { "epoch": 7.966444781813228, "grad_norm": 31.568525314331055, "learning_rate": 1.0184125501187279e-05, "loss": 0.4831, "step": 204650 }, { "epoch": 7.968391140176729, "grad_norm": 14.73270034790039, "learning_rate": 1.017439370936977e-05, "loss": 0.382, "step": 204700 }, { "epoch": 7.970337498540231, "grad_norm": 77.84832000732422, "learning_rate": 1.0164661917552261e-05, "loss": 0.3496, "step": 204750 }, { "epoch": 7.972283856903733, "grad_norm": 7.671836853027344, "learning_rate": 1.015493012573475e-05, "loss": 0.5164, "step": 204800 }, { "epoch": 7.974230215267235, "grad_norm": 16.1517391204834, "learning_rate": 1.014519833391724e-05, "loss": 0.4895, "step": 204850 }, { "epoch": 7.976176573630736, "grad_norm": 4.2142839431762695, "learning_rate": 1.0135466542099733e-05, "loss": 0.3646, "step": 204900 }, { "epoch": 7.978122931994239, "grad_norm": 95.8125228881836, "learning_rate": 1.0125734750282223e-05, "loss": 0.4066, "step": 204950 }, { "epoch": 7.980069290357741, "grad_norm": 7.96625280380249, "learning_rate": 1.0116197594301064e-05, "loss": 0.3783, "step": 205000 }, { "epoch": 7.982015648721243, "grad_norm": 26.06406021118164, "learning_rate": 1.0106465802483554e-05, "loss": 0.4439, "step": 205050 }, { "epoch": 7.983962007084744, "grad_norm": 7.118045330047607, "learning_rate": 1.0096734010666045e-05, "loss": 0.4087, "step": 205100 }, { "epoch": 7.985908365448246, "grad_norm": 2.2053661346435547, "learning_rate": 1.0087002218848535e-05, "loss": 0.3754, "step": 205150 }, { "epoch": 7.987854723811748, "grad_norm": 59.032711029052734, "learning_rate": 1.0077270427031026e-05, "loss": 0.4278, "step": 205200 }, { "epoch": 7.98980108217525, "grad_norm": 67.55097198486328, "learning_rate": 1.0067538635213516e-05, "loss": 0.4424, "step": 205250 }, { "epoch": 7.991747440538752, "grad_norm": 74.5265884399414, "learning_rate": 1.0057806843396007e-05, "loss": 0.4863, "step": 205300 }, { "epoch": 7.993693798902254, "grad_norm": 5.816012859344482, "learning_rate": 1.0048075051578497e-05, "loss": 0.4241, "step": 205350 }, { "epoch": 7.995640157265756, "grad_norm": 20.300312042236328, "learning_rate": 1.0038343259760988e-05, "loss": 0.44, "step": 205400 }, { "epoch": 7.997586515629258, "grad_norm": 53.80659103393555, "learning_rate": 1.0028611467943478e-05, "loss": 0.4022, "step": 205450 }, { "epoch": 7.999532873992759, "grad_norm": 12.25969409942627, "learning_rate": 1.0018879676125968e-05, "loss": 0.3398, "step": 205500 }, { "epoch": 8.0, "eval_accuracy": 0.8026003347736386, "eval_f1_macro": 0.7593482845398875, "eval_f1_weighted": 0.8003979335254302, "eval_loss": 0.7740474343299866, "eval_roc_auc": 0.9516197368626681, "eval_runtime": 29.1169, "eval_samples_per_second": 882.272, "eval_steps_per_second": 110.314, "step": 205512 }, { "epoch": 8.001479232356262, "grad_norm": 26.278038024902344, "learning_rate": 1.0009147884308459e-05, "loss": 0.2967, "step": 205550 }, { "epoch": 8.003425590719763, "grad_norm": 24.645666122436523, "learning_rate": 9.99941609249095e-06, "loss": 0.3282, "step": 205600 }, { "epoch": 8.005371949083266, "grad_norm": 11.696399688720703, "learning_rate": 9.989684300673442e-06, "loss": 0.282, "step": 205650 }, { "epoch": 8.007318307446766, "grad_norm": 18.27579116821289, "learning_rate": 9.97995250885593e-06, "loss": 0.4308, "step": 205700 }, { "epoch": 8.009264665810269, "grad_norm": 2.619457721710205, "learning_rate": 9.970220717038421e-06, "loss": 0.3897, "step": 205750 }, { "epoch": 8.011211024173772, "grad_norm": 19.732501983642578, "learning_rate": 9.960488925220911e-06, "loss": 0.3836, "step": 205800 }, { "epoch": 8.013157382537273, "grad_norm": 14.662297248840332, "learning_rate": 9.950757133403404e-06, "loss": 0.36, "step": 205850 }, { "epoch": 8.015103740900775, "grad_norm": 19.679969787597656, "learning_rate": 9.941025341585894e-06, "loss": 0.4385, "step": 205900 }, { "epoch": 8.017050099264276, "grad_norm": 0.6841996312141418, "learning_rate": 9.931293549768383e-06, "loss": 0.332, "step": 205950 }, { "epoch": 8.018996457627779, "grad_norm": 759.7100219726562, "learning_rate": 9.921561757950875e-06, "loss": 0.3986, "step": 206000 }, { "epoch": 8.02094281599128, "grad_norm": 0.17681734263896942, "learning_rate": 9.911829966133366e-06, "loss": 0.3055, "step": 206050 }, { "epoch": 8.022889174354782, "grad_norm": 303.51019287109375, "learning_rate": 9.902098174315856e-06, "loss": 0.3112, "step": 206100 }, { "epoch": 8.024835532718283, "grad_norm": 24.21787452697754, "learning_rate": 9.892366382498345e-06, "loss": 0.4214, "step": 206150 }, { "epoch": 8.026781891081786, "grad_norm": 9.570271492004395, "learning_rate": 9.882634590680837e-06, "loss": 0.3705, "step": 206200 }, { "epoch": 8.028728249445289, "grad_norm": 19.258638381958008, "learning_rate": 9.872902798863327e-06, "loss": 0.4543, "step": 206250 }, { "epoch": 8.03067460780879, "grad_norm": 0.8751875758171082, "learning_rate": 9.863171007045818e-06, "loss": 0.2982, "step": 206300 }, { "epoch": 8.032620966172292, "grad_norm": 14.873605728149414, "learning_rate": 9.853439215228308e-06, "loss": 0.2989, "step": 206350 }, { "epoch": 8.034567324535793, "grad_norm": 2.7973859310150146, "learning_rate": 9.843707423410799e-06, "loss": 0.2073, "step": 206400 }, { "epoch": 8.036513682899296, "grad_norm": 49.09610366821289, "learning_rate": 9.83397563159329e-06, "loss": 0.3685, "step": 206450 }, { "epoch": 8.038460041262796, "grad_norm": 9.364380836486816, "learning_rate": 9.82424383977578e-06, "loss": 0.375, "step": 206500 }, { "epoch": 8.040406399626299, "grad_norm": 85.02690124511719, "learning_rate": 9.81451204795827e-06, "loss": 0.3896, "step": 206550 }, { "epoch": 8.042352757989802, "grad_norm": 1.8385483026504517, "learning_rate": 9.804780256140761e-06, "loss": 0.3921, "step": 206600 }, { "epoch": 8.044299116353303, "grad_norm": 8.314380645751953, "learning_rate": 9.795048464323251e-06, "loss": 0.415, "step": 206650 }, { "epoch": 8.046245474716805, "grad_norm": 48.39614486694336, "learning_rate": 9.785316672505744e-06, "loss": 0.3925, "step": 206700 }, { "epoch": 8.048191833080306, "grad_norm": 3.7507903575897217, "learning_rate": 9.775584880688232e-06, "loss": 0.3203, "step": 206750 }, { "epoch": 8.050138191443809, "grad_norm": 15.769351959228516, "learning_rate": 9.765853088870723e-06, "loss": 0.3793, "step": 206800 }, { "epoch": 8.05208454980731, "grad_norm": 0.6068198084831238, "learning_rate": 9.756121297053213e-06, "loss": 0.4002, "step": 206850 }, { "epoch": 8.054030908170812, "grad_norm": 0.5870898365974426, "learning_rate": 9.746389505235705e-06, "loss": 0.3226, "step": 206900 }, { "epoch": 8.055977266534315, "grad_norm": 18.84095573425293, "learning_rate": 9.736657713418194e-06, "loss": 0.4426, "step": 206950 }, { "epoch": 8.057923624897816, "grad_norm": 2.3566524982452393, "learning_rate": 9.726925921600685e-06, "loss": 0.4175, "step": 207000 }, { "epoch": 8.059869983261319, "grad_norm": 32.56481170654297, "learning_rate": 9.717194129783177e-06, "loss": 0.4011, "step": 207050 }, { "epoch": 8.06181634162482, "grad_norm": 28.797786712646484, "learning_rate": 9.707462337965667e-06, "loss": 0.281, "step": 207100 }, { "epoch": 8.063762699988322, "grad_norm": 23.138938903808594, "learning_rate": 9.697730546148156e-06, "loss": 0.3857, "step": 207150 }, { "epoch": 8.065709058351823, "grad_norm": 24.45941925048828, "learning_rate": 9.687998754330647e-06, "loss": 0.3746, "step": 207200 }, { "epoch": 8.067655416715326, "grad_norm": 36.87044906616211, "learning_rate": 9.678266962513139e-06, "loss": 0.4266, "step": 207250 }, { "epoch": 8.069601775078828, "grad_norm": 1.8044096231460571, "learning_rate": 9.66853517069563e-06, "loss": 0.3836, "step": 207300 }, { "epoch": 8.07154813344233, "grad_norm": 43.42618942260742, "learning_rate": 9.65880337887812e-06, "loss": 0.2595, "step": 207350 }, { "epoch": 8.073494491805832, "grad_norm": 25.02806854248047, "learning_rate": 9.64907158706061e-06, "loss": 0.3258, "step": 207400 }, { "epoch": 8.075440850169333, "grad_norm": 0.4955123960971832, "learning_rate": 9.6393397952431e-06, "loss": 0.3374, "step": 207450 }, { "epoch": 8.077387208532835, "grad_norm": 1.0109410285949707, "learning_rate": 9.629608003425591e-06, "loss": 0.3433, "step": 207500 }, { "epoch": 8.079333566896336, "grad_norm": 22.56977081298828, "learning_rate": 9.619876211608082e-06, "loss": 0.2676, "step": 207550 }, { "epoch": 8.081279925259839, "grad_norm": 76.18679809570312, "learning_rate": 9.610144419790572e-06, "loss": 0.4354, "step": 207600 }, { "epoch": 8.083226283623342, "grad_norm": 52.71858215332031, "learning_rate": 9.600412627973063e-06, "loss": 0.3658, "step": 207650 }, { "epoch": 8.085172641986842, "grad_norm": 106.28515625, "learning_rate": 9.590680836155553e-06, "loss": 0.4434, "step": 207700 }, { "epoch": 8.087119000350345, "grad_norm": 17.562801361083984, "learning_rate": 9.580949044338044e-06, "loss": 0.3463, "step": 207750 }, { "epoch": 8.089065358713846, "grad_norm": 2.484236478805542, "learning_rate": 9.571217252520534e-06, "loss": 0.3269, "step": 207800 }, { "epoch": 8.091011717077349, "grad_norm": 129.06259155273438, "learning_rate": 9.561485460703025e-06, "loss": 0.345, "step": 207850 }, { "epoch": 8.09295807544085, "grad_norm": 8.581806182861328, "learning_rate": 9.551753668885515e-06, "loss": 0.4583, "step": 207900 }, { "epoch": 8.094904433804352, "grad_norm": 22.862043380737305, "learning_rate": 9.542021877068007e-06, "loss": 0.3683, "step": 207950 }, { "epoch": 8.096850792167855, "grad_norm": 5.296134948730469, "learning_rate": 9.532290085250496e-06, "loss": 0.3907, "step": 208000 }, { "epoch": 8.098797150531356, "grad_norm": 61.901554107666016, "learning_rate": 9.522558293432987e-06, "loss": 0.3975, "step": 208050 }, { "epoch": 8.100743508894858, "grad_norm": 4.137235641479492, "learning_rate": 9.512826501615479e-06, "loss": 0.3715, "step": 208100 }, { "epoch": 8.10268986725836, "grad_norm": 141.8504638671875, "learning_rate": 9.50309470979797e-06, "loss": 0.3086, "step": 208150 }, { "epoch": 8.104636225621862, "grad_norm": 93.7691421508789, "learning_rate": 9.493362917980458e-06, "loss": 0.3354, "step": 208200 }, { "epoch": 8.106582583985363, "grad_norm": 47.881683349609375, "learning_rate": 9.4838257619993e-06, "loss": 0.2273, "step": 208250 }, { "epoch": 8.108528942348865, "grad_norm": 50.40843200683594, "learning_rate": 9.47409397018179e-06, "loss": 0.2572, "step": 208300 }, { "epoch": 8.110475300712368, "grad_norm": 25.090253829956055, "learning_rate": 9.464362178364281e-06, "loss": 0.3735, "step": 208350 }, { "epoch": 8.112421659075869, "grad_norm": 12.534561157226562, "learning_rate": 9.454630386546772e-06, "loss": 0.3326, "step": 208400 }, { "epoch": 8.114368017439372, "grad_norm": 657.9789428710938, "learning_rate": 9.444898594729262e-06, "loss": 0.4011, "step": 208450 }, { "epoch": 8.116314375802872, "grad_norm": 0.8463770151138306, "learning_rate": 9.435166802911753e-06, "loss": 0.3689, "step": 208500 }, { "epoch": 8.118260734166375, "grad_norm": 31.5478458404541, "learning_rate": 9.425435011094243e-06, "loss": 0.3415, "step": 208550 }, { "epoch": 8.120207092529876, "grad_norm": 5.358351230621338, "learning_rate": 9.415703219276734e-06, "loss": 0.4273, "step": 208600 }, { "epoch": 8.122153450893379, "grad_norm": 111.81636810302734, "learning_rate": 9.405971427459224e-06, "loss": 0.4283, "step": 208650 }, { "epoch": 8.124099809256881, "grad_norm": 5.34293270111084, "learning_rate": 9.396239635641716e-06, "loss": 0.2488, "step": 208700 }, { "epoch": 8.126046167620382, "grad_norm": 0.5881949663162231, "learning_rate": 9.386507843824205e-06, "loss": 0.3083, "step": 208750 }, { "epoch": 8.127992525983885, "grad_norm": 6.420180797576904, "learning_rate": 9.376776052006695e-06, "loss": 0.309, "step": 208800 }, { "epoch": 8.129938884347386, "grad_norm": 31.88120460510254, "learning_rate": 9.367044260189186e-06, "loss": 0.3754, "step": 208850 }, { "epoch": 8.131885242710888, "grad_norm": 68.37137603759766, "learning_rate": 9.357312468371678e-06, "loss": 0.3702, "step": 208900 }, { "epoch": 8.13383160107439, "grad_norm": 61.04022979736328, "learning_rate": 9.347580676554167e-06, "loss": 0.4077, "step": 208950 }, { "epoch": 8.135777959437892, "grad_norm": 28.539833068847656, "learning_rate": 9.337848884736657e-06, "loss": 0.3193, "step": 209000 }, { "epoch": 8.137724317801393, "grad_norm": 18.343812942504883, "learning_rate": 9.32811709291915e-06, "loss": 0.3127, "step": 209050 }, { "epoch": 8.139670676164895, "grad_norm": 50.558006286621094, "learning_rate": 9.31838530110164e-06, "loss": 0.4132, "step": 209100 }, { "epoch": 8.141617034528398, "grad_norm": 20.995834350585938, "learning_rate": 9.308653509284129e-06, "loss": 0.3472, "step": 209150 }, { "epoch": 8.143563392891899, "grad_norm": 16.543066024780273, "learning_rate": 9.29892171746662e-06, "loss": 0.3166, "step": 209200 }, { "epoch": 8.145509751255402, "grad_norm": 3.0608890056610107, "learning_rate": 9.289189925649112e-06, "loss": 0.3942, "step": 209250 }, { "epoch": 8.147456109618902, "grad_norm": 8.117815971374512, "learning_rate": 9.279458133831602e-06, "loss": 0.3837, "step": 209300 }, { "epoch": 8.149402467982405, "grad_norm": 32.156124114990234, "learning_rate": 9.26972634201409e-06, "loss": 0.3407, "step": 209350 }, { "epoch": 8.151348826345906, "grad_norm": 0.032980479300022125, "learning_rate": 9.259994550196583e-06, "loss": 0.2517, "step": 209400 }, { "epoch": 8.153295184709409, "grad_norm": 0.5044978260993958, "learning_rate": 9.250262758379073e-06, "loss": 0.4421, "step": 209450 }, { "epoch": 8.155241543072911, "grad_norm": 5.142784595489502, "learning_rate": 9.240530966561564e-06, "loss": 0.3551, "step": 209500 }, { "epoch": 8.157187901436412, "grad_norm": 11.260842323303223, "learning_rate": 9.230799174744054e-06, "loss": 0.24, "step": 209550 }, { "epoch": 8.159134259799915, "grad_norm": 1.538138747215271, "learning_rate": 9.221067382926545e-06, "loss": 0.4247, "step": 209600 }, { "epoch": 8.161080618163416, "grad_norm": 0.7053439021110535, "learning_rate": 9.211335591109035e-06, "loss": 0.2846, "step": 209650 }, { "epoch": 8.163026976526918, "grad_norm": 15.06623649597168, "learning_rate": 9.201603799291526e-06, "loss": 0.2412, "step": 209700 }, { "epoch": 8.16497333489042, "grad_norm": 24.015018463134766, "learning_rate": 9.191872007474016e-06, "loss": 0.3955, "step": 209750 }, { "epoch": 8.166919693253922, "grad_norm": 27.806354522705078, "learning_rate": 9.182140215656507e-06, "loss": 0.3331, "step": 209800 }, { "epoch": 8.168866051617425, "grad_norm": 41.307708740234375, "learning_rate": 9.172408423838997e-06, "loss": 0.3961, "step": 209850 }, { "epoch": 8.170812409980925, "grad_norm": 112.48374938964844, "learning_rate": 9.162871267857838e-06, "loss": 0.4505, "step": 209900 }, { "epoch": 8.172758768344428, "grad_norm": 0.983919620513916, "learning_rate": 9.153139476040328e-06, "loss": 0.375, "step": 209950 }, { "epoch": 8.174705126707929, "grad_norm": 18.549116134643555, "learning_rate": 9.14340768422282e-06, "loss": 0.3215, "step": 210000 }, { "epoch": 8.176651485071432, "grad_norm": 0.3909394443035126, "learning_rate": 9.133675892405311e-06, "loss": 0.3183, "step": 210050 }, { "epoch": 8.178597843434932, "grad_norm": 0.6646482944488525, "learning_rate": 9.1239441005878e-06, "loss": 0.3091, "step": 210100 }, { "epoch": 8.180544201798435, "grad_norm": 5.470911502838135, "learning_rate": 9.114212308770292e-06, "loss": 0.3356, "step": 210150 }, { "epoch": 8.182490560161938, "grad_norm": 0.4599927067756653, "learning_rate": 9.104480516952782e-06, "loss": 0.3538, "step": 210200 }, { "epoch": 8.184436918525439, "grad_norm": 96.37556457519531, "learning_rate": 9.094748725135273e-06, "loss": 0.3066, "step": 210250 }, { "epoch": 8.186383276888941, "grad_norm": 2.7455122470855713, "learning_rate": 9.085016933317762e-06, "loss": 0.3574, "step": 210300 }, { "epoch": 8.188329635252442, "grad_norm": 17.33638572692871, "learning_rate": 9.075285141500254e-06, "loss": 0.292, "step": 210350 }, { "epoch": 8.190275993615945, "grad_norm": 2.3058464527130127, "learning_rate": 9.065553349682744e-06, "loss": 0.3861, "step": 210400 }, { "epoch": 8.192222351979446, "grad_norm": 13.59524917602539, "learning_rate": 9.055821557865235e-06, "loss": 0.3497, "step": 210450 }, { "epoch": 8.194168710342948, "grad_norm": 0.864811360836029, "learning_rate": 9.046089766047725e-06, "loss": 0.3169, "step": 210500 }, { "epoch": 8.196115068706451, "grad_norm": 0.06367062032222748, "learning_rate": 9.036357974230216e-06, "loss": 0.2824, "step": 210550 }, { "epoch": 8.198061427069952, "grad_norm": 79.61901092529297, "learning_rate": 9.026626182412706e-06, "loss": 0.4361, "step": 210600 }, { "epoch": 8.200007785433455, "grad_norm": 43.876094818115234, "learning_rate": 9.016894390595197e-06, "loss": 0.399, "step": 210650 }, { "epoch": 8.201954143796955, "grad_norm": 6.227982044219971, "learning_rate": 9.007162598777687e-06, "loss": 0.3566, "step": 210700 }, { "epoch": 8.203900502160458, "grad_norm": 132.6034393310547, "learning_rate": 8.997430806960178e-06, "loss": 0.397, "step": 210750 }, { "epoch": 8.205846860523959, "grad_norm": 8.047165870666504, "learning_rate": 8.987699015142668e-06, "loss": 0.4135, "step": 210800 }, { "epoch": 8.207793218887462, "grad_norm": 19.737377166748047, "learning_rate": 8.977967223325159e-06, "loss": 0.372, "step": 210850 }, { "epoch": 8.209739577250964, "grad_norm": 33.11486053466797, "learning_rate": 8.96823543150765e-06, "loss": 0.3188, "step": 210900 }, { "epoch": 8.211685935614465, "grad_norm": 5.008458614349365, "learning_rate": 8.95850363969014e-06, "loss": 0.4065, "step": 210950 }, { "epoch": 8.213632293977968, "grad_norm": 15.817551612854004, "learning_rate": 8.94877184787263e-06, "loss": 0.3408, "step": 211000 }, { "epoch": 8.215578652341469, "grad_norm": 22.816967010498047, "learning_rate": 8.939040056055122e-06, "loss": 0.3657, "step": 211050 }, { "epoch": 8.217525010704971, "grad_norm": 10.342336654663086, "learning_rate": 8.929308264237613e-06, "loss": 0.3527, "step": 211100 }, { "epoch": 8.219471369068472, "grad_norm": 65.87459564208984, "learning_rate": 8.919576472420102e-06, "loss": 0.3369, "step": 211150 }, { "epoch": 8.221417727431975, "grad_norm": 23.595853805541992, "learning_rate": 8.909844680602592e-06, "loss": 0.3038, "step": 211200 }, { "epoch": 8.223364085795478, "grad_norm": 15.080907821655273, "learning_rate": 8.900112888785084e-06, "loss": 0.3542, "step": 211250 }, { "epoch": 8.225310444158978, "grad_norm": 57.30108642578125, "learning_rate": 8.890381096967575e-06, "loss": 0.3709, "step": 211300 }, { "epoch": 8.227256802522481, "grad_norm": 9.041622161865234, "learning_rate": 8.880649305150063e-06, "loss": 0.4329, "step": 211350 }, { "epoch": 8.229203160885982, "grad_norm": 18.411407470703125, "learning_rate": 8.870917513332556e-06, "loss": 0.3614, "step": 211400 }, { "epoch": 8.231149519249485, "grad_norm": 18.76998519897461, "learning_rate": 8.861185721515046e-06, "loss": 0.3048, "step": 211450 }, { "epoch": 8.233095877612985, "grad_norm": 17.16579818725586, "learning_rate": 8.851453929697537e-06, "loss": 0.2802, "step": 211500 }, { "epoch": 8.235042235976488, "grad_norm": 0.23132488131523132, "learning_rate": 8.841722137880027e-06, "loss": 0.3872, "step": 211550 }, { "epoch": 8.23698859433999, "grad_norm": 8.182238578796387, "learning_rate": 8.831990346062518e-06, "loss": 0.3601, "step": 211600 }, { "epoch": 8.238934952703492, "grad_norm": 16.215723037719727, "learning_rate": 8.822258554245008e-06, "loss": 0.3561, "step": 211650 }, { "epoch": 8.240881311066994, "grad_norm": 0.842173159122467, "learning_rate": 8.812526762427499e-06, "loss": 0.3373, "step": 211700 }, { "epoch": 8.242827669430495, "grad_norm": 3.5194003582000732, "learning_rate": 8.802794970609989e-06, "loss": 0.3611, "step": 211750 }, { "epoch": 8.244774027793998, "grad_norm": 76.2630844116211, "learning_rate": 8.79306317879248e-06, "loss": 0.3385, "step": 211800 }, { "epoch": 8.246720386157499, "grad_norm": 213.95130920410156, "learning_rate": 8.78333138697497e-06, "loss": 0.251, "step": 211850 }, { "epoch": 8.248666744521001, "grad_norm": 37.597476959228516, "learning_rate": 8.77359959515746e-06, "loss": 0.3741, "step": 211900 }, { "epoch": 8.250613102884504, "grad_norm": 15.035717964172363, "learning_rate": 8.763867803339951e-06, "loss": 0.4342, "step": 211950 }, { "epoch": 8.252559461248005, "grad_norm": 31.582317352294922, "learning_rate": 8.754136011522441e-06, "loss": 0.4014, "step": 212000 }, { "epoch": 8.254505819611508, "grad_norm": 44.54512023925781, "learning_rate": 8.744404219704932e-06, "loss": 0.3802, "step": 212050 }, { "epoch": 8.256452177975008, "grad_norm": 18.239288330078125, "learning_rate": 8.734672427887424e-06, "loss": 0.3343, "step": 212100 }, { "epoch": 8.258398536338511, "grad_norm": 54.7878532409668, "learning_rate": 8.724940636069915e-06, "loss": 0.3852, "step": 212150 }, { "epoch": 8.260344894702012, "grad_norm": 42.963985443115234, "learning_rate": 8.715208844252403e-06, "loss": 0.3091, "step": 212200 }, { "epoch": 8.262291253065515, "grad_norm": 23.66748046875, "learning_rate": 8.705477052434894e-06, "loss": 0.4715, "step": 212250 }, { "epoch": 8.264237611429015, "grad_norm": 20.31688117980957, "learning_rate": 8.695745260617386e-06, "loss": 0.3487, "step": 212300 }, { "epoch": 8.266183969792518, "grad_norm": 32.89665985107422, "learning_rate": 8.686013468799877e-06, "loss": 0.3379, "step": 212350 }, { "epoch": 8.26813032815602, "grad_norm": 0.4706190526485443, "learning_rate": 8.676281676982365e-06, "loss": 0.3777, "step": 212400 }, { "epoch": 8.270076686519522, "grad_norm": 1.6783398389816284, "learning_rate": 8.666549885164858e-06, "loss": 0.3577, "step": 212450 }, { "epoch": 8.272023044883024, "grad_norm": 70.33980560302734, "learning_rate": 8.656818093347348e-06, "loss": 0.367, "step": 212500 }, { "epoch": 8.273969403246525, "grad_norm": 29.576419830322266, "learning_rate": 8.647086301529839e-06, "loss": 0.2943, "step": 212550 }, { "epoch": 8.275915761610028, "grad_norm": 0.2740004062652588, "learning_rate": 8.637354509712329e-06, "loss": 0.4347, "step": 212600 }, { "epoch": 8.277862119973529, "grad_norm": 84.86604309082031, "learning_rate": 8.62762271789482e-06, "loss": 0.3833, "step": 212650 }, { "epoch": 8.279808478337031, "grad_norm": 19.118986129760742, "learning_rate": 8.61789092607731e-06, "loss": 0.375, "step": 212700 }, { "epoch": 8.281754836700534, "grad_norm": 25.87902069091797, "learning_rate": 8.6081591342598e-06, "loss": 0.3818, "step": 212750 }, { "epoch": 8.283701195064035, "grad_norm": 3.6347382068634033, "learning_rate": 8.598427342442291e-06, "loss": 0.3731, "step": 212800 }, { "epoch": 8.285647553427538, "grad_norm": 0.29332590103149414, "learning_rate": 8.588695550624781e-06, "loss": 0.3094, "step": 212850 }, { "epoch": 8.287593911791038, "grad_norm": 0.44279155135154724, "learning_rate": 8.578963758807272e-06, "loss": 0.2227, "step": 212900 }, { "epoch": 8.289540270154541, "grad_norm": 0.22298859059810638, "learning_rate": 8.569231966989762e-06, "loss": 0.3432, "step": 212950 }, { "epoch": 8.291486628518042, "grad_norm": 13.157524108886719, "learning_rate": 8.559500175172253e-06, "loss": 0.4281, "step": 213000 }, { "epoch": 8.293432986881545, "grad_norm": 39.38589096069336, "learning_rate": 8.549768383354743e-06, "loss": 0.3301, "step": 213050 }, { "epoch": 8.295379345245047, "grad_norm": 74.72953033447266, "learning_rate": 8.540036591537234e-06, "loss": 0.2952, "step": 213100 }, { "epoch": 8.297325703608548, "grad_norm": 28.534133911132812, "learning_rate": 8.530304799719726e-06, "loss": 0.3049, "step": 213150 }, { "epoch": 8.29927206197205, "grad_norm": 15.04012393951416, "learning_rate": 8.520573007902215e-06, "loss": 0.4827, "step": 213200 }, { "epoch": 8.301218420335552, "grad_norm": 3.026059865951538, "learning_rate": 8.510841216084705e-06, "loss": 0.3401, "step": 213250 }, { "epoch": 8.303164778699054, "grad_norm": 0.28233930468559265, "learning_rate": 8.501109424267196e-06, "loss": 0.4141, "step": 213300 }, { "epoch": 8.305111137062555, "grad_norm": 0.1117667630314827, "learning_rate": 8.491377632449688e-06, "loss": 0.3504, "step": 213350 }, { "epoch": 8.307057495426058, "grad_norm": 44.17383575439453, "learning_rate": 8.481645840632177e-06, "loss": 0.2798, "step": 213400 }, { "epoch": 8.30900385378956, "grad_norm": 13.077094078063965, "learning_rate": 8.471914048814667e-06, "loss": 0.3347, "step": 213450 }, { "epoch": 8.310950212153061, "grad_norm": 0.6649339199066162, "learning_rate": 8.46218225699716e-06, "loss": 0.4175, "step": 213500 }, { "epoch": 8.312896570516564, "grad_norm": 22.989967346191406, "learning_rate": 8.45245046517965e-06, "loss": 0.3685, "step": 213550 }, { "epoch": 8.314842928880065, "grad_norm": 0.393192857503891, "learning_rate": 8.44271867336214e-06, "loss": 0.3401, "step": 213600 }, { "epoch": 8.316789287243568, "grad_norm": 66.30856323242188, "learning_rate": 8.43298688154463e-06, "loss": 0.3905, "step": 213650 }, { "epoch": 8.318735645607068, "grad_norm": 25.084535598754883, "learning_rate": 8.423255089727121e-06, "loss": 0.3959, "step": 213700 }, { "epoch": 8.320682003970571, "grad_norm": 1.4300577640533447, "learning_rate": 8.413523297909612e-06, "loss": 0.399, "step": 213750 }, { "epoch": 8.322628362334074, "grad_norm": 31.679689407348633, "learning_rate": 8.403791506092102e-06, "loss": 0.3678, "step": 213800 }, { "epoch": 8.324574720697575, "grad_norm": 0.44044116139411926, "learning_rate": 8.394059714274593e-06, "loss": 0.4209, "step": 213850 }, { "epoch": 8.326521079061077, "grad_norm": 51.093605041503906, "learning_rate": 8.384327922457083e-06, "loss": 0.3888, "step": 213900 }, { "epoch": 8.328467437424578, "grad_norm": 46.244022369384766, "learning_rate": 8.374596130639574e-06, "loss": 0.3715, "step": 213950 }, { "epoch": 8.33041379578808, "grad_norm": 182.81985473632812, "learning_rate": 8.364864338822064e-06, "loss": 0.356, "step": 214000 }, { "epoch": 8.332360154151582, "grad_norm": 7.939088821411133, "learning_rate": 8.355132547004555e-06, "loss": 0.2851, "step": 214050 }, { "epoch": 8.334306512515084, "grad_norm": 4.292377948760986, "learning_rate": 8.345400755187045e-06, "loss": 0.3988, "step": 214100 }, { "epoch": 8.336252870878587, "grad_norm": 28.541500091552734, "learning_rate": 8.335668963369536e-06, "loss": 0.3632, "step": 214150 }, { "epoch": 8.338199229242088, "grad_norm": 4.114736557006836, "learning_rate": 8.325937171552026e-06, "loss": 0.3763, "step": 214200 }, { "epoch": 8.34014558760559, "grad_norm": 29.326017379760742, "learning_rate": 8.316205379734517e-06, "loss": 0.4274, "step": 214250 }, { "epoch": 8.342091945969091, "grad_norm": 56.973533630371094, "learning_rate": 8.306473587917007e-06, "loss": 0.3132, "step": 214300 }, { "epoch": 8.344038304332594, "grad_norm": 20.859210968017578, "learning_rate": 8.296741796099498e-06, "loss": 0.3179, "step": 214350 }, { "epoch": 8.345984662696095, "grad_norm": 0.14887459576129913, "learning_rate": 8.28701000428199e-06, "loss": 0.2733, "step": 214400 }, { "epoch": 8.347931021059598, "grad_norm": 29.25450325012207, "learning_rate": 8.277278212464479e-06, "loss": 0.3622, "step": 214450 }, { "epoch": 8.349877379423098, "grad_norm": 83.5732421875, "learning_rate": 8.267546420646969e-06, "loss": 0.3507, "step": 214500 }, { "epoch": 8.351823737786601, "grad_norm": 22.79537582397461, "learning_rate": 8.257814628829461e-06, "loss": 0.4422, "step": 214550 }, { "epoch": 8.353770096150104, "grad_norm": 22.95654296875, "learning_rate": 8.248082837011952e-06, "loss": 0.3786, "step": 214600 }, { "epoch": 8.355716454513605, "grad_norm": 26.920888900756836, "learning_rate": 8.23835104519444e-06, "loss": 0.2258, "step": 214650 }, { "epoch": 8.357662812877107, "grad_norm": 34.35489273071289, "learning_rate": 8.228619253376931e-06, "loss": 0.4225, "step": 214700 }, { "epoch": 8.359609171240608, "grad_norm": 27.42578125, "learning_rate": 8.219082097395773e-06, "loss": 0.4179, "step": 214750 }, { "epoch": 8.36155552960411, "grad_norm": 2.337683916091919, "learning_rate": 8.209350305578264e-06, "loss": 0.4937, "step": 214800 }, { "epoch": 8.363501887967612, "grad_norm": 26.396242141723633, "learning_rate": 8.199618513760754e-06, "loss": 0.4563, "step": 214850 }, { "epoch": 8.365448246331114, "grad_norm": 409.46075439453125, "learning_rate": 8.189886721943245e-06, "loss": 0.359, "step": 214900 }, { "epoch": 8.367394604694617, "grad_norm": 38.23627471923828, "learning_rate": 8.180154930125735e-06, "loss": 0.3941, "step": 214950 }, { "epoch": 8.369340963058118, "grad_norm": 0.36533650755882263, "learning_rate": 8.170423138308226e-06, "loss": 0.4064, "step": 215000 }, { "epoch": 8.37128732142162, "grad_norm": 19.258520126342773, "learning_rate": 8.160691346490716e-06, "loss": 0.3917, "step": 215050 }, { "epoch": 8.373233679785121, "grad_norm": 18.61333465576172, "learning_rate": 8.150959554673207e-06, "loss": 0.3698, "step": 215100 }, { "epoch": 8.375180038148624, "grad_norm": 4.67263650894165, "learning_rate": 8.141227762855699e-06, "loss": 0.3629, "step": 215150 }, { "epoch": 8.377126396512125, "grad_norm": 116.76776123046875, "learning_rate": 8.131495971038188e-06, "loss": 0.3311, "step": 215200 }, { "epoch": 8.379072754875628, "grad_norm": 55.410560607910156, "learning_rate": 8.121764179220678e-06, "loss": 0.2722, "step": 215250 }, { "epoch": 8.38101911323913, "grad_norm": 32.822357177734375, "learning_rate": 8.112032387403168e-06, "loss": 0.4309, "step": 215300 }, { "epoch": 8.382965471602631, "grad_norm": 1.0277888774871826, "learning_rate": 8.10230059558566e-06, "loss": 0.48, "step": 215350 }, { "epoch": 8.384911829966134, "grad_norm": 20.92021369934082, "learning_rate": 8.09256880376815e-06, "loss": 0.2877, "step": 215400 }, { "epoch": 8.386858188329635, "grad_norm": 0.9754750728607178, "learning_rate": 8.08283701195064e-06, "loss": 0.334, "step": 215450 }, { "epoch": 8.388804546693137, "grad_norm": 2.775167942047119, "learning_rate": 8.073105220133132e-06, "loss": 0.4043, "step": 215500 }, { "epoch": 8.390750905056638, "grad_norm": 0.9066095352172852, "learning_rate": 8.063373428315623e-06, "loss": 0.3269, "step": 215550 }, { "epoch": 8.39269726342014, "grad_norm": 9.651918411254883, "learning_rate": 8.053641636498111e-06, "loss": 0.5544, "step": 215600 }, { "epoch": 8.394643621783644, "grad_norm": 17.844741821289062, "learning_rate": 8.043909844680602e-06, "loss": 0.4071, "step": 215650 }, { "epoch": 8.396589980147144, "grad_norm": 0.17656905949115753, "learning_rate": 8.034178052863094e-06, "loss": 0.2882, "step": 215700 }, { "epoch": 8.398536338510647, "grad_norm": 5.114139556884766, "learning_rate": 8.024446261045585e-06, "loss": 0.2849, "step": 215750 }, { "epoch": 8.400482696874148, "grad_norm": 0.2529154121875763, "learning_rate": 8.014714469228073e-06, "loss": 0.2911, "step": 215800 }, { "epoch": 8.40242905523765, "grad_norm": 45.78586959838867, "learning_rate": 8.004982677410566e-06, "loss": 0.3509, "step": 215850 }, { "epoch": 8.404375413601151, "grad_norm": 0.34544605016708374, "learning_rate": 7.995250885593056e-06, "loss": 0.5309, "step": 215900 }, { "epoch": 8.406321771964654, "grad_norm": 7.244893550872803, "learning_rate": 7.985519093775546e-06, "loss": 0.3148, "step": 215950 }, { "epoch": 8.408268130328157, "grad_norm": 81.75678253173828, "learning_rate": 7.975787301958037e-06, "loss": 0.2875, "step": 216000 }, { "epoch": 8.410214488691658, "grad_norm": 25.47453498840332, "learning_rate": 7.966055510140527e-06, "loss": 0.3976, "step": 216050 }, { "epoch": 8.41216084705516, "grad_norm": 2.903407096862793, "learning_rate": 7.956323718323018e-06, "loss": 0.3558, "step": 216100 }, { "epoch": 8.414107205418661, "grad_norm": 13.733667373657227, "learning_rate": 7.946591926505508e-06, "loss": 0.3691, "step": 216150 }, { "epoch": 8.416053563782164, "grad_norm": 3.6683170795440674, "learning_rate": 7.936860134688e-06, "loss": 0.3827, "step": 216200 }, { "epoch": 8.417999922145665, "grad_norm": 6.774913787841797, "learning_rate": 7.92712834287049e-06, "loss": 0.3052, "step": 216250 }, { "epoch": 8.419946280509167, "grad_norm": 37.53233337402344, "learning_rate": 7.91739655105298e-06, "loss": 0.3525, "step": 216300 }, { "epoch": 8.42189263887267, "grad_norm": 136.4558563232422, "learning_rate": 7.90766475923547e-06, "loss": 0.3775, "step": 216350 }, { "epoch": 8.423838997236171, "grad_norm": 8.245142936706543, "learning_rate": 7.897932967417963e-06, "loss": 0.3925, "step": 216400 }, { "epoch": 8.425785355599674, "grad_norm": 97.74210357666016, "learning_rate": 7.888201175600451e-06, "loss": 0.3942, "step": 216450 }, { "epoch": 8.427731713963174, "grad_norm": 42.45359802246094, "learning_rate": 7.878469383782942e-06, "loss": 0.4142, "step": 216500 }, { "epoch": 8.429678072326677, "grad_norm": 17.55264663696289, "learning_rate": 7.868737591965434e-06, "loss": 0.3104, "step": 216550 }, { "epoch": 8.431624430690178, "grad_norm": 0.6540454030036926, "learning_rate": 7.859005800147924e-06, "loss": 0.3279, "step": 216600 }, { "epoch": 8.43357078905368, "grad_norm": 122.44757843017578, "learning_rate": 7.849274008330413e-06, "loss": 0.3611, "step": 216650 }, { "epoch": 8.435517147417183, "grad_norm": 59.21687316894531, "learning_rate": 7.839542216512904e-06, "loss": 0.3481, "step": 216700 }, { "epoch": 8.437463505780684, "grad_norm": 0.314205139875412, "learning_rate": 7.829810424695396e-06, "loss": 0.307, "step": 216750 }, { "epoch": 8.439409864144187, "grad_norm": 17.416723251342773, "learning_rate": 7.820078632877886e-06, "loss": 0.3424, "step": 216800 }, { "epoch": 8.441356222507688, "grad_norm": 14.674776077270508, "learning_rate": 7.810346841060375e-06, "loss": 0.4169, "step": 216850 }, { "epoch": 8.44330258087119, "grad_norm": 38.95624542236328, "learning_rate": 7.800615049242867e-06, "loss": 0.3676, "step": 216900 }, { "epoch": 8.445248939234691, "grad_norm": 6.135432243347168, "learning_rate": 7.790883257425358e-06, "loss": 0.3501, "step": 216950 }, { "epoch": 8.447195297598194, "grad_norm": 13.802192687988281, "learning_rate": 7.781151465607848e-06, "loss": 0.3833, "step": 217000 }, { "epoch": 8.449141655961697, "grad_norm": 7.109553337097168, "learning_rate": 7.771419673790339e-06, "loss": 0.3803, "step": 217050 }, { "epoch": 8.451088014325197, "grad_norm": 16.336557388305664, "learning_rate": 7.76168788197283e-06, "loss": 0.3798, "step": 217100 }, { "epoch": 8.4530343726887, "grad_norm": 62.24176025390625, "learning_rate": 7.75195609015532e-06, "loss": 0.2729, "step": 217150 }, { "epoch": 8.454980731052201, "grad_norm": 16.262617111206055, "learning_rate": 7.74222429833781e-06, "loss": 0.3852, "step": 217200 }, { "epoch": 8.456927089415704, "grad_norm": 94.38314056396484, "learning_rate": 7.7324925065203e-06, "loss": 0.3455, "step": 217250 }, { "epoch": 8.458873447779204, "grad_norm": 0.19787168502807617, "learning_rate": 7.722760714702791e-06, "loss": 0.4853, "step": 217300 }, { "epoch": 8.460819806142707, "grad_norm": 65.49479675292969, "learning_rate": 7.713028922885282e-06, "loss": 0.3769, "step": 217350 }, { "epoch": 8.46276616450621, "grad_norm": 81.45146942138672, "learning_rate": 7.703297131067772e-06, "loss": 0.2918, "step": 217400 }, { "epoch": 8.46471252286971, "grad_norm": 21.403350830078125, "learning_rate": 7.693565339250264e-06, "loss": 0.4463, "step": 217450 }, { "epoch": 8.466658881233213, "grad_norm": 32.25508499145508, "learning_rate": 7.683833547432753e-06, "loss": 0.377, "step": 217500 }, { "epoch": 8.468605239596714, "grad_norm": 102.52825927734375, "learning_rate": 7.674101755615244e-06, "loss": 0.3758, "step": 217550 }, { "epoch": 8.470551597960217, "grad_norm": 0.22965359687805176, "learning_rate": 7.664369963797736e-06, "loss": 0.3802, "step": 217600 }, { "epoch": 8.472497956323718, "grad_norm": 33.80280685424805, "learning_rate": 7.654638171980226e-06, "loss": 0.5047, "step": 217650 }, { "epoch": 8.47444431468722, "grad_norm": 0.823825478553772, "learning_rate": 7.644906380162715e-06, "loss": 0.3853, "step": 217700 }, { "epoch": 8.476390673050721, "grad_norm": 14.897957801818848, "learning_rate": 7.635174588345206e-06, "loss": 0.4147, "step": 217750 }, { "epoch": 8.478337031414224, "grad_norm": 14.810110092163086, "learning_rate": 7.625442796527697e-06, "loss": 0.3612, "step": 217800 }, { "epoch": 8.480283389777727, "grad_norm": 20.07162857055664, "learning_rate": 7.615905640546537e-06, "loss": 0.4309, "step": 217850 }, { "epoch": 8.482229748141227, "grad_norm": 0.1594788283109665, "learning_rate": 7.606173848729029e-06, "loss": 0.4237, "step": 217900 }, { "epoch": 8.48417610650473, "grad_norm": 9.68181324005127, "learning_rate": 7.596442056911519e-06, "loss": 0.3575, "step": 217950 }, { "epoch": 8.486122464868231, "grad_norm": 27.49397850036621, "learning_rate": 7.586710265094009e-06, "loss": 0.4236, "step": 218000 }, { "epoch": 8.488068823231734, "grad_norm": 3.782043695449829, "learning_rate": 7.5769784732765e-06, "loss": 0.3663, "step": 218050 }, { "epoch": 8.490015181595234, "grad_norm": 70.58650970458984, "learning_rate": 7.567246681458991e-06, "loss": 0.3182, "step": 218100 }, { "epoch": 8.491961539958737, "grad_norm": 61.00771713256836, "learning_rate": 7.557514889641482e-06, "loss": 0.4433, "step": 218150 }, { "epoch": 8.49390789832224, "grad_norm": 15.120667457580566, "learning_rate": 7.547783097823971e-06, "loss": 0.3984, "step": 218200 }, { "epoch": 8.49585425668574, "grad_norm": 0.31695830821990967, "learning_rate": 7.538051306006462e-06, "loss": 0.3372, "step": 218250 }, { "epoch": 8.497800615049243, "grad_norm": 6.036387920379639, "learning_rate": 7.5283195141889526e-06, "loss": 0.3704, "step": 218300 }, { "epoch": 8.499746973412744, "grad_norm": 30.501209259033203, "learning_rate": 7.518587722371444e-06, "loss": 0.3477, "step": 218350 }, { "epoch": 8.501693331776247, "grad_norm": 94.53404235839844, "learning_rate": 7.5088559305539335e-06, "loss": 0.27, "step": 218400 }, { "epoch": 8.503639690139748, "grad_norm": 0.1715928614139557, "learning_rate": 7.499124138736424e-06, "loss": 0.3403, "step": 218450 }, { "epoch": 8.50558604850325, "grad_norm": 6.8963799476623535, "learning_rate": 7.489392346918915e-06, "loss": 0.3045, "step": 218500 }, { "epoch": 8.507532406866753, "grad_norm": 1.3019670248031616, "learning_rate": 7.479660555101406e-06, "loss": 0.3465, "step": 218550 }, { "epoch": 8.509478765230254, "grad_norm": 0.8486109972000122, "learning_rate": 7.469928763283897e-06, "loss": 0.3531, "step": 218600 }, { "epoch": 8.511425123593757, "grad_norm": 27.578414916992188, "learning_rate": 7.460196971466386e-06, "loss": 0.331, "step": 218650 }, { "epoch": 8.513371481957257, "grad_norm": 24.29524040222168, "learning_rate": 7.450465179648877e-06, "loss": 0.3701, "step": 218700 }, { "epoch": 8.51531784032076, "grad_norm": 42.9409065246582, "learning_rate": 7.440733387831368e-06, "loss": 0.428, "step": 218750 }, { "epoch": 8.517264198684261, "grad_norm": 260.6474609375, "learning_rate": 7.431001596013859e-06, "loss": 0.3596, "step": 218800 }, { "epoch": 8.519210557047764, "grad_norm": 27.411518096923828, "learning_rate": 7.421269804196349e-06, "loss": 0.3244, "step": 218850 }, { "epoch": 8.521156915411266, "grad_norm": 14.322549819946289, "learning_rate": 7.411538012378839e-06, "loss": 0.5402, "step": 218900 }, { "epoch": 8.523103273774767, "grad_norm": 14.03453254699707, "learning_rate": 7.4018062205613306e-06, "loss": 0.2651, "step": 218950 }, { "epoch": 8.52504963213827, "grad_norm": 2.9811289310455322, "learning_rate": 7.392074428743821e-06, "loss": 0.3877, "step": 219000 }, { "epoch": 8.52699599050177, "grad_norm": 21.31934928894043, "learning_rate": 7.382342636926311e-06, "loss": 0.4106, "step": 219050 }, { "epoch": 8.528942348865273, "grad_norm": 32.3565559387207, "learning_rate": 7.372610845108802e-06, "loss": 0.339, "step": 219100 }, { "epoch": 8.530888707228774, "grad_norm": 21.443838119506836, "learning_rate": 7.3628790532912925e-06, "loss": 0.3255, "step": 219150 }, { "epoch": 8.532835065592277, "grad_norm": 96.8027114868164, "learning_rate": 7.353147261473784e-06, "loss": 0.4233, "step": 219200 }, { "epoch": 8.53478142395578, "grad_norm": 0.2646442651748657, "learning_rate": 7.343415469656273e-06, "loss": 0.346, "step": 219250 }, { "epoch": 8.53672778231928, "grad_norm": 0.09790492057800293, "learning_rate": 7.333683677838764e-06, "loss": 0.4321, "step": 219300 }, { "epoch": 8.538674140682783, "grad_norm": 12.052342414855957, "learning_rate": 7.3239518860212544e-06, "loss": 0.3443, "step": 219350 }, { "epoch": 8.540620499046284, "grad_norm": 0.3464350998401642, "learning_rate": 7.314220094203746e-06, "loss": 0.3517, "step": 219400 }, { "epoch": 8.542566857409787, "grad_norm": 0.4015164375305176, "learning_rate": 7.304488302386235e-06, "loss": 0.3264, "step": 219450 }, { "epoch": 8.544513215773287, "grad_norm": 0.43963247537612915, "learning_rate": 7.294756510568726e-06, "loss": 0.3747, "step": 219500 }, { "epoch": 8.54645957413679, "grad_norm": 67.11590576171875, "learning_rate": 7.285024718751217e-06, "loss": 0.3552, "step": 219550 }, { "epoch": 8.548405932500291, "grad_norm": 8.320755004882812, "learning_rate": 7.275292926933708e-06, "loss": 0.4437, "step": 219600 }, { "epoch": 8.550352290863794, "grad_norm": 13.893418312072754, "learning_rate": 7.265561135116197e-06, "loss": 0.3312, "step": 219650 }, { "epoch": 8.552298649227296, "grad_norm": 37.70243835449219, "learning_rate": 7.255829343298688e-06, "loss": 0.3398, "step": 219700 }, { "epoch": 8.554245007590797, "grad_norm": 46.875003814697266, "learning_rate": 7.246097551481179e-06, "loss": 0.4007, "step": 219750 }, { "epoch": 8.5561913659543, "grad_norm": 8.304167747497559, "learning_rate": 7.23636575966367e-06, "loss": 0.3477, "step": 219800 }, { "epoch": 8.5581377243178, "grad_norm": 68.84445190429688, "learning_rate": 7.226633967846161e-06, "loss": 0.3628, "step": 219850 }, { "epoch": 8.560084082681303, "grad_norm": 79.2291488647461, "learning_rate": 7.216902176028651e-06, "loss": 0.4522, "step": 219900 }, { "epoch": 8.562030441044804, "grad_norm": 8.03372859954834, "learning_rate": 7.207170384211141e-06, "loss": 0.3334, "step": 219950 }, { "epoch": 8.563976799408307, "grad_norm": 14.909876823425293, "learning_rate": 7.1974385923936324e-06, "loss": 0.3417, "step": 220000 }, { "epoch": 8.56592315777181, "grad_norm": 36.058502197265625, "learning_rate": 7.187706800576123e-06, "loss": 0.3343, "step": 220050 }, { "epoch": 8.56786951613531, "grad_norm": 49.10384750366211, "learning_rate": 7.1779750087586126e-06, "loss": 0.2588, "step": 220100 }, { "epoch": 8.569815874498813, "grad_norm": 32.03388214111328, "learning_rate": 7.168243216941103e-06, "loss": 0.3521, "step": 220150 }, { "epoch": 8.571762232862314, "grad_norm": 25.493316650390625, "learning_rate": 7.158511425123594e-06, "loss": 0.4129, "step": 220200 }, { "epoch": 8.573708591225817, "grad_norm": 358.3221130371094, "learning_rate": 7.148779633306085e-06, "loss": 0.441, "step": 220250 }, { "epoch": 8.575654949589318, "grad_norm": 39.40644454956055, "learning_rate": 7.1390478414885745e-06, "loss": 0.4615, "step": 220300 }, { "epoch": 8.57760130795282, "grad_norm": 349.61578369140625, "learning_rate": 7.129316049671066e-06, "loss": 0.3682, "step": 220350 }, { "epoch": 8.579547666316323, "grad_norm": 5.462282657623291, "learning_rate": 7.119584257853556e-06, "loss": 0.3387, "step": 220400 }, { "epoch": 8.581494024679824, "grad_norm": 11.50753116607666, "learning_rate": 7.109852466036048e-06, "loss": 0.376, "step": 220450 }, { "epoch": 8.583440383043326, "grad_norm": 0.5181344747543335, "learning_rate": 7.100120674218537e-06, "loss": 0.2778, "step": 220500 }, { "epoch": 8.585386741406827, "grad_norm": 4.212285995483398, "learning_rate": 7.090388882401028e-06, "loss": 0.457, "step": 220550 }, { "epoch": 8.58733309977033, "grad_norm": 19.974802017211914, "learning_rate": 7.080657090583519e-06, "loss": 0.4399, "step": 220600 }, { "epoch": 8.58927945813383, "grad_norm": 0.6270450353622437, "learning_rate": 7.07092529876601e-06, "loss": 0.2593, "step": 220650 }, { "epoch": 8.591225816497333, "grad_norm": 32.24148941040039, "learning_rate": 7.061193506948499e-06, "loss": 0.3265, "step": 220700 }, { "epoch": 8.593172174860836, "grad_norm": 16.12700843811035, "learning_rate": 7.05146171513099e-06, "loss": 0.3955, "step": 220750 }, { "epoch": 8.595118533224337, "grad_norm": 33.40711975097656, "learning_rate": 7.041729923313481e-06, "loss": 0.296, "step": 220800 }, { "epoch": 8.59706489158784, "grad_norm": 1.9151540994644165, "learning_rate": 7.0319981314959715e-06, "loss": 0.5311, "step": 220850 }, { "epoch": 8.59901124995134, "grad_norm": 0.4307612478733063, "learning_rate": 7.022266339678461e-06, "loss": 0.347, "step": 220900 }, { "epoch": 8.600957608314843, "grad_norm": 1.7006497383117676, "learning_rate": 7.0125345478609525e-06, "loss": 0.3651, "step": 220950 }, { "epoch": 8.602903966678344, "grad_norm": 19.940242767333984, "learning_rate": 7.002802756043443e-06, "loss": 0.3757, "step": 221000 }, { "epoch": 8.604850325041847, "grad_norm": 1.6810246706008911, "learning_rate": 6.993070964225934e-06, "loss": 0.3303, "step": 221050 }, { "epoch": 8.60679668340535, "grad_norm": 40.92107391357422, "learning_rate": 6.983339172408423e-06, "loss": 0.4411, "step": 221100 }, { "epoch": 8.60874304176885, "grad_norm": 10.826274871826172, "learning_rate": 6.9736073805909144e-06, "loss": 0.3136, "step": 221150 }, { "epoch": 8.610689400132353, "grad_norm": 34.17304992675781, "learning_rate": 6.963875588773405e-06, "loss": 0.4161, "step": 221200 }, { "epoch": 8.612635758495854, "grad_norm": 33.585304260253906, "learning_rate": 6.954143796955896e-06, "loss": 0.3567, "step": 221250 }, { "epoch": 8.614582116859356, "grad_norm": 9.976914405822754, "learning_rate": 6.944412005138387e-06, "loss": 0.3675, "step": 221300 }, { "epoch": 8.616528475222857, "grad_norm": 2.487536668777466, "learning_rate": 6.934874849157227e-06, "loss": 0.4628, "step": 221350 }, { "epoch": 8.61847483358636, "grad_norm": 23.0937442779541, "learning_rate": 6.9251430573397184e-06, "loss": 0.3276, "step": 221400 }, { "epoch": 8.620421191949863, "grad_norm": 32.612430572509766, "learning_rate": 6.915411265522208e-06, "loss": 0.3401, "step": 221450 }, { "epoch": 8.622367550313363, "grad_norm": 8.451699256896973, "learning_rate": 6.9056794737046986e-06, "loss": 0.4482, "step": 221500 }, { "epoch": 8.624313908676866, "grad_norm": 4.424816608428955, "learning_rate": 6.89594768188719e-06, "loss": 0.3403, "step": 221550 }, { "epoch": 8.626260267040367, "grad_norm": 3.166138172149658, "learning_rate": 6.88621589006968e-06, "loss": 0.421, "step": 221600 }, { "epoch": 8.62820662540387, "grad_norm": 81.44871520996094, "learning_rate": 6.87648409825217e-06, "loss": 0.2604, "step": 221650 }, { "epoch": 8.63015298376737, "grad_norm": 13.346428871154785, "learning_rate": 6.8667523064346605e-06, "loss": 0.3042, "step": 221700 }, { "epoch": 8.632099342130873, "grad_norm": 25.034902572631836, "learning_rate": 6.857020514617152e-06, "loss": 0.4023, "step": 221750 }, { "epoch": 8.634045700494376, "grad_norm": 33.09614944458008, "learning_rate": 6.847288722799642e-06, "loss": 0.2874, "step": 221800 }, { "epoch": 8.635992058857877, "grad_norm": 6.6905317306518555, "learning_rate": 6.837556930982132e-06, "loss": 0.296, "step": 221850 }, { "epoch": 8.63793841722138, "grad_norm": 3.0893237590789795, "learning_rate": 6.827825139164623e-06, "loss": 0.3186, "step": 221900 }, { "epoch": 8.63988477558488, "grad_norm": 70.92662811279297, "learning_rate": 6.818093347347114e-06, "loss": 0.3371, "step": 221950 }, { "epoch": 8.641831133948383, "grad_norm": 2.214918375015259, "learning_rate": 6.808361555529605e-06, "loss": 0.3246, "step": 222000 }, { "epoch": 8.643777492311884, "grad_norm": 40.39765548706055, "learning_rate": 6.798629763712095e-06, "loss": 0.3817, "step": 222050 }, { "epoch": 8.645723850675386, "grad_norm": 85.87996673583984, "learning_rate": 6.788897971894585e-06, "loss": 0.2981, "step": 222100 }, { "epoch": 8.647670209038889, "grad_norm": 1.841369867324829, "learning_rate": 6.7791661800770766e-06, "loss": 0.3941, "step": 222150 }, { "epoch": 8.64961656740239, "grad_norm": 16.443140029907227, "learning_rate": 6.769434388259567e-06, "loss": 0.3968, "step": 222200 }, { "epoch": 8.651562925765893, "grad_norm": 25.667510986328125, "learning_rate": 6.759702596442057e-06, "loss": 0.2987, "step": 222250 }, { "epoch": 8.653509284129393, "grad_norm": 5.0715436935424805, "learning_rate": 6.749970804624547e-06, "loss": 0.3814, "step": 222300 }, { "epoch": 8.655455642492896, "grad_norm": 0.1812075972557068, "learning_rate": 6.7402390128070385e-06, "loss": 0.2977, "step": 222350 }, { "epoch": 8.657402000856397, "grad_norm": 5.179006099700928, "learning_rate": 6.730507220989529e-06, "loss": 0.351, "step": 222400 }, { "epoch": 8.6593483592199, "grad_norm": 17.574840545654297, "learning_rate": 6.72077542917202e-06, "loss": 0.2921, "step": 222450 }, { "epoch": 8.661294717583402, "grad_norm": 1.014943242073059, "learning_rate": 6.71104363735451e-06, "loss": 0.5191, "step": 222500 }, { "epoch": 8.663241075946903, "grad_norm": 1.7706698179244995, "learning_rate": 6.7013118455370005e-06, "loss": 0.3174, "step": 222550 }, { "epoch": 8.665187434310406, "grad_norm": 1.09407377243042, "learning_rate": 6.691580053719492e-06, "loss": 0.4271, "step": 222600 }, { "epoch": 8.667133792673907, "grad_norm": 110.30030059814453, "learning_rate": 6.681848261901982e-06, "loss": 0.3918, "step": 222650 }, { "epoch": 8.66908015103741, "grad_norm": 5.543716907501221, "learning_rate": 6.672116470084472e-06, "loss": 0.304, "step": 222700 }, { "epoch": 8.67102650940091, "grad_norm": 7.529234409332275, "learning_rate": 6.662384678266962e-06, "loss": 0.3901, "step": 222750 }, { "epoch": 8.672972867764413, "grad_norm": 37.54677200317383, "learning_rate": 6.652652886449454e-06, "loss": 0.359, "step": 222800 }, { "epoch": 8.674919226127916, "grad_norm": 2.961869478225708, "learning_rate": 6.642921094631944e-06, "loss": 0.3567, "step": 222850 }, { "epoch": 8.676865584491416, "grad_norm": 124.39166259765625, "learning_rate": 6.633189302814434e-06, "loss": 0.3479, "step": 222900 }, { "epoch": 8.678811942854919, "grad_norm": 9.771851539611816, "learning_rate": 6.623457510996925e-06, "loss": 0.424, "step": 222950 }, { "epoch": 8.68075830121842, "grad_norm": 34.31972885131836, "learning_rate": 6.613725719179416e-06, "loss": 0.2294, "step": 223000 }, { "epoch": 8.682704659581923, "grad_norm": 21.864376068115234, "learning_rate": 6.603993927361907e-06, "loss": 0.4277, "step": 223050 }, { "epoch": 8.684651017945423, "grad_norm": 167.41160583496094, "learning_rate": 6.594456771380747e-06, "loss": 0.3455, "step": 223100 }, { "epoch": 8.686597376308926, "grad_norm": 17.1745662689209, "learning_rate": 6.584724979563238e-06, "loss": 0.4008, "step": 223150 }, { "epoch": 8.688543734672429, "grad_norm": 48.082000732421875, "learning_rate": 6.5749931877457275e-06, "loss": 0.4371, "step": 223200 }, { "epoch": 8.69049009303593, "grad_norm": 7.061744213104248, "learning_rate": 6.565261395928218e-06, "loss": 0.3482, "step": 223250 }, { "epoch": 8.692436451399432, "grad_norm": 2.691284418106079, "learning_rate": 6.555529604110709e-06, "loss": 0.3073, "step": 223300 }, { "epoch": 8.694382809762933, "grad_norm": 21.241867065429688, "learning_rate": 6.5457978122932e-06, "loss": 0.251, "step": 223350 }, { "epoch": 8.696329168126436, "grad_norm": 31.078702926635742, "learning_rate": 6.5360660204756894e-06, "loss": 0.3353, "step": 223400 }, { "epoch": 8.698275526489937, "grad_norm": 17.49899673461914, "learning_rate": 6.526334228658181e-06, "loss": 0.3652, "step": 223450 }, { "epoch": 8.70022188485344, "grad_norm": 5.2942585945129395, "learning_rate": 6.516602436840671e-06, "loss": 0.3901, "step": 223500 }, { "epoch": 8.702168243216942, "grad_norm": 47.62997817993164, "learning_rate": 6.506870645023163e-06, "loss": 0.3001, "step": 223550 }, { "epoch": 8.704114601580443, "grad_norm": 48.599185943603516, "learning_rate": 6.497138853205653e-06, "loss": 0.3753, "step": 223600 }, { "epoch": 8.706060959943946, "grad_norm": 53.04881286621094, "learning_rate": 6.487407061388143e-06, "loss": 0.3809, "step": 223650 }, { "epoch": 8.708007318307446, "grad_norm": 83.82718658447266, "learning_rate": 6.477675269570633e-06, "loss": 0.2599, "step": 223700 }, { "epoch": 8.709953676670949, "grad_norm": 79.01872253417969, "learning_rate": 6.4679434777531245e-06, "loss": 0.4158, "step": 223750 }, { "epoch": 8.71190003503445, "grad_norm": 20.358524322509766, "learning_rate": 6.458211685935615e-06, "loss": 0.3461, "step": 223800 }, { "epoch": 8.713846393397953, "grad_norm": 4.175461769104004, "learning_rate": 6.448479894118105e-06, "loss": 0.3547, "step": 223850 }, { "epoch": 8.715792751761454, "grad_norm": 23.78647804260254, "learning_rate": 6.438748102300596e-06, "loss": 0.3999, "step": 223900 }, { "epoch": 8.717739110124956, "grad_norm": 20.422916412353516, "learning_rate": 6.4290163104830865e-06, "loss": 0.3589, "step": 223950 }, { "epoch": 8.719685468488459, "grad_norm": 75.97553253173828, "learning_rate": 6.419284518665578e-06, "loss": 0.3778, "step": 224000 }, { "epoch": 8.72163182685196, "grad_norm": 36.335487365722656, "learning_rate": 6.4095527268480674e-06, "loss": 0.4033, "step": 224050 }, { "epoch": 8.723578185215462, "grad_norm": 0.6676282286643982, "learning_rate": 6.399820935030558e-06, "loss": 0.3725, "step": 224100 }, { "epoch": 8.725524543578963, "grad_norm": 1.4138561487197876, "learning_rate": 6.390089143213049e-06, "loss": 0.2921, "step": 224150 }, { "epoch": 8.727470901942466, "grad_norm": 35.6706428527832, "learning_rate": 6.38035735139554e-06, "loss": 0.3439, "step": 224200 }, { "epoch": 8.729417260305967, "grad_norm": 38.529510498046875, "learning_rate": 6.370625559578029e-06, "loss": 0.3457, "step": 224250 }, { "epoch": 8.73136361866947, "grad_norm": 4.901052951812744, "learning_rate": 6.36089376776052e-06, "loss": 0.2931, "step": 224300 }, { "epoch": 8.733309977032972, "grad_norm": 150.7915496826172, "learning_rate": 6.351161975943011e-06, "loss": 0.4102, "step": 224350 }, { "epoch": 8.735256335396473, "grad_norm": 0.9786162972450256, "learning_rate": 6.341430184125502e-06, "loss": 0.3777, "step": 224400 }, { "epoch": 8.737202693759976, "grad_norm": 8.3831787109375, "learning_rate": 6.331698392307991e-06, "loss": 0.3142, "step": 224450 }, { "epoch": 8.739149052123476, "grad_norm": 16.863611221313477, "learning_rate": 6.321966600490483e-06, "loss": 0.3512, "step": 224500 }, { "epoch": 8.74109541048698, "grad_norm": 0.8706159591674805, "learning_rate": 6.312234808672973e-06, "loss": 0.3525, "step": 224550 }, { "epoch": 8.74304176885048, "grad_norm": 0.16685530543327332, "learning_rate": 6.3025030168554645e-06, "loss": 0.3285, "step": 224600 }, { "epoch": 8.744988127213983, "grad_norm": 0.10372503846883774, "learning_rate": 6.292771225037953e-06, "loss": 0.3197, "step": 224650 }, { "epoch": 8.746934485577485, "grad_norm": 204.88613891601562, "learning_rate": 6.283039433220445e-06, "loss": 0.3891, "step": 224700 }, { "epoch": 8.748880843940986, "grad_norm": 0.20557746291160583, "learning_rate": 6.273307641402935e-06, "loss": 0.3433, "step": 224750 }, { "epoch": 8.750827202304489, "grad_norm": 49.39046859741211, "learning_rate": 6.263575849585426e-06, "loss": 0.359, "step": 224800 }, { "epoch": 8.75277356066799, "grad_norm": 17.51939582824707, "learning_rate": 6.253844057767917e-06, "loss": 0.2855, "step": 224850 }, { "epoch": 8.754719919031492, "grad_norm": 53.27047348022461, "learning_rate": 6.244112265950407e-06, "loss": 0.3882, "step": 224900 }, { "epoch": 8.756666277394993, "grad_norm": 11.507645606994629, "learning_rate": 6.234380474132898e-06, "loss": 0.3737, "step": 224950 }, { "epoch": 8.758612635758496, "grad_norm": 116.23258972167969, "learning_rate": 6.224648682315388e-06, "loss": 0.3551, "step": 225000 }, { "epoch": 8.760558994121999, "grad_norm": 23.435815811157227, "learning_rate": 6.214916890497879e-06, "loss": 0.397, "step": 225050 }, { "epoch": 8.7625053524855, "grad_norm": 1.3348294496536255, "learning_rate": 6.205185098680369e-06, "loss": 0.3706, "step": 225100 }, { "epoch": 8.764451710849002, "grad_norm": 9.011042594909668, "learning_rate": 6.19545330686286e-06, "loss": 0.3388, "step": 225150 }, { "epoch": 8.766398069212503, "grad_norm": 12.70809555053711, "learning_rate": 6.18572151504535e-06, "loss": 0.314, "step": 225200 }, { "epoch": 8.768344427576006, "grad_norm": 74.03276062011719, "learning_rate": 6.175989723227841e-06, "loss": 0.3799, "step": 225250 }, { "epoch": 8.770290785939507, "grad_norm": 1.324950933456421, "learning_rate": 6.166257931410332e-06, "loss": 0.3316, "step": 225300 }, { "epoch": 8.77223714430301, "grad_norm": 6.953982830047607, "learning_rate": 6.156526139592822e-06, "loss": 0.3679, "step": 225350 }, { "epoch": 8.77418350266651, "grad_norm": 0.06860756129026413, "learning_rate": 6.146794347775313e-06, "loss": 0.3455, "step": 225400 }, { "epoch": 8.776129861030013, "grad_norm": 11.077356338500977, "learning_rate": 6.137062555957803e-06, "loss": 0.4001, "step": 225450 }, { "epoch": 8.778076219393515, "grad_norm": 2.815685272216797, "learning_rate": 6.127330764140294e-06, "loss": 0.3702, "step": 225500 }, { "epoch": 8.780022577757016, "grad_norm": 20.536880493164062, "learning_rate": 6.1175989723227845e-06, "loss": 0.3403, "step": 225550 }, { "epoch": 8.781968936120519, "grad_norm": 54.75350570678711, "learning_rate": 6.107867180505275e-06, "loss": 0.2879, "step": 225600 }, { "epoch": 8.78391529448402, "grad_norm": 2.695492744445801, "learning_rate": 6.0981353886877655e-06, "loss": 0.3582, "step": 225650 }, { "epoch": 8.785861652847522, "grad_norm": 34.32632827758789, "learning_rate": 6.088403596870256e-06, "loss": 0.4763, "step": 225700 }, { "epoch": 8.787808011211023, "grad_norm": 11.949219703674316, "learning_rate": 6.0786718050527465e-06, "loss": 0.3589, "step": 225750 }, { "epoch": 8.789754369574526, "grad_norm": 251.01019287109375, "learning_rate": 6.068940013235237e-06, "loss": 0.3512, "step": 225800 }, { "epoch": 8.791700727938029, "grad_norm": 1.0615158081054688, "learning_rate": 6.0592082214177274e-06, "loss": 0.3883, "step": 225850 }, { "epoch": 8.79364708630153, "grad_norm": 44.95500564575195, "learning_rate": 6.049476429600219e-06, "loss": 0.3395, "step": 225900 }, { "epoch": 8.795593444665032, "grad_norm": 17.43313217163086, "learning_rate": 6.039744637782708e-06, "loss": 0.3079, "step": 225950 }, { "epoch": 8.797539803028533, "grad_norm": 16.837614059448242, "learning_rate": 6.0300128459652e-06, "loss": 0.2626, "step": 226000 }, { "epoch": 8.799486161392036, "grad_norm": 38.90789031982422, "learning_rate": 6.020281054147689e-06, "loss": 0.4063, "step": 226050 }, { "epoch": 8.801432519755537, "grad_norm": 199.47140502929688, "learning_rate": 6.010549262330181e-06, "loss": 0.3521, "step": 226100 }, { "epoch": 8.80337887811904, "grad_norm": 7.4376654624938965, "learning_rate": 6.00081747051267e-06, "loss": 0.2858, "step": 226150 }, { "epoch": 8.805325236482542, "grad_norm": 12.45365047454834, "learning_rate": 5.991085678695162e-06, "loss": 0.4156, "step": 226200 }, { "epoch": 8.807271594846043, "grad_norm": 20.5790958404541, "learning_rate": 5.981353886877652e-06, "loss": 0.3309, "step": 226250 }, { "epoch": 8.809217953209545, "grad_norm": 2.531684160232544, "learning_rate": 5.971622095060143e-06, "loss": 0.3656, "step": 226300 }, { "epoch": 8.811164311573046, "grad_norm": 15.505149841308594, "learning_rate": 5.961890303242634e-06, "loss": 0.2775, "step": 226350 }, { "epoch": 8.813110669936549, "grad_norm": 7.218366622924805, "learning_rate": 5.952158511425124e-06, "loss": 0.3666, "step": 226400 }, { "epoch": 8.81505702830005, "grad_norm": 30.13496971130371, "learning_rate": 5.942426719607615e-06, "loss": 0.3912, "step": 226450 }, { "epoch": 8.817003386663552, "grad_norm": 4.954912185668945, "learning_rate": 5.932694927790105e-06, "loss": 0.4352, "step": 226500 }, { "epoch": 8.818949745027055, "grad_norm": 28.696453094482422, "learning_rate": 5.922963135972596e-06, "loss": 0.4124, "step": 226550 }, { "epoch": 8.820896103390556, "grad_norm": 4.26706075668335, "learning_rate": 5.913231344155086e-06, "loss": 0.4419, "step": 226600 }, { "epoch": 8.822842461754059, "grad_norm": 8.161088943481445, "learning_rate": 5.903499552337577e-06, "loss": 0.3652, "step": 226650 }, { "epoch": 8.82478882011756, "grad_norm": 30.508033752441406, "learning_rate": 5.893767760520067e-06, "loss": 0.3061, "step": 226700 }, { "epoch": 8.826735178481062, "grad_norm": 6.0639214515686035, "learning_rate": 5.884035968702558e-06, "loss": 0.4109, "step": 226750 }, { "epoch": 8.828681536844563, "grad_norm": 19.079130172729492, "learning_rate": 5.874304176885048e-06, "loss": 0.2951, "step": 226800 }, { "epoch": 8.830627895208066, "grad_norm": 9.895259857177734, "learning_rate": 5.864572385067539e-06, "loss": 0.369, "step": 226850 }, { "epoch": 8.832574253571568, "grad_norm": 19.91774559020996, "learning_rate": 5.854840593250029e-06, "loss": 0.3176, "step": 226900 }, { "epoch": 8.83452061193507, "grad_norm": 13.7282075881958, "learning_rate": 5.84510880143252e-06, "loss": 0.3685, "step": 226950 }, { "epoch": 8.836466970298572, "grad_norm": 31.074377059936523, "learning_rate": 5.83537700961501e-06, "loss": 0.3658, "step": 227000 }, { "epoch": 8.838413328662073, "grad_norm": 0.16780517995357513, "learning_rate": 5.825645217797502e-06, "loss": 0.2828, "step": 227050 }, { "epoch": 8.840359687025575, "grad_norm": 14.044483184814453, "learning_rate": 5.815913425979991e-06, "loss": 0.328, "step": 227100 }, { "epoch": 8.842306045389076, "grad_norm": 0.38573983311653137, "learning_rate": 5.806181634162483e-06, "loss": 0.3597, "step": 227150 }, { "epoch": 8.844252403752579, "grad_norm": 50.809871673583984, "learning_rate": 5.796449842344972e-06, "loss": 0.3304, "step": 227200 }, { "epoch": 8.846198762116082, "grad_norm": 68.22897338867188, "learning_rate": 5.7867180505274636e-06, "loss": 0.3941, "step": 227250 }, { "epoch": 8.848145120479582, "grad_norm": 0.1677882820367813, "learning_rate": 5.776986258709954e-06, "loss": 0.3433, "step": 227300 }, { "epoch": 8.850091478843085, "grad_norm": 12.835567474365234, "learning_rate": 5.7672544668924445e-06, "loss": 0.4505, "step": 227350 }, { "epoch": 8.852037837206586, "grad_norm": 4.1731672286987305, "learning_rate": 5.757522675074935e-06, "loss": 0.4289, "step": 227400 }, { "epoch": 8.853984195570089, "grad_norm": 13.509185791015625, "learning_rate": 5.7477908832574255e-06, "loss": 0.3246, "step": 227450 }, { "epoch": 8.85593055393359, "grad_norm": 20.432531356811523, "learning_rate": 5.738059091439916e-06, "loss": 0.2522, "step": 227500 }, { "epoch": 8.857876912297092, "grad_norm": 3.8568308353424072, "learning_rate": 5.7283272996224065e-06, "loss": 0.347, "step": 227550 }, { "epoch": 8.859823270660595, "grad_norm": 10.229039192199707, "learning_rate": 5.718595507804898e-06, "loss": 0.3905, "step": 227600 }, { "epoch": 8.861769629024096, "grad_norm": 5.799859046936035, "learning_rate": 5.7088637159873874e-06, "loss": 0.4006, "step": 227650 }, { "epoch": 8.863715987387598, "grad_norm": 6.207911968231201, "learning_rate": 5.699131924169879e-06, "loss": 0.402, "step": 227700 }, { "epoch": 8.8656623457511, "grad_norm": 24.720172882080078, "learning_rate": 5.689400132352369e-06, "loss": 0.2995, "step": 227750 }, { "epoch": 8.867608704114602, "grad_norm": 50.7036247253418, "learning_rate": 5.67966834053486e-06, "loss": 0.3477, "step": 227800 }, { "epoch": 8.869555062478103, "grad_norm": 0.8579446077346802, "learning_rate": 5.66993654871735e-06, "loss": 0.3217, "step": 227850 }, { "epoch": 8.871501420841605, "grad_norm": 15.049579620361328, "learning_rate": 5.660204756899841e-06, "loss": 0.2937, "step": 227900 }, { "epoch": 8.873447779205108, "grad_norm": 166.2444610595703, "learning_rate": 5.650472965082331e-06, "loss": 0.4035, "step": 227950 }, { "epoch": 8.875394137568609, "grad_norm": 6.390206336975098, "learning_rate": 5.640741173264822e-06, "loss": 0.3171, "step": 228000 }, { "epoch": 8.877340495932112, "grad_norm": 6.946544647216797, "learning_rate": 5.631009381447312e-06, "loss": 0.4285, "step": 228050 }, { "epoch": 8.879286854295612, "grad_norm": 28.87405014038086, "learning_rate": 5.6212775896298035e-06, "loss": 0.3034, "step": 228100 }, { "epoch": 8.881233212659115, "grad_norm": 55.88542175292969, "learning_rate": 5.611545797812293e-06, "loss": 0.3843, "step": 228150 }, { "epoch": 8.883179571022616, "grad_norm": 4.111444473266602, "learning_rate": 5.6018140059947845e-06, "loss": 0.2505, "step": 228200 }, { "epoch": 8.885125929386119, "grad_norm": 0.3566741645336151, "learning_rate": 5.592276850013625e-06, "loss": 0.4107, "step": 228250 }, { "epoch": 8.887072287749621, "grad_norm": 94.76947784423828, "learning_rate": 5.582545058196115e-06, "loss": 0.3311, "step": 228300 }, { "epoch": 8.889018646113122, "grad_norm": 7.853178977966309, "learning_rate": 5.572813266378606e-06, "loss": 0.3293, "step": 228350 }, { "epoch": 8.890965004476625, "grad_norm": 112.56310272216797, "learning_rate": 5.563081474561096e-06, "loss": 0.3708, "step": 228400 }, { "epoch": 8.892911362840126, "grad_norm": 0.37414416670799255, "learning_rate": 5.553349682743587e-06, "loss": 0.2454, "step": 228450 }, { "epoch": 8.894857721203628, "grad_norm": 6.738828182220459, "learning_rate": 5.543617890926077e-06, "loss": 0.2956, "step": 228500 }, { "epoch": 8.89680407956713, "grad_norm": 0.75816410779953, "learning_rate": 5.533886099108568e-06, "loss": 0.3372, "step": 228550 }, { "epoch": 8.898750437930632, "grad_norm": 35.78632736206055, "learning_rate": 5.524154307291059e-06, "loss": 0.3925, "step": 228600 }, { "epoch": 8.900696796294135, "grad_norm": 58.97236633300781, "learning_rate": 5.5144225154735496e-06, "loss": 0.2627, "step": 228650 }, { "epoch": 8.902643154657635, "grad_norm": 1.5802282094955444, "learning_rate": 5.50469072365604e-06, "loss": 0.3924, "step": 228700 }, { "epoch": 8.904589513021138, "grad_norm": 9.241840362548828, "learning_rate": 5.4949589318385305e-06, "loss": 0.2742, "step": 228750 }, { "epoch": 8.906535871384639, "grad_norm": 23.211898803710938, "learning_rate": 5.485227140021021e-06, "loss": 0.4312, "step": 228800 }, { "epoch": 8.908482229748142, "grad_norm": 8.542198181152344, "learning_rate": 5.4754953482035115e-06, "loss": 0.3275, "step": 228850 }, { "epoch": 8.910428588111643, "grad_norm": 0.4717353284358978, "learning_rate": 5.465763556386002e-06, "loss": 0.3613, "step": 228900 }, { "epoch": 8.912374946475145, "grad_norm": 22.798017501831055, "learning_rate": 5.4560317645684925e-06, "loss": 0.2647, "step": 228950 }, { "epoch": 8.914321304838648, "grad_norm": 20.00990104675293, "learning_rate": 5.446299972750983e-06, "loss": 0.4229, "step": 229000 }, { "epoch": 8.916267663202149, "grad_norm": 16.985668182373047, "learning_rate": 5.436568180933474e-06, "loss": 0.3209, "step": 229050 }, { "epoch": 8.918214021565651, "grad_norm": 85.2391586303711, "learning_rate": 5.426836389115964e-06, "loss": 0.2896, "step": 229100 }, { "epoch": 8.920160379929152, "grad_norm": 68.90104675292969, "learning_rate": 5.417104597298455e-06, "loss": 0.4018, "step": 229150 }, { "epoch": 8.922106738292655, "grad_norm": 0.6628040671348572, "learning_rate": 5.407372805480945e-06, "loss": 0.3377, "step": 229200 }, { "epoch": 8.924053096656156, "grad_norm": 10.836268424987793, "learning_rate": 5.397641013663436e-06, "loss": 0.3669, "step": 229250 }, { "epoch": 8.925999455019658, "grad_norm": 3.898082733154297, "learning_rate": 5.387909221845927e-06, "loss": 0.4177, "step": 229300 }, { "epoch": 8.92794581338316, "grad_norm": 0.10330332815647125, "learning_rate": 5.378177430028417e-06, "loss": 0.332, "step": 229350 }, { "epoch": 8.929892171746662, "grad_norm": 20.305957794189453, "learning_rate": 5.368445638210908e-06, "loss": 0.3938, "step": 229400 }, { "epoch": 8.931838530110165, "grad_norm": 19.907541275024414, "learning_rate": 5.358713846393398e-06, "loss": 0.3219, "step": 229450 }, { "epoch": 8.933784888473665, "grad_norm": 29.59224510192871, "learning_rate": 5.348982054575889e-06, "loss": 0.377, "step": 229500 }, { "epoch": 8.935731246837168, "grad_norm": 59.18272018432617, "learning_rate": 5.339250262758379e-06, "loss": 0.3051, "step": 229550 }, { "epoch": 8.937677605200669, "grad_norm": 0.67954421043396, "learning_rate": 5.32951847094087e-06, "loss": 0.289, "step": 229600 }, { "epoch": 8.939623963564172, "grad_norm": 0.5278038382530212, "learning_rate": 5.31978667912336e-06, "loss": 0.4256, "step": 229650 }, { "epoch": 8.941570321927673, "grad_norm": 49.131717681884766, "learning_rate": 5.310054887305851e-06, "loss": 0.2948, "step": 229700 }, { "epoch": 8.943516680291175, "grad_norm": 17.600290298461914, "learning_rate": 5.300323095488342e-06, "loss": 0.3645, "step": 229750 }, { "epoch": 8.945463038654678, "grad_norm": 94.71209716796875, "learning_rate": 5.2905913036708316e-06, "loss": 0.4662, "step": 229800 }, { "epoch": 8.947409397018179, "grad_norm": 9.701077461242676, "learning_rate": 5.280859511853323e-06, "loss": 0.2909, "step": 229850 }, { "epoch": 8.949355755381681, "grad_norm": 0.7770358920097351, "learning_rate": 5.2711277200358125e-06, "loss": 0.2061, "step": 229900 }, { "epoch": 8.951302113745182, "grad_norm": 1.2425841093063354, "learning_rate": 5.261395928218304e-06, "loss": 0.2936, "step": 229950 }, { "epoch": 8.953248472108685, "grad_norm": 1.4582775831222534, "learning_rate": 5.251664136400794e-06, "loss": 0.4526, "step": 230000 }, { "epoch": 8.955194830472186, "grad_norm": 1.0264467000961304, "learning_rate": 5.241932344583285e-06, "loss": 0.3729, "step": 230050 }, { "epoch": 8.957141188835688, "grad_norm": 93.4474105834961, "learning_rate": 5.232200552765776e-06, "loss": 0.4078, "step": 230100 }, { "epoch": 8.959087547199191, "grad_norm": 36.80929183959961, "learning_rate": 5.222468760948266e-06, "loss": 0.2661, "step": 230150 }, { "epoch": 8.961033905562692, "grad_norm": 0.3659721612930298, "learning_rate": 5.212736969130757e-06, "loss": 0.4268, "step": 230200 }, { "epoch": 8.962980263926195, "grad_norm": 134.3724822998047, "learning_rate": 5.203005177313247e-06, "loss": 0.3161, "step": 230250 }, { "epoch": 8.964926622289696, "grad_norm": 0.33224621415138245, "learning_rate": 5.193273385495738e-06, "loss": 0.4299, "step": 230300 }, { "epoch": 8.966872980653198, "grad_norm": 0.34782660007476807, "learning_rate": 5.183541593678229e-06, "loss": 0.3236, "step": 230350 }, { "epoch": 8.968819339016699, "grad_norm": 21.499114990234375, "learning_rate": 5.173809801860719e-06, "loss": 0.4113, "step": 230400 }, { "epoch": 8.970765697380202, "grad_norm": 8.111068725585938, "learning_rate": 5.1640780100432096e-06, "loss": 0.3523, "step": 230450 }, { "epoch": 8.972712055743704, "grad_norm": 20.145055770874023, "learning_rate": 5.1543462182257e-06, "loss": 0.2714, "step": 230500 }, { "epoch": 8.974658414107205, "grad_norm": 0.3969848155975342, "learning_rate": 5.1446144264081905e-06, "loss": 0.3137, "step": 230550 }, { "epoch": 8.976604772470708, "grad_norm": 5.591084003448486, "learning_rate": 5.134882634590681e-06, "loss": 0.3296, "step": 230600 }, { "epoch": 8.978551130834209, "grad_norm": 87.94420623779297, "learning_rate": 5.1251508427731715e-06, "loss": 0.4958, "step": 230650 }, { "epoch": 8.980497489197711, "grad_norm": 12.29471206665039, "learning_rate": 5.115419050955662e-06, "loss": 0.3239, "step": 230700 }, { "epoch": 8.982443847561212, "grad_norm": 0.5806594491004944, "learning_rate": 5.1056872591381525e-06, "loss": 0.3401, "step": 230750 }, { "epoch": 8.984390205924715, "grad_norm": 24.924489974975586, "learning_rate": 5.095955467320644e-06, "loss": 0.3206, "step": 230800 }, { "epoch": 8.986336564288216, "grad_norm": 11.158859252929688, "learning_rate": 5.0862236755031335e-06, "loss": 0.3804, "step": 230850 }, { "epoch": 8.988282922651718, "grad_norm": 4.931925296783447, "learning_rate": 5.076491883685625e-06, "loss": 0.3759, "step": 230900 }, { "epoch": 8.990229281015221, "grad_norm": 505.4148254394531, "learning_rate": 5.0667600918681144e-06, "loss": 0.4304, "step": 230950 }, { "epoch": 8.992175639378722, "grad_norm": 10.506492614746094, "learning_rate": 5.057028300050606e-06, "loss": 0.4467, "step": 231000 }, { "epoch": 8.994121997742225, "grad_norm": 66.42252349853516, "learning_rate": 5.047296508233096e-06, "loss": 0.3538, "step": 231050 }, { "epoch": 8.996068356105726, "grad_norm": 5.136214256286621, "learning_rate": 5.037564716415587e-06, "loss": 0.5135, "step": 231100 }, { "epoch": 8.998014714469228, "grad_norm": 10.950336456298828, "learning_rate": 5.027832924598077e-06, "loss": 0.3079, "step": 231150 }, { "epoch": 8.999961072832729, "grad_norm": 0.2582404613494873, "learning_rate": 5.018101132780568e-06, "loss": 0.3141, "step": 231200 }, { "epoch": 9.0, "eval_accuracy": 0.8024056989372883, "eval_f1_macro": 0.7597594933424565, "eval_f1_weighted": 0.8012496938947167, "eval_loss": 0.8878054618835449, "eval_roc_auc": 0.9515056005356491, "eval_runtime": 27.7872, "eval_samples_per_second": 924.49, "eval_steps_per_second": 115.593, "step": 231201 } ], "logging_steps": 50, "max_steps": 256890, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2166744767532646e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }