{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 5000, "global_step": 542931, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00276278201097377, "grad_norm": 2.595154047012329, "learning_rate": 4.53088739087192e-07, "loss": 3.7252, "step": 500 }, { "epoch": 0.00552556402194754, "grad_norm": 3.3661036491394043, "learning_rate": 9.135447747449074e-07, "loss": 3.7548, "step": 1000 }, { "epoch": 0.00828834603292131, "grad_norm": 3.4492340087890625, "learning_rate": 1.3730798983313074e-06, "loss": 3.7204, "step": 1500 }, { "epoch": 0.01105112804389508, "grad_norm": 5.218665599822998, "learning_rate": 1.8335359339890227e-06, "loss": 3.7032, "step": 2000 }, { "epoch": 0.013813910054868851, "grad_norm": 5.617623805999756, "learning_rate": 2.293991969646738e-06, "loss": 3.6474, "step": 2500 }, { "epoch": 0.01657669206584262, "grad_norm": 5.512911319732666, "learning_rate": 2.7544480053044535e-06, "loss": 3.5624, "step": 3000 }, { "epoch": 0.019339474076816392, "grad_norm": 5.604681015014648, "learning_rate": 3.2139831288908536e-06, "loss": 3.4398, "step": 3500 }, { "epoch": 0.02210225608779016, "grad_norm": 6.751086235046387, "learning_rate": 3.6744391645485687e-06, "loss": 3.3622, "step": 4000 }, { "epoch": 0.02486503809876393, "grad_norm": 8.164905548095703, "learning_rate": 4.134895200206284e-06, "loss": 3.1909, "step": 4500 }, { "epoch": 0.027627820109737702, "grad_norm": 7.667040824890137, "learning_rate": 4.595351235863999e-06, "loss": 3.2288, "step": 5000 }, { "epoch": 0.027627820109737702, "eval_runtime": 1422.3033, "eval_samples_per_second": 254.484, "eval_steps_per_second": 31.811, "step": 5000 }, { "epoch": 0.03039060212071147, "grad_norm": 7.5037031173706055, "learning_rate": 5.055807271521715e-06, "loss": 3.1382, "step": 5500 }, { "epoch": 0.03315338413168524, "grad_norm": 7.567819595336914, "learning_rate": 5.51626330717943e-06, "loss": 3.1615, "step": 6000 }, { "epoch": 0.03591616614265901, "grad_norm": 9.968245506286621, "learning_rate": 5.976719342837146e-06, "loss": 3.102, "step": 6500 }, { "epoch": 0.038678948153632785, "grad_norm": 9.482606887817383, "learning_rate": 6.4371753784948614e-06, "loss": 3.1102, "step": 7000 }, { "epoch": 0.041441730164606554, "grad_norm": 7.074209213256836, "learning_rate": 6.8967105020812615e-06, "loss": 3.0017, "step": 7500 }, { "epoch": 0.04420451217558032, "grad_norm": 8.135669708251953, "learning_rate": 7.356245625667662e-06, "loss": 3.0574, "step": 8000 }, { "epoch": 0.04696729418655409, "grad_norm": 13.200604438781738, "learning_rate": 7.816701661325378e-06, "loss": 3.0581, "step": 8500 }, { "epoch": 0.04973007619752786, "grad_norm": 8.470757484436035, "learning_rate": 8.277157696983094e-06, "loss": 2.9737, "step": 9000 }, { "epoch": 0.052492858208501636, "grad_norm": 8.881372451782227, "learning_rate": 8.737613732640809e-06, "loss": 3.0145, "step": 9500 }, { "epoch": 0.055255640219475405, "grad_norm": 7.54667854309082, "learning_rate": 9.198069768298522e-06, "loss": 3.0479, "step": 10000 }, { "epoch": 0.055255640219475405, "eval_runtime": 1428.4865, "eval_samples_per_second": 253.382, "eval_steps_per_second": 31.673, "step": 10000 }, { "epoch": 0.058018422230449174, "grad_norm": 9.400116920471191, "learning_rate": 9.658525803956239e-06, "loss": 2.9569, "step": 10500 }, { "epoch": 0.06078120424142294, "grad_norm": 9.827701568603516, "learning_rate": 1.0118981839613954e-05, "loss": 2.9763, "step": 11000 }, { "epoch": 0.06354398625239671, "grad_norm": 9.987720489501953, "learning_rate": 1.0579437875271669e-05, "loss": 2.9257, "step": 11500 }, { "epoch": 0.06630676826337048, "grad_norm": 13.571234703063965, "learning_rate": 1.1039893910929384e-05, "loss": 2.9896, "step": 12000 }, { "epoch": 0.06906955027434425, "grad_norm": 11.866579055786133, "learning_rate": 1.1499429034515786e-05, "loss": 2.978, "step": 12500 }, { "epoch": 0.07183233228531802, "grad_norm": 15.338528633117676, "learning_rate": 1.19598850701735e-05, "loss": 2.9433, "step": 13000 }, { "epoch": 0.0745951142962918, "grad_norm": 8.874979019165039, "learning_rate": 1.2420341105831214e-05, "loss": 2.9817, "step": 13500 }, { "epoch": 0.07735789630726557, "grad_norm": 8.997846603393555, "learning_rate": 1.2880797141488931e-05, "loss": 2.8916, "step": 14000 }, { "epoch": 0.08012067831823934, "grad_norm": 11.824705123901367, "learning_rate": 1.3340332265075331e-05, "loss": 2.9126, "step": 14500 }, { "epoch": 0.08288346032921311, "grad_norm": 11.844013214111328, "learning_rate": 1.3800788300733048e-05, "loss": 2.9078, "step": 15000 }, { "epoch": 0.08288346032921311, "eval_runtime": 1417.0351, "eval_samples_per_second": 255.43, "eval_steps_per_second": 31.929, "step": 15000 }, { "epoch": 0.08564624234018688, "grad_norm": 9.543901443481445, "learning_rate": 1.4261244336390762e-05, "loss": 2.9336, "step": 15500 }, { "epoch": 0.08840902435116064, "grad_norm": 10.792810440063477, "learning_rate": 1.4721700372048478e-05, "loss": 2.9225, "step": 16000 }, { "epoch": 0.09117180636213441, "grad_norm": 9.682255744934082, "learning_rate": 1.5181235495634877e-05, "loss": 2.9388, "step": 16500 }, { "epoch": 0.09393458837310818, "grad_norm": 9.591748237609863, "learning_rate": 1.5641691531292592e-05, "loss": 2.9116, "step": 17000 }, { "epoch": 0.09669737038408195, "grad_norm": 15.961346626281738, "learning_rate": 1.610214756695031e-05, "loss": 2.8788, "step": 17500 }, { "epoch": 0.09946015239505572, "grad_norm": 12.563713073730469, "learning_rate": 1.6562603602608022e-05, "loss": 2.868, "step": 18000 }, { "epoch": 0.10222293440602949, "grad_norm": 9.912450790405273, "learning_rate": 1.702305963826574e-05, "loss": 2.8436, "step": 18500 }, { "epoch": 0.10498571641700327, "grad_norm": 10.168437957763672, "learning_rate": 1.748259476185214e-05, "loss": 2.893, "step": 19000 }, { "epoch": 0.10774849842797704, "grad_norm": 9.45627212524414, "learning_rate": 1.7943050797509856e-05, "loss": 2.9454, "step": 19500 }, { "epoch": 0.11051128043895081, "grad_norm": 11.309224128723145, "learning_rate": 1.840350683316757e-05, "loss": 2.898, "step": 20000 }, { "epoch": 0.11051128043895081, "eval_runtime": 1418.7718, "eval_samples_per_second": 255.117, "eval_steps_per_second": 31.89, "step": 20000 }, { "epoch": 0.11327406244992458, "grad_norm": 9.24001407623291, "learning_rate": 1.8863962868825286e-05, "loss": 2.8393, "step": 20500 }, { "epoch": 0.11603684446089835, "grad_norm": 11.295225143432617, "learning_rate": 1.9324418904483e-05, "loss": 2.8147, "step": 21000 }, { "epoch": 0.11879962647187212, "grad_norm": 16.763473510742188, "learning_rate": 1.97839540280694e-05, "loss": 2.8566, "step": 21500 }, { "epoch": 0.12156240848284589, "grad_norm": 11.213971138000488, "learning_rate": 2.02434891516558e-05, "loss": 2.8748, "step": 22000 }, { "epoch": 0.12432519049381965, "grad_norm": 8.82392406463623, "learning_rate": 2.0703945187313516e-05, "loss": 2.8374, "step": 22500 }, { "epoch": 0.12708797250479342, "grad_norm": 16.63780403137207, "learning_rate": 2.116440122297123e-05, "loss": 2.815, "step": 23000 }, { "epoch": 0.1298507545157672, "grad_norm": 8.42003345489502, "learning_rate": 2.1624857258628947e-05, "loss": 2.8534, "step": 23500 }, { "epoch": 0.13261353652674096, "grad_norm": 18.058134078979492, "learning_rate": 2.2085313294286665e-05, "loss": 2.8309, "step": 24000 }, { "epoch": 0.13537631853771473, "grad_norm": 10.62288761138916, "learning_rate": 2.2545769329944377e-05, "loss": 2.7925, "step": 24500 }, { "epoch": 0.1381391005486885, "grad_norm": 10.44122314453125, "learning_rate": 2.3006225365602095e-05, "loss": 2.8439, "step": 25000 }, { "epoch": 0.1381391005486885, "eval_runtime": 1423.542, "eval_samples_per_second": 254.262, "eval_steps_per_second": 31.783, "step": 25000 }, { "epoch": 0.14090188255966227, "grad_norm": 12.520238876342773, "learning_rate": 2.346668140125981e-05, "loss": 2.7922, "step": 25500 }, { "epoch": 0.14366466457063604, "grad_norm": 9.882099151611328, "learning_rate": 2.3927137436917525e-05, "loss": 2.8186, "step": 26000 }, { "epoch": 0.1464274465816098, "grad_norm": 9.38256549835205, "learning_rate": 2.4386672560503922e-05, "loss": 2.7843, "step": 26500 }, { "epoch": 0.1491902285925836, "grad_norm": 9.543642044067383, "learning_rate": 2.4846207684090326e-05, "loss": 2.8285, "step": 27000 }, { "epoch": 0.15195301060355737, "grad_norm": 11.416987419128418, "learning_rate": 2.530666371974804e-05, "loss": 2.8062, "step": 27500 }, { "epoch": 0.15471579261453114, "grad_norm": 11.19566535949707, "learning_rate": 2.5767119755405756e-05, "loss": 2.8096, "step": 28000 }, { "epoch": 0.1574785746255049, "grad_norm": 11.436910629272461, "learning_rate": 2.6227575791063468e-05, "loss": 2.7607, "step": 28500 }, { "epoch": 0.16024135663647868, "grad_norm": 11.108380317687988, "learning_rate": 2.668803182672119e-05, "loss": 2.8115, "step": 29000 }, { "epoch": 0.16300413864745245, "grad_norm": 8.149287223815918, "learning_rate": 2.7147566950307586e-05, "loss": 2.8093, "step": 29500 }, { "epoch": 0.16576692065842621, "grad_norm": 9.921182632446289, "learning_rate": 2.7608022985965305e-05, "loss": 2.7958, "step": 30000 }, { "epoch": 0.16576692065842621, "eval_runtime": 1420.6642, "eval_samples_per_second": 254.777, "eval_steps_per_second": 31.848, "step": 30000 }, { "epoch": 0.16852970266939998, "grad_norm": 9.527510643005371, "learning_rate": 2.8068479021623016e-05, "loss": 2.8025, "step": 30500 }, { "epoch": 0.17129248468037375, "grad_norm": 8.283650398254395, "learning_rate": 2.852893505728073e-05, "loss": 2.7871, "step": 31000 }, { "epoch": 0.17405526669134752, "grad_norm": 10.247408866882324, "learning_rate": 2.8989391092938447e-05, "loss": 2.8239, "step": 31500 }, { "epoch": 0.1768180487023213, "grad_norm": 9.047866821289062, "learning_rate": 2.9449847128596165e-05, "loss": 2.8061, "step": 32000 }, { "epoch": 0.17958083071329506, "grad_norm": 12.305429458618164, "learning_rate": 2.9910303164253877e-05, "loss": 2.7637, "step": 32500 }, { "epoch": 0.18234361272426883, "grad_norm": 8.979884147644043, "learning_rate": 3.0370759199911592e-05, "loss": 2.8169, "step": 33000 }, { "epoch": 0.1851063947352426, "grad_norm": 10.300788879394531, "learning_rate": 3.0830294323497995e-05, "loss": 2.7801, "step": 33500 }, { "epoch": 0.18786917674621637, "grad_norm": 11.265089988708496, "learning_rate": 3.129075035915571e-05, "loss": 2.7739, "step": 34000 }, { "epoch": 0.19063195875719013, "grad_norm": 7.74532413482666, "learning_rate": 3.1751206394813426e-05, "loss": 2.8098, "step": 34500 }, { "epoch": 0.1933947407681639, "grad_norm": 7.382881164550781, "learning_rate": 3.221166243047114e-05, "loss": 2.7901, "step": 35000 }, { "epoch": 0.1933947407681639, "eval_runtime": 1425.7104, "eval_samples_per_second": 253.876, "eval_steps_per_second": 31.735, "step": 35000 }, { "epoch": 0.19615752277913767, "grad_norm": 11.83242416381836, "learning_rate": 3.2672118466128856e-05, "loss": 2.7753, "step": 35500 }, { "epoch": 0.19892030479011144, "grad_norm": 7.594019412994385, "learning_rate": 3.313165358971525e-05, "loss": 2.7747, "step": 36000 }, { "epoch": 0.2016830868010852, "grad_norm": 7.169086933135986, "learning_rate": 3.359210962537297e-05, "loss": 2.7942, "step": 36500 }, { "epoch": 0.20444586881205898, "grad_norm": 9.127911567687988, "learning_rate": 3.405256566103069e-05, "loss": 2.8336, "step": 37000 }, { "epoch": 0.20720865082303275, "grad_norm": 9.519842147827148, "learning_rate": 3.4512100784617086e-05, "loss": 2.8049, "step": 37500 }, { "epoch": 0.20997143283400654, "grad_norm": 7.047683238983154, "learning_rate": 3.49725568202748e-05, "loss": 2.7987, "step": 38000 }, { "epoch": 0.2127342148449803, "grad_norm": 8.880785942077637, "learning_rate": 3.5433012855932516e-05, "loss": 2.7919, "step": 38500 }, { "epoch": 0.21549699685595408, "grad_norm": 9.917975425720215, "learning_rate": 3.589346889159023e-05, "loss": 2.8002, "step": 39000 }, { "epoch": 0.21825977886692785, "grad_norm": 8.076781272888184, "learning_rate": 3.6353924927247947e-05, "loss": 2.7868, "step": 39500 }, { "epoch": 0.22102256087790162, "grad_norm": 8.734768867492676, "learning_rate": 3.681438096290566e-05, "loss": 2.7937, "step": 40000 }, { "epoch": 0.22102256087790162, "eval_runtime": 1421.6361, "eval_samples_per_second": 254.603, "eval_steps_per_second": 31.826, "step": 40000 }, { "epoch": 0.2237853428888754, "grad_norm": 10.838555335998535, "learning_rate": 3.727483699856338e-05, "loss": 2.8301, "step": 40500 }, { "epoch": 0.22654812489984916, "grad_norm": 10.323356628417969, "learning_rate": 3.773529303422109e-05, "loss": 2.7691, "step": 41000 }, { "epoch": 0.22931090691082293, "grad_norm": 10.885273933410645, "learning_rate": 3.8195749069878814e-05, "loss": 2.8109, "step": 41500 }, { "epoch": 0.2320736889217967, "grad_norm": 7.563793659210205, "learning_rate": 3.865620510553652e-05, "loss": 2.7768, "step": 42000 }, { "epoch": 0.23483647093277046, "grad_norm": 18.850767135620117, "learning_rate": 3.911666114119424e-05, "loss": 2.8198, "step": 42500 }, { "epoch": 0.23759925294374423, "grad_norm": 6.142578601837158, "learning_rate": 3.957619626478064e-05, "loss": 2.7862, "step": 43000 }, { "epoch": 0.240362034954718, "grad_norm": 5.065933704376221, "learning_rate": 4.0036652300438356e-05, "loss": 2.7652, "step": 43500 }, { "epoch": 0.24312481696569177, "grad_norm": 7.932371139526367, "learning_rate": 4.049710833609607e-05, "loss": 2.7517, "step": 44000 }, { "epoch": 0.24588759897666554, "grad_norm": 11.972217559814453, "learning_rate": 4.0957564371753786e-05, "loss": 2.8162, "step": 44500 }, { "epoch": 0.2486503809876393, "grad_norm": 8.804488182067871, "learning_rate": 4.141709949534019e-05, "loss": 2.7491, "step": 45000 }, { "epoch": 0.2486503809876393, "eval_runtime": 1414.8499, "eval_samples_per_second": 255.824, "eval_steps_per_second": 31.979, "step": 45000 }, { "epoch": 0.2514131629986131, "grad_norm": 7.548344135284424, "learning_rate": 4.1877555530997905e-05, "loss": 2.8001, "step": 45500 }, { "epoch": 0.25417594500958685, "grad_norm": 7.751644134521484, "learning_rate": 4.233801156665562e-05, "loss": 2.7754, "step": 46000 }, { "epoch": 0.25693872702056064, "grad_norm": 7.323184013366699, "learning_rate": 4.2798467602313335e-05, "loss": 2.8117, "step": 46500 }, { "epoch": 0.2597015090315344, "grad_norm": 10.177978515625, "learning_rate": 4.325892363797105e-05, "loss": 2.7637, "step": 47000 }, { "epoch": 0.2624642910425082, "grad_norm": 9.300724029541016, "learning_rate": 4.3719379673628765e-05, "loss": 2.7798, "step": 47500 }, { "epoch": 0.2652270730534819, "grad_norm": 7.969640731811523, "learning_rate": 4.417983570928648e-05, "loss": 2.7597, "step": 48000 }, { "epoch": 0.2679898550644557, "grad_norm": 11.944114685058594, "learning_rate": 4.463937083287288e-05, "loss": 2.7944, "step": 48500 }, { "epoch": 0.27075263707542946, "grad_norm": 8.067237854003906, "learning_rate": 4.509982686853059e-05, "loss": 2.797, "step": 49000 }, { "epoch": 0.27351541908640326, "grad_norm": 8.342887878417969, "learning_rate": 4.5560282904188314e-05, "loss": 2.7753, "step": 49500 }, { "epoch": 0.276278201097377, "grad_norm": 7.031680107116699, "learning_rate": 4.602073893984602e-05, "loss": 2.7716, "step": 50000 }, { "epoch": 0.276278201097377, "eval_runtime": 1419.8091, "eval_samples_per_second": 254.931, "eval_steps_per_second": 31.867, "step": 50000 }, { "epoch": 0.2790409831083508, "grad_norm": 6.716032028198242, "learning_rate": 4.648119497550374e-05, "loss": 2.762, "step": 50500 }, { "epoch": 0.28180376511932453, "grad_norm": 6.020242214202881, "learning_rate": 4.694073009909014e-05, "loss": 2.7485, "step": 51000 }, { "epoch": 0.28456654713029833, "grad_norm": 6.298365592956543, "learning_rate": 4.7401186134747856e-05, "loss": 2.733, "step": 51500 }, { "epoch": 0.28732932914127207, "grad_norm": 11.52296257019043, "learning_rate": 4.786164217040557e-05, "loss": 2.7207, "step": 52000 }, { "epoch": 0.29009211115224587, "grad_norm": 7.5664143562316895, "learning_rate": 4.8322098206063286e-05, "loss": 2.7784, "step": 52500 }, { "epoch": 0.2928548931632196, "grad_norm": 7.839147567749023, "learning_rate": 4.8782554241721e-05, "loss": 2.75, "step": 53000 }, { "epoch": 0.2956176751741934, "grad_norm": 7.68875789642334, "learning_rate": 4.9242089365307405e-05, "loss": 2.8226, "step": 53500 }, { "epoch": 0.2983804571851672, "grad_norm": 9.220076560974121, "learning_rate": 4.970254540096512e-05, "loss": 2.7392, "step": 54000 }, { "epoch": 0.30114323919614094, "grad_norm": 6.299682140350342, "learning_rate": 4.998188839568023e-05, "loss": 2.7663, "step": 54500 }, { "epoch": 0.30390602120711474, "grad_norm": 7.302303791046143, "learning_rate": 4.993072567161308e-05, "loss": 2.7241, "step": 55000 }, { "epoch": 0.30390602120711474, "eval_runtime": 1421.4385, "eval_samples_per_second": 254.639, "eval_steps_per_second": 31.83, "step": 55000 }, { "epoch": 0.3066688032180885, "grad_norm": 7.086694240570068, "learning_rate": 4.9879562947545934e-05, "loss": 2.7075, "step": 55500 }, { "epoch": 0.3094315852290623, "grad_norm": 8.3382568359375, "learning_rate": 4.9828502548926916e-05, "loss": 2.7592, "step": 56000 }, { "epoch": 0.312194367240036, "grad_norm": 10.291159629821777, "learning_rate": 4.977733982485976e-05, "loss": 2.7674, "step": 56500 }, { "epoch": 0.3149571492510098, "grad_norm": 10.337152481079102, "learning_rate": 4.972617710079261e-05, "loss": 2.75, "step": 57000 }, { "epoch": 0.31771993126198356, "grad_norm": 5.782974720001221, "learning_rate": 4.9675014376725465e-05, "loss": 2.7324, "step": 57500 }, { "epoch": 0.32048271327295735, "grad_norm": 6.272622108459473, "learning_rate": 4.962385165265832e-05, "loss": 2.746, "step": 58000 }, { "epoch": 0.3232454952839311, "grad_norm": 6.653768539428711, "learning_rate": 4.957268892859116e-05, "loss": 2.7742, "step": 58500 }, { "epoch": 0.3260082772949049, "grad_norm": 6.433887481689453, "learning_rate": 4.9521526204524014e-05, "loss": 2.7635, "step": 59000 }, { "epoch": 0.32877105930587863, "grad_norm": 6.354071617126465, "learning_rate": 4.9470363480456866e-05, "loss": 2.7317, "step": 59500 }, { "epoch": 0.33153384131685243, "grad_norm": 7.874678611755371, "learning_rate": 4.941920075638972e-05, "loss": 2.7431, "step": 60000 }, { "epoch": 0.33153384131685243, "eval_runtime": 1414.971, "eval_samples_per_second": 255.802, "eval_steps_per_second": 31.976, "step": 60000 }, { "epoch": 0.33429662332782617, "grad_norm": 9.324529647827148, "learning_rate": 4.93681403577707e-05, "loss": 2.7606, "step": 60500 }, { "epoch": 0.33705940533879997, "grad_norm": 6.370975017547607, "learning_rate": 4.9316977633703545e-05, "loss": 2.7578, "step": 61000 }, { "epoch": 0.3398221873497737, "grad_norm": 6.50999116897583, "learning_rate": 4.92658149096364e-05, "loss": 2.7627, "step": 61500 }, { "epoch": 0.3425849693607475, "grad_norm": 6.26449728012085, "learning_rate": 4.921465218556925e-05, "loss": 2.7515, "step": 62000 }, { "epoch": 0.34534775137172125, "grad_norm": 5.123514175415039, "learning_rate": 4.916359178695024e-05, "loss": 2.7553, "step": 62500 }, { "epoch": 0.34811053338269504, "grad_norm": 7.093264102935791, "learning_rate": 4.911242906288308e-05, "loss": 2.7338, "step": 63000 }, { "epoch": 0.3508733153936688, "grad_norm": 5.520063400268555, "learning_rate": 4.906126633881593e-05, "loss": 2.7567, "step": 63500 }, { "epoch": 0.3536360974046426, "grad_norm": 6.911723613739014, "learning_rate": 4.901010361474878e-05, "loss": 2.7687, "step": 64000 }, { "epoch": 0.3563988794156164, "grad_norm": 7.70906400680542, "learning_rate": 4.895894089068163e-05, "loss": 2.7228, "step": 64500 }, { "epoch": 0.3591616614265901, "grad_norm": 6.372740745544434, "learning_rate": 4.890788049206262e-05, "loss": 2.733, "step": 65000 }, { "epoch": 0.3591616614265901, "eval_runtime": 1420.2395, "eval_samples_per_second": 254.854, "eval_steps_per_second": 31.857, "step": 65000 }, { "epoch": 0.3619244434375639, "grad_norm": 7.683312892913818, "learning_rate": 4.885671776799547e-05, "loss": 2.7507, "step": 65500 }, { "epoch": 0.36468722544853766, "grad_norm": 6.729420185089111, "learning_rate": 4.880555504392832e-05, "loss": 2.7595, "step": 66000 }, { "epoch": 0.36745000745951145, "grad_norm": 6.871359825134277, "learning_rate": 4.875439231986117e-05, "loss": 2.7179, "step": 66500 }, { "epoch": 0.3702127894704852, "grad_norm": 6.755906581878662, "learning_rate": 4.870343424669029e-05, "loss": 2.7454, "step": 67000 }, { "epoch": 0.372975571481459, "grad_norm": 5.853033065795898, "learning_rate": 4.865227152262313e-05, "loss": 2.7138, "step": 67500 }, { "epoch": 0.37573835349243273, "grad_norm": 7.575068950653076, "learning_rate": 4.8601108798555984e-05, "loss": 2.7798, "step": 68000 }, { "epoch": 0.3785011355034065, "grad_norm": 8.923949241638184, "learning_rate": 4.8549946074488836e-05, "loss": 2.7867, "step": 68500 }, { "epoch": 0.38126391751438027, "grad_norm": 6.704534530639648, "learning_rate": 4.849878335042168e-05, "loss": 2.6885, "step": 69000 }, { "epoch": 0.38402669952535406, "grad_norm": 6.3064117431640625, "learning_rate": 4.8447620626354533e-05, "loss": 2.75, "step": 69500 }, { "epoch": 0.3867894815363278, "grad_norm": 5.934976100921631, "learning_rate": 4.8396457902287385e-05, "loss": 2.7623, "step": 70000 }, { "epoch": 0.3867894815363278, "eval_runtime": 1418.267, "eval_samples_per_second": 255.208, "eval_steps_per_second": 31.902, "step": 70000 }, { "epoch": 0.3895522635473016, "grad_norm": 8.18109130859375, "learning_rate": 4.834529517822024e-05, "loss": 2.7414, "step": 70500 }, { "epoch": 0.39231504555827534, "grad_norm": 5.952932834625244, "learning_rate": 4.829413245415308e-05, "loss": 2.7224, "step": 71000 }, { "epoch": 0.39507782756924914, "grad_norm": 6.907143592834473, "learning_rate": 4.824307205553407e-05, "loss": 2.7661, "step": 71500 }, { "epoch": 0.3978406095802229, "grad_norm": 6.694629192352295, "learning_rate": 4.8191909331466916e-05, "loss": 2.7454, "step": 72000 }, { "epoch": 0.4006033915911967, "grad_norm": 6.9917192459106445, "learning_rate": 4.814074660739977e-05, "loss": 2.7164, "step": 72500 }, { "epoch": 0.4033661736021704, "grad_norm": 7.304172992706299, "learning_rate": 4.808958388333262e-05, "loss": 2.7476, "step": 73000 }, { "epoch": 0.4061289556131442, "grad_norm": 9.685128211975098, "learning_rate": 4.8038421159265465e-05, "loss": 2.7332, "step": 73500 }, { "epoch": 0.40889173762411796, "grad_norm": 5.05424165725708, "learning_rate": 4.798725843519832e-05, "loss": 2.7229, "step": 74000 }, { "epoch": 0.41165451963509175, "grad_norm": 6.11020040512085, "learning_rate": 4.7936198036579306e-05, "loss": 2.75, "step": 74500 }, { "epoch": 0.4144173016460655, "grad_norm": 5.443029403686523, "learning_rate": 4.788503531251216e-05, "loss": 2.7217, "step": 75000 }, { "epoch": 0.4144173016460655, "eval_runtime": 1418.9418, "eval_samples_per_second": 255.087, "eval_steps_per_second": 31.886, "step": 75000 }, { "epoch": 0.4171800836570393, "grad_norm": 6.66157865524292, "learning_rate": 4.7833872588445e-05, "loss": 2.6993, "step": 75500 }, { "epoch": 0.4199428656680131, "grad_norm": 6.773935317993164, "learning_rate": 4.7782709864377855e-05, "loss": 2.7365, "step": 76000 }, { "epoch": 0.42270564767898683, "grad_norm": 5.710464000701904, "learning_rate": 4.77315471403107e-05, "loss": 2.7073, "step": 76500 }, { "epoch": 0.4254684296899606, "grad_norm": 5.999380111694336, "learning_rate": 4.768048674169169e-05, "loss": 2.7089, "step": 77000 }, { "epoch": 0.42823121170093437, "grad_norm": 12.620460510253906, "learning_rate": 4.762932401762454e-05, "loss": 2.7287, "step": 77500 }, { "epoch": 0.43099399371190816, "grad_norm": 5.68431282043457, "learning_rate": 4.7578161293557385e-05, "loss": 2.7433, "step": 78000 }, { "epoch": 0.4337567757228819, "grad_norm": 4.173344135284424, "learning_rate": 4.752699856949024e-05, "loss": 2.7507, "step": 78500 }, { "epoch": 0.4365195577338557, "grad_norm": 7.130237579345703, "learning_rate": 4.747583584542309e-05, "loss": 2.6849, "step": 79000 }, { "epoch": 0.43928233974482944, "grad_norm": 7.622902870178223, "learning_rate": 4.742467312135594e-05, "loss": 2.7398, "step": 79500 }, { "epoch": 0.44204512175580324, "grad_norm": 6.098598003387451, "learning_rate": 4.7373510397288787e-05, "loss": 2.7279, "step": 80000 }, { "epoch": 0.44204512175580324, "eval_runtime": 1421.1349, "eval_samples_per_second": 254.693, "eval_steps_per_second": 31.837, "step": 80000 }, { "epoch": 0.444807903766777, "grad_norm": 5.454360485076904, "learning_rate": 4.732234767322164e-05, "loss": 2.7256, "step": 80500 }, { "epoch": 0.4475706857777508, "grad_norm": 9.29869556427002, "learning_rate": 4.727128727460262e-05, "loss": 2.7272, "step": 81000 }, { "epoch": 0.4503334677887245, "grad_norm": 10.766260147094727, "learning_rate": 4.722012455053547e-05, "loss": 2.7266, "step": 81500 }, { "epoch": 0.4530962497996983, "grad_norm": 5.04358434677124, "learning_rate": 4.7168961826468324e-05, "loss": 2.6443, "step": 82000 }, { "epoch": 0.45585903181067206, "grad_norm": 6.527529716491699, "learning_rate": 4.711779910240117e-05, "loss": 2.7191, "step": 82500 }, { "epoch": 0.45862181382164585, "grad_norm": 4.683417797088623, "learning_rate": 4.706673870378215e-05, "loss": 2.7299, "step": 83000 }, { "epoch": 0.4613845958326196, "grad_norm": 6.090554237365723, "learning_rate": 4.7015575979715e-05, "loss": 2.759, "step": 83500 }, { "epoch": 0.4641473778435934, "grad_norm": 6.470883369445801, "learning_rate": 4.6964413255647855e-05, "loss": 2.734, "step": 84000 }, { "epoch": 0.46691015985456713, "grad_norm": 8.398398399353027, "learning_rate": 4.691325053158071e-05, "loss": 2.7007, "step": 84500 }, { "epoch": 0.4696729418655409, "grad_norm": 5.122215270996094, "learning_rate": 4.686208780751355e-05, "loss": 2.6983, "step": 85000 }, { "epoch": 0.4696729418655409, "eval_runtime": 1425.8823, "eval_samples_per_second": 253.845, "eval_steps_per_second": 31.731, "step": 85000 }, { "epoch": 0.47243572387651467, "grad_norm": 5.900498867034912, "learning_rate": 4.681102740889454e-05, "loss": 2.7391, "step": 85500 }, { "epoch": 0.47519850588748846, "grad_norm": 8.789809226989746, "learning_rate": 4.6759864684827386e-05, "loss": 2.6843, "step": 86000 }, { "epoch": 0.47796128789846226, "grad_norm": 10.296858787536621, "learning_rate": 4.670870196076024e-05, "loss": 2.7054, "step": 86500 }, { "epoch": 0.480724069909436, "grad_norm": 7.866740703582764, "learning_rate": 4.665753923669309e-05, "loss": 2.7313, "step": 87000 }, { "epoch": 0.4834868519204098, "grad_norm": 5.662039756774902, "learning_rate": 4.660647883807408e-05, "loss": 2.6513, "step": 87500 }, { "epoch": 0.48624963393138354, "grad_norm": 6.773980617523193, "learning_rate": 4.655531611400692e-05, "loss": 2.7071, "step": 88000 }, { "epoch": 0.48901241594235734, "grad_norm": 9.490970611572266, "learning_rate": 4.6504153389939775e-05, "loss": 2.7469, "step": 88500 }, { "epoch": 0.4917751979533311, "grad_norm": 6.021182537078857, "learning_rate": 4.645299066587263e-05, "loss": 2.7378, "step": 89000 }, { "epoch": 0.4945379799643049, "grad_norm": 6.2784271240234375, "learning_rate": 4.640182794180548e-05, "loss": 2.691, "step": 89500 }, { "epoch": 0.4973007619752786, "grad_norm": 6.208467960357666, "learning_rate": 4.635076754318646e-05, "loss": 2.6986, "step": 90000 }, { "epoch": 0.4973007619752786, "eval_runtime": 1416.7509, "eval_samples_per_second": 255.481, "eval_steps_per_second": 31.936, "step": 90000 }, { "epoch": 0.5000635439862524, "grad_norm": 7.096754550933838, "learning_rate": 4.6299604819119306e-05, "loss": 2.7249, "step": 90500 }, { "epoch": 0.5028263259972262, "grad_norm": 6.615260601043701, "learning_rate": 4.624844209505216e-05, "loss": 2.691, "step": 91000 }, { "epoch": 0.5055891080081999, "grad_norm": 5.5422043800354, "learning_rate": 4.619727937098501e-05, "loss": 2.7257, "step": 91500 }, { "epoch": 0.5083518900191737, "grad_norm": 6.378222465515137, "learning_rate": 4.614621897236599e-05, "loss": 2.7359, "step": 92000 }, { "epoch": 0.5111146720301475, "grad_norm": 7.653573513031006, "learning_rate": 4.609505624829884e-05, "loss": 2.7233, "step": 92500 }, { "epoch": 0.5138774540411213, "grad_norm": 8.169157981872559, "learning_rate": 4.604389352423169e-05, "loss": 2.6786, "step": 93000 }, { "epoch": 0.516640236052095, "grad_norm": 6.562656402587891, "learning_rate": 4.599273080016454e-05, "loss": 2.705, "step": 93500 }, { "epoch": 0.5194030180630688, "grad_norm": 5.986241340637207, "learning_rate": 4.594167040154553e-05, "loss": 2.717, "step": 94000 }, { "epoch": 0.5221658000740426, "grad_norm": 6.135688304901123, "learning_rate": 4.589061000292651e-05, "loss": 2.6973, "step": 94500 }, { "epoch": 0.5249285820850164, "grad_norm": 5.69881534576416, "learning_rate": 4.583944727885936e-05, "loss": 2.7153, "step": 95000 }, { "epoch": 0.5249285820850164, "eval_runtime": 1415.9132, "eval_samples_per_second": 255.632, "eval_steps_per_second": 31.955, "step": 95000 }, { "epoch": 0.52769136409599, "grad_norm": 11.074015617370605, "learning_rate": 4.578828455479221e-05, "loss": 2.7301, "step": 95500 }, { "epoch": 0.5304541461069638, "grad_norm": 4.483022212982178, "learning_rate": 4.573712183072506e-05, "loss": 2.6829, "step": 96000 }, { "epoch": 0.5332169281179376, "grad_norm": 5.1948561668396, "learning_rate": 4.568595910665791e-05, "loss": 2.7031, "step": 96500 }, { "epoch": 0.5359797101289114, "grad_norm": 7.523544788360596, "learning_rate": 4.5634796382590763e-05, "loss": 2.75, "step": 97000 }, { "epoch": 0.5387424921398852, "grad_norm": 7.069555282592773, "learning_rate": 4.558363365852361e-05, "loss": 2.7049, "step": 97500 }, { "epoch": 0.5415052741508589, "grad_norm": 9.187417984008789, "learning_rate": 4.553247093445646e-05, "loss": 2.7008, "step": 98000 }, { "epoch": 0.5442680561618327, "grad_norm": 6.571780204772949, "learning_rate": 4.548141053583744e-05, "loss": 2.6672, "step": 98500 }, { "epoch": 0.5470308381728065, "grad_norm": 6.857777118682861, "learning_rate": 4.5430247811770294e-05, "loss": 2.6805, "step": 99000 }, { "epoch": 0.5497936201837803, "grad_norm": 4.911254405975342, "learning_rate": 4.5379085087703146e-05, "loss": 2.7083, "step": 99500 }, { "epoch": 0.552556402194754, "grad_norm": 6.255260467529297, "learning_rate": 4.532792236363599e-05, "loss": 2.7035, "step": 100000 }, { "epoch": 0.552556402194754, "eval_runtime": 1418.8162, "eval_samples_per_second": 255.109, "eval_steps_per_second": 31.889, "step": 100000 }, { "epoch": 0.5553191842057278, "grad_norm": 5.266800403594971, "learning_rate": 4.527686196501697e-05, "loss": 2.7331, "step": 100500 }, { "epoch": 0.5580819662167016, "grad_norm": 5.317836284637451, "learning_rate": 4.5225699240949825e-05, "loss": 2.7293, "step": 101000 }, { "epoch": 0.5608447482276754, "grad_norm": 6.019017219543457, "learning_rate": 4.517453651688268e-05, "loss": 2.6752, "step": 101500 }, { "epoch": 0.5636075302386491, "grad_norm": 9.754213333129883, "learning_rate": 4.512337379281553e-05, "loss": 2.7048, "step": 102000 }, { "epoch": 0.5663703122496229, "grad_norm": 5.172014236450195, "learning_rate": 4.507231339419651e-05, "loss": 2.6733, "step": 102500 }, { "epoch": 0.5691330942605967, "grad_norm": 23.669513702392578, "learning_rate": 4.502115067012936e-05, "loss": 2.6909, "step": 103000 }, { "epoch": 0.5718958762715705, "grad_norm": 4.895296573638916, "learning_rate": 4.4969987946062214e-05, "loss": 2.6816, "step": 103500 }, { "epoch": 0.5746586582825441, "grad_norm": 6.87628173828125, "learning_rate": 4.4918825221995066e-05, "loss": 2.7055, "step": 104000 }, { "epoch": 0.5774214402935179, "grad_norm": 6.831465721130371, "learning_rate": 4.486776482337605e-05, "loss": 2.6939, "step": 104500 }, { "epoch": 0.5801842223044917, "grad_norm": 6.297806739807129, "learning_rate": 4.481670442475703e-05, "loss": 2.7159, "step": 105000 }, { "epoch": 0.5801842223044917, "eval_runtime": 1418.3876, "eval_samples_per_second": 255.186, "eval_steps_per_second": 31.899, "step": 105000 }, { "epoch": 0.5829470043154655, "grad_norm": 5.875136852264404, "learning_rate": 4.4765541700689875e-05, "loss": 2.6865, "step": 105500 }, { "epoch": 0.5857097863264392, "grad_norm": 7.282098770141602, "learning_rate": 4.471437897662273e-05, "loss": 2.6504, "step": 106000 }, { "epoch": 0.588472568337413, "grad_norm": 7.123196125030518, "learning_rate": 4.466321625255558e-05, "loss": 2.6962, "step": 106500 }, { "epoch": 0.5912353503483868, "grad_norm": 5.343898296356201, "learning_rate": 4.461205352848843e-05, "loss": 2.6918, "step": 107000 }, { "epoch": 0.5939981323593606, "grad_norm": 7.826199054718018, "learning_rate": 4.4560890804421276e-05, "loss": 2.7273, "step": 107500 }, { "epoch": 0.5967609143703344, "grad_norm": 4.318883895874023, "learning_rate": 4.4509830405802264e-05, "loss": 2.7267, "step": 108000 }, { "epoch": 0.5995236963813081, "grad_norm": 7.37441873550415, "learning_rate": 4.4458667681735116e-05, "loss": 2.6595, "step": 108500 }, { "epoch": 0.6022864783922819, "grad_norm": 8.249720573425293, "learning_rate": 4.440750495766797e-05, "loss": 2.7001, "step": 109000 }, { "epoch": 0.6050492604032557, "grad_norm": 7.008593559265137, "learning_rate": 4.435634223360081e-05, "loss": 2.6998, "step": 109500 }, { "epoch": 0.6078120424142295, "grad_norm": 5.00942850112915, "learning_rate": 4.430517950953366e-05, "loss": 2.6903, "step": 110000 }, { "epoch": 0.6078120424142295, "eval_runtime": 1419.9436, "eval_samples_per_second": 254.907, "eval_steps_per_second": 31.864, "step": 110000 }, { "epoch": 0.6105748244252032, "grad_norm": 9.650525093078613, "learning_rate": 4.425401678546651e-05, "loss": 2.6904, "step": 110500 }, { "epoch": 0.613337606436177, "grad_norm": 8.574694633483887, "learning_rate": 4.420285406139936e-05, "loss": 2.7113, "step": 111000 }, { "epoch": 0.6161003884471508, "grad_norm": 8.437103271484375, "learning_rate": 4.4151691337332214e-05, "loss": 2.6072, "step": 111500 }, { "epoch": 0.6188631704581246, "grad_norm": 7.744716167449951, "learning_rate": 4.41006309387132e-05, "loss": 2.6759, "step": 112000 }, { "epoch": 0.6216259524690982, "grad_norm": 6.564632892608643, "learning_rate": 4.404946821464605e-05, "loss": 2.6762, "step": 112500 }, { "epoch": 0.624388734480072, "grad_norm": 8.12996768951416, "learning_rate": 4.39983054905789e-05, "loss": 2.6773, "step": 113000 }, { "epoch": 0.6271515164910458, "grad_norm": 10.49181842803955, "learning_rate": 4.394714276651175e-05, "loss": 2.7151, "step": 113500 }, { "epoch": 0.6299142985020196, "grad_norm": 10.710319519042969, "learning_rate": 4.3896082367892734e-05, "loss": 2.6788, "step": 114000 }, { "epoch": 0.6326770805129933, "grad_norm": 4.202202796936035, "learning_rate": 4.3844919643825586e-05, "loss": 2.7281, "step": 114500 }, { "epoch": 0.6354398625239671, "grad_norm": 5.33767557144165, "learning_rate": 4.379375691975843e-05, "loss": 2.7071, "step": 115000 }, { "epoch": 0.6354398625239671, "eval_runtime": 1417.9967, "eval_samples_per_second": 255.257, "eval_steps_per_second": 31.908, "step": 115000 }, { "epoch": 0.6382026445349409, "grad_norm": 7.461055278778076, "learning_rate": 4.374259419569128e-05, "loss": 2.7429, "step": 115500 }, { "epoch": 0.6409654265459147, "grad_norm": 5.790754318237305, "learning_rate": 4.3691533797072264e-05, "loss": 2.6729, "step": 116000 }, { "epoch": 0.6437282085568884, "grad_norm": 6.799173831939697, "learning_rate": 4.3640371073005116e-05, "loss": 2.6795, "step": 116500 }, { "epoch": 0.6464909905678622, "grad_norm": 8.002934455871582, "learning_rate": 4.358920834893797e-05, "loss": 2.7077, "step": 117000 }, { "epoch": 0.649253772578836, "grad_norm": 6.33392858505249, "learning_rate": 4.353814795031895e-05, "loss": 2.67, "step": 117500 }, { "epoch": 0.6520165545898098, "grad_norm": 7.54648494720459, "learning_rate": 4.34869852262518e-05, "loss": 2.703, "step": 118000 }, { "epoch": 0.6547793366007836, "grad_norm": 6.37369441986084, "learning_rate": 4.343582250218465e-05, "loss": 2.7147, "step": 118500 }, { "epoch": 0.6575421186117573, "grad_norm": 6.073976516723633, "learning_rate": 4.33846597781175e-05, "loss": 2.6448, "step": 119000 }, { "epoch": 0.6603049006227311, "grad_norm": 6.560715675354004, "learning_rate": 4.333349705405035e-05, "loss": 2.6999, "step": 119500 }, { "epoch": 0.6630676826337049, "grad_norm": 5.832411766052246, "learning_rate": 4.3282334329983196e-05, "loss": 2.6517, "step": 120000 }, { "epoch": 0.6630676826337049, "eval_runtime": 1414.7946, "eval_samples_per_second": 255.834, "eval_steps_per_second": 31.98, "step": 120000 }, { "epoch": 0.6658304646446787, "grad_norm": 7.141829490661621, "learning_rate": 4.323117160591605e-05, "loss": 2.6821, "step": 120500 }, { "epoch": 0.6685932466556523, "grad_norm": 10.832301139831543, "learning_rate": 4.31800088818489e-05, "loss": 2.6649, "step": 121000 }, { "epoch": 0.6713560286666261, "grad_norm": 6.108252048492432, "learning_rate": 4.312894848322989e-05, "loss": 2.6853, "step": 121500 }, { "epoch": 0.6741188106775999, "grad_norm": 6.897459983825684, "learning_rate": 4.3077785759162734e-05, "loss": 2.7084, "step": 122000 }, { "epoch": 0.6768815926885737, "grad_norm": 7.211142063140869, "learning_rate": 4.3026623035095586e-05, "loss": 2.7029, "step": 122500 }, { "epoch": 0.6796443746995474, "grad_norm": 6.063936710357666, "learning_rate": 4.297546031102843e-05, "loss": 2.6938, "step": 123000 }, { "epoch": 0.6824071567105212, "grad_norm": 7.535489082336426, "learning_rate": 4.292439991240942e-05, "loss": 2.71, "step": 123500 }, { "epoch": 0.685169938721495, "grad_norm": 6.275320529937744, "learning_rate": 4.287323718834227e-05, "loss": 2.6889, "step": 124000 }, { "epoch": 0.6879327207324688, "grad_norm": 5.57111930847168, "learning_rate": 4.2822074464275116e-05, "loss": 2.6347, "step": 124500 }, { "epoch": 0.6906955027434425, "grad_norm": 5.227652072906494, "learning_rate": 4.277091174020797e-05, "loss": 2.6816, "step": 125000 }, { "epoch": 0.6906955027434425, "eval_runtime": 1444.2501, "eval_samples_per_second": 250.617, "eval_steps_per_second": 31.328, "step": 125000 }, { "epoch": 0.6934582847544163, "grad_norm": 3.870896339416504, "learning_rate": 4.271974901614082e-05, "loss": 2.6564, "step": 125500 }, { "epoch": 0.6962210667653901, "grad_norm": 5.954395771026611, "learning_rate": 4.26686886175218e-05, "loss": 2.7183, "step": 126000 }, { "epoch": 0.6989838487763639, "grad_norm": 5.926782131195068, "learning_rate": 4.2617525893454654e-05, "loss": 2.7013, "step": 126500 }, { "epoch": 0.7017466307873376, "grad_norm": 6.156914710998535, "learning_rate": 4.25663631693875e-05, "loss": 2.6723, "step": 127000 }, { "epoch": 0.7045094127983114, "grad_norm": 5.73563289642334, "learning_rate": 4.251520044532035e-05, "loss": 2.7047, "step": 127500 }, { "epoch": 0.7072721948092852, "grad_norm": 6.068446636199951, "learning_rate": 4.246414004670133e-05, "loss": 2.6729, "step": 128000 }, { "epoch": 0.710034976820259, "grad_norm": 6.130403995513916, "learning_rate": 4.2412977322634185e-05, "loss": 2.7039, "step": 128500 }, { "epoch": 0.7127977588312328, "grad_norm": 5.924908638000488, "learning_rate": 4.236191692401517e-05, "loss": 2.7127, "step": 129000 }, { "epoch": 0.7155605408422064, "grad_norm": 7.866479396820068, "learning_rate": 4.2310754199948025e-05, "loss": 2.644, "step": 129500 }, { "epoch": 0.7183233228531802, "grad_norm": 6.398780345916748, "learning_rate": 4.225959147588087e-05, "loss": 2.7028, "step": 130000 }, { "epoch": 0.7183233228531802, "eval_runtime": 1453.3668, "eval_samples_per_second": 249.044, "eval_steps_per_second": 31.131, "step": 130000 }, { "epoch": 0.721086104864154, "grad_norm": 7.091272830963135, "learning_rate": 4.220842875181372e-05, "loss": 2.6944, "step": 130500 }, { "epoch": 0.7238488868751278, "grad_norm": 6.755273342132568, "learning_rate": 4.2157266027746574e-05, "loss": 2.7319, "step": 131000 }, { "epoch": 0.7266116688861015, "grad_norm": 7.107387065887451, "learning_rate": 4.2106103303679426e-05, "loss": 2.686, "step": 131500 }, { "epoch": 0.7293744508970753, "grad_norm": 5.511538982391357, "learning_rate": 4.205494057961227e-05, "loss": 2.7141, "step": 132000 }, { "epoch": 0.7321372329080491, "grad_norm": 6.616804599761963, "learning_rate": 4.2003777855545116e-05, "loss": 2.6817, "step": 132500 }, { "epoch": 0.7349000149190229, "grad_norm": 5.216026782989502, "learning_rate": 4.1952717456926105e-05, "loss": 2.6844, "step": 133000 }, { "epoch": 0.7376627969299966, "grad_norm": 6.271154880523682, "learning_rate": 4.190155473285896e-05, "loss": 2.6845, "step": 133500 }, { "epoch": 0.7404255789409704, "grad_norm": 7.05709981918335, "learning_rate": 4.185039200879181e-05, "loss": 2.6943, "step": 134000 }, { "epoch": 0.7431883609519442, "grad_norm": 8.08059024810791, "learning_rate": 4.1799229284724654e-05, "loss": 2.6897, "step": 134500 }, { "epoch": 0.745951142962918, "grad_norm": 9.127315521240234, "learning_rate": 4.1748066560657506e-05, "loss": 2.7364, "step": 135000 }, { "epoch": 0.745951142962918, "eval_runtime": 1433.8013, "eval_samples_per_second": 252.443, "eval_steps_per_second": 31.556, "step": 135000 }, { "epoch": 0.7487139249738917, "grad_norm": 6.763530254364014, "learning_rate": 4.169700616203849e-05, "loss": 2.6775, "step": 135500 }, { "epoch": 0.7514767069848655, "grad_norm": 7.9728617668151855, "learning_rate": 4.164584343797134e-05, "loss": 2.6517, "step": 136000 }, { "epoch": 0.7542394889958393, "grad_norm": 6.352534294128418, "learning_rate": 4.159468071390419e-05, "loss": 2.6726, "step": 136500 }, { "epoch": 0.757002271006813, "grad_norm": 8.01561450958252, "learning_rate": 4.154351798983704e-05, "loss": 2.656, "step": 137000 }, { "epoch": 0.7597650530177867, "grad_norm": 4.679101467132568, "learning_rate": 4.149245759121802e-05, "loss": 2.6909, "step": 137500 }, { "epoch": 0.7625278350287605, "grad_norm": 8.915389060974121, "learning_rate": 4.144129486715087e-05, "loss": 2.7239, "step": 138000 }, { "epoch": 0.7652906170397343, "grad_norm": 9.970344543457031, "learning_rate": 4.139013214308372e-05, "loss": 2.6624, "step": 138500 }, { "epoch": 0.7680533990507081, "grad_norm": 4.899960994720459, "learning_rate": 4.1338969419016574e-05, "loss": 2.6651, "step": 139000 }, { "epoch": 0.7708161810616819, "grad_norm": 6.549561023712158, "learning_rate": 4.128780669494942e-05, "loss": 2.6871, "step": 139500 }, { "epoch": 0.7735789630726556, "grad_norm": 9.052062034606934, "learning_rate": 4.123674629633041e-05, "loss": 2.6841, "step": 140000 }, { "epoch": 0.7735789630726556, "eval_runtime": 1389.3693, "eval_samples_per_second": 260.516, "eval_steps_per_second": 32.565, "step": 140000 }, { "epoch": 0.7763417450836294, "grad_norm": 9.36550521850586, "learning_rate": 4.118558357226326e-05, "loss": 2.6732, "step": 140500 }, { "epoch": 0.7791045270946032, "grad_norm": 6.052969932556152, "learning_rate": 4.113442084819611e-05, "loss": 2.6642, "step": 141000 }, { "epoch": 0.781867309105577, "grad_norm": 5.193731307983398, "learning_rate": 4.108325812412896e-05, "loss": 2.6776, "step": 141500 }, { "epoch": 0.7846300911165507, "grad_norm": 7.808539390563965, "learning_rate": 4.10320954000618e-05, "loss": 2.6496, "step": 142000 }, { "epoch": 0.7873928731275245, "grad_norm": 6.747580051422119, "learning_rate": 4.098103500144279e-05, "loss": 2.6942, "step": 142500 }, { "epoch": 0.7901556551384983, "grad_norm": 7.423492908477783, "learning_rate": 4.092987227737564e-05, "loss": 2.7079, "step": 143000 }, { "epoch": 0.7929184371494721, "grad_norm": 8.380352973937988, "learning_rate": 4.0878709553308494e-05, "loss": 2.7226, "step": 143500 }, { "epoch": 0.7956812191604458, "grad_norm": 5.976553440093994, "learning_rate": 4.082754682924134e-05, "loss": 2.6531, "step": 144000 }, { "epoch": 0.7984440011714196, "grad_norm": 6.945559024810791, "learning_rate": 4.077648643062232e-05, "loss": 2.6888, "step": 144500 }, { "epoch": 0.8012067831823934, "grad_norm": 5.81919002532959, "learning_rate": 4.072532370655517e-05, "loss": 2.7122, "step": 145000 }, { "epoch": 0.8012067831823934, "eval_runtime": 1417.5368, "eval_samples_per_second": 255.339, "eval_steps_per_second": 31.918, "step": 145000 }, { "epoch": 0.8039695651933672, "grad_norm": 4.8362908363342285, "learning_rate": 4.0674160982488025e-05, "loss": 2.7026, "step": 145500 }, { "epoch": 0.8067323472043408, "grad_norm": 5.70801305770874, "learning_rate": 4.062299825842088e-05, "loss": 2.676, "step": 146000 }, { "epoch": 0.8094951292153146, "grad_norm": 5.2062811851501465, "learning_rate": 4.057193785980186e-05, "loss": 2.6539, "step": 146500 }, { "epoch": 0.8122579112262884, "grad_norm": 7.768016815185547, "learning_rate": 4.0520775135734704e-05, "loss": 2.6562, "step": 147000 }, { "epoch": 0.8150206932372622, "grad_norm": 5.368041515350342, "learning_rate": 4.0469612411667556e-05, "loss": 2.6276, "step": 147500 }, { "epoch": 0.8177834752482359, "grad_norm": 8.40014934539795, "learning_rate": 4.041844968760041e-05, "loss": 2.6833, "step": 148000 }, { "epoch": 0.8205462572592097, "grad_norm": 5.6016740798950195, "learning_rate": 4.036728696353326e-05, "loss": 2.6668, "step": 148500 }, { "epoch": 0.8233090392701835, "grad_norm": 7.395069599151611, "learning_rate": 4.031622656491424e-05, "loss": 2.6534, "step": 149000 }, { "epoch": 0.8260718212811573, "grad_norm": 6.262421607971191, "learning_rate": 4.026506384084709e-05, "loss": 2.6487, "step": 149500 }, { "epoch": 0.828834603292131, "grad_norm": 5.546928882598877, "learning_rate": 4.0213901116779945e-05, "loss": 2.678, "step": 150000 }, { "epoch": 0.828834603292131, "eval_runtime": 1413.0655, "eval_samples_per_second": 256.147, "eval_steps_per_second": 32.019, "step": 150000 }, { "epoch": 0.8315973853031048, "grad_norm": 6.29634428024292, "learning_rate": 4.01627383927128e-05, "loss": 2.7104, "step": 150500 }, { "epoch": 0.8343601673140786, "grad_norm": 8.511784553527832, "learning_rate": 4.011167799409378e-05, "loss": 2.6784, "step": 151000 }, { "epoch": 0.8371229493250524, "grad_norm": 6.0729451179504395, "learning_rate": 4.0060515270026624e-05, "loss": 2.6968, "step": 151500 }, { "epoch": 0.8398857313360262, "grad_norm": 4.513732433319092, "learning_rate": 4.0009352545959476e-05, "loss": 2.661, "step": 152000 }, { "epoch": 0.8426485133469999, "grad_norm": 7.522515773773193, "learning_rate": 3.995818982189233e-05, "loss": 2.6892, "step": 152500 }, { "epoch": 0.8454112953579737, "grad_norm": 13.848055839538574, "learning_rate": 3.990712942327331e-05, "loss": 2.6444, "step": 153000 }, { "epoch": 0.8481740773689475, "grad_norm": 7.082030296325684, "learning_rate": 3.985596669920616e-05, "loss": 2.6872, "step": 153500 }, { "epoch": 0.8509368593799213, "grad_norm": 7.098601818084717, "learning_rate": 3.980480397513901e-05, "loss": 2.6684, "step": 154000 }, { "epoch": 0.8536996413908949, "grad_norm": 5.784538269042969, "learning_rate": 3.975364125107186e-05, "loss": 2.7026, "step": 154500 }, { "epoch": 0.8564624234018687, "grad_norm": 7.91291618347168, "learning_rate": 3.970258085245285e-05, "loss": 2.7019, "step": 155000 }, { "epoch": 0.8564624234018687, "eval_runtime": 95374.473, "eval_samples_per_second": 3.795, "eval_steps_per_second": 0.474, "step": 155000 }, { "epoch": 0.8592252054128425, "grad_norm": 6.4150710105896, "learning_rate": 3.96514181283857e-05, "loss": 2.6462, "step": 155500 }, { "epoch": 0.8619879874238163, "grad_norm": 6.692925930023193, "learning_rate": 3.9600255404318544e-05, "loss": 2.6601, "step": 156000 }, { "epoch": 0.86475076943479, "grad_norm": 6.283834934234619, "learning_rate": 3.954909268025139e-05, "loss": 2.6762, "step": 156500 }, { "epoch": 0.8675135514457638, "grad_norm": 7.09011173248291, "learning_rate": 3.949803228163238e-05, "loss": 2.6903, "step": 157000 }, { "epoch": 0.8702763334567376, "grad_norm": 5.456295490264893, "learning_rate": 3.944686955756523e-05, "loss": 2.6861, "step": 157500 }, { "epoch": 0.8730391154677114, "grad_norm": 8.332560539245605, "learning_rate": 3.939570683349808e-05, "loss": 2.6292, "step": 158000 }, { "epoch": 0.8758018974786851, "grad_norm": 11.606867790222168, "learning_rate": 3.9344544109430934e-05, "loss": 2.6681, "step": 158500 }, { "epoch": 0.8785646794896589, "grad_norm": 6.087031364440918, "learning_rate": 3.9293483710811915e-05, "loss": 2.6429, "step": 159000 }, { "epoch": 0.8813274615006327, "grad_norm": 7.540450572967529, "learning_rate": 3.924232098674476e-05, "loss": 2.6526, "step": 159500 }, { "epoch": 0.8840902435116065, "grad_norm": 5.995485305786133, "learning_rate": 3.919115826267761e-05, "loss": 2.6561, "step": 160000 }, { "epoch": 0.8840902435116065, "eval_runtime": 1415.0083, "eval_samples_per_second": 255.796, "eval_steps_per_second": 31.975, "step": 160000 }, { "epoch": 0.8868530255225802, "grad_norm": 6.677637100219727, "learning_rate": 3.9139995538610464e-05, "loss": 2.726, "step": 160500 }, { "epoch": 0.889615807533554, "grad_norm": 4.598792552947998, "learning_rate": 3.9088832814543316e-05, "loss": 2.6989, "step": 161000 }, { "epoch": 0.8923785895445278, "grad_norm": 6.177221775054932, "learning_rate": 3.90377724159243e-05, "loss": 2.6515, "step": 161500 }, { "epoch": 0.8951413715555016, "grad_norm": 4.76786994934082, "learning_rate": 3.898660969185714e-05, "loss": 2.6567, "step": 162000 }, { "epoch": 0.8979041535664753, "grad_norm": 8.788933753967285, "learning_rate": 3.8935446967789995e-05, "loss": 2.6491, "step": 162500 }, { "epoch": 0.900666935577449, "grad_norm": 5.806134223937988, "learning_rate": 3.888428424372285e-05, "loss": 2.638, "step": 163000 }, { "epoch": 0.9034297175884228, "grad_norm": 6.518142223358154, "learning_rate": 3.8833223845103836e-05, "loss": 2.7087, "step": 163500 }, { "epoch": 0.9061924995993966, "grad_norm": 5.603370189666748, "learning_rate": 3.878206112103668e-05, "loss": 2.6832, "step": 164000 }, { "epoch": 0.9089552816103704, "grad_norm": 4.990660667419434, "learning_rate": 3.873089839696953e-05, "loss": 2.6847, "step": 164500 }, { "epoch": 0.9117180636213441, "grad_norm": 7.145622730255127, "learning_rate": 3.8679735672902385e-05, "loss": 2.652, "step": 165000 }, { "epoch": 0.9117180636213441, "eval_runtime": 1419.0902, "eval_samples_per_second": 255.06, "eval_steps_per_second": 31.883, "step": 165000 }, { "epoch": 0.9144808456323179, "grad_norm": 6.253338813781738, "learning_rate": 3.8628675274283366e-05, "loss": 2.6564, "step": 165500 }, { "epoch": 0.9172436276432917, "grad_norm": 9.034163475036621, "learning_rate": 3.857761487566435e-05, "loss": 2.6881, "step": 166000 }, { "epoch": 0.9200064096542655, "grad_norm": 6.8109331130981445, "learning_rate": 3.85264521515972e-05, "loss": 2.6523, "step": 166500 }, { "epoch": 0.9227691916652392, "grad_norm": 6.663868427276611, "learning_rate": 3.8475289427530045e-05, "loss": 2.6115, "step": 167000 }, { "epoch": 0.925531973676213, "grad_norm": 6.121496677398682, "learning_rate": 3.84241267034629e-05, "loss": 2.6596, "step": 167500 }, { "epoch": 0.9282947556871868, "grad_norm": 7.0516533851623535, "learning_rate": 3.837296397939575e-05, "loss": 2.7253, "step": 168000 }, { "epoch": 0.9310575376981606, "grad_norm": 9.789468765258789, "learning_rate": 3.83218012553286e-05, "loss": 2.6783, "step": 168500 }, { "epoch": 0.9338203197091343, "grad_norm": 6.72811222076416, "learning_rate": 3.8270638531261446e-05, "loss": 2.6639, "step": 169000 }, { "epoch": 0.9365831017201081, "grad_norm": 6.445733070373535, "learning_rate": 3.82194758071943e-05, "loss": 2.6683, "step": 169500 }, { "epoch": 0.9393458837310819, "grad_norm": 6.840031623840332, "learning_rate": 3.816841540857528e-05, "loss": 2.6877, "step": 170000 }, { "epoch": 0.9393458837310819, "eval_runtime": 1422.4105, "eval_samples_per_second": 254.465, "eval_steps_per_second": 31.809, "step": 170000 }, { "epoch": 0.9421086657420557, "grad_norm": 7.416438579559326, "learning_rate": 3.811735500995627e-05, "loss": 2.6762, "step": 170500 }, { "epoch": 0.9448714477530293, "grad_norm": 7.09953498840332, "learning_rate": 3.806619228588912e-05, "loss": 2.6424, "step": 171000 }, { "epoch": 0.9476342297640031, "grad_norm": 4.984237194061279, "learning_rate": 3.801502956182197e-05, "loss": 2.615, "step": 171500 }, { "epoch": 0.9503970117749769, "grad_norm": 7.201922416687012, "learning_rate": 3.796386683775482e-05, "loss": 2.6765, "step": 172000 }, { "epoch": 0.9531597937859507, "grad_norm": 5.911804676055908, "learning_rate": 3.791270411368767e-05, "loss": 2.6948, "step": 172500 }, { "epoch": 0.9559225757969245, "grad_norm": 9.068735122680664, "learning_rate": 3.786154138962052e-05, "loss": 2.732, "step": 173000 }, { "epoch": 0.9586853578078982, "grad_norm": 5.268345355987549, "learning_rate": 3.781037866555337e-05, "loss": 2.6845, "step": 173500 }, { "epoch": 0.961448139818872, "grad_norm": 4.931880474090576, "learning_rate": 3.775921594148622e-05, "loss": 2.6898, "step": 174000 }, { "epoch": 0.9642109218298458, "grad_norm": 6.40797233581543, "learning_rate": 3.77081555428672e-05, "loss": 2.6438, "step": 174500 }, { "epoch": 0.9669737038408196, "grad_norm": 8.506610870361328, "learning_rate": 3.765699281880005e-05, "loss": 2.6475, "step": 175000 }, { "epoch": 0.9669737038408196, "eval_runtime": 1421.0183, "eval_samples_per_second": 254.714, "eval_steps_per_second": 31.84, "step": 175000 }, { "epoch": 0.9697364858517933, "grad_norm": 6.606350898742676, "learning_rate": 3.7605932420181033e-05, "loss": 2.649, "step": 175500 }, { "epoch": 0.9724992678627671, "grad_norm": 7.5227580070495605, "learning_rate": 3.7554769696113885e-05, "loss": 2.6666, "step": 176000 }, { "epoch": 0.9752620498737409, "grad_norm": 6.499021530151367, "learning_rate": 3.750360697204674e-05, "loss": 2.6652, "step": 176500 }, { "epoch": 0.9780248318847147, "grad_norm": 6.094892501831055, "learning_rate": 3.745244424797958e-05, "loss": 2.6776, "step": 177000 }, { "epoch": 0.9807876138956884, "grad_norm": 6.329995632171631, "learning_rate": 3.7401281523912435e-05, "loss": 2.7002, "step": 177500 }, { "epoch": 0.9835503959066622, "grad_norm": 7.394835472106934, "learning_rate": 3.7350118799845287e-05, "loss": 2.6647, "step": 178000 }, { "epoch": 0.986313177917636, "grad_norm": 9.899744987487793, "learning_rate": 3.729895607577814e-05, "loss": 2.6873, "step": 178500 }, { "epoch": 0.9890759599286097, "grad_norm": 8.495482444763184, "learning_rate": 3.7247793351710984e-05, "loss": 2.6574, "step": 179000 }, { "epoch": 0.9918387419395834, "grad_norm": 7.411177158355713, "learning_rate": 3.7196630627643836e-05, "loss": 2.6587, "step": 179500 }, { "epoch": 0.9946015239505572, "grad_norm": 6.457353591918945, "learning_rate": 3.714557022902482e-05, "loss": 2.6607, "step": 180000 }, { "epoch": 0.9946015239505572, "eval_runtime": 1418.3578, "eval_samples_per_second": 255.192, "eval_steps_per_second": 31.9, "step": 180000 }, { "epoch": 0.997364305961531, "grad_norm": 4.6210618019104, "learning_rate": 3.709440750495767e-05, "loss": 2.6894, "step": 180500 }, { "epoch": 1.0001270879725048, "grad_norm": 8.51563549041748, "learning_rate": 3.704324478089052e-05, "loss": 2.6624, "step": 181000 }, { "epoch": 1.0028898699834785, "grad_norm": 7.084454536437988, "learning_rate": 3.6992082056823366e-05, "loss": 2.644, "step": 181500 }, { "epoch": 1.0056526519944524, "grad_norm": 6.7502665519714355, "learning_rate": 3.6941021658204355e-05, "loss": 2.719, "step": 182000 }, { "epoch": 1.008415434005426, "grad_norm": 9.962002754211426, "learning_rate": 3.688985893413721e-05, "loss": 2.6581, "step": 182500 }, { "epoch": 1.0111782160163998, "grad_norm": 8.310935020446777, "learning_rate": 3.683869621007006e-05, "loss": 2.6834, "step": 183000 }, { "epoch": 1.0139409980273737, "grad_norm": 6.0927557945251465, "learning_rate": 3.6787533486002904e-05, "loss": 2.6788, "step": 183500 }, { "epoch": 1.0167037800383474, "grad_norm": 7.631921768188477, "learning_rate": 3.6736473087383886e-05, "loss": 2.6882, "step": 184000 }, { "epoch": 1.0194665620493213, "grad_norm": 11.251723289489746, "learning_rate": 3.668531036331674e-05, "loss": 2.6699, "step": 184500 }, { "epoch": 1.022229344060295, "grad_norm": 6.595937252044678, "learning_rate": 3.663414763924959e-05, "loss": 2.6798, "step": 185000 }, { "epoch": 1.022229344060295, "eval_runtime": 1415.123, "eval_samples_per_second": 255.775, "eval_steps_per_second": 31.972, "step": 185000 }, { "epoch": 1.0249921260712687, "grad_norm": 11.095263481140137, "learning_rate": 3.658308724063057e-05, "loss": 2.6569, "step": 185500 }, { "epoch": 1.0277549080822426, "grad_norm": 6.58620023727417, "learning_rate": 3.653192451656342e-05, "loss": 2.6731, "step": 186000 }, { "epoch": 1.0305176900932163, "grad_norm": 8.076006889343262, "learning_rate": 3.648076179249627e-05, "loss": 2.708, "step": 186500 }, { "epoch": 1.03328047210419, "grad_norm": 6.5196990966796875, "learning_rate": 3.642959906842912e-05, "loss": 2.6235, "step": 187000 }, { "epoch": 1.0360432541151638, "grad_norm": 5.743066787719727, "learning_rate": 3.637843634436197e-05, "loss": 2.6598, "step": 187500 }, { "epoch": 1.0388060361261375, "grad_norm": 5.8798112869262695, "learning_rate": 3.6327273620294824e-05, "loss": 2.6463, "step": 188000 }, { "epoch": 1.0415688181371114, "grad_norm": 5.750164031982422, "learning_rate": 3.627611089622767e-05, "loss": 2.6784, "step": 188500 }, { "epoch": 1.0443316001480851, "grad_norm": 10.507771492004395, "learning_rate": 3.622494817216052e-05, "loss": 2.6642, "step": 189000 }, { "epoch": 1.0470943821590588, "grad_norm": 6.932614803314209, "learning_rate": 3.61738877735415e-05, "loss": 2.6819, "step": 189500 }, { "epoch": 1.0498571641700327, "grad_norm": 6.338881969451904, "learning_rate": 3.6122725049474355e-05, "loss": 2.6597, "step": 190000 }, { "epoch": 1.0498571641700327, "eval_runtime": 1409.894, "eval_samples_per_second": 256.724, "eval_steps_per_second": 32.091, "step": 190000 }, { "epoch": 1.0526199461810064, "grad_norm": 6.120572090148926, "learning_rate": 3.607156232540721e-05, "loss": 2.6625, "step": 190500 }, { "epoch": 1.05538272819198, "grad_norm": 7.011288166046143, "learning_rate": 3.602039960134005e-05, "loss": 2.6853, "step": 191000 }, { "epoch": 1.058145510202954, "grad_norm": 6.725677013397217, "learning_rate": 3.5969236877272904e-05, "loss": 2.715, "step": 191500 }, { "epoch": 1.0609082922139277, "grad_norm": 5.5944132804870605, "learning_rate": 3.5918074153205756e-05, "loss": 2.6688, "step": 192000 }, { "epoch": 1.0636710742249016, "grad_norm": 5.993505954742432, "learning_rate": 3.5867013754586744e-05, "loss": 2.6747, "step": 192500 }, { "epoch": 1.0664338562358753, "grad_norm": 6.300167560577393, "learning_rate": 3.581585103051959e-05, "loss": 2.6711, "step": 193000 }, { "epoch": 1.069196638246849, "grad_norm": 4.657979965209961, "learning_rate": 3.5764688306452435e-05, "loss": 2.6409, "step": 193500 }, { "epoch": 1.0719594202578229, "grad_norm": 5.573739051818848, "learning_rate": 3.571352558238529e-05, "loss": 2.6507, "step": 194000 }, { "epoch": 1.0747222022687966, "grad_norm": 9.203325271606445, "learning_rate": 3.566236285831814e-05, "loss": 2.6963, "step": 194500 }, { "epoch": 1.0774849842797702, "grad_norm": 5.0021185874938965, "learning_rate": 3.561120013425099e-05, "loss": 2.6293, "step": 195000 }, { "epoch": 1.0774849842797702, "eval_runtime": 1412.7471, "eval_samples_per_second": 256.205, "eval_steps_per_second": 32.026, "step": 195000 }, { "epoch": 1.0802477662907441, "grad_norm": 5.483398914337158, "learning_rate": 3.5560037410183836e-05, "loss": 2.6449, "step": 195500 }, { "epoch": 1.0830105483017178, "grad_norm": 6.608130931854248, "learning_rate": 3.550887468611669e-05, "loss": 2.7054, "step": 196000 }, { "epoch": 1.0857733303126917, "grad_norm": 6.910079479217529, "learning_rate": 3.5457814287497676e-05, "loss": 2.6568, "step": 196500 }, { "epoch": 1.0885361123236654, "grad_norm": 4.9066619873046875, "learning_rate": 3.540665156343053e-05, "loss": 2.6738, "step": 197000 }, { "epoch": 1.0912988943346391, "grad_norm": 10.586669921875, "learning_rate": 3.535548883936337e-05, "loss": 2.6878, "step": 197500 }, { "epoch": 1.094061676345613, "grad_norm": 6.523357391357422, "learning_rate": 3.530432611529622e-05, "loss": 2.6964, "step": 198000 }, { "epoch": 1.0968244583565867, "grad_norm": 6.5124831199646, "learning_rate": 3.525326571667721e-05, "loss": 2.6358, "step": 198500 }, { "epoch": 1.0995872403675606, "grad_norm": 5.297051429748535, "learning_rate": 3.520220531805819e-05, "loss": 2.609, "step": 199000 }, { "epoch": 1.1023500223785343, "grad_norm": 9.636564254760742, "learning_rate": 3.515104259399104e-05, "loss": 2.6268, "step": 199500 }, { "epoch": 1.105112804389508, "grad_norm": 6.009031772613525, "learning_rate": 3.509987986992389e-05, "loss": 2.6668, "step": 200000 }, { "epoch": 1.105112804389508, "eval_runtime": 1415.3759, "eval_samples_per_second": 255.729, "eval_steps_per_second": 31.967, "step": 200000 }, { "epoch": 1.107875586400482, "grad_norm": 6.43049430847168, "learning_rate": 3.504871714585674e-05, "loss": 2.5873, "step": 200500 }, { "epoch": 1.1106383684114556, "grad_norm": 5.201763153076172, "learning_rate": 3.4997656747237726e-05, "loss": 2.6485, "step": 201000 }, { "epoch": 1.1134011504224293, "grad_norm": 8.699557304382324, "learning_rate": 3.494659634861871e-05, "loss": 2.6795, "step": 201500 }, { "epoch": 1.1161639324334032, "grad_norm": 6.473239421844482, "learning_rate": 3.489543362455156e-05, "loss": 2.6479, "step": 202000 }, { "epoch": 1.1189267144443769, "grad_norm": 5.315208435058594, "learning_rate": 3.484427090048441e-05, "loss": 2.6804, "step": 202500 }, { "epoch": 1.1216894964553508, "grad_norm": 5.999909400939941, "learning_rate": 3.4793108176417264e-05, "loss": 2.6287, "step": 203000 }, { "epoch": 1.1244522784663245, "grad_norm": 5.583693504333496, "learning_rate": 3.474194545235011e-05, "loss": 2.6928, "step": 203500 }, { "epoch": 1.1272150604772981, "grad_norm": 12.50479793548584, "learning_rate": 3.469078272828296e-05, "loss": 2.6234, "step": 204000 }, { "epoch": 1.129977842488272, "grad_norm": 7.096823692321777, "learning_rate": 3.463962000421581e-05, "loss": 2.6131, "step": 204500 }, { "epoch": 1.1327406244992457, "grad_norm": 8.93995475769043, "learning_rate": 3.4588457280148665e-05, "loss": 2.6765, "step": 205000 }, { "epoch": 1.1327406244992457, "eval_runtime": 1410.6732, "eval_samples_per_second": 256.582, "eval_steps_per_second": 32.073, "step": 205000 }, { "epoch": 1.1355034065102196, "grad_norm": 7.607180595397949, "learning_rate": 3.453729455608151e-05, "loss": 2.6506, "step": 205500 }, { "epoch": 1.1382661885211933, "grad_norm": 5.653107166290283, "learning_rate": 3.448613183201436e-05, "loss": 2.6471, "step": 206000 }, { "epoch": 1.141028970532167, "grad_norm": 5.887621879577637, "learning_rate": 3.4434969107947214e-05, "loss": 2.6733, "step": 206500 }, { "epoch": 1.143791752543141, "grad_norm": 6.376918315887451, "learning_rate": 3.438380638388006e-05, "loss": 2.6874, "step": 207000 }, { "epoch": 1.1465545345541146, "grad_norm": 5.099799156188965, "learning_rate": 3.433274598526105e-05, "loss": 2.6371, "step": 207500 }, { "epoch": 1.1493173165650883, "grad_norm": 6.011958122253418, "learning_rate": 3.428168558664203e-05, "loss": 2.6578, "step": 208000 }, { "epoch": 1.1520800985760622, "grad_norm": 6.049017429351807, "learning_rate": 3.4230522862574874e-05, "loss": 2.6667, "step": 208500 }, { "epoch": 1.1548428805870359, "grad_norm": 7.92437744140625, "learning_rate": 3.4179360138507726e-05, "loss": 2.6889, "step": 209000 }, { "epoch": 1.1576056625980098, "grad_norm": 6.145605087280273, "learning_rate": 3.412819741444058e-05, "loss": 2.6537, "step": 209500 }, { "epoch": 1.1603684446089835, "grad_norm": 7.511498928070068, "learning_rate": 3.407703469037343e-05, "loss": 2.6694, "step": 210000 }, { "epoch": 1.1603684446089835, "eval_runtime": 1410.6383, "eval_samples_per_second": 256.588, "eval_steps_per_second": 32.074, "step": 210000 }, { "epoch": 1.1631312266199572, "grad_norm": 13.444281578063965, "learning_rate": 3.4025871966306275e-05, "loss": 2.6653, "step": 210500 }, { "epoch": 1.165894008630931, "grad_norm": 6.2352399826049805, "learning_rate": 3.397470924223913e-05, "loss": 2.6504, "step": 211000 }, { "epoch": 1.1686567906419048, "grad_norm": 6.435050964355469, "learning_rate": 3.392354651817198e-05, "loss": 2.6625, "step": 211500 }, { "epoch": 1.1714195726528787, "grad_norm": 5.893118858337402, "learning_rate": 3.387248611955296e-05, "loss": 2.6887, "step": 212000 }, { "epoch": 1.1741823546638523, "grad_norm": 7.816985607147217, "learning_rate": 3.382132339548581e-05, "loss": 2.6655, "step": 212500 }, { "epoch": 1.176945136674826, "grad_norm": 6.382891654968262, "learning_rate": 3.377016067141866e-05, "loss": 2.675, "step": 213000 }, { "epoch": 1.1797079186858, "grad_norm": 5.761401653289795, "learning_rate": 3.371899794735151e-05, "loss": 2.6846, "step": 213500 }, { "epoch": 1.1824707006967736, "grad_norm": 6.107725620269775, "learning_rate": 3.366783522328436e-05, "loss": 2.6129, "step": 214000 }, { "epoch": 1.1852334827077473, "grad_norm": 6.5399394035339355, "learning_rate": 3.361677482466535e-05, "loss": 2.6943, "step": 214500 }, { "epoch": 1.1879962647187212, "grad_norm": 10.632125854492188, "learning_rate": 3.356571442604633e-05, "loss": 2.6334, "step": 215000 }, { "epoch": 1.1879962647187212, "eval_runtime": 1416.4537, "eval_samples_per_second": 255.535, "eval_steps_per_second": 31.942, "step": 215000 }, { "epoch": 1.190759046729695, "grad_norm": 8.258727073669434, "learning_rate": 3.351455170197918e-05, "loss": 2.6625, "step": 215500 }, { "epoch": 1.1935218287406686, "grad_norm": 6.275572776794434, "learning_rate": 3.346338897791203e-05, "loss": 2.6586, "step": 216000 }, { "epoch": 1.1962846107516425, "grad_norm": 5.8179144859313965, "learning_rate": 3.341222625384488e-05, "loss": 2.7107, "step": 216500 }, { "epoch": 1.1990473927626162, "grad_norm": 7.048791885375977, "learning_rate": 3.336106352977773e-05, "loss": 2.6541, "step": 217000 }, { "epoch": 1.20181017477359, "grad_norm": 5.731837272644043, "learning_rate": 3.330990080571058e-05, "loss": 2.6137, "step": 217500 }, { "epoch": 1.2045729567845638, "grad_norm": 6.205833911895752, "learning_rate": 3.325873808164343e-05, "loss": 2.6608, "step": 218000 }, { "epoch": 1.2073357387955375, "grad_norm": 7.904666900634766, "learning_rate": 3.320757535757628e-05, "loss": 2.6514, "step": 218500 }, { "epoch": 1.2100985208065114, "grad_norm": 7.223947525024414, "learning_rate": 3.3156514958957264e-05, "loss": 2.7057, "step": 219000 }, { "epoch": 1.212861302817485, "grad_norm": 5.2091569900512695, "learning_rate": 3.3105352234890116e-05, "loss": 2.6155, "step": 219500 }, { "epoch": 1.215624084828459, "grad_norm": 13.029759407043457, "learning_rate": 3.305418951082296e-05, "loss": 2.6393, "step": 220000 }, { "epoch": 1.215624084828459, "eval_runtime": 1409.2915, "eval_samples_per_second": 256.833, "eval_steps_per_second": 32.105, "step": 220000 }, { "epoch": 1.2183868668394326, "grad_norm": 5.841119766235352, "learning_rate": 3.300302678675581e-05, "loss": 2.6914, "step": 220500 }, { "epoch": 1.2211496488504063, "grad_norm": 5.692915916442871, "learning_rate": 3.2951966388136794e-05, "loss": 2.674, "step": 221000 }, { "epoch": 1.2239124308613802, "grad_norm": 6.484999656677246, "learning_rate": 3.2900803664069646e-05, "loss": 2.6225, "step": 221500 }, { "epoch": 1.226675212872354, "grad_norm": 8.07515811920166, "learning_rate": 3.28496409400025e-05, "loss": 2.6609, "step": 222000 }, { "epoch": 1.2294379948833276, "grad_norm": 7.687187194824219, "learning_rate": 3.2798478215935343e-05, "loss": 2.6666, "step": 222500 }, { "epoch": 1.2322007768943015, "grad_norm": 5.499644756317139, "learning_rate": 3.274741781731633e-05, "loss": 2.6853, "step": 223000 }, { "epoch": 1.2349635589052752, "grad_norm": 6.094354629516602, "learning_rate": 3.2696255093249184e-05, "loss": 2.665, "step": 223500 }, { "epoch": 1.2377263409162491, "grad_norm": 6.3320159912109375, "learning_rate": 3.2645092369182036e-05, "loss": 2.6958, "step": 224000 }, { "epoch": 1.2404891229272228, "grad_norm": 5.882307529449463, "learning_rate": 3.259392964511489e-05, "loss": 2.6822, "step": 224500 }, { "epoch": 1.2432519049381965, "grad_norm": 6.465645790100098, "learning_rate": 3.254276692104773e-05, "loss": 2.6648, "step": 225000 }, { "epoch": 1.2432519049381965, "eval_runtime": 1411.6928, "eval_samples_per_second": 256.396, "eval_steps_per_second": 32.05, "step": 225000 }, { "epoch": 1.2460146869491704, "grad_norm": 7.901124477386475, "learning_rate": 3.2491706522428715e-05, "loss": 2.6548, "step": 225500 }, { "epoch": 1.248777468960144, "grad_norm": 11.486516952514648, "learning_rate": 3.2440543798361567e-05, "loss": 2.6362, "step": 226000 }, { "epoch": 1.251540250971118, "grad_norm": 8.68649959564209, "learning_rate": 3.238938107429442e-05, "loss": 2.6175, "step": 226500 }, { "epoch": 1.2543030329820917, "grad_norm": 9.17063045501709, "learning_rate": 3.233821835022727e-05, "loss": 2.6319, "step": 227000 }, { "epoch": 1.2570658149930654, "grad_norm": 7.608591556549072, "learning_rate": 3.228715795160825e-05, "loss": 2.6814, "step": 227500 }, { "epoch": 1.2598285970040393, "grad_norm": 6.055707931518555, "learning_rate": 3.2236097552989234e-05, "loss": 2.6131, "step": 228000 }, { "epoch": 1.262591379015013, "grad_norm": 10.52354621887207, "learning_rate": 3.2184934828922086e-05, "loss": 2.6803, "step": 228500 }, { "epoch": 1.2653541610259866, "grad_norm": 4.819145679473877, "learning_rate": 3.213377210485494e-05, "loss": 2.6665, "step": 229000 }, { "epoch": 1.2681169430369605, "grad_norm": 6.71164608001709, "learning_rate": 3.208260938078779e-05, "loss": 2.6405, "step": 229500 }, { "epoch": 1.2708797250479342, "grad_norm": 6.727443218231201, "learning_rate": 3.2031446656720635e-05, "loss": 2.6668, "step": 230000 }, { "epoch": 1.2708797250479342, "eval_runtime": 1397.8356, "eval_samples_per_second": 258.938, "eval_steps_per_second": 32.368, "step": 230000 }, { "epoch": 1.273642507058908, "grad_norm": 6.408928871154785, "learning_rate": 3.198028393265348e-05, "loss": 2.6714, "step": 230500 }, { "epoch": 1.2764052890698818, "grad_norm": 7.555359363555908, "learning_rate": 3.192912120858633e-05, "loss": 2.6462, "step": 231000 }, { "epoch": 1.2791680710808555, "grad_norm": 7.9627909660339355, "learning_rate": 3.1877958484519184e-05, "loss": 2.6649, "step": 231500 }, { "epoch": 1.2819308530918294, "grad_norm": 5.883249759674072, "learning_rate": 3.182689808590017e-05, "loss": 2.6482, "step": 232000 }, { "epoch": 1.284693635102803, "grad_norm": 6.337319850921631, "learning_rate": 3.177573536183302e-05, "loss": 2.653, "step": 232500 }, { "epoch": 1.287456417113777, "grad_norm": 9.221954345703125, "learning_rate": 3.172457263776587e-05, "loss": 2.6429, "step": 233000 }, { "epoch": 1.2902191991247507, "grad_norm": 8.365209579467773, "learning_rate": 3.167340991369872e-05, "loss": 2.6898, "step": 233500 }, { "epoch": 1.2929819811357244, "grad_norm": 13.809211730957031, "learning_rate": 3.16223495150797e-05, "loss": 2.6034, "step": 234000 }, { "epoch": 1.2957447631466983, "grad_norm": 6.561621189117432, "learning_rate": 3.1571186791012555e-05, "loss": 2.6831, "step": 234500 }, { "epoch": 1.298507545157672, "grad_norm": 7.049484729766846, "learning_rate": 3.15200240669454e-05, "loss": 2.6741, "step": 235000 }, { "epoch": 1.298507545157672, "eval_runtime": 1410.5655, "eval_samples_per_second": 256.601, "eval_steps_per_second": 32.076, "step": 235000 }, { "epoch": 1.3012703271686457, "grad_norm": 7.707888126373291, "learning_rate": 3.146886134287825e-05, "loss": 2.6391, "step": 235500 }, { "epoch": 1.3040331091796196, "grad_norm": 8.038480758666992, "learning_rate": 3.1417800944259234e-05, "loss": 2.6347, "step": 236000 }, { "epoch": 1.3067958911905933, "grad_norm": 8.12757396697998, "learning_rate": 3.1366638220192086e-05, "loss": 2.6362, "step": 236500 }, { "epoch": 1.309558673201567, "grad_norm": 14.125542640686035, "learning_rate": 3.131547549612494e-05, "loss": 2.6423, "step": 237000 }, { "epoch": 1.3123214552125408, "grad_norm": 7.672112941741943, "learning_rate": 3.126431277205778e-05, "loss": 2.6628, "step": 237500 }, { "epoch": 1.3150842372235145, "grad_norm": 5.344297409057617, "learning_rate": 3.1213150047990635e-05, "loss": 2.6825, "step": 238000 }, { "epoch": 1.3178470192344884, "grad_norm": 5.527090549468994, "learning_rate": 3.116208964937162e-05, "loss": 2.6757, "step": 238500 }, { "epoch": 1.3206098012454621, "grad_norm": 6.49380350112915, "learning_rate": 3.1110926925304475e-05, "loss": 2.6798, "step": 239000 }, { "epoch": 1.323372583256436, "grad_norm": 6.0890069007873535, "learning_rate": 3.105976420123732e-05, "loss": 2.6273, "step": 239500 }, { "epoch": 1.3261353652674097, "grad_norm": 6.064700126647949, "learning_rate": 3.1008601477170166e-05, "loss": 2.6461, "step": 240000 }, { "epoch": 1.3261353652674097, "eval_runtime": 1411.2023, "eval_samples_per_second": 256.486, "eval_steps_per_second": 32.061, "step": 240000 }, { "epoch": 1.3288981472783834, "grad_norm": 8.562914848327637, "learning_rate": 3.0957541078551154e-05, "loss": 2.6699, "step": 240500 }, { "epoch": 1.3316609292893573, "grad_norm": 4.0414958000183105, "learning_rate": 3.0906378354484006e-05, "loss": 2.7082, "step": 241000 }, { "epoch": 1.334423711300331, "grad_norm": 6.873857021331787, "learning_rate": 3.085531795586499e-05, "loss": 2.6486, "step": 241500 }, { "epoch": 1.3371864933113047, "grad_norm": 7.180528163909912, "learning_rate": 3.080415523179784e-05, "loss": 2.6766, "step": 242000 }, { "epoch": 1.3399492753222786, "grad_norm": 3.9526586532592773, "learning_rate": 3.0752992507730685e-05, "loss": 2.6523, "step": 242500 }, { "epoch": 1.3427120573332523, "grad_norm": 7.868597030639648, "learning_rate": 3.070182978366354e-05, "loss": 2.7141, "step": 243000 }, { "epoch": 1.345474839344226, "grad_norm": 6.393147945404053, "learning_rate": 3.065066705959639e-05, "loss": 2.661, "step": 243500 }, { "epoch": 1.3482376213551999, "grad_norm": 6.155392646789551, "learning_rate": 3.059950433552924e-05, "loss": 2.6336, "step": 244000 }, { "epoch": 1.3510004033661736, "grad_norm": 5.36915922164917, "learning_rate": 3.0548341611462086e-05, "loss": 2.6183, "step": 244500 }, { "epoch": 1.3537631853771472, "grad_norm": 8.455395698547363, "learning_rate": 3.0497178887394938e-05, "loss": 2.6629, "step": 245000 }, { "epoch": 1.3537631853771472, "eval_runtime": 1408.8422, "eval_samples_per_second": 256.915, "eval_steps_per_second": 32.115, "step": 245000 }, { "epoch": 1.3565259673881211, "grad_norm": 7.414444446563721, "learning_rate": 3.044601616332779e-05, "loss": 2.6668, "step": 245500 }, { "epoch": 1.3592887493990948, "grad_norm": 5.821547031402588, "learning_rate": 3.039495576470877e-05, "loss": 2.6888, "step": 246000 }, { "epoch": 1.3620515314100687, "grad_norm": 6.702820301055908, "learning_rate": 3.034379304064162e-05, "loss": 2.6647, "step": 246500 }, { "epoch": 1.3648143134210424, "grad_norm": 8.23851203918457, "learning_rate": 3.0292630316574472e-05, "loss": 2.6668, "step": 247000 }, { "epoch": 1.3675770954320163, "grad_norm": 6.016136646270752, "learning_rate": 3.024146759250732e-05, "loss": 2.6509, "step": 247500 }, { "epoch": 1.37033987744299, "grad_norm": 6.836232662200928, "learning_rate": 3.019040719388831e-05, "loss": 2.6904, "step": 248000 }, { "epoch": 1.3731026594539637, "grad_norm": 7.973288059234619, "learning_rate": 3.013924446982116e-05, "loss": 2.6805, "step": 248500 }, { "epoch": 1.3758654414649376, "grad_norm": 6.736196517944336, "learning_rate": 3.0088081745754003e-05, "loss": 2.6456, "step": 249000 }, { "epoch": 1.3786282234759113, "grad_norm": 6.223706245422363, "learning_rate": 3.0036919021686855e-05, "loss": 2.6705, "step": 249500 }, { "epoch": 1.381391005486885, "grad_norm": 6.599213600158691, "learning_rate": 2.9985858623067843e-05, "loss": 2.6088, "step": 250000 }, { "epoch": 1.381391005486885, "eval_runtime": 1412.4913, "eval_samples_per_second": 256.251, "eval_steps_per_second": 32.032, "step": 250000 }, { "epoch": 1.384153787497859, "grad_norm": 5.85990571975708, "learning_rate": 2.993469589900069e-05, "loss": 2.6636, "step": 250500 }, { "epoch": 1.3869165695088326, "grad_norm": 8.427452087402344, "learning_rate": 2.9883533174933544e-05, "loss": 2.641, "step": 251000 }, { "epoch": 1.3896793515198063, "grad_norm": 4.338545799255371, "learning_rate": 2.9832472776314525e-05, "loss": 2.6391, "step": 251500 }, { "epoch": 1.3924421335307802, "grad_norm": 6.209786891937256, "learning_rate": 2.9781310052247374e-05, "loss": 2.6357, "step": 252000 }, { "epoch": 1.3952049155417539, "grad_norm": 5.6584672927856445, "learning_rate": 2.9730147328180226e-05, "loss": 2.6903, "step": 252500 }, { "epoch": 1.3979676975527278, "grad_norm": 6.956233501434326, "learning_rate": 2.9678984604113074e-05, "loss": 2.6452, "step": 253000 }, { "epoch": 1.4007304795637014, "grad_norm": 7.020050048828125, "learning_rate": 2.9627821880045926e-05, "loss": 2.707, "step": 253500 }, { "epoch": 1.4034932615746754, "grad_norm": 6.412283420562744, "learning_rate": 2.9576659155978775e-05, "loss": 2.5976, "step": 254000 }, { "epoch": 1.406256043585649, "grad_norm": 6.7848711013793945, "learning_rate": 2.9525598757359756e-05, "loss": 2.6319, "step": 254500 }, { "epoch": 1.4090188255966227, "grad_norm": 4.951188564300537, "learning_rate": 2.947443603329261e-05, "loss": 2.6127, "step": 255000 }, { "epoch": 1.4090188255966227, "eval_runtime": 1426.0103, "eval_samples_per_second": 253.822, "eval_steps_per_second": 31.728, "step": 255000 }, { "epoch": 1.4117816076075966, "grad_norm": 7.517430305480957, "learning_rate": 2.9423273309225457e-05, "loss": 2.6338, "step": 255500 }, { "epoch": 1.4145443896185703, "grad_norm": 8.3431978225708, "learning_rate": 2.937211058515831e-05, "loss": 2.6741, "step": 256000 }, { "epoch": 1.417307171629544, "grad_norm": 6.5295867919921875, "learning_rate": 2.9321050186539294e-05, "loss": 2.6293, "step": 256500 }, { "epoch": 1.420069953640518, "grad_norm": 7.589269638061523, "learning_rate": 2.9269887462472146e-05, "loss": 2.6028, "step": 257000 }, { "epoch": 1.4228327356514916, "grad_norm": 6.105846881866455, "learning_rate": 2.9218724738404994e-05, "loss": 2.6168, "step": 257500 }, { "epoch": 1.4255955176624653, "grad_norm": 5.840164661407471, "learning_rate": 2.9167562014337846e-05, "loss": 2.642, "step": 258000 }, { "epoch": 1.4283582996734392, "grad_norm": 7.00549840927124, "learning_rate": 2.911639929027069e-05, "loss": 2.6812, "step": 258500 }, { "epoch": 1.4311210816844129, "grad_norm": 8.726187705993652, "learning_rate": 2.906523656620354e-05, "loss": 2.6415, "step": 259000 }, { "epoch": 1.4338838636953868, "grad_norm": 7.919028282165527, "learning_rate": 2.9014073842136392e-05, "loss": 2.5798, "step": 259500 }, { "epoch": 1.4366466457063605, "grad_norm": 4.848925590515137, "learning_rate": 2.896291111806924e-05, "loss": 2.6991, "step": 260000 }, { "epoch": 1.4366466457063605, "eval_runtime": 1376.0192, "eval_samples_per_second": 263.044, "eval_steps_per_second": 32.881, "step": 260000 }, { "epoch": 1.4394094277173344, "grad_norm": 7.420814514160156, "learning_rate": 2.8911748394002093e-05, "loss": 2.6541, "step": 260500 }, { "epoch": 1.442172209728308, "grad_norm": 7.090695381164551, "learning_rate": 2.8860687995383078e-05, "loss": 2.6506, "step": 261000 }, { "epoch": 1.4449349917392817, "grad_norm": 6.3192338943481445, "learning_rate": 2.880952527131593e-05, "loss": 2.6555, "step": 261500 }, { "epoch": 1.4476977737502557, "grad_norm": 5.872584819793701, "learning_rate": 2.8758362547248778e-05, "loss": 2.615, "step": 262000 }, { "epoch": 1.4504605557612293, "grad_norm": 7.909795761108398, "learning_rate": 2.870719982318163e-05, "loss": 2.6214, "step": 262500 }, { "epoch": 1.453223337772203, "grad_norm": 6.419271469116211, "learning_rate": 2.8656139424562612e-05, "loss": 2.6541, "step": 263000 }, { "epoch": 1.455986119783177, "grad_norm": 8.628451347351074, "learning_rate": 2.860497670049546e-05, "loss": 2.6178, "step": 263500 }, { "epoch": 1.4587489017941506, "grad_norm": 6.384825706481934, "learning_rate": 2.8553813976428312e-05, "loss": 2.6617, "step": 264000 }, { "epoch": 1.4615116838051243, "grad_norm": 7.782327651977539, "learning_rate": 2.850265125236116e-05, "loss": 2.6056, "step": 264500 }, { "epoch": 1.4642744658160982, "grad_norm": 6.750179767608643, "learning_rate": 2.8451590853742143e-05, "loss": 2.6685, "step": 265000 }, { "epoch": 1.4642744658160982, "eval_runtime": 1431.562, "eval_samples_per_second": 252.838, "eval_steps_per_second": 31.605, "step": 265000 }, { "epoch": 1.467037247827072, "grad_norm": 6.400202751159668, "learning_rate": 2.8400428129674995e-05, "loss": 2.6467, "step": 265500 }, { "epoch": 1.4698000298380456, "grad_norm": 9.842098236083984, "learning_rate": 2.8349265405607843e-05, "loss": 2.5767, "step": 266000 }, { "epoch": 1.4725628118490195, "grad_norm": 6.3928680419921875, "learning_rate": 2.8298102681540695e-05, "loss": 2.643, "step": 266500 }, { "epoch": 1.4753255938599932, "grad_norm": 5.774998188018799, "learning_rate": 2.8246939957473544e-05, "loss": 2.7011, "step": 267000 }, { "epoch": 1.478088375870967, "grad_norm": 7.253933429718018, "learning_rate": 2.8195879558854525e-05, "loss": 2.6298, "step": 267500 }, { "epoch": 1.4808511578819408, "grad_norm": 6.2444658279418945, "learning_rate": 2.8144716834787377e-05, "loss": 2.661, "step": 268000 }, { "epoch": 1.4836139398929147, "grad_norm": 7.06601619720459, "learning_rate": 2.8093554110720226e-05, "loss": 2.6499, "step": 268500 }, { "epoch": 1.4863767219038884, "grad_norm": 8.167895317077637, "learning_rate": 2.8042391386653078e-05, "loss": 2.6404, "step": 269000 }, { "epoch": 1.489139503914862, "grad_norm": 6.798631191253662, "learning_rate": 2.7991330988034066e-05, "loss": 2.6585, "step": 269500 }, { "epoch": 1.491902285925836, "grad_norm": 6.69813346862793, "learning_rate": 2.7940168263966915e-05, "loss": 2.6658, "step": 270000 }, { "epoch": 1.491902285925836, "eval_runtime": 1470.9324, "eval_samples_per_second": 246.07, "eval_steps_per_second": 30.759, "step": 270000 }, { "epoch": 1.4946650679368096, "grad_norm": 6.37416410446167, "learning_rate": 2.7889005539899767e-05, "loss": 2.6653, "step": 270500 }, { "epoch": 1.4974278499477833, "grad_norm": 5.052603244781494, "learning_rate": 2.7837842815832615e-05, "loss": 2.6822, "step": 271000 }, { "epoch": 1.5001906319587572, "grad_norm": 7.3138861656188965, "learning_rate": 2.7786782417213597e-05, "loss": 2.637, "step": 271500 }, { "epoch": 1.502953413969731, "grad_norm": 12.509490013122559, "learning_rate": 2.773561969314645e-05, "loss": 2.6139, "step": 272000 }, { "epoch": 1.5057161959807046, "grad_norm": 8.668211936950684, "learning_rate": 2.7684456969079297e-05, "loss": 2.6967, "step": 272500 }, { "epoch": 1.5084789779916785, "grad_norm": 6.163717269897461, "learning_rate": 2.763339657046028e-05, "loss": 2.6344, "step": 273000 }, { "epoch": 1.5112417600026524, "grad_norm": 5.849397659301758, "learning_rate": 2.758223384639313e-05, "loss": 2.7212, "step": 273500 }, { "epoch": 1.5140045420136259, "grad_norm": 5.386920928955078, "learning_rate": 2.753107112232598e-05, "loss": 2.6579, "step": 274000 }, { "epoch": 1.5167673240245998, "grad_norm": 7.916058540344238, "learning_rate": 2.747990839825883e-05, "loss": 2.6906, "step": 274500 }, { "epoch": 1.5195301060355737, "grad_norm": 5.125283241271973, "learning_rate": 2.742874567419168e-05, "loss": 2.6266, "step": 275000 }, { "epoch": 1.5195301060355737, "eval_runtime": 1424.4614, "eval_samples_per_second": 254.098, "eval_steps_per_second": 31.763, "step": 275000 }, { "epoch": 1.5222928880465474, "grad_norm": 8.393288612365723, "learning_rate": 2.7377582950124532e-05, "loss": 2.6324, "step": 275500 }, { "epoch": 1.525055670057521, "grad_norm": 6.936960697174072, "learning_rate": 2.732642022605738e-05, "loss": 2.6751, "step": 276000 }, { "epoch": 1.527818452068495, "grad_norm": 7.31864595413208, "learning_rate": 2.7275257501990233e-05, "loss": 2.7046, "step": 276500 }, { "epoch": 1.5305812340794687, "grad_norm": 7.4950151443481445, "learning_rate": 2.7224197103371214e-05, "loss": 2.6602, "step": 277000 }, { "epoch": 1.5333440160904424, "grad_norm": 8.419631004333496, "learning_rate": 2.7173034379304063e-05, "loss": 2.6288, "step": 277500 }, { "epoch": 1.5361067981014163, "grad_norm": 4.573643684387207, "learning_rate": 2.7121871655236915e-05, "loss": 2.6075, "step": 278000 }, { "epoch": 1.53886958011239, "grad_norm": 6.746376991271973, "learning_rate": 2.7070708931169763e-05, "loss": 2.6455, "step": 278500 }, { "epoch": 1.5416323621233636, "grad_norm": 7.2786478996276855, "learning_rate": 2.7019546207102615e-05, "loss": 2.6467, "step": 279000 }, { "epoch": 1.5443951441343375, "grad_norm": 8.315926551818848, "learning_rate": 2.69684858084836e-05, "loss": 2.6329, "step": 279500 }, { "epoch": 1.5471579261453114, "grad_norm": 6.038636207580566, "learning_rate": 2.6917323084416452e-05, "loss": 2.6422, "step": 280000 }, { "epoch": 1.5471579261453114, "eval_runtime": 1397.1141, "eval_samples_per_second": 259.072, "eval_steps_per_second": 32.385, "step": 280000 }, { "epoch": 1.549920708156285, "grad_norm": 5.000608921051025, "learning_rate": 2.68661603603493e-05, "loss": 2.5991, "step": 280500 }, { "epoch": 1.5526834901672588, "grad_norm": 5.943995952606201, "learning_rate": 2.6814997636282153e-05, "loss": 2.5922, "step": 281000 }, { "epoch": 1.5554462721782327, "grad_norm": 8.255182266235352, "learning_rate": 2.6763834912214998e-05, "loss": 2.6483, "step": 281500 }, { "epoch": 1.5582090541892064, "grad_norm": 8.202108383178711, "learning_rate": 2.6712774513595983e-05, "loss": 2.6235, "step": 282000 }, { "epoch": 1.56097183620018, "grad_norm": 5.840571880340576, "learning_rate": 2.6661611789528835e-05, "loss": 2.652, "step": 282500 }, { "epoch": 1.563734618211154, "grad_norm": 5.939957141876221, "learning_rate": 2.6610449065461684e-05, "loss": 2.6501, "step": 283000 }, { "epoch": 1.5664974002221277, "grad_norm": 5.314937114715576, "learning_rate": 2.6559286341394536e-05, "loss": 2.656, "step": 283500 }, { "epoch": 1.5692601822331014, "grad_norm": 6.23870849609375, "learning_rate": 2.6508123617327384e-05, "loss": 2.6776, "step": 284000 }, { "epoch": 1.5720229642440753, "grad_norm": 6.62495231628418, "learning_rate": 2.6457063218708366e-05, "loss": 2.6351, "step": 284500 }, { "epoch": 1.574785746255049, "grad_norm": 6.557297706604004, "learning_rate": 2.6406002820089354e-05, "loss": 2.6637, "step": 285000 }, { "epoch": 1.574785746255049, "eval_runtime": 1399.3188, "eval_samples_per_second": 258.664, "eval_steps_per_second": 32.334, "step": 285000 }, { "epoch": 1.5775485282660227, "grad_norm": 7.950584411621094, "learning_rate": 2.6354840096022203e-05, "loss": 2.6292, "step": 285500 }, { "epoch": 1.5803113102769966, "grad_norm": 6.725704193115234, "learning_rate": 2.6303677371955055e-05, "loss": 2.64, "step": 286000 }, { "epoch": 1.5830740922879702, "grad_norm": 8.884140014648438, "learning_rate": 2.62525146478879e-05, "loss": 2.6596, "step": 286500 }, { "epoch": 1.585836874298944, "grad_norm": 6.812872409820557, "learning_rate": 2.620135192382075e-05, "loss": 2.6034, "step": 287000 }, { "epoch": 1.5885996563099178, "grad_norm": 7.91174840927124, "learning_rate": 2.61501891997536e-05, "loss": 2.6564, "step": 287500 }, { "epoch": 1.5913624383208917, "grad_norm": 5.403963565826416, "learning_rate": 2.6099128801134585e-05, "loss": 2.6303, "step": 288000 }, { "epoch": 1.5941252203318652, "grad_norm": 7.750992774963379, "learning_rate": 2.6047966077067437e-05, "loss": 2.6337, "step": 288500 }, { "epoch": 1.5968880023428391, "grad_norm": 8.077462196350098, "learning_rate": 2.5996803353000286e-05, "loss": 2.6428, "step": 289000 }, { "epoch": 1.599650784353813, "grad_norm": 5.293886184692383, "learning_rate": 2.5945640628933138e-05, "loss": 2.6432, "step": 289500 }, { "epoch": 1.6024135663647867, "grad_norm": 6.613586902618408, "learning_rate": 2.5894477904865986e-05, "loss": 2.618, "step": 290000 }, { "epoch": 1.6024135663647867, "eval_runtime": 1431.565, "eval_samples_per_second": 252.837, "eval_steps_per_second": 31.605, "step": 290000 }, { "epoch": 1.6051763483757604, "grad_norm": 7.696370601654053, "learning_rate": 2.584331518079884e-05, "loss": 2.6043, "step": 290500 }, { "epoch": 1.6079391303867343, "grad_norm": 14.686103820800781, "learning_rate": 2.579225478217982e-05, "loss": 2.6113, "step": 291000 }, { "epoch": 1.610701912397708, "grad_norm": 7.173743724822998, "learning_rate": 2.574109205811267e-05, "loss": 2.6496, "step": 291500 }, { "epoch": 1.6134646944086817, "grad_norm": 5.398017883300781, "learning_rate": 2.568992933404552e-05, "loss": 2.6616, "step": 292000 }, { "epoch": 1.6162274764196556, "grad_norm": 4.810672760009766, "learning_rate": 2.563876660997837e-05, "loss": 2.6557, "step": 292500 }, { "epoch": 1.6189902584306293, "grad_norm": 5.541525840759277, "learning_rate": 2.558760388591122e-05, "loss": 2.6657, "step": 293000 }, { "epoch": 1.621753040441603, "grad_norm": 6.207642555236816, "learning_rate": 2.5536543487292203e-05, "loss": 2.6438, "step": 293500 }, { "epoch": 1.6245158224525769, "grad_norm": 5.112069129943848, "learning_rate": 2.548538076322505e-05, "loss": 2.617, "step": 294000 }, { "epoch": 1.6272786044635508, "grad_norm": 5.147789001464844, "learning_rate": 2.5434218039157903e-05, "loss": 2.6407, "step": 294500 }, { "epoch": 1.6300413864745242, "grad_norm": 7.100889205932617, "learning_rate": 2.5383055315090752e-05, "loss": 2.6411, "step": 295000 }, { "epoch": 1.6300413864745242, "eval_runtime": 1371.2042, "eval_samples_per_second": 263.967, "eval_steps_per_second": 32.997, "step": 295000 }, { "epoch": 1.6328041684854981, "grad_norm": 8.297256469726562, "learning_rate": 2.5331892591023604e-05, "loss": 2.6792, "step": 295500 }, { "epoch": 1.635566950496472, "grad_norm": 7.450379371643066, "learning_rate": 2.5280729866956452e-05, "loss": 2.6567, "step": 296000 }, { "epoch": 1.6383297325074457, "grad_norm": 4.418615818023682, "learning_rate": 2.5229567142889304e-05, "loss": 2.5833, "step": 296500 }, { "epoch": 1.6410925145184194, "grad_norm": 8.853099822998047, "learning_rate": 2.5178404418822153e-05, "loss": 2.6617, "step": 297000 }, { "epoch": 1.6438552965293933, "grad_norm": 6.378116607666016, "learning_rate": 2.5127344020203138e-05, "loss": 2.6225, "step": 297500 }, { "epoch": 1.646618078540367, "grad_norm": 9.61796760559082, "learning_rate": 2.5076181296135987e-05, "loss": 2.7009, "step": 298000 }, { "epoch": 1.6493808605513407, "grad_norm": 6.160669803619385, "learning_rate": 2.502501857206884e-05, "loss": 2.667, "step": 298500 }, { "epoch": 1.6521436425623146, "grad_norm": 5.313681602478027, "learning_rate": 2.497395817344982e-05, "loss": 2.5937, "step": 299000 }, { "epoch": 1.6549064245732883, "grad_norm": 6.531844139099121, "learning_rate": 2.4922795449382672e-05, "loss": 2.6204, "step": 299500 }, { "epoch": 1.657669206584262, "grad_norm": 7.415525436401367, "learning_rate": 2.487163272531552e-05, "loss": 2.6015, "step": 300000 }, { "epoch": 1.657669206584262, "eval_runtime": 1437.4291, "eval_samples_per_second": 251.806, "eval_steps_per_second": 31.476, "step": 300000 }, { "epoch": 1.6604319885952359, "grad_norm": 7.35875129699707, "learning_rate": 2.4820470001248373e-05, "loss": 2.611, "step": 300500 }, { "epoch": 1.6631947706062098, "grad_norm": 6.453457355499268, "learning_rate": 2.476930727718122e-05, "loss": 2.6368, "step": 301000 }, { "epoch": 1.6659575526171833, "grad_norm": 5.64149808883667, "learning_rate": 2.4718144553114073e-05, "loss": 2.6352, "step": 301500 }, { "epoch": 1.6687203346281572, "grad_norm": 6.376221656799316, "learning_rate": 2.4666981829046922e-05, "loss": 2.6224, "step": 302000 }, { "epoch": 1.671483116639131, "grad_norm": 7.666605472564697, "learning_rate": 2.461581910497977e-05, "loss": 2.6664, "step": 302500 }, { "epoch": 1.6742458986501048, "grad_norm": 5.104877471923828, "learning_rate": 2.4564758706360755e-05, "loss": 2.5975, "step": 303000 }, { "epoch": 1.6770086806610784, "grad_norm": 14.055898666381836, "learning_rate": 2.4513595982293604e-05, "loss": 2.628, "step": 303500 }, { "epoch": 1.6797714626720524, "grad_norm": 9.033441543579102, "learning_rate": 2.4462433258226456e-05, "loss": 2.6281, "step": 304000 }, { "epoch": 1.682534244683026, "grad_norm": 12.3050537109375, "learning_rate": 2.4411270534159304e-05, "loss": 2.6701, "step": 304500 }, { "epoch": 1.6852970266939997, "grad_norm": 8.409795761108398, "learning_rate": 2.436021013554029e-05, "loss": 2.6023, "step": 305000 }, { "epoch": 1.6852970266939997, "eval_runtime": 1387.5318, "eval_samples_per_second": 260.861, "eval_steps_per_second": 32.608, "step": 305000 }, { "epoch": 1.6880598087049736, "grad_norm": 5.9802937507629395, "learning_rate": 2.4309149736921274e-05, "loss": 2.665, "step": 305500 }, { "epoch": 1.6908225907159473, "grad_norm": 6.1783270835876465, "learning_rate": 2.4257987012854126e-05, "loss": 2.6356, "step": 306000 }, { "epoch": 1.693585372726921, "grad_norm": 6.058241367340088, "learning_rate": 2.4206824288786975e-05, "loss": 2.6219, "step": 306500 }, { "epoch": 1.696348154737895, "grad_norm": 6.79514741897583, "learning_rate": 2.4155661564719824e-05, "loss": 2.6272, "step": 307000 }, { "epoch": 1.6991109367488686, "grad_norm": 9.22230052947998, "learning_rate": 2.4104498840652672e-05, "loss": 2.5919, "step": 307500 }, { "epoch": 1.7018737187598423, "grad_norm": 5.048295021057129, "learning_rate": 2.4053336116585524e-05, "loss": 2.6379, "step": 308000 }, { "epoch": 1.7046365007708162, "grad_norm": 7.282494068145752, "learning_rate": 2.4002173392518373e-05, "loss": 2.6391, "step": 308500 }, { "epoch": 1.70739928278179, "grad_norm": 6.831259727478027, "learning_rate": 2.3951010668451225e-05, "loss": 2.6218, "step": 309000 }, { "epoch": 1.7101620647927636, "grad_norm": 6.001838207244873, "learning_rate": 2.389995026983221e-05, "loss": 2.6372, "step": 309500 }, { "epoch": 1.7129248468037375, "grad_norm": 5.721564769744873, "learning_rate": 2.3848787545765058e-05, "loss": 2.7011, "step": 310000 }, { "epoch": 1.7129248468037375, "eval_runtime": 1416.3525, "eval_samples_per_second": 255.553, "eval_steps_per_second": 31.945, "step": 310000 }, { "epoch": 1.7156876288147114, "grad_norm": 7.709352970123291, "learning_rate": 2.379762482169791e-05, "loss": 2.6193, "step": 310500 }, { "epoch": 1.718450410825685, "grad_norm": 7.23681640625, "learning_rate": 2.374646209763076e-05, "loss": 2.6307, "step": 311000 }, { "epoch": 1.7212131928366587, "grad_norm": 6.505390167236328, "learning_rate": 2.3695401699011744e-05, "loss": 2.6409, "step": 311500 }, { "epoch": 1.7239759748476327, "grad_norm": 8.059307098388672, "learning_rate": 2.3644238974944592e-05, "loss": 2.6348, "step": 312000 }, { "epoch": 1.7267387568586063, "grad_norm": 7.6500749588012695, "learning_rate": 2.359307625087744e-05, "loss": 2.6435, "step": 312500 }, { "epoch": 1.72950153886958, "grad_norm": 9.657527923583984, "learning_rate": 2.3541913526810293e-05, "loss": 2.661, "step": 313000 }, { "epoch": 1.732264320880554, "grad_norm": 5.21886682510376, "learning_rate": 2.3490853128191275e-05, "loss": 2.6794, "step": 313500 }, { "epoch": 1.7350271028915276, "grad_norm": 6.33572244644165, "learning_rate": 2.3439690404124126e-05, "loss": 2.6372, "step": 314000 }, { "epoch": 1.7377898849025013, "grad_norm": 6.692564010620117, "learning_rate": 2.3388527680056975e-05, "loss": 2.6583, "step": 314500 }, { "epoch": 1.7405526669134752, "grad_norm": 5.055424690246582, "learning_rate": 2.3337364955989827e-05, "loss": 2.6386, "step": 315000 }, { "epoch": 1.7405526669134752, "eval_runtime": 1401.7623, "eval_samples_per_second": 258.213, "eval_steps_per_second": 32.277, "step": 315000 }, { "epoch": 1.7433154489244491, "grad_norm": 5.959291934967041, "learning_rate": 2.3286202231922676e-05, "loss": 2.6501, "step": 315500 }, { "epoch": 1.7460782309354226, "grad_norm": 7.027371406555176, "learning_rate": 2.323514183330366e-05, "loss": 2.6522, "step": 316000 }, { "epoch": 1.7488410129463965, "grad_norm": 6.8300557136535645, "learning_rate": 2.318397910923651e-05, "loss": 2.6386, "step": 316500 }, { "epoch": 1.7516037949573704, "grad_norm": 5.422798156738281, "learning_rate": 2.3132816385169358e-05, "loss": 2.6825, "step": 317000 }, { "epoch": 1.754366576968344, "grad_norm": 7.326968669891357, "learning_rate": 2.308165366110221e-05, "loss": 2.628, "step": 317500 }, { "epoch": 1.7571293589793178, "grad_norm": 6.498944282531738, "learning_rate": 2.3030490937035058e-05, "loss": 2.6193, "step": 318000 }, { "epoch": 1.7598921409902917, "grad_norm": 7.064229965209961, "learning_rate": 2.297953286386418e-05, "loss": 2.6416, "step": 318500 }, { "epoch": 1.7626549230012654, "grad_norm": 6.315282344818115, "learning_rate": 2.2928370139797028e-05, "loss": 2.6887, "step": 319000 }, { "epoch": 1.765417705012239, "grad_norm": 8.035995483398438, "learning_rate": 2.287720741572988e-05, "loss": 2.6238, "step": 319500 }, { "epoch": 1.768180487023213, "grad_norm": 7.513897895812988, "learning_rate": 2.282604469166273e-05, "loss": 2.626, "step": 320000 }, { "epoch": 1.768180487023213, "eval_runtime": 1385.5831, "eval_samples_per_second": 261.228, "eval_steps_per_second": 32.654, "step": 320000 }, { "epoch": 1.7709432690341866, "grad_norm": 6.0088090896606445, "learning_rate": 2.277488196759558e-05, "loss": 2.6455, "step": 320500 }, { "epoch": 1.7737060510451603, "grad_norm": 4.775638103485107, "learning_rate": 2.2723719243528426e-05, "loss": 2.6581, "step": 321000 }, { "epoch": 1.7764688330561342, "grad_norm": 5.797138690948486, "learning_rate": 2.2672556519461278e-05, "loss": 2.6307, "step": 321500 }, { "epoch": 1.7792316150671081, "grad_norm": 6.206060886383057, "learning_rate": 2.2621393795394127e-05, "loss": 2.6399, "step": 322000 }, { "epoch": 1.7819943970780816, "grad_norm": 6.536865711212158, "learning_rate": 2.257023107132698e-05, "loss": 2.6514, "step": 322500 }, { "epoch": 1.7847571790890555, "grad_norm": 5.105484962463379, "learning_rate": 2.2519170672707964e-05, "loss": 2.6481, "step": 323000 }, { "epoch": 1.7875199611000294, "grad_norm": 5.873786926269531, "learning_rate": 2.2468007948640812e-05, "loss": 2.6467, "step": 323500 }, { "epoch": 1.790282743111003, "grad_norm": 5.885590553283691, "learning_rate": 2.2416947550021797e-05, "loss": 2.6207, "step": 324000 }, { "epoch": 1.7930455251219768, "grad_norm": 9.117544174194336, "learning_rate": 2.2365784825954646e-05, "loss": 2.6161, "step": 324500 }, { "epoch": 1.7958083071329507, "grad_norm": 7.810193061828613, "learning_rate": 2.2314622101887498e-05, "loss": 2.6701, "step": 325000 }, { "epoch": 1.7958083071329507, "eval_runtime": 1431.4837, "eval_samples_per_second": 252.852, "eval_steps_per_second": 31.607, "step": 325000 }, { "epoch": 1.7985710891439244, "grad_norm": 5.142136573791504, "learning_rate": 2.2263459377820346e-05, "loss": 2.6457, "step": 325500 }, { "epoch": 1.801333871154898, "grad_norm": 6.691473960876465, "learning_rate": 2.2212398979201328e-05, "loss": 2.6182, "step": 326000 }, { "epoch": 1.804096653165872, "grad_norm": 10.478597640991211, "learning_rate": 2.216123625513418e-05, "loss": 2.6797, "step": 326500 }, { "epoch": 1.8068594351768457, "grad_norm": 6.4801554679870605, "learning_rate": 2.211007353106703e-05, "loss": 2.6638, "step": 327000 }, { "epoch": 1.8096222171878193, "grad_norm": 5.350027561187744, "learning_rate": 2.205891080699988e-05, "loss": 2.6383, "step": 327500 }, { "epoch": 1.8123849991987933, "grad_norm": 7.608794689178467, "learning_rate": 2.200774808293273e-05, "loss": 2.6547, "step": 328000 }, { "epoch": 1.815147781209767, "grad_norm": 7.434188365936279, "learning_rate": 2.195658535886558e-05, "loss": 2.6311, "step": 328500 }, { "epoch": 1.8179105632207406, "grad_norm": 5.700359344482422, "learning_rate": 2.190542263479843e-05, "loss": 2.641, "step": 329000 }, { "epoch": 1.8206733452317145, "grad_norm": 5.555663585662842, "learning_rate": 2.185425991073128e-05, "loss": 2.6202, "step": 329500 }, { "epoch": 1.8234361272426884, "grad_norm": 6.527945518493652, "learning_rate": 2.180309718666413e-05, "loss": 2.6888, "step": 330000 }, { "epoch": 1.8234361272426884, "eval_runtime": 1420.615, "eval_samples_per_second": 254.786, "eval_steps_per_second": 31.849, "step": 330000 }, { "epoch": 1.826198909253662, "grad_norm": 8.153132438659668, "learning_rate": 2.175193446259698e-05, "loss": 2.6267, "step": 330500 }, { "epoch": 1.8289616912646358, "grad_norm": 5.93485164642334, "learning_rate": 2.170077173852983e-05, "loss": 2.642, "step": 331000 }, { "epoch": 1.8317244732756097, "grad_norm": 7.894295692443848, "learning_rate": 2.164960901446268e-05, "loss": 2.6249, "step": 331500 }, { "epoch": 1.8344872552865834, "grad_norm": 7.009608268737793, "learning_rate": 2.159844629039553e-05, "loss": 2.6824, "step": 332000 }, { "epoch": 1.837250037297557, "grad_norm": 23.06879997253418, "learning_rate": 2.1547385891776513e-05, "loss": 2.6113, "step": 332500 }, { "epoch": 1.840012819308531, "grad_norm": 5.4768290519714355, "learning_rate": 2.1496223167709365e-05, "loss": 2.6372, "step": 333000 }, { "epoch": 1.8427756013195047, "grad_norm": 5.850235939025879, "learning_rate": 2.1445060443642213e-05, "loss": 2.6308, "step": 333500 }, { "epoch": 1.8455383833304784, "grad_norm": 6.897058963775635, "learning_rate": 2.1393897719575065e-05, "loss": 2.6435, "step": 334000 }, { "epoch": 1.8483011653414523, "grad_norm": 7.006948947906494, "learning_rate": 2.134283732095605e-05, "loss": 2.6398, "step": 334500 }, { "epoch": 1.851063947352426, "grad_norm": 5.789132595062256, "learning_rate": 2.1291674596888895e-05, "loss": 2.6249, "step": 335000 }, { "epoch": 1.851063947352426, "eval_runtime": 1441.3667, "eval_samples_per_second": 251.118, "eval_steps_per_second": 31.39, "step": 335000 }, { "epoch": 1.8538267293633997, "grad_norm": 7.754148006439209, "learning_rate": 2.1240511872821747e-05, "loss": 2.6279, "step": 335500 }, { "epoch": 1.8565895113743736, "grad_norm": 5.5116071701049805, "learning_rate": 2.1189349148754596e-05, "loss": 2.6443, "step": 336000 }, { "epoch": 1.8593522933853475, "grad_norm": 7.665276050567627, "learning_rate": 2.113828875013558e-05, "loss": 2.5898, "step": 336500 }, { "epoch": 1.862115075396321, "grad_norm": 6.607998371124268, "learning_rate": 2.1087126026068433e-05, "loss": 2.6224, "step": 337000 }, { "epoch": 1.8648778574072948, "grad_norm": 7.938060760498047, "learning_rate": 2.103596330200128e-05, "loss": 2.6023, "step": 337500 }, { "epoch": 1.8676406394182687, "grad_norm": 5.741148948669434, "learning_rate": 2.0984800577934133e-05, "loss": 2.6425, "step": 338000 }, { "epoch": 1.8704034214292424, "grad_norm": 7.50128173828125, "learning_rate": 2.093374017931512e-05, "loss": 2.6474, "step": 338500 }, { "epoch": 1.8731662034402161, "grad_norm": 5.097824573516846, "learning_rate": 2.0882577455247967e-05, "loss": 2.682, "step": 339000 }, { "epoch": 1.87592898545119, "grad_norm": 7.523733139038086, "learning_rate": 2.0831414731180816e-05, "loss": 2.6237, "step": 339500 }, { "epoch": 1.8786917674621637, "grad_norm": 10.524862289428711, "learning_rate": 2.0780252007113664e-05, "loss": 2.6293, "step": 340000 }, { "epoch": 1.8786917674621637, "eval_runtime": 1428.3224, "eval_samples_per_second": 253.411, "eval_steps_per_second": 31.677, "step": 340000 }, { "epoch": 1.8814545494731374, "grad_norm": 5.725772380828857, "learning_rate": 2.072919160849465e-05, "loss": 2.6238, "step": 340500 }, { "epoch": 1.8842173314841113, "grad_norm": 6.34156608581543, "learning_rate": 2.06780288844275e-05, "loss": 2.5932, "step": 341000 }, { "epoch": 1.886980113495085, "grad_norm": 9.06069278717041, "learning_rate": 2.062686616036035e-05, "loss": 2.6577, "step": 341500 }, { "epoch": 1.8897428955060587, "grad_norm": 7.7342329025268555, "learning_rate": 2.05757034362932e-05, "loss": 2.6486, "step": 342000 }, { "epoch": 1.8925056775170326, "grad_norm": 7.23144006729126, "learning_rate": 2.052454071222605e-05, "loss": 2.6266, "step": 342500 }, { "epoch": 1.8952684595280063, "grad_norm": 6.990833759307861, "learning_rate": 2.0473480313607035e-05, "loss": 2.6279, "step": 343000 }, { "epoch": 1.89803124153898, "grad_norm": 9.42507553100586, "learning_rate": 2.0422317589539884e-05, "loss": 2.61, "step": 343500 }, { "epoch": 1.9007940235499539, "grad_norm": 4.919162750244141, "learning_rate": 2.037125719092087e-05, "loss": 2.6752, "step": 344000 }, { "epoch": 1.9035568055609278, "grad_norm": 6.697198390960693, "learning_rate": 2.0320094466853717e-05, "loss": 2.6724, "step": 344500 }, { "epoch": 1.9063195875719015, "grad_norm": 8.001893043518066, "learning_rate": 2.026893174278657e-05, "loss": 2.6417, "step": 345000 }, { "epoch": 1.9063195875719015, "eval_runtime": 1396.1909, "eval_samples_per_second": 259.243, "eval_steps_per_second": 32.406, "step": 345000 }, { "epoch": 1.9090823695828751, "grad_norm": 6.076798915863037, "learning_rate": 2.0217769018719418e-05, "loss": 2.6188, "step": 345500 }, { "epoch": 1.911845151593849, "grad_norm": 6.7825398445129395, "learning_rate": 2.016660629465227e-05, "loss": 2.6218, "step": 346000 }, { "epoch": 1.9146079336048227, "grad_norm": 7.529403209686279, "learning_rate": 2.011544357058512e-05, "loss": 2.6696, "step": 346500 }, { "epoch": 1.9173707156157964, "grad_norm": 6.596738815307617, "learning_rate": 2.006428084651797e-05, "loss": 2.5924, "step": 347000 }, { "epoch": 1.9201334976267703, "grad_norm": 8.123401641845703, "learning_rate": 2.001311812245082e-05, "loss": 2.6367, "step": 347500 }, { "epoch": 1.922896279637744, "grad_norm": 7.292053699493408, "learning_rate": 1.996195539838367e-05, "loss": 2.6255, "step": 348000 }, { "epoch": 1.9256590616487177, "grad_norm": 8.276845932006836, "learning_rate": 1.9910894999764653e-05, "loss": 2.6787, "step": 348500 }, { "epoch": 1.9284218436596916, "grad_norm": 7.750123023986816, "learning_rate": 1.98597322756975e-05, "loss": 2.6608, "step": 349000 }, { "epoch": 1.9311846256706653, "grad_norm": 5.726837158203125, "learning_rate": 1.9808569551630353e-05, "loss": 2.6488, "step": 349500 }, { "epoch": 1.933947407681639, "grad_norm": 6.344818592071533, "learning_rate": 1.9757406827563202e-05, "loss": 2.6391, "step": 350000 }, { "epoch": 1.933947407681639, "eval_runtime": 1416.31, "eval_samples_per_second": 255.561, "eval_steps_per_second": 31.946, "step": 350000 }, { "epoch": 1.9367101896926129, "grad_norm": 6.042297840118408, "learning_rate": 1.9706346428944187e-05, "loss": 2.6575, "step": 350500 }, { "epoch": 1.9394729717035868, "grad_norm": 7.41777229309082, "learning_rate": 1.9655183704877035e-05, "loss": 2.6125, "step": 351000 }, { "epoch": 1.9422357537145603, "grad_norm": 7.600329875946045, "learning_rate": 1.960412330625802e-05, "loss": 2.5912, "step": 351500 }, { "epoch": 1.9449985357255342, "grad_norm": 8.74294376373291, "learning_rate": 1.9552960582190872e-05, "loss": 2.6447, "step": 352000 }, { "epoch": 1.947761317736508, "grad_norm": 7.785200595855713, "learning_rate": 1.950179785812372e-05, "loss": 2.6272, "step": 352500 }, { "epoch": 1.9505240997474818, "grad_norm": 5.1395263671875, "learning_rate": 1.9450635134056573e-05, "loss": 2.6262, "step": 353000 }, { "epoch": 1.9532868817584554, "grad_norm": 6.273059368133545, "learning_rate": 1.9399472409989418e-05, "loss": 2.6474, "step": 353500 }, { "epoch": 1.9560496637694293, "grad_norm": 7.929372787475586, "learning_rate": 1.9348412011370403e-05, "loss": 2.6075, "step": 354000 }, { "epoch": 1.958812445780403, "grad_norm": 5.680710792541504, "learning_rate": 1.9297249287303255e-05, "loss": 2.6348, "step": 354500 }, { "epoch": 1.9615752277913767, "grad_norm": 7.282249450683594, "learning_rate": 1.9246086563236104e-05, "loss": 2.6393, "step": 355000 }, { "epoch": 1.9615752277913767, "eval_runtime": 1435.1937, "eval_samples_per_second": 252.198, "eval_steps_per_second": 31.525, "step": 355000 }, { "epoch": 1.9643380098023506, "grad_norm": 7.032031536102295, "learning_rate": 1.9194923839168956e-05, "loss": 2.6573, "step": 355500 }, { "epoch": 1.9671007918133243, "grad_norm": 7.789410591125488, "learning_rate": 1.9143761115101804e-05, "loss": 2.6282, "step": 356000 }, { "epoch": 1.969863573824298, "grad_norm": 7.624570369720459, "learning_rate": 1.9092598391034656e-05, "loss": 2.6147, "step": 356500 }, { "epoch": 1.972626355835272, "grad_norm": 7.583735942840576, "learning_rate": 1.9041435666967505e-05, "loss": 2.6087, "step": 357000 }, { "epoch": 1.9753891378462458, "grad_norm": 7.545061111450195, "learning_rate": 1.8990272942900357e-05, "loss": 2.6363, "step": 357500 }, { "epoch": 1.9781519198572193, "grad_norm": 8.852106094360352, "learning_rate": 1.8939110218833202e-05, "loss": 2.6375, "step": 358000 }, { "epoch": 1.9809147018681932, "grad_norm": 9.523889541625977, "learning_rate": 1.8888049820214187e-05, "loss": 2.6647, "step": 358500 }, { "epoch": 1.983677483879167, "grad_norm": 5.024425029754639, "learning_rate": 1.883688709614704e-05, "loss": 2.6493, "step": 359000 }, { "epoch": 1.9864402658901408, "grad_norm": 6.810407638549805, "learning_rate": 1.8785724372079887e-05, "loss": 2.6589, "step": 359500 }, { "epoch": 1.9892030479011145, "grad_norm": 7.492327690124512, "learning_rate": 1.873456164801274e-05, "loss": 2.6184, "step": 360000 }, { "epoch": 1.9892030479011145, "eval_runtime": 1381.387, "eval_samples_per_second": 262.021, "eval_steps_per_second": 32.753, "step": 360000 }, { "epoch": 1.9919658299120884, "grad_norm": 7.694727420806885, "learning_rate": 1.8683501249393724e-05, "loss": 2.6087, "step": 360500 }, { "epoch": 1.994728611923062, "grad_norm": 6.121093273162842, "learning_rate": 1.8632338525326573e-05, "loss": 2.6543, "step": 361000 }, { "epoch": 1.9974913939340357, "grad_norm": 11.24258041381836, "learning_rate": 1.8581175801259425e-05, "loss": 2.5845, "step": 361500 }, { "epoch": 2.0002541759450096, "grad_norm": 6.372257709503174, "learning_rate": 1.8530013077192273e-05, "loss": 2.6031, "step": 362000 }, { "epoch": 2.0030169579559836, "grad_norm": 6.463737964630127, "learning_rate": 1.847895267857326e-05, "loss": 2.6096, "step": 362500 }, { "epoch": 2.005779739966957, "grad_norm": 5.803626537322998, "learning_rate": 1.8427789954506107e-05, "loss": 2.6181, "step": 363000 }, { "epoch": 2.008542521977931, "grad_norm": 6.860798358917236, "learning_rate": 1.8376627230438956e-05, "loss": 2.6372, "step": 363500 }, { "epoch": 2.011305303988905, "grad_norm": 6.21894645690918, "learning_rate": 1.832556683181994e-05, "loss": 2.5802, "step": 364000 }, { "epoch": 2.0140680859998783, "grad_norm": 5.939208507537842, "learning_rate": 1.827440410775279e-05, "loss": 2.6489, "step": 364500 }, { "epoch": 2.016830868010852, "grad_norm": 11.368240356445312, "learning_rate": 1.822324138368564e-05, "loss": 2.665, "step": 365000 }, { "epoch": 2.016830868010852, "eval_runtime": 1404.9151, "eval_samples_per_second": 257.633, "eval_steps_per_second": 32.205, "step": 365000 }, { "epoch": 2.019593650021826, "grad_norm": 6.604458808898926, "learning_rate": 1.817207865961849e-05, "loss": 2.6483, "step": 365500 }, { "epoch": 2.0223564320327996, "grad_norm": 9.250690460205078, "learning_rate": 1.812091593555134e-05, "loss": 2.5957, "step": 366000 }, { "epoch": 2.0251192140437735, "grad_norm": 4.720149517059326, "learning_rate": 1.806975321148419e-05, "loss": 2.6207, "step": 366500 }, { "epoch": 2.0278819960547474, "grad_norm": 4.749586582183838, "learning_rate": 1.801859048741704e-05, "loss": 2.5841, "step": 367000 }, { "epoch": 2.030644778065721, "grad_norm": 9.431694984436035, "learning_rate": 1.796742776334989e-05, "loss": 2.645, "step": 367500 }, { "epoch": 2.0334075600766948, "grad_norm": 9.072124481201172, "learning_rate": 1.7916367364730872e-05, "loss": 2.6951, "step": 368000 }, { "epoch": 2.0361703420876687, "grad_norm": 5.056208610534668, "learning_rate": 1.7865204640663724e-05, "loss": 2.5918, "step": 368500 }, { "epoch": 2.0389331240986426, "grad_norm": 6.752665996551514, "learning_rate": 1.7814041916596573e-05, "loss": 2.6602, "step": 369000 }, { "epoch": 2.041695906109616, "grad_norm": 10.628358840942383, "learning_rate": 1.7762879192529425e-05, "loss": 2.6218, "step": 369500 }, { "epoch": 2.04445868812059, "grad_norm": 6.070361614227295, "learning_rate": 1.7711716468462274e-05, "loss": 2.7051, "step": 370000 }, { "epoch": 2.04445868812059, "eval_runtime": 1429.0226, "eval_samples_per_second": 253.287, "eval_steps_per_second": 31.662, "step": 370000 }, { "epoch": 2.047221470131564, "grad_norm": 7.342000961303711, "learning_rate": 1.766065606984326e-05, "loss": 2.6368, "step": 370500 }, { "epoch": 2.0499842521425373, "grad_norm": 10.825027465820312, "learning_rate": 1.760949334577611e-05, "loss": 2.6431, "step": 371000 }, { "epoch": 2.0527470341535112, "grad_norm": 5.528331756591797, "learning_rate": 1.755833062170896e-05, "loss": 2.6038, "step": 371500 }, { "epoch": 2.055509816164485, "grad_norm": 5.892696380615234, "learning_rate": 1.7507167897641808e-05, "loss": 2.6242, "step": 372000 }, { "epoch": 2.0582725981754586, "grad_norm": 6.996720790863037, "learning_rate": 1.7456107499022793e-05, "loss": 2.6515, "step": 372500 }, { "epoch": 2.0610353801864325, "grad_norm": 6.8381757736206055, "learning_rate": 1.740494477495564e-05, "loss": 2.6302, "step": 373000 }, { "epoch": 2.0637981621974064, "grad_norm": 8.656445503234863, "learning_rate": 1.7353782050888493e-05, "loss": 2.6484, "step": 373500 }, { "epoch": 2.06656094420838, "grad_norm": 6.725839138031006, "learning_rate": 1.7302619326821342e-05, "loss": 2.6319, "step": 374000 }, { "epoch": 2.069323726219354, "grad_norm": 5.520457744598389, "learning_rate": 1.7251558928202327e-05, "loss": 2.626, "step": 374500 }, { "epoch": 2.0720865082303277, "grad_norm": 5.802083969116211, "learning_rate": 1.720039620413518e-05, "loss": 2.6161, "step": 375000 }, { "epoch": 2.0720865082303277, "eval_runtime": 1404.6799, "eval_samples_per_second": 257.676, "eval_steps_per_second": 32.21, "step": 375000 }, { "epoch": 2.0748492902413016, "grad_norm": 6.6860551834106445, "learning_rate": 1.7149233480068027e-05, "loss": 2.6162, "step": 375500 }, { "epoch": 2.077612072252275, "grad_norm": 6.858133792877197, "learning_rate": 1.709807075600088e-05, "loss": 2.6101, "step": 376000 }, { "epoch": 2.080374854263249, "grad_norm": 6.421977996826172, "learning_rate": 1.7046908031933724e-05, "loss": 2.6648, "step": 376500 }, { "epoch": 2.083137636274223, "grad_norm": 5.524794578552246, "learning_rate": 1.699584763331471e-05, "loss": 2.616, "step": 377000 }, { "epoch": 2.0859004182851963, "grad_norm": 11.743040084838867, "learning_rate": 1.694468490924756e-05, "loss": 2.5836, "step": 377500 }, { "epoch": 2.0886632002961703, "grad_norm": 11.61206340789795, "learning_rate": 1.689352218518041e-05, "loss": 2.6349, "step": 378000 }, { "epoch": 2.091425982307144, "grad_norm": 10.494318962097168, "learning_rate": 1.6842359461113262e-05, "loss": 2.6134, "step": 378500 }, { "epoch": 2.0941887643181176, "grad_norm": 5.6387619972229, "learning_rate": 1.679119673704611e-05, "loss": 2.6245, "step": 379000 }, { "epoch": 2.0969515463290915, "grad_norm": 8.307207107543945, "learning_rate": 1.6740136338427096e-05, "loss": 2.6092, "step": 379500 }, { "epoch": 2.0997143283400654, "grad_norm": 8.203621864318848, "learning_rate": 1.6688973614359944e-05, "loss": 2.6709, "step": 380000 }, { "epoch": 2.0997143283400654, "eval_runtime": 1387.6269, "eval_samples_per_second": 260.843, "eval_steps_per_second": 32.606, "step": 380000 }, { "epoch": 2.102477110351039, "grad_norm": 7.687305927276611, "learning_rate": 1.6637810890292796e-05, "loss": 2.6405, "step": 380500 }, { "epoch": 2.105239892362013, "grad_norm": 5.860333442687988, "learning_rate": 1.6586648166225645e-05, "loss": 2.6511, "step": 381000 }, { "epoch": 2.1080026743729867, "grad_norm": 6.413809299468994, "learning_rate": 1.6535587767606626e-05, "loss": 2.634, "step": 381500 }, { "epoch": 2.11076545638396, "grad_norm": 6.205860137939453, "learning_rate": 1.6484425043539478e-05, "loss": 2.6624, "step": 382000 }, { "epoch": 2.113528238394934, "grad_norm": 7.94976806640625, "learning_rate": 1.6433262319472327e-05, "loss": 2.592, "step": 382500 }, { "epoch": 2.116291020405908, "grad_norm": 6.3407793045043945, "learning_rate": 1.638209959540518e-05, "loss": 2.6391, "step": 383000 }, { "epoch": 2.119053802416882, "grad_norm": 5.911262512207031, "learning_rate": 1.6331039196786164e-05, "loss": 2.6936, "step": 383500 }, { "epoch": 2.1218165844278554, "grad_norm": 6.195751667022705, "learning_rate": 1.6279876472719012e-05, "loss": 2.6127, "step": 384000 }, { "epoch": 2.1245793664388293, "grad_norm": 7.307173252105713, "learning_rate": 1.6228713748651864e-05, "loss": 2.6623, "step": 384500 }, { "epoch": 2.127342148449803, "grad_norm": 7.353754043579102, "learning_rate": 1.6177551024584713e-05, "loss": 2.6114, "step": 385000 }, { "epoch": 2.127342148449803, "eval_runtime": 1366.6131, "eval_samples_per_second": 264.854, "eval_steps_per_second": 33.107, "step": 385000 }, { "epoch": 2.1301049304607766, "grad_norm": 6.172619342803955, "learning_rate": 1.6126490625965698e-05, "loss": 2.6657, "step": 385500 }, { "epoch": 2.1328677124717506, "grad_norm": 7.605554580688477, "learning_rate": 1.6075327901898546e-05, "loss": 2.6024, "step": 386000 }, { "epoch": 2.1356304944827245, "grad_norm": 8.817626953125, "learning_rate": 1.6024165177831395e-05, "loss": 2.6079, "step": 386500 }, { "epoch": 2.138393276493698, "grad_norm": 7.332306861877441, "learning_rate": 1.5973002453764247e-05, "loss": 2.6872, "step": 387000 }, { "epoch": 2.141156058504672, "grad_norm": 4.464954853057861, "learning_rate": 1.5921839729697096e-05, "loss": 2.6496, "step": 387500 }, { "epoch": 2.1439188405156457, "grad_norm": 5.7703962326049805, "learning_rate": 1.5870677005629948e-05, "loss": 2.6303, "step": 388000 }, { "epoch": 2.146681622526619, "grad_norm": 7.109230041503906, "learning_rate": 1.5819616607010933e-05, "loss": 2.6188, "step": 388500 }, { "epoch": 2.149444404537593, "grad_norm": 10.503727912902832, "learning_rate": 1.576845388294378e-05, "loss": 2.6147, "step": 389000 }, { "epoch": 2.152207186548567, "grad_norm": 7.042636394500732, "learning_rate": 1.5717291158876633e-05, "loss": 2.6058, "step": 389500 }, { "epoch": 2.1549699685595405, "grad_norm": 6.6826252937316895, "learning_rate": 1.5666230760257615e-05, "loss": 2.6158, "step": 390000 }, { "epoch": 2.1549699685595405, "eval_runtime": 1433.6301, "eval_samples_per_second": 252.473, "eval_steps_per_second": 31.56, "step": 390000 }, { "epoch": 2.1577327505705144, "grad_norm": 8.209315299987793, "learning_rate": 1.5615068036190463e-05, "loss": 2.648, "step": 390500 }, { "epoch": 2.1604955325814883, "grad_norm": 6.613926887512207, "learning_rate": 1.5563905312123315e-05, "loss": 2.6458, "step": 391000 }, { "epoch": 2.163258314592462, "grad_norm": 7.016421794891357, "learning_rate": 1.5512742588056164e-05, "loss": 2.6386, "step": 391500 }, { "epoch": 2.1660210966034357, "grad_norm": 6.227564811706543, "learning_rate": 1.5461579863989016e-05, "loss": 2.6648, "step": 392000 }, { "epoch": 2.1687838786144096, "grad_norm": 6.598555088043213, "learning_rate": 1.5410417139921864e-05, "loss": 2.6187, "step": 392500 }, { "epoch": 2.1715466606253835, "grad_norm": 8.013922691345215, "learning_rate": 1.5359254415854716e-05, "loss": 2.6079, "step": 393000 }, { "epoch": 2.174309442636357, "grad_norm": 5.305454730987549, "learning_rate": 1.53081940172357e-05, "loss": 2.6232, "step": 393500 }, { "epoch": 2.177072224647331, "grad_norm": 7.661605358123779, "learning_rate": 1.525703129316855e-05, "loss": 2.6102, "step": 394000 }, { "epoch": 2.1798350066583048, "grad_norm": 7.636397838592529, "learning_rate": 1.52058685691014e-05, "loss": 2.635, "step": 394500 }, { "epoch": 2.1825977886692782, "grad_norm": 9.935632705688477, "learning_rate": 1.5154705845034247e-05, "loss": 2.5679, "step": 395000 }, { "epoch": 2.1825977886692782, "eval_runtime": 1450.0546, "eval_samples_per_second": 249.613, "eval_steps_per_second": 31.202, "step": 395000 }, { "epoch": 2.185360570680252, "grad_norm": 6.385195255279541, "learning_rate": 1.5103543120967097e-05, "loss": 2.6055, "step": 395500 }, { "epoch": 2.188123352691226, "grad_norm": 4.359088897705078, "learning_rate": 1.5052380396899948e-05, "loss": 2.6116, "step": 396000 }, { "epoch": 2.1908861347021995, "grad_norm": 6.967292308807373, "learning_rate": 1.5001217672832798e-05, "loss": 2.5738, "step": 396500 }, { "epoch": 2.1936489167131734, "grad_norm": 5.064013481140137, "learning_rate": 1.4950054948765648e-05, "loss": 2.59, "step": 397000 }, { "epoch": 2.1964116987241473, "grad_norm": 7.287230014801025, "learning_rate": 1.4898892224698498e-05, "loss": 2.5935, "step": 397500 }, { "epoch": 2.1991744807351212, "grad_norm": 5.277096271514893, "learning_rate": 1.4847831826079483e-05, "loss": 2.6603, "step": 398000 }, { "epoch": 2.2019372627460947, "grad_norm": 5.027023792266846, "learning_rate": 1.4796771427460468e-05, "loss": 2.6749, "step": 398500 }, { "epoch": 2.2047000447570686, "grad_norm": 5.998363494873047, "learning_rate": 1.4745608703393319e-05, "loss": 2.6365, "step": 399000 }, { "epoch": 2.2074628267680425, "grad_norm": 5.35511589050293, "learning_rate": 1.4694445979326169e-05, "loss": 2.6342, "step": 399500 }, { "epoch": 2.210225608779016, "grad_norm": 4.967937469482422, "learning_rate": 1.4643283255259016e-05, "loss": 2.6388, "step": 400000 }, { "epoch": 2.210225608779016, "eval_runtime": 1400.0792, "eval_samples_per_second": 258.523, "eval_steps_per_second": 32.316, "step": 400000 }, { "epoch": 2.21298839078999, "grad_norm": 5.5652055740356445, "learning_rate": 1.4592120531191866e-05, "loss": 2.629, "step": 400500 }, { "epoch": 2.215751172800964, "grad_norm": 6.4792962074279785, "learning_rate": 1.4540957807124716e-05, "loss": 2.6253, "step": 401000 }, { "epoch": 2.2185139548119372, "grad_norm": 5.494840621948242, "learning_rate": 1.4489795083057567e-05, "loss": 2.5912, "step": 401500 }, { "epoch": 2.221276736822911, "grad_norm": 6.507066249847412, "learning_rate": 1.4438632358990417e-05, "loss": 2.5945, "step": 402000 }, { "epoch": 2.224039518833885, "grad_norm": 5.496526718139648, "learning_rate": 1.4387571960371402e-05, "loss": 2.6173, "step": 402500 }, { "epoch": 2.2268023008448585, "grad_norm": 6.222531795501709, "learning_rate": 1.4336511561752385e-05, "loss": 2.6482, "step": 403000 }, { "epoch": 2.2295650828558324, "grad_norm": 5.230762481689453, "learning_rate": 1.4285348837685236e-05, "loss": 2.6678, "step": 403500 }, { "epoch": 2.2323278648668063, "grad_norm": 7.752573490142822, "learning_rate": 1.4234186113618086e-05, "loss": 2.6552, "step": 404000 }, { "epoch": 2.2350906468777803, "grad_norm": 6.026094436645508, "learning_rate": 1.4183023389550934e-05, "loss": 2.6387, "step": 404500 }, { "epoch": 2.2378534288887537, "grad_norm": 8.043586730957031, "learning_rate": 1.4131962990931918e-05, "loss": 2.6137, "step": 405000 }, { "epoch": 2.2378534288887537, "eval_runtime": 1371.0294, "eval_samples_per_second": 264.001, "eval_steps_per_second": 33.001, "step": 405000 }, { "epoch": 2.2406162108997276, "grad_norm": 13.741228103637695, "learning_rate": 1.4080800266864768e-05, "loss": 2.621, "step": 405500 }, { "epoch": 2.2433789929107015, "grad_norm": 7.889692306518555, "learning_rate": 1.4029637542797618e-05, "loss": 2.645, "step": 406000 }, { "epoch": 2.246141774921675, "grad_norm": 7.462569236755371, "learning_rate": 1.3978474818730469e-05, "loss": 2.5896, "step": 406500 }, { "epoch": 2.248904556932649, "grad_norm": 6.514028072357178, "learning_rate": 1.3927312094663319e-05, "loss": 2.6266, "step": 407000 }, { "epoch": 2.251667338943623, "grad_norm": 6.088305950164795, "learning_rate": 1.3876149370596169e-05, "loss": 2.5982, "step": 407500 }, { "epoch": 2.2544301209545963, "grad_norm": 8.39070987701416, "learning_rate": 1.382498664652902e-05, "loss": 2.6371, "step": 408000 }, { "epoch": 2.25719290296557, "grad_norm": 7.8665361404418945, "learning_rate": 1.377382392246187e-05, "loss": 2.621, "step": 408500 }, { "epoch": 2.259955684976544, "grad_norm": 6.9274373054504395, "learning_rate": 1.3722661198394718e-05, "loss": 2.6896, "step": 409000 }, { "epoch": 2.2627184669875176, "grad_norm": 7.833282947540283, "learning_rate": 1.3671498474327568e-05, "loss": 2.6513, "step": 409500 }, { "epoch": 2.2654812489984915, "grad_norm": 7.596132278442383, "learning_rate": 1.3620438075708552e-05, "loss": 2.6452, "step": 410000 }, { "epoch": 2.2654812489984915, "eval_runtime": 1432.6263, "eval_samples_per_second": 252.65, "eval_steps_per_second": 31.582, "step": 410000 }, { "epoch": 2.2682440310094654, "grad_norm": 7.686011791229248, "learning_rate": 1.3569275351641402e-05, "loss": 2.6487, "step": 410500 }, { "epoch": 2.2710068130204393, "grad_norm": 10.180373191833496, "learning_rate": 1.3518112627574252e-05, "loss": 2.6355, "step": 411000 }, { "epoch": 2.2737695950314127, "grad_norm": 5.1875410079956055, "learning_rate": 1.3466949903507103e-05, "loss": 2.6251, "step": 411500 }, { "epoch": 2.2765323770423866, "grad_norm": 5.864450931549072, "learning_rate": 1.3415787179439953e-05, "loss": 2.5926, "step": 412000 }, { "epoch": 2.2792951590533606, "grad_norm": 6.403237342834473, "learning_rate": 1.3364624455372803e-05, "loss": 2.5844, "step": 412500 }, { "epoch": 2.282057941064334, "grad_norm": 6.299551963806152, "learning_rate": 1.3313461731305653e-05, "loss": 2.6534, "step": 413000 }, { "epoch": 2.284820723075308, "grad_norm": 5.631259441375732, "learning_rate": 1.3262299007238502e-05, "loss": 2.6481, "step": 413500 }, { "epoch": 2.287583505086282, "grad_norm": 6.804217338562012, "learning_rate": 1.3211136283171352e-05, "loss": 2.6417, "step": 414000 }, { "epoch": 2.2903462870972553, "grad_norm": 6.593264102935791, "learning_rate": 1.3160075884552337e-05, "loss": 2.6293, "step": 414500 }, { "epoch": 2.293109069108229, "grad_norm": 7.17709493637085, "learning_rate": 1.3108913160485188e-05, "loss": 2.6037, "step": 415000 }, { "epoch": 2.293109069108229, "eval_runtime": 1394.243, "eval_samples_per_second": 259.605, "eval_steps_per_second": 32.451, "step": 415000 }, { "epoch": 2.295871851119203, "grad_norm": 8.601012229919434, "learning_rate": 1.3057750436418038e-05, "loss": 2.6404, "step": 415500 }, { "epoch": 2.2986346331301766, "grad_norm": 5.984838485717773, "learning_rate": 1.3006587712350888e-05, "loss": 2.6162, "step": 416000 }, { "epoch": 2.3013974151411505, "grad_norm": 6.601894378662109, "learning_rate": 1.2955629639180006e-05, "loss": 2.5866, "step": 416500 }, { "epoch": 2.3041601971521244, "grad_norm": 7.93733024597168, "learning_rate": 1.2904466915112856e-05, "loss": 2.6262, "step": 417000 }, { "epoch": 2.3069229791630983, "grad_norm": 7.534053325653076, "learning_rate": 1.2853304191045707e-05, "loss": 2.6565, "step": 417500 }, { "epoch": 2.3096857611740718, "grad_norm": 5.987677574157715, "learning_rate": 1.2802141466978554e-05, "loss": 2.6128, "step": 418000 }, { "epoch": 2.3124485431850457, "grad_norm": 9.730072021484375, "learning_rate": 1.2750978742911404e-05, "loss": 2.6266, "step": 418500 }, { "epoch": 2.3152113251960196, "grad_norm": 5.9827799797058105, "learning_rate": 1.2699816018844254e-05, "loss": 2.6411, "step": 419000 }, { "epoch": 2.317974107206993, "grad_norm": 8.000412940979004, "learning_rate": 1.2648653294777104e-05, "loss": 2.6197, "step": 419500 }, { "epoch": 2.320736889217967, "grad_norm": 5.673067092895508, "learning_rate": 1.2597490570709955e-05, "loss": 2.6396, "step": 420000 }, { "epoch": 2.320736889217967, "eval_runtime": 1399.3763, "eval_samples_per_second": 258.653, "eval_steps_per_second": 32.332, "step": 420000 }, { "epoch": 2.323499671228941, "grad_norm": 11.707938194274902, "learning_rate": 1.254643017209094e-05, "loss": 2.685, "step": 420500 }, { "epoch": 2.3262624532399143, "grad_norm": 9.935530662536621, "learning_rate": 1.249526744802379e-05, "loss": 2.6652, "step": 421000 }, { "epoch": 2.3290252352508882, "grad_norm": 7.645023345947266, "learning_rate": 1.2444104723956638e-05, "loss": 2.6288, "step": 421500 }, { "epoch": 2.331788017261862, "grad_norm": 8.301952362060547, "learning_rate": 1.2392941999889489e-05, "loss": 2.6653, "step": 422000 }, { "epoch": 2.3345507992728356, "grad_norm": 8.863719940185547, "learning_rate": 1.2341881601270474e-05, "loss": 2.6188, "step": 422500 }, { "epoch": 2.3373135812838095, "grad_norm": 6.79737663269043, "learning_rate": 1.2290718877203324e-05, "loss": 2.6198, "step": 423000 }, { "epoch": 2.3400763632947834, "grad_norm": 13.54198932647705, "learning_rate": 1.2239556153136174e-05, "loss": 2.6335, "step": 423500 }, { "epoch": 2.3428391453057573, "grad_norm": 6.236546039581299, "learning_rate": 1.2188393429069023e-05, "loss": 2.5878, "step": 424000 }, { "epoch": 2.345601927316731, "grad_norm": 6.494855880737305, "learning_rate": 1.2137230705001873e-05, "loss": 2.6116, "step": 424500 }, { "epoch": 2.3483647093277047, "grad_norm": 5.994902610778809, "learning_rate": 1.2086170306382858e-05, "loss": 2.6368, "step": 425000 }, { "epoch": 2.3483647093277047, "eval_runtime": 1385.8563, "eval_samples_per_second": 261.176, "eval_steps_per_second": 32.648, "step": 425000 }, { "epoch": 2.351127491338678, "grad_norm": 6.626513481140137, "learning_rate": 1.2035007582315708e-05, "loss": 2.6542, "step": 425500 }, { "epoch": 2.353890273349652, "grad_norm": 6.740534782409668, "learning_rate": 1.1983844858248559e-05, "loss": 2.6394, "step": 426000 }, { "epoch": 2.356653055360626, "grad_norm": 6.561714172363281, "learning_rate": 1.1932682134181407e-05, "loss": 2.6508, "step": 426500 }, { "epoch": 2.3594158373716, "grad_norm": 7.288315773010254, "learning_rate": 1.188162173556239e-05, "loss": 2.673, "step": 427000 }, { "epoch": 2.3621786193825733, "grad_norm": 6.247045040130615, "learning_rate": 1.183045901149524e-05, "loss": 2.6219, "step": 427500 }, { "epoch": 2.3649414013935472, "grad_norm": 6.165623664855957, "learning_rate": 1.1779296287428091e-05, "loss": 2.567, "step": 428000 }, { "epoch": 2.367704183404521, "grad_norm": 7.5528717041015625, "learning_rate": 1.1728133563360941e-05, "loss": 2.6683, "step": 428500 }, { "epoch": 2.3704669654154946, "grad_norm": 6.398986339569092, "learning_rate": 1.1677073164741925e-05, "loss": 2.5937, "step": 429000 }, { "epoch": 2.3732297474264685, "grad_norm": 5.770337104797363, "learning_rate": 1.162601276612291e-05, "loss": 2.6698, "step": 429500 }, { "epoch": 2.3759925294374424, "grad_norm": 11.233945846557617, "learning_rate": 1.157485004205576e-05, "loss": 2.6207, "step": 430000 }, { "epoch": 2.3759925294374424, "eval_runtime": 1412.4375, "eval_samples_per_second": 256.261, "eval_steps_per_second": 32.033, "step": 430000 }, { "epoch": 2.3787553114484163, "grad_norm": 8.665916442871094, "learning_rate": 1.152368731798861e-05, "loss": 2.6306, "step": 430500 }, { "epoch": 2.38151809345939, "grad_norm": 5.553136825561523, "learning_rate": 1.1472524593921459e-05, "loss": 2.6256, "step": 431000 }, { "epoch": 2.3842808754703637, "grad_norm": 5.725644588470459, "learning_rate": 1.1421361869854309e-05, "loss": 2.6169, "step": 431500 }, { "epoch": 2.387043657481337, "grad_norm": 5.656550407409668, "learning_rate": 1.137019914578716e-05, "loss": 2.614, "step": 432000 }, { "epoch": 2.389806439492311, "grad_norm": 5.470634460449219, "learning_rate": 1.131903642172001e-05, "loss": 2.6239, "step": 432500 }, { "epoch": 2.392569221503285, "grad_norm": 6.324733257293701, "learning_rate": 1.1267873697652858e-05, "loss": 2.6097, "step": 433000 }, { "epoch": 2.395332003514259, "grad_norm": 5.290309906005859, "learning_rate": 1.1216710973585708e-05, "loss": 2.6635, "step": 433500 }, { "epoch": 2.3980947855252324, "grad_norm": 9.409131050109863, "learning_rate": 1.1165650574966693e-05, "loss": 2.6287, "step": 434000 }, { "epoch": 2.4008575675362063, "grad_norm": 6.079099655151367, "learning_rate": 1.1114487850899544e-05, "loss": 2.5926, "step": 434500 }, { "epoch": 2.40362034954718, "grad_norm": 5.747387886047363, "learning_rate": 1.1063427452280529e-05, "loss": 2.6502, "step": 435000 }, { "epoch": 2.40362034954718, "eval_runtime": 1441.4109, "eval_samples_per_second": 251.11, "eval_steps_per_second": 31.389, "step": 435000 }, { "epoch": 2.4063831315581536, "grad_norm": 5.655724048614502, "learning_rate": 1.1012264728213379e-05, "loss": 2.6302, "step": 435500 }, { "epoch": 2.4091459135691276, "grad_norm": 7.379015922546387, "learning_rate": 1.0961102004146228e-05, "loss": 2.6013, "step": 436000 }, { "epoch": 2.4119086955801015, "grad_norm": 5.20357608795166, "learning_rate": 1.0909939280079078e-05, "loss": 2.593, "step": 436500 }, { "epoch": 2.414671477591075, "grad_norm": 7.364123344421387, "learning_rate": 1.0858776556011928e-05, "loss": 2.5955, "step": 437000 }, { "epoch": 2.417434259602049, "grad_norm": 6.859920978546143, "learning_rate": 1.0807716157392913e-05, "loss": 2.6142, "step": 437500 }, { "epoch": 2.4201970416130227, "grad_norm": 8.261401176452637, "learning_rate": 1.0756553433325762e-05, "loss": 2.6348, "step": 438000 }, { "epoch": 2.422959823623996, "grad_norm": 6.4325852394104, "learning_rate": 1.0705390709258612e-05, "loss": 2.6208, "step": 438500 }, { "epoch": 2.42572260563497, "grad_norm": 7.540378093719482, "learning_rate": 1.0654227985191462e-05, "loss": 2.6543, "step": 439000 }, { "epoch": 2.428485387645944, "grad_norm": 4.978431701660156, "learning_rate": 1.0603065261124313e-05, "loss": 2.6658, "step": 439500 }, { "epoch": 2.431248169656918, "grad_norm": 7.280527114868164, "learning_rate": 1.0551902537057161e-05, "loss": 2.6231, "step": 440000 }, { "epoch": 2.431248169656918, "eval_runtime": 1447.4545, "eval_samples_per_second": 250.062, "eval_steps_per_second": 31.258, "step": 440000 }, { "epoch": 2.4340109516678914, "grad_norm": 8.55695915222168, "learning_rate": 1.0500739812990011e-05, "loss": 2.6355, "step": 440500 }, { "epoch": 2.4367737336788653, "grad_norm": 6.825678825378418, "learning_rate": 1.0449577088922862e-05, "loss": 2.5972, "step": 441000 }, { "epoch": 2.439536515689839, "grad_norm": 9.022064208984375, "learning_rate": 1.0398414364855712e-05, "loss": 2.6085, "step": 441500 }, { "epoch": 2.4422992977008127, "grad_norm": 7.041652202606201, "learning_rate": 1.0347353966236697e-05, "loss": 2.628, "step": 442000 }, { "epoch": 2.4450620797117866, "grad_norm": 8.78257942199707, "learning_rate": 1.029629356761768e-05, "loss": 2.5947, "step": 442500 }, { "epoch": 2.4478248617227605, "grad_norm": 3.7992634773254395, "learning_rate": 1.0245130843550529e-05, "loss": 2.6069, "step": 443000 }, { "epoch": 2.450587643733734, "grad_norm": 5.678961753845215, "learning_rate": 1.0193968119483379e-05, "loss": 2.6333, "step": 443500 }, { "epoch": 2.453350425744708, "grad_norm": 6.932492256164551, "learning_rate": 1.014280539541623e-05, "loss": 2.6285, "step": 444000 }, { "epoch": 2.4561132077556818, "grad_norm": 8.48609447479248, "learning_rate": 1.009164267134908e-05, "loss": 2.6003, "step": 444500 }, { "epoch": 2.4588759897666552, "grad_norm": 7.256680488586426, "learning_rate": 1.004047994728193e-05, "loss": 2.5648, "step": 445000 }, { "epoch": 2.4588759897666552, "eval_runtime": 1380.4966, "eval_samples_per_second": 262.19, "eval_steps_per_second": 32.774, "step": 445000 }, { "epoch": 2.461638771777629, "grad_norm": 10.294569969177246, "learning_rate": 9.98931722321478e-06, "loss": 2.6668, "step": 445500 }, { "epoch": 2.464401553788603, "grad_norm": 7.309881687164307, "learning_rate": 9.93815449914763e-06, "loss": 2.6188, "step": 446000 }, { "epoch": 2.467164335799577, "grad_norm": 8.109071731567383, "learning_rate": 9.887094100528614e-06, "loss": 2.5915, "step": 446500 }, { "epoch": 2.4699271178105504, "grad_norm": 6.958956718444824, "learning_rate": 9.835931376461462e-06, "loss": 2.6043, "step": 447000 }, { "epoch": 2.4726898998215243, "grad_norm": 5.835160732269287, "learning_rate": 9.784768652394313e-06, "loss": 2.6273, "step": 447500 }, { "epoch": 2.4754526818324982, "grad_norm": 7.2995781898498535, "learning_rate": 9.733605928327163e-06, "loss": 2.6368, "step": 448000 }, { "epoch": 2.4782154638434717, "grad_norm": 6.141138553619385, "learning_rate": 9.682443204260013e-06, "loss": 2.6429, "step": 448500 }, { "epoch": 2.4809782458544456, "grad_norm": 7.309754371643066, "learning_rate": 9.631382805640998e-06, "loss": 2.6084, "step": 449000 }, { "epoch": 2.4837410278654195, "grad_norm": 6.5357794761657715, "learning_rate": 9.580220081573847e-06, "loss": 2.6064, "step": 449500 }, { "epoch": 2.486503809876393, "grad_norm": 5.566898822784424, "learning_rate": 9.529057357506697e-06, "loss": 2.6275, "step": 450000 }, { "epoch": 2.486503809876393, "eval_runtime": 1385.5398, "eval_samples_per_second": 261.236, "eval_steps_per_second": 32.655, "step": 450000 }, { "epoch": 2.489266591887367, "grad_norm": 7.904833793640137, "learning_rate": 9.477894633439547e-06, "loss": 2.6019, "step": 450500 }, { "epoch": 2.492029373898341, "grad_norm": 7.342651844024658, "learning_rate": 9.426731909372398e-06, "loss": 2.675, "step": 451000 }, { "epoch": 2.4947921559093142, "grad_norm": 6.255519390106201, "learning_rate": 9.375569185305246e-06, "loss": 2.6526, "step": 451500 }, { "epoch": 2.497554937920288, "grad_norm": 5.555826663970947, "learning_rate": 9.324406461238096e-06, "loss": 2.6009, "step": 452000 }, { "epoch": 2.500317719931262, "grad_norm": 9.351966857910156, "learning_rate": 9.273346062619081e-06, "loss": 2.6516, "step": 452500 }, { "epoch": 2.503080501942236, "grad_norm": 13.138755798339844, "learning_rate": 9.222183338551932e-06, "loss": 2.5894, "step": 453000 }, { "epoch": 2.5058432839532094, "grad_norm": 5.904870986938477, "learning_rate": 9.171020614484782e-06, "loss": 2.6366, "step": 453500 }, { "epoch": 2.5086060659641833, "grad_norm": 7.674947261810303, "learning_rate": 9.11985789041763e-06, "loss": 2.6441, "step": 454000 }, { "epoch": 2.511368847975157, "grad_norm": 7.656473636627197, "learning_rate": 9.068797491798615e-06, "loss": 2.6298, "step": 454500 }, { "epoch": 2.5141316299861307, "grad_norm": 5.670429706573486, "learning_rate": 9.017634767731466e-06, "loss": 2.6246, "step": 455000 }, { "epoch": 2.5141316299861307, "eval_runtime": 1400.9354, "eval_samples_per_second": 258.365, "eval_steps_per_second": 32.296, "step": 455000 }, { "epoch": 2.5168944119971046, "grad_norm": 7.4133148193359375, "learning_rate": 8.966472043664316e-06, "loss": 2.6301, "step": 455500 }, { "epoch": 2.5196571940080785, "grad_norm": 12.215228080749512, "learning_rate": 8.915309319597166e-06, "loss": 2.5877, "step": 456000 }, { "epoch": 2.522419976019052, "grad_norm": 7.437780857086182, "learning_rate": 8.864146595530015e-06, "loss": 2.6486, "step": 456500 }, { "epoch": 2.525182758030026, "grad_norm": 6.193426609039307, "learning_rate": 8.812983871462865e-06, "loss": 2.6423, "step": 457000 }, { "epoch": 2.527945540041, "grad_norm": 6.621194362640381, "learning_rate": 8.761821147395715e-06, "loss": 2.6323, "step": 457500 }, { "epoch": 2.5307083220519733, "grad_norm": 8.068601608276367, "learning_rate": 8.710658423328566e-06, "loss": 2.6727, "step": 458000 }, { "epoch": 2.533471104062947, "grad_norm": 4.508535385131836, "learning_rate": 8.659598024709549e-06, "loss": 2.6195, "step": 458500 }, { "epoch": 2.536233886073921, "grad_norm": 7.6524128913879395, "learning_rate": 8.6084353006424e-06, "loss": 2.6293, "step": 459000 }, { "epoch": 2.538996668084895, "grad_norm": 6.510564804077148, "learning_rate": 8.55727257657525e-06, "loss": 2.5563, "step": 459500 }, { "epoch": 2.5417594500958685, "grad_norm": 7.351913928985596, "learning_rate": 8.5061098525081e-06, "loss": 2.6318, "step": 460000 }, { "epoch": 2.5417594500958685, "eval_runtime": 1443.1461, "eval_samples_per_second": 250.808, "eval_steps_per_second": 31.352, "step": 460000 }, { "epoch": 2.5445222321068424, "grad_norm": 10.461015701293945, "learning_rate": 8.45494712844095e-06, "loss": 2.623, "step": 460500 }, { "epoch": 2.547285014117816, "grad_norm": 7.475493907928467, "learning_rate": 8.403886729821933e-06, "loss": 2.6073, "step": 461000 }, { "epoch": 2.5500477961287897, "grad_norm": 5.767341136932373, "learning_rate": 8.352724005754784e-06, "loss": 2.6477, "step": 461500 }, { "epoch": 2.5528105781397636, "grad_norm": 6.720097541809082, "learning_rate": 8.301561281687634e-06, "loss": 2.6244, "step": 462000 }, { "epoch": 2.5555733601507375, "grad_norm": 9.576379776000977, "learning_rate": 8.250398557620484e-06, "loss": 2.6118, "step": 462500 }, { "epoch": 2.558336142161711, "grad_norm": 7.282556056976318, "learning_rate": 8.19933815900147e-06, "loss": 2.6138, "step": 463000 }, { "epoch": 2.561098924172685, "grad_norm": 8.628390312194824, "learning_rate": 8.148175434934318e-06, "loss": 2.6197, "step": 463500 }, { "epoch": 2.563861706183659, "grad_norm": 6.606767654418945, "learning_rate": 8.097012710867168e-06, "loss": 2.6339, "step": 464000 }, { "epoch": 2.5666244881946323, "grad_norm": 5.061454772949219, "learning_rate": 8.045849986800018e-06, "loss": 2.6692, "step": 464500 }, { "epoch": 2.569387270205606, "grad_norm": 5.3237104415893555, "learning_rate": 7.994687262732869e-06, "loss": 2.6403, "step": 465000 }, { "epoch": 2.569387270205606, "eval_runtime": 1446.7808, "eval_samples_per_second": 250.178, "eval_steps_per_second": 31.273, "step": 465000 }, { "epoch": 2.57215005221658, "grad_norm": 7.996010780334473, "learning_rate": 7.943626864113852e-06, "loss": 2.6071, "step": 465500 }, { "epoch": 2.574912834227554, "grad_norm": 5.897533416748047, "learning_rate": 7.892464140046702e-06, "loss": 2.6137, "step": 466000 }, { "epoch": 2.5776756162385275, "grad_norm": 6.0307488441467285, "learning_rate": 7.841301415979552e-06, "loss": 2.6316, "step": 466500 }, { "epoch": 2.5804383982495014, "grad_norm": 8.005854606628418, "learning_rate": 7.790138691912403e-06, "loss": 2.6063, "step": 467000 }, { "epoch": 2.583201180260475, "grad_norm": 5.740025997161865, "learning_rate": 7.739078293293386e-06, "loss": 2.615, "step": 467500 }, { "epoch": 2.5859639622714488, "grad_norm": 8.913529396057129, "learning_rate": 7.687915569226235e-06, "loss": 2.619, "step": 468000 }, { "epoch": 2.5887267442824227, "grad_norm": 7.639087677001953, "learning_rate": 7.636752845159085e-06, "loss": 2.6082, "step": 468500 }, { "epoch": 2.5914895262933966, "grad_norm": 6.564584732055664, "learning_rate": 7.585590121091936e-06, "loss": 2.6301, "step": 469000 }, { "epoch": 2.59425230830437, "grad_norm": 7.4024834632873535, "learning_rate": 7.534427397024786e-06, "loss": 2.63, "step": 469500 }, { "epoch": 2.597015090315344, "grad_norm": 6.080173969268799, "learning_rate": 7.4833669984057704e-06, "loss": 2.64, "step": 470000 }, { "epoch": 2.597015090315344, "eval_runtime": 1388.5169, "eval_samples_per_second": 260.676, "eval_steps_per_second": 32.585, "step": 470000 }, { "epoch": 2.599777872326318, "grad_norm": 5.737242698669434, "learning_rate": 7.43220427433862e-06, "loss": 2.6081, "step": 470500 }, { "epoch": 2.6025406543372913, "grad_norm": 5.815971851348877, "learning_rate": 7.38104155027147e-06, "loss": 2.6399, "step": 471000 }, { "epoch": 2.6053034363482652, "grad_norm": 7.472295761108398, "learning_rate": 7.32987882620432e-06, "loss": 2.6019, "step": 471500 }, { "epoch": 2.608066218359239, "grad_norm": 10.439508438110352, "learning_rate": 7.278920753033438e-06, "loss": 2.6021, "step": 472000 }, { "epoch": 2.610829000370213, "grad_norm": 4.827859878540039, "learning_rate": 7.227758028966289e-06, "loss": 2.6533, "step": 472500 }, { "epoch": 2.6135917823811865, "grad_norm": 7.332652568817139, "learning_rate": 7.176595304899138e-06, "loss": 2.6024, "step": 473000 }, { "epoch": 2.6163545643921604, "grad_norm": 8.309117317199707, "learning_rate": 7.125432580831988e-06, "loss": 2.6284, "step": 473500 }, { "epoch": 2.619117346403134, "grad_norm": 5.733798503875732, "learning_rate": 7.074269856764839e-06, "loss": 2.5479, "step": 474000 }, { "epoch": 2.621880128414108, "grad_norm": 7.765644073486328, "learning_rate": 7.023107132697689e-06, "loss": 2.6764, "step": 474500 }, { "epoch": 2.6246429104250817, "grad_norm": 7.007179260253906, "learning_rate": 6.9719444086305375e-06, "loss": 2.6276, "step": 475000 }, { "epoch": 2.6246429104250817, "eval_runtime": 1395.7273, "eval_samples_per_second": 259.329, "eval_steps_per_second": 32.417, "step": 475000 }, { "epoch": 2.6274056924360556, "grad_norm": 8.562434196472168, "learning_rate": 6.920781684563388e-06, "loss": 2.5855, "step": 475500 }, { "epoch": 2.630168474447029, "grad_norm": 6.0895867347717285, "learning_rate": 6.869618960496238e-06, "loss": 2.622, "step": 476000 }, { "epoch": 2.632931256458003, "grad_norm": 5.998204708099365, "learning_rate": 6.818558561877222e-06, "loss": 2.6498, "step": 476500 }, { "epoch": 2.635694038468977, "grad_norm": 10.850279808044434, "learning_rate": 6.7673958378100725e-06, "loss": 2.6251, "step": 477000 }, { "epoch": 2.6384568204799503, "grad_norm": 8.587841987609863, "learning_rate": 6.716233113742922e-06, "loss": 2.6448, "step": 477500 }, { "epoch": 2.6412196024909242, "grad_norm": 7.590404510498047, "learning_rate": 6.665070389675772e-06, "loss": 2.6155, "step": 478000 }, { "epoch": 2.643982384501898, "grad_norm": 9.195626258850098, "learning_rate": 6.614009991056756e-06, "loss": 2.6585, "step": 478500 }, { "epoch": 2.646745166512872, "grad_norm": 7.80164909362793, "learning_rate": 6.562847266989607e-06, "loss": 2.6388, "step": 479000 }, { "epoch": 2.6495079485238455, "grad_norm": 8.529934883117676, "learning_rate": 6.511684542922455e-06, "loss": 2.5775, "step": 479500 }, { "epoch": 2.6522707305348194, "grad_norm": 4.80623722076416, "learning_rate": 6.4605218188553055e-06, "loss": 2.6404, "step": 480000 }, { "epoch": 2.6522707305348194, "eval_runtime": 1401.4337, "eval_samples_per_second": 258.273, "eval_steps_per_second": 32.285, "step": 480000 }, { "epoch": 2.655033512545793, "grad_norm": 7.216457366943359, "learning_rate": 6.4094614202362905e-06, "loss": 2.5943, "step": 480500 }, { "epoch": 2.657796294556767, "grad_norm": 9.760845184326172, "learning_rate": 6.358298696169141e-06, "loss": 2.6424, "step": 481000 }, { "epoch": 2.6605590765677407, "grad_norm": 5.927933692932129, "learning_rate": 6.307135972101991e-06, "loss": 2.6467, "step": 481500 }, { "epoch": 2.6633218585787146, "grad_norm": 12.73469352722168, "learning_rate": 6.25597324803484e-06, "loss": 2.657, "step": 482000 }, { "epoch": 2.666084640589688, "grad_norm": 10.275845527648926, "learning_rate": 6.204912849415824e-06, "loss": 2.6168, "step": 482500 }, { "epoch": 2.668847422600662, "grad_norm": 6.86333703994751, "learning_rate": 6.153750125348674e-06, "loss": 2.6537, "step": 483000 }, { "epoch": 2.671610204611636, "grad_norm": 7.216489315032959, "learning_rate": 6.102587401281524e-06, "loss": 2.6347, "step": 483500 }, { "epoch": 2.6743729866226094, "grad_norm": 15.533758163452148, "learning_rate": 6.0514246772143745e-06, "loss": 2.6386, "step": 484000 }, { "epoch": 2.6771357686335833, "grad_norm": 9.071037292480469, "learning_rate": 6.000364278595359e-06, "loss": 2.6346, "step": 484500 }, { "epoch": 2.679898550644557, "grad_norm": 6.715532302856445, "learning_rate": 5.949201554528208e-06, "loss": 2.5978, "step": 485000 }, { "epoch": 2.679898550644557, "eval_runtime": 1414.4263, "eval_samples_per_second": 255.901, "eval_steps_per_second": 31.988, "step": 485000 }, { "epoch": 2.682661332655531, "grad_norm": 6.801553249359131, "learning_rate": 5.898038830461058e-06, "loss": 2.6455, "step": 485500 }, { "epoch": 2.6854241146665045, "grad_norm": 8.986194610595703, "learning_rate": 5.846876106393908e-06, "loss": 2.6106, "step": 486000 }, { "epoch": 2.6881868966774785, "grad_norm": 9.200784683227539, "learning_rate": 5.795713382326758e-06, "loss": 2.628, "step": 486500 }, { "epoch": 2.690949678688452, "grad_norm": 12.348143577575684, "learning_rate": 5.744652983707743e-06, "loss": 2.6519, "step": 487000 }, { "epoch": 2.693712460699426, "grad_norm": 5.376158237457275, "learning_rate": 5.6934902596405925e-06, "loss": 2.5868, "step": 487500 }, { "epoch": 2.6964752427103997, "grad_norm": 6.574330806732178, "learning_rate": 5.642327535573443e-06, "loss": 2.6171, "step": 488000 }, { "epoch": 2.6992380247213736, "grad_norm": 7.850590229034424, "learning_rate": 5.591164811506292e-06, "loss": 2.6335, "step": 488500 }, { "epoch": 2.702000806732347, "grad_norm": 4.8061113357543945, "learning_rate": 5.540104412887276e-06, "loss": 2.6141, "step": 489000 }, { "epoch": 2.704763588743321, "grad_norm": 5.521638870239258, "learning_rate": 5.488941688820126e-06, "loss": 2.5994, "step": 489500 }, { "epoch": 2.7075263707542945, "grad_norm": 7.759128093719482, "learning_rate": 5.437778964752976e-06, "loss": 2.6212, "step": 490000 }, { "epoch": 2.7075263707542945, "eval_runtime": 1444.9579, "eval_samples_per_second": 250.494, "eval_steps_per_second": 31.312, "step": 490000 }, { "epoch": 2.7102891527652684, "grad_norm": 7.741880893707275, "learning_rate": 5.386616240685826e-06, "loss": 2.6377, "step": 490500 }, { "epoch": 2.7130519347762423, "grad_norm": 6.901477813720703, "learning_rate": 5.3355558420668105e-06, "loss": 2.6113, "step": 491000 }, { "epoch": 2.715814716787216, "grad_norm": 4.808909893035889, "learning_rate": 5.284393117999661e-06, "loss": 2.6073, "step": 491500 }, { "epoch": 2.7185774987981897, "grad_norm": 5.946444034576416, "learning_rate": 5.23323039393251e-06, "loss": 2.5861, "step": 492000 }, { "epoch": 2.7213402808091636, "grad_norm": 6.473993301391602, "learning_rate": 5.1820676698653605e-06, "loss": 2.5685, "step": 492500 }, { "epoch": 2.7241030628201375, "grad_norm": 5.835826873779297, "learning_rate": 5.131007271246345e-06, "loss": 2.5697, "step": 493000 }, { "epoch": 2.726865844831111, "grad_norm": 6.624295711517334, "learning_rate": 5.079844547179195e-06, "loss": 2.5782, "step": 493500 }, { "epoch": 2.729628626842085, "grad_norm": 9.765020370483398, "learning_rate": 5.028681823112045e-06, "loss": 2.6423, "step": 494000 }, { "epoch": 2.7323914088530588, "grad_norm": 7.017053127288818, "learning_rate": 4.9775190990448946e-06, "loss": 2.6669, "step": 494500 }, { "epoch": 2.7351541908640327, "grad_norm": 6.12160587310791, "learning_rate": 4.926356374977745e-06, "loss": 2.6144, "step": 495000 }, { "epoch": 2.7351541908640327, "eval_runtime": 1425.3262, "eval_samples_per_second": 253.944, "eval_steps_per_second": 31.744, "step": 495000 }, { "epoch": 2.737916972875006, "grad_norm": 7.875446796417236, "learning_rate": 4.875398301806864e-06, "loss": 2.6609, "step": 495500 }, { "epoch": 2.74067975488598, "grad_norm": 5.206502914428711, "learning_rate": 4.824235577739713e-06, "loss": 2.6007, "step": 496000 }, { "epoch": 2.7434425368969535, "grad_norm": 6.697471618652344, "learning_rate": 4.773072853672563e-06, "loss": 2.6205, "step": 496500 }, { "epoch": 2.7462053189079274, "grad_norm": 6.960028648376465, "learning_rate": 4.721910129605413e-06, "loss": 2.5956, "step": 497000 }, { "epoch": 2.7489681009189013, "grad_norm": 5.795044422149658, "learning_rate": 4.670747405538263e-06, "loss": 2.6254, "step": 497500 }, { "epoch": 2.751730882929875, "grad_norm": 5.511195182800293, "learning_rate": 4.6195846814711125e-06, "loss": 2.6216, "step": 498000 }, { "epoch": 2.7544936649408487, "grad_norm": 7.889344215393066, "learning_rate": 4.568421957403963e-06, "loss": 2.5932, "step": 498500 }, { "epoch": 2.7572564469518226, "grad_norm": 6.596147060394287, "learning_rate": 4.517259233336812e-06, "loss": 2.6322, "step": 499000 }, { "epoch": 2.7600192289627965, "grad_norm": 8.392708778381348, "learning_rate": 4.466198834717797e-06, "loss": 2.6114, "step": 499500 }, { "epoch": 2.76278201097377, "grad_norm": 5.404835224151611, "learning_rate": 4.4150361106506475e-06, "loss": 2.6497, "step": 500000 }, { "epoch": 2.76278201097377, "eval_runtime": 1379.3919, "eval_samples_per_second": 262.4, "eval_steps_per_second": 32.801, "step": 500000 }, { "epoch": 2.765544792984744, "grad_norm": 10.54916000366211, "learning_rate": 4.363873386583497e-06, "loss": 2.6555, "step": 500500 }, { "epoch": 2.768307574995718, "grad_norm": 7.687312602996826, "learning_rate": 4.312710662516347e-06, "loss": 2.611, "step": 501000 }, { "epoch": 2.7710703570066917, "grad_norm": 5.376524448394775, "learning_rate": 4.261547938449197e-06, "loss": 2.686, "step": 501500 }, { "epoch": 2.773833139017665, "grad_norm": 6.104116439819336, "learning_rate": 4.210487539830181e-06, "loss": 2.5986, "step": 502000 }, { "epoch": 2.776595921028639, "grad_norm": 6.0707597732543945, "learning_rate": 4.15932481576303e-06, "loss": 2.6303, "step": 502500 }, { "epoch": 2.7793587030396125, "grad_norm": 7.738794803619385, "learning_rate": 4.1081620916958805e-06, "loss": 2.6115, "step": 503000 }, { "epoch": 2.7821214850505864, "grad_norm": 6.483746528625488, "learning_rate": 4.056999367628731e-06, "loss": 2.6179, "step": 503500 }, { "epoch": 2.7848842670615603, "grad_norm": 8.825859069824219, "learning_rate": 4.005938969009715e-06, "loss": 2.6217, "step": 504000 }, { "epoch": 2.7876470490725342, "grad_norm": 6.525907039642334, "learning_rate": 3.954776244942565e-06, "loss": 2.6415, "step": 504500 }, { "epoch": 2.7904098310835077, "grad_norm": 6.188871383666992, "learning_rate": 3.903613520875415e-06, "loss": 2.629, "step": 505000 }, { "epoch": 2.7904098310835077, "eval_runtime": 1400.5744, "eval_samples_per_second": 258.432, "eval_steps_per_second": 32.305, "step": 505000 }, { "epoch": 2.7931726130944816, "grad_norm": 5.803805828094482, "learning_rate": 3.852450796808265e-06, "loss": 2.6107, "step": 505500 }, { "epoch": 2.7959353951054555, "grad_norm": 5.849481105804443, "learning_rate": 3.8013903981892486e-06, "loss": 2.6773, "step": 506000 }, { "epoch": 2.798698177116429, "grad_norm": 5.97512674331665, "learning_rate": 3.750227674122099e-06, "loss": 2.622, "step": 506500 }, { "epoch": 2.801460959127403, "grad_norm": 23.599891662597656, "learning_rate": 3.6990649500549487e-06, "loss": 2.6565, "step": 507000 }, { "epoch": 2.804223741138377, "grad_norm": 8.95606803894043, "learning_rate": 3.647902225987799e-06, "loss": 2.6444, "step": 507500 }, { "epoch": 2.8069865231493507, "grad_norm": 6.165973663330078, "learning_rate": 3.5967395019206493e-06, "loss": 2.6061, "step": 508000 }, { "epoch": 2.809749305160324, "grad_norm": 5.899477958679199, "learning_rate": 3.545679103301633e-06, "loss": 2.6525, "step": 508500 }, { "epoch": 2.812512087171298, "grad_norm": 12.357131958007812, "learning_rate": 3.4945163792344833e-06, "loss": 2.601, "step": 509000 }, { "epoch": 2.8152748691822715, "grad_norm": 5.383533000946045, "learning_rate": 3.4433536551673327e-06, "loss": 2.6151, "step": 509500 }, { "epoch": 2.8180376511932455, "grad_norm": 6.1412153244018555, "learning_rate": 3.392190931100183e-06, "loss": 2.6158, "step": 510000 }, { "epoch": 2.8180376511932455, "eval_runtime": 1441.3572, "eval_samples_per_second": 251.12, "eval_steps_per_second": 31.391, "step": 510000 }, { "epoch": 2.8208004332042194, "grad_norm": 9.799259185791016, "learning_rate": 3.3411305324811675e-06, "loss": 2.6548, "step": 510500 }, { "epoch": 2.8235632152151933, "grad_norm": 6.84127140045166, "learning_rate": 3.289967808414017e-06, "loss": 2.6121, "step": 511000 }, { "epoch": 2.8263259972261667, "grad_norm": 5.669933795928955, "learning_rate": 3.2388050843468672e-06, "loss": 2.647, "step": 511500 }, { "epoch": 2.8290887792371406, "grad_norm": 4.693601608276367, "learning_rate": 3.187744685727851e-06, "loss": 2.6306, "step": 512000 }, { "epoch": 2.8318515612481145, "grad_norm": 4.971369743347168, "learning_rate": 3.1365819616607012e-06, "loss": 2.6419, "step": 512500 }, { "epoch": 2.834614343259088, "grad_norm": 6.460732460021973, "learning_rate": 3.085419237593551e-06, "loss": 2.5998, "step": 513000 }, { "epoch": 2.837377125270062, "grad_norm": 7.711912155151367, "learning_rate": 3.0342565135264013e-06, "loss": 2.6082, "step": 513500 }, { "epoch": 2.840139907281036, "grad_norm": 6.650845527648926, "learning_rate": 2.983093789459251e-06, "loss": 2.6067, "step": 514000 }, { "epoch": 2.8429026892920097, "grad_norm": 6.2975664138793945, "learning_rate": 2.931931065392101e-06, "loss": 2.6717, "step": 514500 }, { "epoch": 2.845665471302983, "grad_norm": 8.976877212524414, "learning_rate": 2.880768341324951e-06, "loss": 2.6582, "step": 515000 }, { "epoch": 2.845665471302983, "eval_runtime": 1422.7904, "eval_samples_per_second": 254.397, "eval_steps_per_second": 31.8, "step": 515000 }, { "epoch": 2.848428253313957, "grad_norm": 7.009723663330078, "learning_rate": 2.8296056172578008e-06, "loss": 2.6485, "step": 515500 }, { "epoch": 2.8511910353249306, "grad_norm": 12.484458923339844, "learning_rate": 2.7785452186387853e-06, "loss": 2.5926, "step": 516000 }, { "epoch": 2.8539538173359045, "grad_norm": 8.153068542480469, "learning_rate": 2.727382494571635e-06, "loss": 2.6376, "step": 516500 }, { "epoch": 2.8567165993468784, "grad_norm": 8.453513145446777, "learning_rate": 2.676219770504485e-06, "loss": 2.5791, "step": 517000 }, { "epoch": 2.8594793813578523, "grad_norm": 5.08599853515625, "learning_rate": 2.6250570464373353e-06, "loss": 2.6142, "step": 517500 }, { "epoch": 2.8622421633688258, "grad_norm": 6.9135589599609375, "learning_rate": 2.5740989732664537e-06, "loss": 2.6435, "step": 518000 }, { "epoch": 2.8650049453797997, "grad_norm": 7.915692329406738, "learning_rate": 2.5229362491993036e-06, "loss": 2.6459, "step": 518500 }, { "epoch": 2.8677677273907736, "grad_norm": 6.149202823638916, "learning_rate": 2.4717735251321534e-06, "loss": 2.6444, "step": 519000 }, { "epoch": 2.870530509401747, "grad_norm": 5.931761741638184, "learning_rate": 2.4206108010650033e-06, "loss": 2.6396, "step": 519500 }, { "epoch": 2.873293291412721, "grad_norm": 7.582653045654297, "learning_rate": 2.3694480769978536e-06, "loss": 2.6285, "step": 520000 }, { "epoch": 2.873293291412721, "eval_runtime": 1392.8266, "eval_samples_per_second": 259.869, "eval_steps_per_second": 32.484, "step": 520000 }, { "epoch": 2.876056073423695, "grad_norm": 5.626070976257324, "learning_rate": 2.3182853529307034e-06, "loss": 2.5947, "step": 520500 }, { "epoch": 2.8788188554346688, "grad_norm": 6.177218914031982, "learning_rate": 2.2671226288635533e-06, "loss": 2.6351, "step": 521000 }, { "epoch": 2.881581637445642, "grad_norm": 7.875889778137207, "learning_rate": 2.215959904796403e-06, "loss": 2.6291, "step": 521500 }, { "epoch": 2.884344419456616, "grad_norm": 6.308676242828369, "learning_rate": 2.164797180729253e-06, "loss": 2.5921, "step": 522000 }, { "epoch": 2.8871072014675896, "grad_norm": 6.715153217315674, "learning_rate": 2.1137367821102375e-06, "loss": 2.6217, "step": 522500 }, { "epoch": 2.8898699834785635, "grad_norm": 6.99116849899292, "learning_rate": 2.0625740580430874e-06, "loss": 2.645, "step": 523000 }, { "epoch": 2.8926327654895374, "grad_norm": 7.715075969696045, "learning_rate": 2.0114113339759372e-06, "loss": 2.6156, "step": 523500 }, { "epoch": 2.8953955475005113, "grad_norm": 8.207829475402832, "learning_rate": 1.9602486099087875e-06, "loss": 2.6027, "step": 524000 }, { "epoch": 2.8981583295114848, "grad_norm": 6.43826150894165, "learning_rate": 1.9091882112897712e-06, "loss": 2.6053, "step": 524500 }, { "epoch": 2.9009211115224587, "grad_norm": 9.398484230041504, "learning_rate": 1.8580254872226213e-06, "loss": 2.5925, "step": 525000 }, { "epoch": 2.9009211115224587, "eval_runtime": 1412.1705, "eval_samples_per_second": 256.31, "eval_steps_per_second": 32.039, "step": 525000 }, { "epoch": 2.903683893533432, "grad_norm": 6.568800926208496, "learning_rate": 1.8068627631554713e-06, "loss": 2.5867, "step": 525500 }, { "epoch": 2.906446675544406, "grad_norm": 6.8241963386535645, "learning_rate": 1.7557000390883212e-06, "loss": 2.5929, "step": 526000 }, { "epoch": 2.90920945755538, "grad_norm": 4.7916436195373535, "learning_rate": 1.7046396404693056e-06, "loss": 2.6441, "step": 526500 }, { "epoch": 2.911972239566354, "grad_norm": 8.739401817321777, "learning_rate": 1.6534769164021556e-06, "loss": 2.6121, "step": 527000 }, { "epoch": 2.914735021577328, "grad_norm": 8.750603675842285, "learning_rate": 1.6023141923350055e-06, "loss": 2.6541, "step": 527500 }, { "epoch": 2.9174978035883012, "grad_norm": 7.22824239730835, "learning_rate": 1.5511514682678553e-06, "loss": 2.6455, "step": 528000 }, { "epoch": 2.920260585599275, "grad_norm": 8.114001274108887, "learning_rate": 1.4999887442007054e-06, "loss": 2.5991, "step": 528500 }, { "epoch": 2.9230233676102486, "grad_norm": 4.913358688354492, "learning_rate": 1.4488260201335552e-06, "loss": 2.6597, "step": 529000 }, { "epoch": 2.9257861496212225, "grad_norm": 4.944784164428711, "learning_rate": 1.397663296066405e-06, "loss": 2.5788, "step": 529500 }, { "epoch": 2.9285489316321964, "grad_norm": 6.435703754425049, "learning_rate": 1.3465005719992551e-06, "loss": 2.5839, "step": 530000 }, { "epoch": 2.9285489316321964, "eval_runtime": 1465.041, "eval_samples_per_second": 247.06, "eval_steps_per_second": 30.883, "step": 530000 }, { "epoch": 2.9313117136431703, "grad_norm": 6.146181583404541, "learning_rate": 1.295337847932105e-06, "loss": 2.5753, "step": 530500 }, { "epoch": 2.934074495654144, "grad_norm": 6.585812568664551, "learning_rate": 1.2442774493130894e-06, "loss": 2.6388, "step": 531000 }, { "epoch": 2.9368372776651177, "grad_norm": 5.040027618408203, "learning_rate": 1.1931147252459392e-06, "loss": 2.6404, "step": 531500 }, { "epoch": 2.939600059676091, "grad_norm": 6.62531042098999, "learning_rate": 1.141952001178789e-06, "loss": 2.6357, "step": 532000 }, { "epoch": 2.942362841687065, "grad_norm": 6.749676704406738, "learning_rate": 1.0907892771116393e-06, "loss": 2.6343, "step": 532500 }, { "epoch": 2.945125623698039, "grad_norm": 6.896877765655518, "learning_rate": 1.0397288784926235e-06, "loss": 2.5909, "step": 533000 }, { "epoch": 2.947888405709013, "grad_norm": 5.896803855895996, "learning_rate": 9.885661544254733e-07, "loss": 2.6058, "step": 533500 }, { "epoch": 2.9506511877199864, "grad_norm": 7.961572170257568, "learning_rate": 9.374034303583234e-07, "loss": 2.5977, "step": 534000 }, { "epoch": 2.9534139697309603, "grad_norm": 5.0403594970703125, "learning_rate": 8.862407062911732e-07, "loss": 2.6199, "step": 534500 }, { "epoch": 2.956176751741934, "grad_norm": 5.320588111877441, "learning_rate": 8.351803076721575e-07, "loss": 2.6392, "step": 535000 }, { "epoch": 2.956176751741934, "eval_runtime": 1384.9303, "eval_samples_per_second": 261.351, "eval_steps_per_second": 32.67, "step": 535000 }, { "epoch": 2.9589395337529076, "grad_norm": 6.019600868225098, "learning_rate": 7.840175836050074e-07, "loss": 2.6634, "step": 535500 }, { "epoch": 2.9617023157638815, "grad_norm": 8.975756645202637, "learning_rate": 7.328548595378574e-07, "loss": 2.6451, "step": 536000 }, { "epoch": 2.9644650977748555, "grad_norm": 5.559593200683594, "learning_rate": 6.816921354707074e-07, "loss": 2.579, "step": 536500 }, { "epoch": 2.9672278797858294, "grad_norm": 5.876176834106445, "learning_rate": 6.305294114035572e-07, "loss": 2.6417, "step": 537000 }, { "epoch": 2.969990661796803, "grad_norm": 6.175969123840332, "learning_rate": 5.793666873364073e-07, "loss": 2.6182, "step": 537500 }, { "epoch": 2.9727534438077767, "grad_norm": 7.48173713684082, "learning_rate": 5.283062887173914e-07, "loss": 2.6455, "step": 538000 }, { "epoch": 2.97551622581875, "grad_norm": 7.331089496612549, "learning_rate": 4.771435646502415e-07, "loss": 2.625, "step": 538500 }, { "epoch": 2.978279007829724, "grad_norm": 4.610020637512207, "learning_rate": 4.259808405830914e-07, "loss": 2.6334, "step": 539000 }, { "epoch": 2.981041789840698, "grad_norm": 8.355249404907227, "learning_rate": 3.748181165159413e-07, "loss": 2.6034, "step": 539500 }, { "epoch": 2.983804571851672, "grad_norm": 6.447065830230713, "learning_rate": 3.2365539244879124e-07, "loss": 2.6019, "step": 540000 }, { "epoch": 2.983804571851672, "eval_runtime": 1372.3117, "eval_samples_per_second": 263.754, "eval_steps_per_second": 32.97, "step": 540000 }, { "epoch": 2.9865673538626454, "grad_norm": 8.215147018432617, "learning_rate": 2.724926683816412e-07, "loss": 2.6186, "step": 540500 }, { "epoch": 2.9893301358736193, "grad_norm": 5.153554439544678, "learning_rate": 2.2132994431449112e-07, "loss": 2.6318, "step": 541000 }, { "epoch": 2.992092917884593, "grad_norm": 6.764389991760254, "learning_rate": 1.7016722024734108e-07, "loss": 2.6113, "step": 541500 }, { "epoch": 2.9948556998955667, "grad_norm": 8.090177536010742, "learning_rate": 1.1910682162832533e-07, "loss": 2.6307, "step": 542000 }, { "epoch": 2.9976184819065406, "grad_norm": 10.311904907226562, "learning_rate": 6.794409756117527e-08, "loss": 2.6157, "step": 542500 } ], "logging_steps": 500, "max_steps": 542931, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8505448008628067e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }