diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13523 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 192734, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010376996274658336, + "grad_norm": 2.8394250869750977, + "learning_rate": 0.0002998972677368809, + "loss": 7.61689697265625, + "step": 100 + }, + { + "epoch": 0.0020753992549316673, + "grad_norm": 1.2515239715576172, + "learning_rate": 0.00029979349777413427, + "loss": 6.9781201171875, + "step": 200 + }, + { + "epoch": 0.0031130988823975013, + "grad_norm": 3.6369314193725586, + "learning_rate": 0.0002996897278113877, + "loss": 6.69011474609375, + "step": 300 + }, + { + "epoch": 0.0041507985098633345, + "grad_norm": 2.6945459842681885, + "learning_rate": 0.0002995859578486411, + "loss": 6.55205078125, + "step": 400 + }, + { + "epoch": 0.005188498137329169, + "grad_norm": 1.4870922565460205, + "learning_rate": 0.0002994821878858945, + "loss": 6.272236938476563, + "step": 500 + }, + { + "epoch": 0.006226197764795003, + "grad_norm": 2.198580265045166, + "learning_rate": 0.00029937841792314796, + "loss": 6.2509613037109375, + "step": 600 + }, + { + "epoch": 0.007263897392260836, + "grad_norm": 1.332912564277649, + "learning_rate": 0.00029927464796040135, + "loss": 6.2750787353515625, + "step": 700 + }, + { + "epoch": 0.008301597019726669, + "grad_norm": 1.6891261339187622, + "learning_rate": 0.0002991708779976548, + "loss": 6.012156372070312, + "step": 800 + }, + { + "epoch": 0.009339296647192503, + "grad_norm": 2.389779806137085, + "learning_rate": 0.0002990671080349082, + "loss": 6.011610717773437, + "step": 900 + }, + { + "epoch": 0.010376996274658337, + "grad_norm": 3.896207332611084, + "learning_rate": 0.0002989633380721616, + "loss": 5.872296752929688, + "step": 1000 + }, + { + "epoch": 0.011414695902124171, + "grad_norm": 1.2714102268218994, + "learning_rate": 0.00029885956810941504, + "loss": 5.8444580078125, + "step": 1100 + }, + { + "epoch": 0.012452395529590005, + "grad_norm": 1.9793014526367188, + "learning_rate": 0.00029875579814666844, + "loss": 5.780259399414063, + "step": 1200 + }, + { + "epoch": 0.01349009515705584, + "grad_norm": 1.7210673093795776, + "learning_rate": 0.0002986520281839219, + "loss": 5.784580688476563, + "step": 1300 + }, + { + "epoch": 0.014527794784521672, + "grad_norm": 3.133103609085083, + "learning_rate": 0.0002985482582211753, + "loss": 5.726546020507812, + "step": 1400 + }, + { + "epoch": 0.015565494411987506, + "grad_norm": 3.7988669872283936, + "learning_rate": 0.0002984444882584287, + "loss": 5.659859619140625, + "step": 1500 + }, + { + "epoch": 0.016603194039453338, + "grad_norm": 1.580628514289856, + "learning_rate": 0.00029834071829568207, + "loss": 5.710869140625, + "step": 1600 + }, + { + "epoch": 0.017640893666919174, + "grad_norm": 2.1428017616271973, + "learning_rate": 0.0002982369483329355, + "loss": 5.61485107421875, + "step": 1700 + }, + { + "epoch": 0.018678593294385006, + "grad_norm": 1.9413044452667236, + "learning_rate": 0.00029813317837018897, + "loss": 5.542117309570313, + "step": 1800 + }, + { + "epoch": 0.019716292921850842, + "grad_norm": 1.9118558168411255, + "learning_rate": 0.00029802940840744236, + "loss": 5.524238891601563, + "step": 1900 + }, + { + "epoch": 0.020753992549316674, + "grad_norm": 1.9226549863815308, + "learning_rate": 0.00029792563844469576, + "loss": 5.544407348632813, + "step": 2000 + }, + { + "epoch": 0.02179169217678251, + "grad_norm": 3.6845390796661377, + "learning_rate": 0.00029782186848194915, + "loss": 5.507258911132812, + "step": 2100 + }, + { + "epoch": 0.022829391804248342, + "grad_norm": 1.113272786140442, + "learning_rate": 0.0002977180985192026, + "loss": 5.420562133789063, + "step": 2200 + }, + { + "epoch": 0.023867091431714175, + "grad_norm": 1.05723237991333, + "learning_rate": 0.00029761432855645605, + "loss": 5.467652587890625, + "step": 2300 + }, + { + "epoch": 0.02490479105918001, + "grad_norm": 3.3967299461364746, + "learning_rate": 0.00029751055859370944, + "loss": 5.412258911132812, + "step": 2400 + }, + { + "epoch": 0.025942490686645843, + "grad_norm": 2.4142208099365234, + "learning_rate": 0.0002974067886309629, + "loss": 5.421605224609375, + "step": 2500 + }, + { + "epoch": 0.02698019031411168, + "grad_norm": 1.577314853668213, + "learning_rate": 0.0002973030186682163, + "loss": 5.2732666015625, + "step": 2600 + }, + { + "epoch": 0.02801788994157751, + "grad_norm": 2.5680480003356934, + "learning_rate": 0.0002971992487054697, + "loss": 5.42623779296875, + "step": 2700 + }, + { + "epoch": 0.029055589569043343, + "grad_norm": 1.665701150894165, + "learning_rate": 0.0002970954787427231, + "loss": 5.345192260742188, + "step": 2800 + }, + { + "epoch": 0.03009328919650918, + "grad_norm": 1.3420246839523315, + "learning_rate": 0.0002969917087799765, + "loss": 5.259754028320312, + "step": 2900 + }, + { + "epoch": 0.03113098882397501, + "grad_norm": 1.4943575859069824, + "learning_rate": 0.00029688793881723, + "loss": 5.325694580078125, + "step": 3000 + }, + { + "epoch": 0.032168688451440844, + "grad_norm": 1.7797436714172363, + "learning_rate": 0.00029678416885448337, + "loss": 5.393818359375, + "step": 3100 + }, + { + "epoch": 0.033206388078906676, + "grad_norm": 3.023359537124634, + "learning_rate": 0.00029668039889173677, + "loss": 5.23187255859375, + "step": 3200 + }, + { + "epoch": 0.034244087706372515, + "grad_norm": 1.9899531602859497, + "learning_rate": 0.00029657662892899016, + "loss": 5.1434765625, + "step": 3300 + }, + { + "epoch": 0.03528178733383835, + "grad_norm": 1.0039557218551636, + "learning_rate": 0.0002964728589662436, + "loss": 5.28422607421875, + "step": 3400 + }, + { + "epoch": 0.03631948696130418, + "grad_norm": 1.9204686880111694, + "learning_rate": 0.000296369089003497, + "loss": 5.149194946289063, + "step": 3500 + }, + { + "epoch": 0.03735718658877001, + "grad_norm": 1.5530883073806763, + "learning_rate": 0.00029626531904075045, + "loss": 5.0889456176757815, + "step": 3600 + }, + { + "epoch": 0.03839488621623585, + "grad_norm": 1.4477442502975464, + "learning_rate": 0.00029616154907800385, + "loss": 5.225645751953125, + "step": 3700 + }, + { + "epoch": 0.039432585843701684, + "grad_norm": 2.998966693878174, + "learning_rate": 0.00029605777911525724, + "loss": 5.127691650390625, + "step": 3800 + }, + { + "epoch": 0.040470285471167516, + "grad_norm": 1.1760146617889404, + "learning_rate": 0.0002959540091525107, + "loss": 5.099805908203125, + "step": 3900 + }, + { + "epoch": 0.04150798509863335, + "grad_norm": 1.6684191226959229, + "learning_rate": 0.0002958502391897641, + "loss": 5.195625, + "step": 4000 + }, + { + "epoch": 0.04254568472609918, + "grad_norm": 3.276620864868164, + "learning_rate": 0.00029574646922701754, + "loss": 5.0514678955078125, + "step": 4100 + }, + { + "epoch": 0.04358338435356502, + "grad_norm": 1.505712628364563, + "learning_rate": 0.00029564269926427093, + "loss": 5.234470825195313, + "step": 4200 + }, + { + "epoch": 0.04462108398103085, + "grad_norm": 1.561785101890564, + "learning_rate": 0.0002955389293015243, + "loss": 5.18435302734375, + "step": 4300 + }, + { + "epoch": 0.045658783608496685, + "grad_norm": 2.103935956954956, + "learning_rate": 0.0002954351593387778, + "loss": 5.127916259765625, + "step": 4400 + }, + { + "epoch": 0.04669648323596252, + "grad_norm": 1.1984394788742065, + "learning_rate": 0.00029533138937603117, + "loss": 5.009371032714844, + "step": 4500 + }, + { + "epoch": 0.04773418286342835, + "grad_norm": 1.35122549533844, + "learning_rate": 0.0002952276194132846, + "loss": 4.988144836425781, + "step": 4600 + }, + { + "epoch": 0.04877188249089419, + "grad_norm": 1.7199909687042236, + "learning_rate": 0.000295123849450538, + "loss": 5.139700317382813, + "step": 4700 + }, + { + "epoch": 0.04980958211836002, + "grad_norm": 2.299783706665039, + "learning_rate": 0.00029502007948779146, + "loss": 5.189196166992187, + "step": 4800 + }, + { + "epoch": 0.050847281745825854, + "grad_norm": 1.251342535018921, + "learning_rate": 0.00029491630952504486, + "loss": 5.0067724609375, + "step": 4900 + }, + { + "epoch": 0.051884981373291686, + "grad_norm": 1.7228055000305176, + "learning_rate": 0.00029481253956229825, + "loss": 5.058696594238281, + "step": 5000 + }, + { + "epoch": 0.05292268100075752, + "grad_norm": 1.2999722957611084, + "learning_rate": 0.0002947087695995517, + "loss": 4.953595275878906, + "step": 5100 + }, + { + "epoch": 0.05396038062822336, + "grad_norm": 2.576788902282715, + "learning_rate": 0.0002946049996368051, + "loss": 4.935113220214844, + "step": 5200 + }, + { + "epoch": 0.05499808025568919, + "grad_norm": 3.006600856781006, + "learning_rate": 0.00029450122967405854, + "loss": 5.13054931640625, + "step": 5300 + }, + { + "epoch": 0.05603577988315502, + "grad_norm": 1.5450797080993652, + "learning_rate": 0.00029439745971131194, + "loss": 4.888633117675782, + "step": 5400 + }, + { + "epoch": 0.057073479510620855, + "grad_norm": 1.9071307182312012, + "learning_rate": 0.00029429368974856533, + "loss": 4.968219299316406, + "step": 5500 + }, + { + "epoch": 0.05811117913808669, + "grad_norm": 1.2374857664108276, + "learning_rate": 0.0002941899197858188, + "loss": 5.0035269165039065, + "step": 5600 + }, + { + "epoch": 0.059148878765552526, + "grad_norm": 1.270337462425232, + "learning_rate": 0.0002940861498230722, + "loss": 4.9964404296875, + "step": 5700 + }, + { + "epoch": 0.06018657839301836, + "grad_norm": 2.112285614013672, + "learning_rate": 0.0002939823798603256, + "loss": 4.882070007324219, + "step": 5800 + }, + { + "epoch": 0.06122427802048419, + "grad_norm": 1.2048200368881226, + "learning_rate": 0.000293878609897579, + "loss": 4.689561767578125, + "step": 5900 + }, + { + "epoch": 0.06226197764795002, + "grad_norm": 1.213274359703064, + "learning_rate": 0.0002937748399348324, + "loss": 4.969376525878906, + "step": 6000 + }, + { + "epoch": 0.06329967727541586, + "grad_norm": 1.1453360319137573, + "learning_rate": 0.00029367106997208587, + "loss": 4.848797302246094, + "step": 6100 + }, + { + "epoch": 0.06433737690288169, + "grad_norm": 1.78568696975708, + "learning_rate": 0.00029356730000933926, + "loss": 4.889250793457031, + "step": 6200 + }, + { + "epoch": 0.06537507653034752, + "grad_norm": 1.004668951034546, + "learning_rate": 0.0002934635300465927, + "loss": 4.881064758300782, + "step": 6300 + }, + { + "epoch": 0.06641277615781335, + "grad_norm": 3.34089994430542, + "learning_rate": 0.0002933597600838461, + "loss": 4.922989501953125, + "step": 6400 + }, + { + "epoch": 0.0674504757852792, + "grad_norm": 1.7132960557937622, + "learning_rate": 0.00029325599012109955, + "loss": 4.900790405273438, + "step": 6500 + }, + { + "epoch": 0.06848817541274503, + "grad_norm": 3.6154215335845947, + "learning_rate": 0.00029315222015835295, + "loss": 4.858998718261719, + "step": 6600 + }, + { + "epoch": 0.06952587504021086, + "grad_norm": 2.199787139892578, + "learning_rate": 0.00029304845019560634, + "loss": 4.776265258789063, + "step": 6700 + }, + { + "epoch": 0.0705635746676767, + "grad_norm": 1.193831443786621, + "learning_rate": 0.0002929446802328598, + "loss": 4.933597717285156, + "step": 6800 + }, + { + "epoch": 0.07160127429514253, + "grad_norm": 1.0364950895309448, + "learning_rate": 0.0002928409102701132, + "loss": 4.812368469238281, + "step": 6900 + }, + { + "epoch": 0.07263897392260836, + "grad_norm": 4.54287576675415, + "learning_rate": 0.00029273714030736664, + "loss": 4.874449157714844, + "step": 7000 + }, + { + "epoch": 0.07367667355007419, + "grad_norm": 1.9481868743896484, + "learning_rate": 0.00029263337034462003, + "loss": 4.836025390625, + "step": 7100 + }, + { + "epoch": 0.07471437317754002, + "grad_norm": 1.5283995866775513, + "learning_rate": 0.0002925296003818734, + "loss": 4.789447631835937, + "step": 7200 + }, + { + "epoch": 0.07575207280500586, + "grad_norm": 1.1243209838867188, + "learning_rate": 0.0002924258304191268, + "loss": 4.771495971679688, + "step": 7300 + }, + { + "epoch": 0.0767897724324717, + "grad_norm": 1.2010672092437744, + "learning_rate": 0.00029232206045638027, + "loss": 4.796032104492188, + "step": 7400 + }, + { + "epoch": 0.07782747205993754, + "grad_norm": 1.3179821968078613, + "learning_rate": 0.0002922182904936337, + "loss": 4.949848022460937, + "step": 7500 + }, + { + "epoch": 0.07886517168740337, + "grad_norm": 2.766585111618042, + "learning_rate": 0.0002921145205308871, + "loss": 4.7913055419921875, + "step": 7600 + }, + { + "epoch": 0.0799028713148692, + "grad_norm": 1.301639437675476, + "learning_rate": 0.0002920107505681405, + "loss": 4.828057556152344, + "step": 7700 + }, + { + "epoch": 0.08094057094233503, + "grad_norm": 1.205676794052124, + "learning_rate": 0.0002919069806053939, + "loss": 4.7562734985351565, + "step": 7800 + }, + { + "epoch": 0.08197827056980087, + "grad_norm": 2.1412694454193115, + "learning_rate": 0.00029180321064264735, + "loss": 4.7240576171875, + "step": 7900 + }, + { + "epoch": 0.0830159701972667, + "grad_norm": 1.9297393560409546, + "learning_rate": 0.0002916994406799008, + "loss": 4.752750244140625, + "step": 8000 + }, + { + "epoch": 0.08405366982473253, + "grad_norm": 1.5971039533615112, + "learning_rate": 0.0002915956707171542, + "loss": 4.7790225219726565, + "step": 8100 + }, + { + "epoch": 0.08509136945219836, + "grad_norm": 1.4667614698410034, + "learning_rate": 0.0002914919007544076, + "loss": 4.823405151367187, + "step": 8200 + }, + { + "epoch": 0.0861290690796642, + "grad_norm": 1.8018951416015625, + "learning_rate": 0.000291388130791661, + "loss": 4.806950378417969, + "step": 8300 + }, + { + "epoch": 0.08716676870713004, + "grad_norm": 3.0917904376983643, + "learning_rate": 0.00029128436082891443, + "loss": 4.716513977050782, + "step": 8400 + }, + { + "epoch": 0.08820446833459587, + "grad_norm": 1.8211461305618286, + "learning_rate": 0.00029118059086616783, + "loss": 4.803590393066406, + "step": 8500 + }, + { + "epoch": 0.0892421679620617, + "grad_norm": 1.4940656423568726, + "learning_rate": 0.0002910768209034213, + "loss": 4.682643737792969, + "step": 8600 + }, + { + "epoch": 0.09027986758952754, + "grad_norm": 1.432560682296753, + "learning_rate": 0.00029097305094067473, + "loss": 4.758638610839844, + "step": 8700 + }, + { + "epoch": 0.09131756721699337, + "grad_norm": 1.0015602111816406, + "learning_rate": 0.0002908692809779281, + "loss": 4.829322204589844, + "step": 8800 + }, + { + "epoch": 0.0923552668444592, + "grad_norm": 1.3050769567489624, + "learning_rate": 0.0002907655110151815, + "loss": 4.62219482421875, + "step": 8900 + }, + { + "epoch": 0.09339296647192503, + "grad_norm": 1.0704928636550903, + "learning_rate": 0.0002906617410524349, + "loss": 4.6304998779296875, + "step": 9000 + }, + { + "epoch": 0.09443066609939087, + "grad_norm": 2.2267684936523438, + "learning_rate": 0.00029055797108968836, + "loss": 4.664536437988281, + "step": 9100 + }, + { + "epoch": 0.0954683657268567, + "grad_norm": 2.4608747959136963, + "learning_rate": 0.00029045420112694176, + "loss": 4.759125366210937, + "step": 9200 + }, + { + "epoch": 0.09650606535432253, + "grad_norm": 1.5068875551223755, + "learning_rate": 0.0002903504311641952, + "loss": 4.665271606445312, + "step": 9300 + }, + { + "epoch": 0.09754376498178838, + "grad_norm": 2.078646421432495, + "learning_rate": 0.0002902466612014486, + "loss": 4.739638671875, + "step": 9400 + }, + { + "epoch": 0.09858146460925421, + "grad_norm": 1.3762885332107544, + "learning_rate": 0.000290142891238702, + "loss": 4.698047485351562, + "step": 9500 + }, + { + "epoch": 0.09961916423672004, + "grad_norm": 1.2879425287246704, + "learning_rate": 0.00029003912127595544, + "loss": 4.619927673339844, + "step": 9600 + }, + { + "epoch": 0.10065686386418587, + "grad_norm": 1.584159016609192, + "learning_rate": 0.00028993535131320884, + "loss": 4.748394165039063, + "step": 9700 + }, + { + "epoch": 0.10169456349165171, + "grad_norm": 1.453415870666504, + "learning_rate": 0.0002898315813504623, + "loss": 4.62876220703125, + "step": 9800 + }, + { + "epoch": 0.10273226311911754, + "grad_norm": 0.965919017791748, + "learning_rate": 0.0002897278113877157, + "loss": 4.665562438964844, + "step": 9900 + }, + { + "epoch": 0.10376996274658337, + "grad_norm": 1.2607330083847046, + "learning_rate": 0.0002896240414249691, + "loss": 4.7940805053710935, + "step": 10000 + }, + { + "epoch": 0.1048076623740492, + "grad_norm": 1.0126069784164429, + "learning_rate": 0.0002895202714622225, + "loss": 4.7508541870117185, + "step": 10100 + }, + { + "epoch": 0.10584536200151504, + "grad_norm": 1.541813850402832, + "learning_rate": 0.0002894165014994759, + "loss": 4.57702880859375, + "step": 10200 + }, + { + "epoch": 0.10688306162898087, + "grad_norm": 2.78938889503479, + "learning_rate": 0.00028931273153672937, + "loss": 4.652121887207032, + "step": 10300 + }, + { + "epoch": 0.10792076125644672, + "grad_norm": 2.3567938804626465, + "learning_rate": 0.00028920896157398276, + "loss": 4.566509094238281, + "step": 10400 + }, + { + "epoch": 0.10895846088391255, + "grad_norm": 1.0480419397354126, + "learning_rate": 0.0002891051916112362, + "loss": 4.611513977050781, + "step": 10500 + }, + { + "epoch": 0.10999616051137838, + "grad_norm": 1.577042579650879, + "learning_rate": 0.0002890014216484896, + "loss": 4.62977783203125, + "step": 10600 + }, + { + "epoch": 0.11103386013884421, + "grad_norm": 1.5839786529541016, + "learning_rate": 0.000288897651685743, + "loss": 4.569055786132813, + "step": 10700 + }, + { + "epoch": 0.11207155976631004, + "grad_norm": 3.9769680500030518, + "learning_rate": 0.00028879388172299645, + "loss": 4.6786282348632815, + "step": 10800 + }, + { + "epoch": 0.11310925939377588, + "grad_norm": 1.8089715242385864, + "learning_rate": 0.00028869011176024985, + "loss": 4.630350036621094, + "step": 10900 + }, + { + "epoch": 0.11414695902124171, + "grad_norm": 1.4216063022613525, + "learning_rate": 0.0002885863417975033, + "loss": 4.669395751953125, + "step": 11000 + }, + { + "epoch": 0.11518465864870754, + "grad_norm": 1.2107151746749878, + "learning_rate": 0.0002884825718347567, + "loss": 4.612738342285156, + "step": 11100 + }, + { + "epoch": 0.11622235827617337, + "grad_norm": 1.5037158727645874, + "learning_rate": 0.0002883788018720101, + "loss": 4.534631958007813, + "step": 11200 + }, + { + "epoch": 0.1172600579036392, + "grad_norm": 1.1375142335891724, + "learning_rate": 0.00028827503190926353, + "loss": 4.803286437988281, + "step": 11300 + }, + { + "epoch": 0.11829775753110505, + "grad_norm": 1.8553053140640259, + "learning_rate": 0.00028817126194651693, + "loss": 4.684965515136719, + "step": 11400 + }, + { + "epoch": 0.11933545715857088, + "grad_norm": 5.896717071533203, + "learning_rate": 0.0002880674919837704, + "loss": 4.533707275390625, + "step": 11500 + }, + { + "epoch": 0.12037315678603672, + "grad_norm": 0.9495351910591125, + "learning_rate": 0.0002879637220210238, + "loss": 4.481864013671875, + "step": 11600 + }, + { + "epoch": 0.12141085641350255, + "grad_norm": 1.2148685455322266, + "learning_rate": 0.00028785995205827717, + "loss": 4.508511047363282, + "step": 11700 + }, + { + "epoch": 0.12244855604096838, + "grad_norm": 1.2658835649490356, + "learning_rate": 0.0002877561820955306, + "loss": 4.453274841308594, + "step": 11800 + }, + { + "epoch": 0.12348625566843421, + "grad_norm": 1.0808942317962646, + "learning_rate": 0.000287652412132784, + "loss": 4.470396118164063, + "step": 11900 + }, + { + "epoch": 0.12452395529590005, + "grad_norm": 2.0280075073242188, + "learning_rate": 0.00028754864217003746, + "loss": 4.629884643554687, + "step": 12000 + }, + { + "epoch": 0.12556165492336588, + "grad_norm": 1.6987171173095703, + "learning_rate": 0.00028744487220729086, + "loss": 4.673434143066406, + "step": 12100 + }, + { + "epoch": 0.1265993545508317, + "grad_norm": 1.076246976852417, + "learning_rate": 0.00028734110224454425, + "loss": 4.707933349609375, + "step": 12200 + }, + { + "epoch": 0.12763705417829754, + "grad_norm": 1.4878133535385132, + "learning_rate": 0.00028723733228179765, + "loss": 4.649747924804688, + "step": 12300 + }, + { + "epoch": 0.12867475380576338, + "grad_norm": 1.132073163986206, + "learning_rate": 0.0002871335623190511, + "loss": 4.510395812988281, + "step": 12400 + }, + { + "epoch": 0.1297124534332292, + "grad_norm": 1.172968864440918, + "learning_rate": 0.00028702979235630454, + "loss": 4.7042324829101565, + "step": 12500 + }, + { + "epoch": 0.13075015306069504, + "grad_norm": 1.331409215927124, + "learning_rate": 0.00028692602239355794, + "loss": 4.478284912109375, + "step": 12600 + }, + { + "epoch": 0.13178785268816087, + "grad_norm": 0.9544440507888794, + "learning_rate": 0.0002868222524308114, + "loss": 4.574405517578125, + "step": 12700 + }, + { + "epoch": 0.1328255523156267, + "grad_norm": 1.3560587167739868, + "learning_rate": 0.0002867184824680648, + "loss": 4.359691467285156, + "step": 12800 + }, + { + "epoch": 0.13386325194309256, + "grad_norm": 1.4807325601577759, + "learning_rate": 0.0002866147125053182, + "loss": 4.541731872558594, + "step": 12900 + }, + { + "epoch": 0.1349009515705584, + "grad_norm": 1.0621514320373535, + "learning_rate": 0.00028651094254257157, + "loss": 4.442927551269531, + "step": 13000 + }, + { + "epoch": 0.13593865119802423, + "grad_norm": 0.9886642098426819, + "learning_rate": 0.000286407172579825, + "loss": 4.690697326660156, + "step": 13100 + }, + { + "epoch": 0.13697635082549006, + "grad_norm": 1.9239803552627563, + "learning_rate": 0.00028630340261707847, + "loss": 4.497586669921875, + "step": 13200 + }, + { + "epoch": 0.1380140504529559, + "grad_norm": 1.644500494003296, + "learning_rate": 0.00028619963265433186, + "loss": 4.598764038085937, + "step": 13300 + }, + { + "epoch": 0.13905175008042173, + "grad_norm": 1.3600581884384155, + "learning_rate": 0.00028609586269158526, + "loss": 4.550304260253906, + "step": 13400 + }, + { + "epoch": 0.14008944970788756, + "grad_norm": 1.4329279661178589, + "learning_rate": 0.00028599209272883865, + "loss": 4.506571960449219, + "step": 13500 + }, + { + "epoch": 0.1411271493353534, + "grad_norm": 1.386486291885376, + "learning_rate": 0.0002858883227660921, + "loss": 4.419360046386719, + "step": 13600 + }, + { + "epoch": 0.14216484896281922, + "grad_norm": 0.9777548909187317, + "learning_rate": 0.00028578455280334555, + "loss": 4.371921691894531, + "step": 13700 + }, + { + "epoch": 0.14320254859028506, + "grad_norm": 1.323614239692688, + "learning_rate": 0.00028568078284059895, + "loss": 4.449886474609375, + "step": 13800 + }, + { + "epoch": 0.1442402482177509, + "grad_norm": 2.0104715824127197, + "learning_rate": 0.00028557701287785234, + "loss": 4.498194885253906, + "step": 13900 + }, + { + "epoch": 0.14527794784521672, + "grad_norm": 1.040453314781189, + "learning_rate": 0.00028547324291510574, + "loss": 4.410159301757813, + "step": 14000 + }, + { + "epoch": 0.14631564747268255, + "grad_norm": 1.6704965829849243, + "learning_rate": 0.0002853694729523592, + "loss": 4.4047763061523435, + "step": 14100 + }, + { + "epoch": 0.14735334710014839, + "grad_norm": 1.1640102863311768, + "learning_rate": 0.0002852657029896126, + "loss": 4.482722778320312, + "step": 14200 + }, + { + "epoch": 0.14839104672761422, + "grad_norm": 1.5910676717758179, + "learning_rate": 0.00028516193302686603, + "loss": 4.464485473632813, + "step": 14300 + }, + { + "epoch": 0.14942874635508005, + "grad_norm": 2.349853277206421, + "learning_rate": 0.0002850581630641194, + "loss": 4.478161010742188, + "step": 14400 + }, + { + "epoch": 0.15046644598254588, + "grad_norm": 1.6594980955123901, + "learning_rate": 0.0002849543931013728, + "loss": 4.524984741210938, + "step": 14500 + }, + { + "epoch": 0.15150414561001171, + "grad_norm": 1.0867830514907837, + "learning_rate": 0.00028485062313862627, + "loss": 4.444278259277343, + "step": 14600 + }, + { + "epoch": 0.15254184523747755, + "grad_norm": 1.4026222229003906, + "learning_rate": 0.00028474685317587966, + "loss": 4.562846374511719, + "step": 14700 + }, + { + "epoch": 0.1535795448649434, + "grad_norm": 1.7118810415267944, + "learning_rate": 0.0002846430832131331, + "loss": 4.434857177734375, + "step": 14800 + }, + { + "epoch": 0.15461724449240924, + "grad_norm": 1.3377333879470825, + "learning_rate": 0.0002845393132503865, + "loss": 4.50284912109375, + "step": 14900 + }, + { + "epoch": 0.15565494411987507, + "grad_norm": 1.0628588199615479, + "learning_rate": 0.00028443554328763996, + "loss": 4.467984924316406, + "step": 15000 + }, + { + "epoch": 0.1566926437473409, + "grad_norm": 1.122900366783142, + "learning_rate": 0.00028433177332489335, + "loss": 4.477691650390625, + "step": 15100 + }, + { + "epoch": 0.15773034337480674, + "grad_norm": 1.0721949338912964, + "learning_rate": 0.00028422800336214675, + "loss": 4.566653137207031, + "step": 15200 + }, + { + "epoch": 0.15876804300227257, + "grad_norm": 2.0959179401397705, + "learning_rate": 0.0002841242333994002, + "loss": 4.459400939941406, + "step": 15300 + }, + { + "epoch": 0.1598057426297384, + "grad_norm": 1.832321047782898, + "learning_rate": 0.0002840204634366536, + "loss": 4.441622009277344, + "step": 15400 + }, + { + "epoch": 0.16084344225720423, + "grad_norm": 1.9756203889846802, + "learning_rate": 0.00028391669347390704, + "loss": 4.5193002319335935, + "step": 15500 + }, + { + "epoch": 0.16188114188467007, + "grad_norm": 1.9734655618667603, + "learning_rate": 0.00028381292351116043, + "loss": 4.403963012695312, + "step": 15600 + }, + { + "epoch": 0.1629188415121359, + "grad_norm": 1.0987114906311035, + "learning_rate": 0.00028370915354841383, + "loss": 4.3827951049804685, + "step": 15700 + }, + { + "epoch": 0.16395654113960173, + "grad_norm": 1.0084813833236694, + "learning_rate": 0.0002836053835856673, + "loss": 4.431182861328125, + "step": 15800 + }, + { + "epoch": 0.16499424076706756, + "grad_norm": 0.8771688342094421, + "learning_rate": 0.00028350161362292067, + "loss": 4.386305236816407, + "step": 15900 + }, + { + "epoch": 0.1660319403945334, + "grad_norm": 1.960618495941162, + "learning_rate": 0.0002833978436601741, + "loss": 4.450301513671875, + "step": 16000 + }, + { + "epoch": 0.16706964002199923, + "grad_norm": 2.016059398651123, + "learning_rate": 0.0002832940736974275, + "loss": 4.443774719238281, + "step": 16100 + }, + { + "epoch": 0.16810733964946506, + "grad_norm": 2.1017072200775146, + "learning_rate": 0.0002831903037346809, + "loss": 4.387731323242187, + "step": 16200 + }, + { + "epoch": 0.1691450392769309, + "grad_norm": 3.876704216003418, + "learning_rate": 0.00028308653377193436, + "loss": 4.339099731445312, + "step": 16300 + }, + { + "epoch": 0.17018273890439672, + "grad_norm": 2.4443888664245605, + "learning_rate": 0.00028298276380918776, + "loss": 4.420601196289063, + "step": 16400 + }, + { + "epoch": 0.17122043853186256, + "grad_norm": 2.2986700534820557, + "learning_rate": 0.0002828789938464412, + "loss": 4.574692687988281, + "step": 16500 + }, + { + "epoch": 0.1722581381593284, + "grad_norm": 3.120959997177124, + "learning_rate": 0.0002827752238836946, + "loss": 4.3793856811523435, + "step": 16600 + }, + { + "epoch": 0.17329583778679422, + "grad_norm": 3.928020715713501, + "learning_rate": 0.00028267145392094805, + "loss": 4.389268188476563, + "step": 16700 + }, + { + "epoch": 0.17433353741426008, + "grad_norm": 1.5828691720962524, + "learning_rate": 0.00028256768395820144, + "loss": 4.353381652832031, + "step": 16800 + }, + { + "epoch": 0.1753712370417259, + "grad_norm": 1.0565470457077026, + "learning_rate": 0.00028246391399545484, + "loss": 4.289037170410157, + "step": 16900 + }, + { + "epoch": 0.17640893666919175, + "grad_norm": 1.7072774171829224, + "learning_rate": 0.0002823601440327083, + "loss": 4.325290832519531, + "step": 17000 + }, + { + "epoch": 0.17744663629665758, + "grad_norm": 1.0402146577835083, + "learning_rate": 0.0002822563740699617, + "loss": 4.450514221191407, + "step": 17100 + }, + { + "epoch": 0.1784843359241234, + "grad_norm": 1.4970057010650635, + "learning_rate": 0.00028215260410721513, + "loss": 4.393040161132813, + "step": 17200 + }, + { + "epoch": 0.17952203555158924, + "grad_norm": 1.266546607017517, + "learning_rate": 0.0002820488341444685, + "loss": 4.276432800292969, + "step": 17300 + }, + { + "epoch": 0.18055973517905508, + "grad_norm": 1.751590371131897, + "learning_rate": 0.0002819450641817219, + "loss": 4.40036376953125, + "step": 17400 + }, + { + "epoch": 0.1815974348065209, + "grad_norm": 1.5430057048797607, + "learning_rate": 0.00028184129421897537, + "loss": 4.279835205078125, + "step": 17500 + }, + { + "epoch": 0.18263513443398674, + "grad_norm": 4.205715179443359, + "learning_rate": 0.00028173752425622876, + "loss": 4.501398315429688, + "step": 17600 + }, + { + "epoch": 0.18367283406145257, + "grad_norm": 2.2290608882904053, + "learning_rate": 0.0002816337542934822, + "loss": 4.400292053222656, + "step": 17700 + }, + { + "epoch": 0.1847105336889184, + "grad_norm": 1.6409145593643188, + "learning_rate": 0.0002815299843307356, + "loss": 4.361965026855469, + "step": 17800 + }, + { + "epoch": 0.18574823331638424, + "grad_norm": 1.235737919807434, + "learning_rate": 0.000281426214367989, + "loss": 4.4263699340820315, + "step": 17900 + }, + { + "epoch": 0.18678593294385007, + "grad_norm": 1.8182483911514282, + "learning_rate": 0.0002813224444052424, + "loss": 4.38103759765625, + "step": 18000 + }, + { + "epoch": 0.1878236325713159, + "grad_norm": 1.725359559059143, + "learning_rate": 0.00028121867444249585, + "loss": 4.332106323242187, + "step": 18100 + }, + { + "epoch": 0.18886133219878173, + "grad_norm": 1.9186443090438843, + "learning_rate": 0.0002811149044797493, + "loss": 4.354175415039062, + "step": 18200 + }, + { + "epoch": 0.18989903182624757, + "grad_norm": 1.1907823085784912, + "learning_rate": 0.0002810111345170027, + "loss": 4.521398315429687, + "step": 18300 + }, + { + "epoch": 0.1909367314537134, + "grad_norm": 2.796095609664917, + "learning_rate": 0.0002809073645542561, + "loss": 4.280415649414063, + "step": 18400 + }, + { + "epoch": 0.19197443108117923, + "grad_norm": 2.043811798095703, + "learning_rate": 0.0002808035945915095, + "loss": 4.364379272460938, + "step": 18500 + }, + { + "epoch": 0.19301213070864506, + "grad_norm": 6.419173240661621, + "learning_rate": 0.00028069982462876293, + "loss": 4.420321044921875, + "step": 18600 + }, + { + "epoch": 0.1940498303361109, + "grad_norm": 2.0183868408203125, + "learning_rate": 0.0002805960546660163, + "loss": 4.203153381347656, + "step": 18700 + }, + { + "epoch": 0.19508752996357676, + "grad_norm": 1.1752562522888184, + "learning_rate": 0.00028049228470326977, + "loss": 4.362376098632812, + "step": 18800 + }, + { + "epoch": 0.1961252295910426, + "grad_norm": 1.7152916193008423, + "learning_rate": 0.0002803885147405232, + "loss": 4.423097229003906, + "step": 18900 + }, + { + "epoch": 0.19716292921850842, + "grad_norm": 0.8988032341003418, + "learning_rate": 0.0002802847447777766, + "loss": 4.291071166992188, + "step": 19000 + }, + { + "epoch": 0.19820062884597425, + "grad_norm": 1.2874023914337158, + "learning_rate": 0.00028018097481503, + "loss": 4.257485046386718, + "step": 19100 + }, + { + "epoch": 0.19923832847344009, + "grad_norm": 3.89581561088562, + "learning_rate": 0.0002800772048522834, + "loss": 4.355436401367188, + "step": 19200 + }, + { + "epoch": 0.20027602810090592, + "grad_norm": 1.4264250993728638, + "learning_rate": 0.00027997343488953686, + "loss": 4.268387451171875, + "step": 19300 + }, + { + "epoch": 0.20131372772837175, + "grad_norm": 2.3243231773376465, + "learning_rate": 0.0002798696649267903, + "loss": 4.248961791992188, + "step": 19400 + }, + { + "epoch": 0.20235142735583758, + "grad_norm": 1.609995722770691, + "learning_rate": 0.0002797658949640437, + "loss": 4.299253845214844, + "step": 19500 + }, + { + "epoch": 0.20338912698330341, + "grad_norm": 1.636496901512146, + "learning_rate": 0.0002796621250012971, + "loss": 4.379757690429687, + "step": 19600 + }, + { + "epoch": 0.20442682661076925, + "grad_norm": 1.742827296257019, + "learning_rate": 0.0002795583550385505, + "loss": 4.298026733398437, + "step": 19700 + }, + { + "epoch": 0.20546452623823508, + "grad_norm": 1.3360769748687744, + "learning_rate": 0.00027945458507580394, + "loss": 4.443134155273437, + "step": 19800 + }, + { + "epoch": 0.2065022258657009, + "grad_norm": 1.5279536247253418, + "learning_rate": 0.00027935081511305733, + "loss": 4.3536380004882815, + "step": 19900 + }, + { + "epoch": 0.20753992549316674, + "grad_norm": 1.2768709659576416, + "learning_rate": 0.0002792470451503108, + "loss": 4.420497741699219, + "step": 20000 + }, + { + "epoch": 0.20857762512063258, + "grad_norm": 1.1040194034576416, + "learning_rate": 0.0002791432751875642, + "loss": 4.308759155273438, + "step": 20100 + }, + { + "epoch": 0.2096153247480984, + "grad_norm": 1.5710710287094116, + "learning_rate": 0.00027903950522481757, + "loss": 4.188085021972657, + "step": 20200 + }, + { + "epoch": 0.21065302437556424, + "grad_norm": 0.9058725237846375, + "learning_rate": 0.000278935735262071, + "loss": 4.162925720214844, + "step": 20300 + }, + { + "epoch": 0.21169072400303007, + "grad_norm": 2.4681508541107178, + "learning_rate": 0.0002788319652993244, + "loss": 4.207759704589844, + "step": 20400 + }, + { + "epoch": 0.2127284236304959, + "grad_norm": 1.7522861957550049, + "learning_rate": 0.00027872819533657786, + "loss": 4.448352355957031, + "step": 20500 + }, + { + "epoch": 0.21376612325796174, + "grad_norm": 1.8361260890960693, + "learning_rate": 0.00027862442537383126, + "loss": 4.27260986328125, + "step": 20600 + }, + { + "epoch": 0.2148038228854276, + "grad_norm": 1.7720355987548828, + "learning_rate": 0.0002785206554110847, + "loss": 4.315809326171875, + "step": 20700 + }, + { + "epoch": 0.21584152251289343, + "grad_norm": 2.2454731464385986, + "learning_rate": 0.0002784168854483381, + "loss": 4.421763916015625, + "step": 20800 + }, + { + "epoch": 0.21687922214035926, + "grad_norm": 2.7393276691436768, + "learning_rate": 0.0002783131154855915, + "loss": 4.268560791015625, + "step": 20900 + }, + { + "epoch": 0.2179169217678251, + "grad_norm": 1.8933848142623901, + "learning_rate": 0.00027820934552284495, + "loss": 4.316322937011718, + "step": 21000 + }, + { + "epoch": 0.21895462139529093, + "grad_norm": 1.2294155359268188, + "learning_rate": 0.00027810557556009834, + "loss": 4.247787780761719, + "step": 21100 + }, + { + "epoch": 0.21999232102275676, + "grad_norm": 1.5950024127960205, + "learning_rate": 0.0002780018055973518, + "loss": 4.292718811035156, + "step": 21200 + }, + { + "epoch": 0.2210300206502226, + "grad_norm": 0.9710947275161743, + "learning_rate": 0.0002778980356346052, + "loss": 4.238976135253906, + "step": 21300 + }, + { + "epoch": 0.22206772027768842, + "grad_norm": 1.3599995374679565, + "learning_rate": 0.0002777942656718586, + "loss": 4.441769409179687, + "step": 21400 + }, + { + "epoch": 0.22310541990515426, + "grad_norm": 1.2248610258102417, + "learning_rate": 0.00027769049570911203, + "loss": 4.34153564453125, + "step": 21500 + }, + { + "epoch": 0.2241431195326201, + "grad_norm": 1.07679283618927, + "learning_rate": 0.0002775867257463654, + "loss": 4.307798767089844, + "step": 21600 + }, + { + "epoch": 0.22518081916008592, + "grad_norm": 2.6134791374206543, + "learning_rate": 0.0002774829557836189, + "loss": 4.170127868652344, + "step": 21700 + }, + { + "epoch": 0.22621851878755175, + "grad_norm": 3.8844735622406006, + "learning_rate": 0.00027737918582087227, + "loss": 4.2596041870117185, + "step": 21800 + }, + { + "epoch": 0.22725621841501759, + "grad_norm": 3.4798216819763184, + "learning_rate": 0.00027727541585812566, + "loss": 4.257220153808594, + "step": 21900 + }, + { + "epoch": 0.22829391804248342, + "grad_norm": 1.0172936916351318, + "learning_rate": 0.0002771716458953791, + "loss": 4.342347717285156, + "step": 22000 + }, + { + "epoch": 0.22933161766994925, + "grad_norm": 2.0007245540618896, + "learning_rate": 0.0002770678759326325, + "loss": 4.21951171875, + "step": 22100 + }, + { + "epoch": 0.23036931729741508, + "grad_norm": 1.0652577877044678, + "learning_rate": 0.00027696410596988596, + "loss": 4.309334411621093, + "step": 22200 + }, + { + "epoch": 0.23140701692488092, + "grad_norm": 1.0696879625320435, + "learning_rate": 0.00027686033600713935, + "loss": 4.333943481445313, + "step": 22300 + }, + { + "epoch": 0.23244471655234675, + "grad_norm": 1.0693758726119995, + "learning_rate": 0.00027675656604439275, + "loss": 4.325413513183594, + "step": 22400 + }, + { + "epoch": 0.23348241617981258, + "grad_norm": 1.3958321809768677, + "learning_rate": 0.00027665279608164614, + "loss": 4.1349484252929685, + "step": 22500 + }, + { + "epoch": 0.2345201158072784, + "grad_norm": 1.732444167137146, + "learning_rate": 0.0002765490261188996, + "loss": 4.191957397460937, + "step": 22600 + }, + { + "epoch": 0.23555781543474427, + "grad_norm": 1.329959750175476, + "learning_rate": 0.00027644525615615304, + "loss": 4.440416870117187, + "step": 22700 + }, + { + "epoch": 0.2365955150622101, + "grad_norm": 1.4088762998580933, + "learning_rate": 0.00027634148619340643, + "loss": 4.128535461425781, + "step": 22800 + }, + { + "epoch": 0.23763321468967594, + "grad_norm": 1.167936086654663, + "learning_rate": 0.0002762377162306599, + "loss": 4.3338143920898435, + "step": 22900 + }, + { + "epoch": 0.23867091431714177, + "grad_norm": 1.1570918560028076, + "learning_rate": 0.0002761339462679133, + "loss": 4.180432739257813, + "step": 23000 + }, + { + "epoch": 0.2397086139446076, + "grad_norm": 1.2544199228286743, + "learning_rate": 0.00027603017630516667, + "loss": 4.1538671875, + "step": 23100 + }, + { + "epoch": 0.24074631357207343, + "grad_norm": 1.844802975654602, + "learning_rate": 0.0002759264063424201, + "loss": 4.238400268554687, + "step": 23200 + }, + { + "epoch": 0.24178401319953927, + "grad_norm": 2.407107353210449, + "learning_rate": 0.0002758226363796735, + "loss": 4.1402197265625, + "step": 23300 + }, + { + "epoch": 0.2428217128270051, + "grad_norm": 1.7526997327804565, + "learning_rate": 0.00027571886641692696, + "loss": 4.253873901367188, + "step": 23400 + }, + { + "epoch": 0.24385941245447093, + "grad_norm": 2.1768147945404053, + "learning_rate": 0.00027561509645418036, + "loss": 4.146066589355469, + "step": 23500 + }, + { + "epoch": 0.24489711208193676, + "grad_norm": 1.0545059442520142, + "learning_rate": 0.00027551132649143375, + "loss": 4.199613037109375, + "step": 23600 + }, + { + "epoch": 0.2459348117094026, + "grad_norm": 1.2132643461227417, + "learning_rate": 0.00027540755652868715, + "loss": 4.202657775878906, + "step": 23700 + }, + { + "epoch": 0.24697251133686843, + "grad_norm": 2.1652746200561523, + "learning_rate": 0.0002753037865659406, + "loss": 4.301669311523438, + "step": 23800 + }, + { + "epoch": 0.24801021096433426, + "grad_norm": 1.0687705278396606, + "learning_rate": 0.00027520001660319405, + "loss": 4.310574340820312, + "step": 23900 + }, + { + "epoch": 0.2490479105918001, + "grad_norm": 2.6030638217926025, + "learning_rate": 0.00027509624664044744, + "loss": 4.220720825195312, + "step": 24000 + }, + { + "epoch": 0.25008561021926595, + "grad_norm": 0.9720291495323181, + "learning_rate": 0.00027499247667770084, + "loss": 4.376803283691406, + "step": 24100 + }, + { + "epoch": 0.25112330984673176, + "grad_norm": 1.398289680480957, + "learning_rate": 0.00027488870671495423, + "loss": 4.39901123046875, + "step": 24200 + }, + { + "epoch": 0.2521610094741976, + "grad_norm": 2.2055957317352295, + "learning_rate": 0.0002747849367522077, + "loss": 4.196527709960938, + "step": 24300 + }, + { + "epoch": 0.2531987091016634, + "grad_norm": 2.036271810531616, + "learning_rate": 0.0002746811667894611, + "loss": 4.274451599121094, + "step": 24400 + }, + { + "epoch": 0.2542364087291293, + "grad_norm": 2.6011345386505127, + "learning_rate": 0.0002745773968267145, + "loss": 4.2699462890625, + "step": 24500 + }, + { + "epoch": 0.2552741083565951, + "grad_norm": 1.9660414457321167, + "learning_rate": 0.0002744736268639679, + "loss": 4.2452325439453125, + "step": 24600 + }, + { + "epoch": 0.25631180798406095, + "grad_norm": 1.2747102975845337, + "learning_rate": 0.0002743698569012213, + "loss": 4.348042907714844, + "step": 24700 + }, + { + "epoch": 0.25734950761152675, + "grad_norm": 1.4823510646820068, + "learning_rate": 0.00027426608693847476, + "loss": 4.154461669921875, + "step": 24800 + }, + { + "epoch": 0.2583872072389926, + "grad_norm": 1.6665210723876953, + "learning_rate": 0.00027416231697572816, + "loss": 4.136954956054687, + "step": 24900 + }, + { + "epoch": 0.2594249068664584, + "grad_norm": 1.8465914726257324, + "learning_rate": 0.0002740585470129816, + "loss": 4.296747741699218, + "step": 25000 + }, + { + "epoch": 0.2604626064939243, + "grad_norm": 1.0613303184509277, + "learning_rate": 0.00027395477705023506, + "loss": 4.209448547363281, + "step": 25100 + }, + { + "epoch": 0.2615003061213901, + "grad_norm": 2.3083701133728027, + "learning_rate": 0.00027385100708748845, + "loss": 4.412258911132812, + "step": 25200 + }, + { + "epoch": 0.26253800574885594, + "grad_norm": 1.8509588241577148, + "learning_rate": 0.00027374723712474185, + "loss": 4.171485595703125, + "step": 25300 + }, + { + "epoch": 0.26357570537632175, + "grad_norm": 1.091736078262329, + "learning_rate": 0.00027364346716199524, + "loss": 4.24049560546875, + "step": 25400 + }, + { + "epoch": 0.2646134050037876, + "grad_norm": 1.201401710510254, + "learning_rate": 0.0002735396971992487, + "loss": 4.135834350585937, + "step": 25500 + }, + { + "epoch": 0.2656511046312534, + "grad_norm": 1.5545823574066162, + "learning_rate": 0.0002734359272365021, + "loss": 4.291419677734375, + "step": 25600 + }, + { + "epoch": 0.26668880425871927, + "grad_norm": 1.3560378551483154, + "learning_rate": 0.00027333215727375553, + "loss": 4.236996459960937, + "step": 25700 + }, + { + "epoch": 0.26772650388618513, + "grad_norm": 1.0210782289505005, + "learning_rate": 0.00027322838731100893, + "loss": 4.249810791015625, + "step": 25800 + }, + { + "epoch": 0.26876420351365093, + "grad_norm": 1.3093341588974, + "learning_rate": 0.0002731246173482623, + "loss": 4.195414428710937, + "step": 25900 + }, + { + "epoch": 0.2698019031411168, + "grad_norm": 1.7895358800888062, + "learning_rate": 0.00027302084738551577, + "loss": 4.180751037597656, + "step": 26000 + }, + { + "epoch": 0.2708396027685826, + "grad_norm": 11.451671600341797, + "learning_rate": 0.00027291707742276917, + "loss": 4.157826538085938, + "step": 26100 + }, + { + "epoch": 0.27187730239604846, + "grad_norm": 1.9708665609359741, + "learning_rate": 0.0002728133074600226, + "loss": 4.128204956054687, + "step": 26200 + }, + { + "epoch": 0.27291500202351426, + "grad_norm": 1.2628132104873657, + "learning_rate": 0.000272709537497276, + "loss": 4.281667785644531, + "step": 26300 + }, + { + "epoch": 0.2739527016509801, + "grad_norm": 2.2199666500091553, + "learning_rate": 0.0002726057675345294, + "loss": 4.237691650390625, + "step": 26400 + }, + { + "epoch": 0.27499040127844593, + "grad_norm": 2.815150022506714, + "learning_rate": 0.00027250199757178285, + "loss": 4.080834045410156, + "step": 26500 + }, + { + "epoch": 0.2760281009059118, + "grad_norm": 1.7167062759399414, + "learning_rate": 0.00027239822760903625, + "loss": 4.224625549316406, + "step": 26600 + }, + { + "epoch": 0.2770658005333776, + "grad_norm": 2.769949436187744, + "learning_rate": 0.0002722944576462897, + "loss": 4.3115145874023435, + "step": 26700 + }, + { + "epoch": 0.27810350016084345, + "grad_norm": 1.3523616790771484, + "learning_rate": 0.0002721906876835431, + "loss": 4.356557006835938, + "step": 26800 + }, + { + "epoch": 0.27914119978830926, + "grad_norm": 4.089077949523926, + "learning_rate": 0.00027208691772079654, + "loss": 4.286115112304688, + "step": 26900 + }, + { + "epoch": 0.2801788994157751, + "grad_norm": 1.1650248765945435, + "learning_rate": 0.00027198314775804994, + "loss": 4.335249328613282, + "step": 27000 + }, + { + "epoch": 0.2812165990432409, + "grad_norm": 1.8776350021362305, + "learning_rate": 0.00027187937779530333, + "loss": 4.274792175292969, + "step": 27100 + }, + { + "epoch": 0.2822542986707068, + "grad_norm": 3.665797710418701, + "learning_rate": 0.0002717756078325568, + "loss": 4.347820739746094, + "step": 27200 + }, + { + "epoch": 0.2832919982981726, + "grad_norm": 1.1905182600021362, + "learning_rate": 0.0002716718378698102, + "loss": 4.234444274902343, + "step": 27300 + }, + { + "epoch": 0.28432969792563845, + "grad_norm": 1.2664549350738525, + "learning_rate": 0.0002715680679070636, + "loss": 4.19026123046875, + "step": 27400 + }, + { + "epoch": 0.28536739755310425, + "grad_norm": 1.5952035188674927, + "learning_rate": 0.000271464297944317, + "loss": 4.284921569824219, + "step": 27500 + }, + { + "epoch": 0.2864050971805701, + "grad_norm": 1.5898215770721436, + "learning_rate": 0.0002713605279815704, + "loss": 4.128340759277344, + "step": 27600 + }, + { + "epoch": 0.28744279680803597, + "grad_norm": 1.701250433921814, + "learning_rate": 0.00027125675801882386, + "loss": 4.1064456176757815, + "step": 27700 + }, + { + "epoch": 0.2884804964355018, + "grad_norm": 2.2521140575408936, + "learning_rate": 0.00027115298805607726, + "loss": 4.188478698730469, + "step": 27800 + }, + { + "epoch": 0.28951819606296764, + "grad_norm": 1.428589105606079, + "learning_rate": 0.0002710492180933307, + "loss": 4.172950134277344, + "step": 27900 + }, + { + "epoch": 0.29055589569043344, + "grad_norm": 1.5243910551071167, + "learning_rate": 0.0002709454481305841, + "loss": 4.251683044433594, + "step": 28000 + }, + { + "epoch": 0.2915935953178993, + "grad_norm": 1.285276174545288, + "learning_rate": 0.0002708416781678375, + "loss": 4.291034851074219, + "step": 28100 + }, + { + "epoch": 0.2926312949453651, + "grad_norm": 1.2959215641021729, + "learning_rate": 0.0002707379082050909, + "loss": 4.223204040527344, + "step": 28200 + }, + { + "epoch": 0.29366899457283097, + "grad_norm": 1.9572069644927979, + "learning_rate": 0.00027063413824234434, + "loss": 4.1140069580078125, + "step": 28300 + }, + { + "epoch": 0.29470669420029677, + "grad_norm": 2.5625929832458496, + "learning_rate": 0.0002705303682795978, + "loss": 4.2418734741210935, + "step": 28400 + }, + { + "epoch": 0.29574439382776263, + "grad_norm": 1.657065510749817, + "learning_rate": 0.0002704265983168512, + "loss": 4.2059628295898435, + "step": 28500 + }, + { + "epoch": 0.29678209345522844, + "grad_norm": 1.4735133647918701, + "learning_rate": 0.0002703228283541046, + "loss": 4.232904663085938, + "step": 28600 + }, + { + "epoch": 0.2978197930826943, + "grad_norm": 2.643979549407959, + "learning_rate": 0.000270219058391358, + "loss": 4.151640930175781, + "step": 28700 + }, + { + "epoch": 0.2988574927101601, + "grad_norm": 1.5147004127502441, + "learning_rate": 0.0002701152884286114, + "loss": 4.171849060058594, + "step": 28800 + }, + { + "epoch": 0.29989519233762596, + "grad_norm": 1.4815659523010254, + "learning_rate": 0.00027001151846586487, + "loss": 4.120007019042969, + "step": 28900 + }, + { + "epoch": 0.30093289196509176, + "grad_norm": 3.8772029876708984, + "learning_rate": 0.00026990774850311827, + "loss": 4.113840637207031, + "step": 29000 + }, + { + "epoch": 0.3019705915925576, + "grad_norm": 1.8152740001678467, + "learning_rate": 0.0002698039785403717, + "loss": 4.143219604492187, + "step": 29100 + }, + { + "epoch": 0.30300829122002343, + "grad_norm": 1.3441669940948486, + "learning_rate": 0.0002697002085776251, + "loss": 4.151035461425781, + "step": 29200 + }, + { + "epoch": 0.3040459908474893, + "grad_norm": 2.0656609535217285, + "learning_rate": 0.0002695964386148785, + "loss": 4.229763793945312, + "step": 29300 + }, + { + "epoch": 0.3050836904749551, + "grad_norm": 2.8376095294952393, + "learning_rate": 0.0002694926686521319, + "loss": 4.2303158569335935, + "step": 29400 + }, + { + "epoch": 0.30612139010242095, + "grad_norm": 1.9161107540130615, + "learning_rate": 0.00026938889868938535, + "loss": 4.252763061523438, + "step": 29500 + }, + { + "epoch": 0.3071590897298868, + "grad_norm": 2.1317851543426514, + "learning_rate": 0.0002692851287266388, + "loss": 4.12993408203125, + "step": 29600 + }, + { + "epoch": 0.3081967893573526, + "grad_norm": 2.9762330055236816, + "learning_rate": 0.0002691813587638922, + "loss": 4.347277221679687, + "step": 29700 + }, + { + "epoch": 0.3092344889848185, + "grad_norm": 2.135929584503174, + "learning_rate": 0.0002690775888011456, + "loss": 4.052276611328125, + "step": 29800 + }, + { + "epoch": 0.3102721886122843, + "grad_norm": 1.3577543497085571, + "learning_rate": 0.000268973818838399, + "loss": 4.199589233398438, + "step": 29900 + }, + { + "epoch": 0.31130988823975014, + "grad_norm": 1.2834597826004028, + "learning_rate": 0.00026887004887565243, + "loss": 4.134565734863282, + "step": 30000 + }, + { + "epoch": 0.31234758786721595, + "grad_norm": 2.093669891357422, + "learning_rate": 0.00026876627891290583, + "loss": 4.183307495117187, + "step": 30100 + }, + { + "epoch": 0.3133852874946818, + "grad_norm": 1.1888537406921387, + "learning_rate": 0.0002686625089501593, + "loss": 4.022268371582031, + "step": 30200 + }, + { + "epoch": 0.3144229871221476, + "grad_norm": 1.4640058279037476, + "learning_rate": 0.00026855873898741267, + "loss": 4.191292724609375, + "step": 30300 + }, + { + "epoch": 0.3154606867496135, + "grad_norm": 0.9469636678695679, + "learning_rate": 0.00026845496902466607, + "loss": 4.2131259155273435, + "step": 30400 + }, + { + "epoch": 0.3164983863770793, + "grad_norm": 1.5227535963058472, + "learning_rate": 0.0002683511990619195, + "loss": 4.24783935546875, + "step": 30500 + }, + { + "epoch": 0.31753608600454514, + "grad_norm": 2.524731159210205, + "learning_rate": 0.0002682474290991729, + "loss": 4.206085205078125, + "step": 30600 + }, + { + "epoch": 0.31857378563201094, + "grad_norm": 2.7074637413024902, + "learning_rate": 0.00026814365913642636, + "loss": 3.964501953125, + "step": 30700 + }, + { + "epoch": 0.3196114852594768, + "grad_norm": 2.1479899883270264, + "learning_rate": 0.00026803988917367975, + "loss": 4.121002197265625, + "step": 30800 + }, + { + "epoch": 0.3206491848869426, + "grad_norm": 3.6871800422668457, + "learning_rate": 0.00026793611921093315, + "loss": 4.290604858398438, + "step": 30900 + }, + { + "epoch": 0.32168688451440847, + "grad_norm": 2.0092685222625732, + "learning_rate": 0.0002678323492481866, + "loss": 4.169475708007813, + "step": 31000 + }, + { + "epoch": 0.32272458414187427, + "grad_norm": 1.3000237941741943, + "learning_rate": 0.00026772857928544, + "loss": 4.096010131835937, + "step": 31100 + }, + { + "epoch": 0.32376228376934013, + "grad_norm": 2.161574125289917, + "learning_rate": 0.00026762480932269344, + "loss": 4.249325561523437, + "step": 31200 + }, + { + "epoch": 0.32479998339680594, + "grad_norm": 1.0579701662063599, + "learning_rate": 0.00026752103935994684, + "loss": 4.270779724121094, + "step": 31300 + }, + { + "epoch": 0.3258376830242718, + "grad_norm": 1.2264137268066406, + "learning_rate": 0.0002674172693972003, + "loss": 4.2367620849609375, + "step": 31400 + }, + { + "epoch": 0.3268753826517376, + "grad_norm": 3.2623612880706787, + "learning_rate": 0.0002673134994344537, + "loss": 4.160564575195313, + "step": 31500 + }, + { + "epoch": 0.32791308227920346, + "grad_norm": 2.1803345680236816, + "learning_rate": 0.0002672097294717071, + "loss": 4.203829040527344, + "step": 31600 + }, + { + "epoch": 0.3289507819066693, + "grad_norm": 1.9515228271484375, + "learning_rate": 0.0002671059595089605, + "loss": 4.315436706542969, + "step": 31700 + }, + { + "epoch": 0.3299884815341351, + "grad_norm": 3.0683810710906982, + "learning_rate": 0.0002670021895462139, + "loss": 4.155743103027344, + "step": 31800 + }, + { + "epoch": 0.331026181161601, + "grad_norm": 2.6642050743103027, + "learning_rate": 0.00026689841958346737, + "loss": 4.222473754882812, + "step": 31900 + }, + { + "epoch": 0.3320638807890668, + "grad_norm": 1.8333579301834106, + "learning_rate": 0.00026679464962072076, + "loss": 4.210680541992187, + "step": 32000 + }, + { + "epoch": 0.33310158041653265, + "grad_norm": 2.136242151260376, + "learning_rate": 0.00026669087965797416, + "loss": 4.144779357910156, + "step": 32100 + }, + { + "epoch": 0.33413928004399845, + "grad_norm": 0.9694802165031433, + "learning_rate": 0.0002665871096952276, + "loss": 4.1770632934570315, + "step": 32200 + }, + { + "epoch": 0.3351769796714643, + "grad_norm": 2.070678949356079, + "learning_rate": 0.000266483339732481, + "loss": 4.140425415039062, + "step": 32300 + }, + { + "epoch": 0.3362146792989301, + "grad_norm": 1.3420311212539673, + "learning_rate": 0.00026637956976973445, + "loss": 4.117745361328125, + "step": 32400 + }, + { + "epoch": 0.337252378926396, + "grad_norm": 1.7498325109481812, + "learning_rate": 0.00026627579980698785, + "loss": 4.090622253417969, + "step": 32500 + }, + { + "epoch": 0.3382900785538618, + "grad_norm": 5.7661848068237305, + "learning_rate": 0.00026617202984424124, + "loss": 4.212898864746093, + "step": 32600 + }, + { + "epoch": 0.33932777818132764, + "grad_norm": 1.856246829032898, + "learning_rate": 0.0002660682598814947, + "loss": 4.20351806640625, + "step": 32700 + }, + { + "epoch": 0.34036547780879345, + "grad_norm": 5.002403259277344, + "learning_rate": 0.0002659644899187481, + "loss": 4.095009765625, + "step": 32800 + }, + { + "epoch": 0.3414031774362593, + "grad_norm": 1.0896239280700684, + "learning_rate": 0.00026586071995600153, + "loss": 4.061651000976562, + "step": 32900 + }, + { + "epoch": 0.3424408770637251, + "grad_norm": 1.4536166191101074, + "learning_rate": 0.00026575694999325493, + "loss": 4.05192626953125, + "step": 33000 + }, + { + "epoch": 0.343478576691191, + "grad_norm": 3.966247081756592, + "learning_rate": 0.0002656531800305084, + "loss": 4.122060241699219, + "step": 33100 + }, + { + "epoch": 0.3445162763186568, + "grad_norm": 2.3092470169067383, + "learning_rate": 0.00026554941006776177, + "loss": 4.171341247558594, + "step": 33200 + }, + { + "epoch": 0.34555397594612264, + "grad_norm": 1.6187312602996826, + "learning_rate": 0.00026544564010501517, + "loss": 4.147681579589844, + "step": 33300 + }, + { + "epoch": 0.34659167557358844, + "grad_norm": 1.4459052085876465, + "learning_rate": 0.0002653418701422686, + "loss": 4.12395751953125, + "step": 33400 + }, + { + "epoch": 0.3476293752010543, + "grad_norm": 1.6370753049850464, + "learning_rate": 0.000265238100179522, + "loss": 4.043997192382813, + "step": 33500 + }, + { + "epoch": 0.34866707482852016, + "grad_norm": 2.5965089797973633, + "learning_rate": 0.00026513433021677546, + "loss": 4.149281311035156, + "step": 33600 + }, + { + "epoch": 0.34970477445598597, + "grad_norm": 1.4466602802276611, + "learning_rate": 0.00026503056025402885, + "loss": 4.153418884277344, + "step": 33700 + }, + { + "epoch": 0.3507424740834518, + "grad_norm": 1.1217280626296997, + "learning_rate": 0.00026492679029128225, + "loss": 4.173803405761719, + "step": 33800 + }, + { + "epoch": 0.35178017371091763, + "grad_norm": 2.853686809539795, + "learning_rate": 0.00026482302032853564, + "loss": 4.1212451171875, + "step": 33900 + }, + { + "epoch": 0.3528178733383835, + "grad_norm": 1.1508560180664062, + "learning_rate": 0.0002647192503657891, + "loss": 4.179091186523437, + "step": 34000 + }, + { + "epoch": 0.3538555729658493, + "grad_norm": 1.8668493032455444, + "learning_rate": 0.00026461548040304254, + "loss": 4.142960205078125, + "step": 34100 + }, + { + "epoch": 0.35489327259331516, + "grad_norm": 1.7272940874099731, + "learning_rate": 0.00026451171044029594, + "loss": 4.127975769042969, + "step": 34200 + }, + { + "epoch": 0.35593097222078096, + "grad_norm": 1.5529290437698364, + "learning_rate": 0.00026440794047754933, + "loss": 4.2190853881835935, + "step": 34300 + }, + { + "epoch": 0.3569686718482468, + "grad_norm": 1.506499171257019, + "learning_rate": 0.0002643041705148027, + "loss": 4.168932800292969, + "step": 34400 + }, + { + "epoch": 0.3580063714757126, + "grad_norm": 1.2258543968200684, + "learning_rate": 0.0002642004005520562, + "loss": 4.065081176757812, + "step": 34500 + }, + { + "epoch": 0.3590440711031785, + "grad_norm": 1.4408226013183594, + "learning_rate": 0.0002640966305893096, + "loss": 4.102992858886719, + "step": 34600 + }, + { + "epoch": 0.3600817707306443, + "grad_norm": 2.467862844467163, + "learning_rate": 0.000263992860626563, + "loss": 4.061658935546875, + "step": 34700 + }, + { + "epoch": 0.36111947035811015, + "grad_norm": 1.3214993476867676, + "learning_rate": 0.0002638890906638164, + "loss": 4.133033752441406, + "step": 34800 + }, + { + "epoch": 0.36215716998557596, + "grad_norm": 1.2223659753799438, + "learning_rate": 0.0002637853207010698, + "loss": 4.0944091796875, + "step": 34900 + }, + { + "epoch": 0.3631948696130418, + "grad_norm": 1.5864417552947998, + "learning_rate": 0.00026368155073832326, + "loss": 4.031897277832031, + "step": 35000 + }, + { + "epoch": 0.3642325692405076, + "grad_norm": 3.021804094314575, + "learning_rate": 0.00026357778077557665, + "loss": 4.253480224609375, + "step": 35100 + }, + { + "epoch": 0.3652702688679735, + "grad_norm": 2.419196844100952, + "learning_rate": 0.0002634740108128301, + "loss": 4.060654602050781, + "step": 35200 + }, + { + "epoch": 0.3663079684954393, + "grad_norm": 3.106058359146118, + "learning_rate": 0.00026337024085008355, + "loss": 4.167652282714844, + "step": 35300 + }, + { + "epoch": 0.36734566812290514, + "grad_norm": 2.6082842350006104, + "learning_rate": 0.00026326647088733695, + "loss": 4.126443481445312, + "step": 35400 + }, + { + "epoch": 0.368383367750371, + "grad_norm": 3.2292778491973877, + "learning_rate": 0.00026316270092459034, + "loss": 4.16947509765625, + "step": 35500 + }, + { + "epoch": 0.3694210673778368, + "grad_norm": 3.438127279281616, + "learning_rate": 0.00026305893096184374, + "loss": 4.18126220703125, + "step": 35600 + }, + { + "epoch": 0.37045876700530267, + "grad_norm": 1.1258721351623535, + "learning_rate": 0.0002629551609990972, + "loss": 4.133269348144531, + "step": 35700 + }, + { + "epoch": 0.3714964666327685, + "grad_norm": 2.0176923274993896, + "learning_rate": 0.00026285139103635063, + "loss": 4.000823059082031, + "step": 35800 + }, + { + "epoch": 0.37253416626023433, + "grad_norm": 2.162721872329712, + "learning_rate": 0.00026274762107360403, + "loss": 4.158842163085938, + "step": 35900 + }, + { + "epoch": 0.37357186588770014, + "grad_norm": 1.3159765005111694, + "learning_rate": 0.0002626438511108574, + "loss": 4.156724853515625, + "step": 36000 + }, + { + "epoch": 0.374609565515166, + "grad_norm": 1.8504067659378052, + "learning_rate": 0.0002625400811481108, + "loss": 4.074109191894531, + "step": 36100 + }, + { + "epoch": 0.3756472651426318, + "grad_norm": 1.3491618633270264, + "learning_rate": 0.00026243631118536427, + "loss": 4.117833557128907, + "step": 36200 + }, + { + "epoch": 0.37668496477009766, + "grad_norm": 1.1090528964996338, + "learning_rate": 0.00026233254122261766, + "loss": 4.0473480224609375, + "step": 36300 + }, + { + "epoch": 0.37772266439756347, + "grad_norm": 4.539895057678223, + "learning_rate": 0.0002622287712598711, + "loss": 4.0527517700195315, + "step": 36400 + }, + { + "epoch": 0.37876036402502933, + "grad_norm": 1.792636513710022, + "learning_rate": 0.0002621250012971245, + "loss": 3.9459353637695314, + "step": 36500 + }, + { + "epoch": 0.37979806365249513, + "grad_norm": 2.4098236560821533, + "learning_rate": 0.0002620212313343779, + "loss": 4.144781494140625, + "step": 36600 + }, + { + "epoch": 0.380835763279961, + "grad_norm": 1.8648608922958374, + "learning_rate": 0.00026191746137163135, + "loss": 4.104746704101562, + "step": 36700 + }, + { + "epoch": 0.3818734629074268, + "grad_norm": 2.071338653564453, + "learning_rate": 0.00026181369140888474, + "loss": 4.074734191894532, + "step": 36800 + }, + { + "epoch": 0.38291116253489266, + "grad_norm": 1.3856308460235596, + "learning_rate": 0.0002617099214461382, + "loss": 4.158983154296875, + "step": 36900 + }, + { + "epoch": 0.38394886216235846, + "grad_norm": 2.072495698928833, + "learning_rate": 0.0002616061514833916, + "loss": 4.08581787109375, + "step": 37000 + }, + { + "epoch": 0.3849865617898243, + "grad_norm": 1.3703645467758179, + "learning_rate": 0.00026150238152064504, + "loss": 4.006895446777344, + "step": 37100 + }, + { + "epoch": 0.3860242614172901, + "grad_norm": 2.7975013256073, + "learning_rate": 0.00026139861155789843, + "loss": 4.147843627929688, + "step": 37200 + }, + { + "epoch": 0.387061961044756, + "grad_norm": 3.56386661529541, + "learning_rate": 0.0002612948415951518, + "loss": 4.2793121337890625, + "step": 37300 + }, + { + "epoch": 0.3880996606722218, + "grad_norm": 2.8237593173980713, + "learning_rate": 0.0002611910716324053, + "loss": 4.13215087890625, + "step": 37400 + }, + { + "epoch": 0.38913736029968765, + "grad_norm": 1.2382421493530273, + "learning_rate": 0.00026108730166965867, + "loss": 4.071390991210937, + "step": 37500 + }, + { + "epoch": 0.3901750599271535, + "grad_norm": 1.620809555053711, + "learning_rate": 0.0002609835317069121, + "loss": 4.081386108398437, + "step": 37600 + }, + { + "epoch": 0.3912127595546193, + "grad_norm": 1.5530173778533936, + "learning_rate": 0.0002608797617441655, + "loss": 4.095469970703125, + "step": 37700 + }, + { + "epoch": 0.3922504591820852, + "grad_norm": 2.7742369174957275, + "learning_rate": 0.0002607759917814189, + "loss": 4.096160888671875, + "step": 37800 + }, + { + "epoch": 0.393288158809551, + "grad_norm": 1.0493942499160767, + "learning_rate": 0.00026067222181867236, + "loss": 4.000921936035156, + "step": 37900 + }, + { + "epoch": 0.39432585843701684, + "grad_norm": 4.1348958015441895, + "learning_rate": 0.00026056845185592575, + "loss": 3.991048583984375, + "step": 38000 + }, + { + "epoch": 0.39536355806448265, + "grad_norm": 4.481339454650879, + "learning_rate": 0.0002604646818931792, + "loss": 4.004680786132813, + "step": 38100 + }, + { + "epoch": 0.3964012576919485, + "grad_norm": 1.5849348306655884, + "learning_rate": 0.0002603609119304326, + "loss": 4.127825317382812, + "step": 38200 + }, + { + "epoch": 0.3974389573194143, + "grad_norm": 1.5340007543563843, + "learning_rate": 0.000260257141967686, + "loss": 4.126565551757812, + "step": 38300 + }, + { + "epoch": 0.39847665694688017, + "grad_norm": 1.9388331174850464, + "learning_rate": 0.00026015337200493944, + "loss": 4.147232666015625, + "step": 38400 + }, + { + "epoch": 0.399514356574346, + "grad_norm": 1.4936273097991943, + "learning_rate": 0.00026004960204219284, + "loss": 4.046693115234375, + "step": 38500 + }, + { + "epoch": 0.40055205620181183, + "grad_norm": 1.4128496646881104, + "learning_rate": 0.0002599458320794463, + "loss": 4.027592468261719, + "step": 38600 + }, + { + "epoch": 0.40158975582927764, + "grad_norm": 1.2070266008377075, + "learning_rate": 0.0002598420621166997, + "loss": 3.9974462890625, + "step": 38700 + }, + { + "epoch": 0.4026274554567435, + "grad_norm": 1.0721571445465088, + "learning_rate": 0.0002597382921539531, + "loss": 4.048193054199219, + "step": 38800 + }, + { + "epoch": 0.4036651550842093, + "grad_norm": 4.593639373779297, + "learning_rate": 0.00025963452219120647, + "loss": 3.9815548706054686, + "step": 38900 + }, + { + "epoch": 0.40470285471167516, + "grad_norm": 2.84889817237854, + "learning_rate": 0.0002595307522284599, + "loss": 4.118370666503906, + "step": 39000 + }, + { + "epoch": 0.40574055433914097, + "grad_norm": 1.6757389307022095, + "learning_rate": 0.00025942698226571337, + "loss": 4.095942077636718, + "step": 39100 + }, + { + "epoch": 0.40677825396660683, + "grad_norm": 3.5596885681152344, + "learning_rate": 0.00025932321230296676, + "loss": 4.0965576171875, + "step": 39200 + }, + { + "epoch": 0.40781595359407263, + "grad_norm": 1.0558372735977173, + "learning_rate": 0.0002592194423402202, + "loss": 4.239440307617188, + "step": 39300 + }, + { + "epoch": 0.4088536532215385, + "grad_norm": 5.334078311920166, + "learning_rate": 0.0002591156723774736, + "loss": 4.089285888671875, + "step": 39400 + }, + { + "epoch": 0.40989135284900435, + "grad_norm": 2.4086287021636963, + "learning_rate": 0.000259011902414727, + "loss": 4.103414611816406, + "step": 39500 + }, + { + "epoch": 0.41092905247647016, + "grad_norm": 4.432836055755615, + "learning_rate": 0.00025890813245198045, + "loss": 4.0577630615234375, + "step": 39600 + }, + { + "epoch": 0.411966752103936, + "grad_norm": 1.3129891157150269, + "learning_rate": 0.00025880436248923384, + "loss": 4.128912353515625, + "step": 39700 + }, + { + "epoch": 0.4130044517314018, + "grad_norm": 2.148174524307251, + "learning_rate": 0.0002587005925264873, + "loss": 4.197516174316406, + "step": 39800 + }, + { + "epoch": 0.4140421513588677, + "grad_norm": 6.447707176208496, + "learning_rate": 0.0002585968225637407, + "loss": 4.087812805175782, + "step": 39900 + }, + { + "epoch": 0.4150798509863335, + "grad_norm": 2.721989393234253, + "learning_rate": 0.0002584930526009941, + "loss": 3.9460833740234373, + "step": 40000 + }, + { + "epoch": 0.41611755061379935, + "grad_norm": 1.543135166168213, + "learning_rate": 0.0002583892826382475, + "loss": 4.02151611328125, + "step": 40100 + }, + { + "epoch": 0.41715525024126515, + "grad_norm": 1.4670268297195435, + "learning_rate": 0.0002582855126755009, + "loss": 4.18778564453125, + "step": 40200 + }, + { + "epoch": 0.418192949868731, + "grad_norm": 3.8556268215179443, + "learning_rate": 0.0002581817427127544, + "loss": 3.996910400390625, + "step": 40300 + }, + { + "epoch": 0.4192306494961968, + "grad_norm": 1.702594518661499, + "learning_rate": 0.00025807797275000777, + "loss": 4.031709594726562, + "step": 40400 + }, + { + "epoch": 0.4202683491236627, + "grad_norm": 1.2531317472457886, + "learning_rate": 0.00025797420278726117, + "loss": 4.188993835449219, + "step": 40500 + }, + { + "epoch": 0.4213060487511285, + "grad_norm": 2.5484142303466797, + "learning_rate": 0.00025787043282451456, + "loss": 4.031621398925782, + "step": 40600 + }, + { + "epoch": 0.42234374837859434, + "grad_norm": 1.823457956314087, + "learning_rate": 0.000257766662861768, + "loss": 4.001983032226563, + "step": 40700 + }, + { + "epoch": 0.42338144800606015, + "grad_norm": 1.9530704021453857, + "learning_rate": 0.0002576628928990214, + "loss": 4.030978088378906, + "step": 40800 + }, + { + "epoch": 0.424419147633526, + "grad_norm": 4.55501127243042, + "learning_rate": 0.00025755912293627485, + "loss": 4.062133178710938, + "step": 40900 + }, + { + "epoch": 0.4254568472609918, + "grad_norm": 1.9799492359161377, + "learning_rate": 0.00025745535297352825, + "loss": 3.9875259399414062, + "step": 41000 + }, + { + "epoch": 0.42649454688845767, + "grad_norm": 2.4329614639282227, + "learning_rate": 0.00025735158301078164, + "loss": 3.9634893798828124, + "step": 41100 + }, + { + "epoch": 0.4275322465159235, + "grad_norm": 1.3791182041168213, + "learning_rate": 0.0002572478130480351, + "loss": 4.171094055175781, + "step": 41200 + }, + { + "epoch": 0.42856994614338934, + "grad_norm": 1.4852691888809204, + "learning_rate": 0.0002571440430852885, + "loss": 4.059336547851562, + "step": 41300 + }, + { + "epoch": 0.4296076457708552, + "grad_norm": 2.191392183303833, + "learning_rate": 0.00025704027312254194, + "loss": 3.9574560546875, + "step": 41400 + }, + { + "epoch": 0.430645345398321, + "grad_norm": 3.4423017501831055, + "learning_rate": 0.0002569365031597954, + "loss": 3.990745849609375, + "step": 41500 + }, + { + "epoch": 0.43168304502578686, + "grad_norm": 2.979930877685547, + "learning_rate": 0.0002568327331970488, + "loss": 4.166605529785156, + "step": 41600 + }, + { + "epoch": 0.43272074465325266, + "grad_norm": 3.131230354309082, + "learning_rate": 0.0002567289632343022, + "loss": 4.026178894042968, + "step": 41700 + }, + { + "epoch": 0.4337584442807185, + "grad_norm": 1.578643798828125, + "learning_rate": 0.00025662519327155557, + "loss": 4.10739990234375, + "step": 41800 + }, + { + "epoch": 0.43479614390818433, + "grad_norm": 3.628096580505371, + "learning_rate": 0.000256521423308809, + "loss": 4.021985473632813, + "step": 41900 + }, + { + "epoch": 0.4358338435356502, + "grad_norm": 2.235994815826416, + "learning_rate": 0.0002564176533460624, + "loss": 4.138570251464844, + "step": 42000 + }, + { + "epoch": 0.436871543163116, + "grad_norm": 3.0459887981414795, + "learning_rate": 0.00025631388338331586, + "loss": 4.139791564941406, + "step": 42100 + }, + { + "epoch": 0.43790924279058185, + "grad_norm": 1.0590101480484009, + "learning_rate": 0.00025621011342056926, + "loss": 4.018776550292968, + "step": 42200 + }, + { + "epoch": 0.43894694241804766, + "grad_norm": 3.5735878944396973, + "learning_rate": 0.00025610634345782265, + "loss": 4.182121887207031, + "step": 42300 + }, + { + "epoch": 0.4399846420455135, + "grad_norm": 1.1051421165466309, + "learning_rate": 0.0002560025734950761, + "loss": 4.086949157714844, + "step": 42400 + }, + { + "epoch": 0.4410223416729793, + "grad_norm": 2.8680758476257324, + "learning_rate": 0.0002558988035323295, + "loss": 4.053037414550781, + "step": 42500 + }, + { + "epoch": 0.4420600413004452, + "grad_norm": 1.6805782318115234, + "learning_rate": 0.00025579503356958294, + "loss": 4.041470947265625, + "step": 42600 + }, + { + "epoch": 0.443097740927911, + "grad_norm": 1.7229841947555542, + "learning_rate": 0.00025569126360683634, + "loss": 4.1356103515625, + "step": 42700 + }, + { + "epoch": 0.44413544055537685, + "grad_norm": 1.4601655006408691, + "learning_rate": 0.00025558749364408973, + "loss": 4.052696533203125, + "step": 42800 + }, + { + "epoch": 0.44517314018284265, + "grad_norm": 1.552959680557251, + "learning_rate": 0.0002554837236813432, + "loss": 4.020947875976563, + "step": 42900 + }, + { + "epoch": 0.4462108398103085, + "grad_norm": 1.3446309566497803, + "learning_rate": 0.0002553799537185966, + "loss": 4.150856018066406, + "step": 43000 + }, + { + "epoch": 0.4472485394377743, + "grad_norm": 3.128110408782959, + "learning_rate": 0.00025527618375585003, + "loss": 4.118401794433594, + "step": 43100 + }, + { + "epoch": 0.4482862390652402, + "grad_norm": 1.328148603439331, + "learning_rate": 0.0002551724137931034, + "loss": 4.073428649902343, + "step": 43200 + }, + { + "epoch": 0.449323938692706, + "grad_norm": 1.5910078287124634, + "learning_rate": 0.00025506864383035687, + "loss": 4.110806579589844, + "step": 43300 + }, + { + "epoch": 0.45036163832017184, + "grad_norm": 1.2686039209365845, + "learning_rate": 0.00025496487386761027, + "loss": 4.007551574707032, + "step": 43400 + }, + { + "epoch": 0.4513993379476377, + "grad_norm": 4.290769577026367, + "learning_rate": 0.00025486110390486366, + "loss": 4.068913269042969, + "step": 43500 + }, + { + "epoch": 0.4524370375751035, + "grad_norm": 1.6915346384048462, + "learning_rate": 0.0002547573339421171, + "loss": 4.066489562988282, + "step": 43600 + }, + { + "epoch": 0.45347473720256937, + "grad_norm": 1.3425647020339966, + "learning_rate": 0.0002546535639793705, + "loss": 4.024351806640625, + "step": 43700 + }, + { + "epoch": 0.45451243683003517, + "grad_norm": 4.726262092590332, + "learning_rate": 0.00025454979401662395, + "loss": 4.055924987792968, + "step": 43800 + }, + { + "epoch": 0.45555013645750103, + "grad_norm": 1.3767929077148438, + "learning_rate": 0.00025444602405387735, + "loss": 4.1108706665039065, + "step": 43900 + }, + { + "epoch": 0.45658783608496684, + "grad_norm": 2.199096918106079, + "learning_rate": 0.00025434225409113074, + "loss": 4.032781982421875, + "step": 44000 + }, + { + "epoch": 0.4576255357124327, + "grad_norm": 1.529963731765747, + "learning_rate": 0.0002542384841283842, + "loss": 3.9078250122070313, + "step": 44100 + }, + { + "epoch": 0.4586632353398985, + "grad_norm": 2.381452798843384, + "learning_rate": 0.0002541347141656376, + "loss": 4.1637747192382815, + "step": 44200 + }, + { + "epoch": 0.45970093496736436, + "grad_norm": 1.3512217998504639, + "learning_rate": 0.00025403094420289104, + "loss": 4.1603765869140625, + "step": 44300 + }, + { + "epoch": 0.46073863459483017, + "grad_norm": 1.6877330541610718, + "learning_rate": 0.00025392717424014443, + "loss": 3.9833114624023436, + "step": 44400 + }, + { + "epoch": 0.461776334222296, + "grad_norm": 10.19050121307373, + "learning_rate": 0.0002538234042773978, + "loss": 4.087564086914062, + "step": 44500 + }, + { + "epoch": 0.46281403384976183, + "grad_norm": 2.2430684566497803, + "learning_rate": 0.0002537196343146512, + "loss": 3.943908386230469, + "step": 44600 + }, + { + "epoch": 0.4638517334772277, + "grad_norm": 1.8005903959274292, + "learning_rate": 0.00025361586435190467, + "loss": 4.026759948730469, + "step": 44700 + }, + { + "epoch": 0.4648894331046935, + "grad_norm": 1.3022342920303345, + "learning_rate": 0.0002535120943891581, + "loss": 4.106507263183594, + "step": 44800 + }, + { + "epoch": 0.46592713273215935, + "grad_norm": 1.1729425191879272, + "learning_rate": 0.0002534083244264115, + "loss": 4.0660693359375, + "step": 44900 + }, + { + "epoch": 0.46696483235962516, + "grad_norm": 1.7224327325820923, + "learning_rate": 0.0002533045544636649, + "loss": 3.9855413818359375, + "step": 45000 + }, + { + "epoch": 0.468002531987091, + "grad_norm": 1.6977527141571045, + "learning_rate": 0.0002532007845009183, + "loss": 3.813612976074219, + "step": 45100 + }, + { + "epoch": 0.4690402316145568, + "grad_norm": 2.9529614448547363, + "learning_rate": 0.00025309701453817175, + "loss": 3.995145263671875, + "step": 45200 + }, + { + "epoch": 0.4700779312420227, + "grad_norm": 3.1997270584106445, + "learning_rate": 0.0002529932445754252, + "loss": 4.031595153808594, + "step": 45300 + }, + { + "epoch": 0.47111563086948854, + "grad_norm": 5.878026008605957, + "learning_rate": 0.0002528894746126786, + "loss": 4.028975524902344, + "step": 45400 + }, + { + "epoch": 0.47215333049695435, + "grad_norm": 1.7146035432815552, + "learning_rate": 0.00025278570464993205, + "loss": 4.085393676757812, + "step": 45500 + }, + { + "epoch": 0.4731910301244202, + "grad_norm": 2.954148292541504, + "learning_rate": 0.00025268193468718544, + "loss": 4.039700622558594, + "step": 45600 + }, + { + "epoch": 0.474228729751886, + "grad_norm": 1.9127237796783447, + "learning_rate": 0.00025257816472443883, + "loss": 4.100406494140625, + "step": 45700 + }, + { + "epoch": 0.4752664293793519, + "grad_norm": 1.8794509172439575, + "learning_rate": 0.00025247439476169223, + "loss": 3.9390939331054686, + "step": 45800 + }, + { + "epoch": 0.4763041290068177, + "grad_norm": 2.165816307067871, + "learning_rate": 0.0002523706247989457, + "loss": 4.155856628417968, + "step": 45900 + }, + { + "epoch": 0.47734182863428354, + "grad_norm": 6.686591148376465, + "learning_rate": 0.00025226685483619913, + "loss": 4.097453918457031, + "step": 46000 + }, + { + "epoch": 0.47837952826174934, + "grad_norm": 2.4973371028900146, + "learning_rate": 0.0002521630848734525, + "loss": 4.200291137695313, + "step": 46100 + }, + { + "epoch": 0.4794172278892152, + "grad_norm": 2.1478147506713867, + "learning_rate": 0.0002520593149107059, + "loss": 3.899898681640625, + "step": 46200 + }, + { + "epoch": 0.480454927516681, + "grad_norm": 1.6290667057037354, + "learning_rate": 0.0002519555449479593, + "loss": 4.157419128417969, + "step": 46300 + }, + { + "epoch": 0.48149262714414687, + "grad_norm": 2.3697171211242676, + "learning_rate": 0.00025185177498521276, + "loss": 4.0068753051757815, + "step": 46400 + }, + { + "epoch": 0.48253032677161267, + "grad_norm": 3.123157501220703, + "learning_rate": 0.00025174800502246616, + "loss": 3.9923574829101565, + "step": 46500 + }, + { + "epoch": 0.48356802639907853, + "grad_norm": 3.4272193908691406, + "learning_rate": 0.0002516442350597196, + "loss": 4.144463195800781, + "step": 46600 + }, + { + "epoch": 0.48460572602654434, + "grad_norm": 2.8348467350006104, + "learning_rate": 0.000251540465096973, + "loss": 4.055748291015625, + "step": 46700 + }, + { + "epoch": 0.4856434256540102, + "grad_norm": 3.0261967182159424, + "learning_rate": 0.0002514366951342264, + "loss": 4.177880554199219, + "step": 46800 + }, + { + "epoch": 0.486681125281476, + "grad_norm": 10.726264953613281, + "learning_rate": 0.00025133292517147984, + "loss": 3.9125796508789064, + "step": 46900 + }, + { + "epoch": 0.48771882490894186, + "grad_norm": 8.811136245727539, + "learning_rate": 0.00025122915520873324, + "loss": 3.9216848754882814, + "step": 47000 + }, + { + "epoch": 0.48875652453640767, + "grad_norm": 6.8598151206970215, + "learning_rate": 0.0002511253852459867, + "loss": 3.9738433837890623, + "step": 47100 + }, + { + "epoch": 0.4897942241638735, + "grad_norm": 5.096536636352539, + "learning_rate": 0.0002510216152832401, + "loss": 3.998507080078125, + "step": 47200 + }, + { + "epoch": 0.4908319237913394, + "grad_norm": 1.4742202758789062, + "learning_rate": 0.00025091784532049353, + "loss": 4.171350402832031, + "step": 47300 + }, + { + "epoch": 0.4918696234188052, + "grad_norm": 1.88887357711792, + "learning_rate": 0.0002508140753577469, + "loss": 4.106647644042969, + "step": 47400 + }, + { + "epoch": 0.49290732304627105, + "grad_norm": 1.6502625942230225, + "learning_rate": 0.0002507103053950003, + "loss": 3.877885437011719, + "step": 47500 + }, + { + "epoch": 0.49394502267373686, + "grad_norm": 1.728053331375122, + "learning_rate": 0.00025060653543225377, + "loss": 4.064427795410157, + "step": 47600 + }, + { + "epoch": 0.4949827223012027, + "grad_norm": 4.632587432861328, + "learning_rate": 0.00025050276546950716, + "loss": 4.113824157714844, + "step": 47700 + }, + { + "epoch": 0.4960204219286685, + "grad_norm": 1.5823708772659302, + "learning_rate": 0.0002503989955067606, + "loss": 4.080696411132813, + "step": 47800 + }, + { + "epoch": 0.4970581215561344, + "grad_norm": 1.9801136255264282, + "learning_rate": 0.000250295225544014, + "loss": 3.945875549316406, + "step": 47900 + }, + { + "epoch": 0.4980958211836002, + "grad_norm": 1.3339368104934692, + "learning_rate": 0.0002501914555812674, + "loss": 3.951331787109375, + "step": 48000 + }, + { + "epoch": 0.49913352081106604, + "grad_norm": 2.1013355255126953, + "learning_rate": 0.00025008768561852085, + "loss": 4.022156372070312, + "step": 48100 + }, + { + "epoch": 0.5001712204385319, + "grad_norm": 2.7022488117218018, + "learning_rate": 0.00024998391565577425, + "loss": 3.9780624389648436, + "step": 48200 + }, + { + "epoch": 0.5012089200659977, + "grad_norm": 10.230494499206543, + "learning_rate": 0.0002498801456930277, + "loss": 4.024637145996094, + "step": 48300 + }, + { + "epoch": 0.5022466196934635, + "grad_norm": 7.242427349090576, + "learning_rate": 0.0002497763757302811, + "loss": 3.9954248046875, + "step": 48400 + }, + { + "epoch": 0.5032843193209293, + "grad_norm": 2.742445945739746, + "learning_rate": 0.0002496726057675345, + "loss": 3.9637130737304687, + "step": 48500 + }, + { + "epoch": 0.5043220189483952, + "grad_norm": 1.6320149898529053, + "learning_rate": 0.00024956883580478794, + "loss": 4.035350952148438, + "step": 48600 + }, + { + "epoch": 0.505359718575861, + "grad_norm": 2.239950180053711, + "learning_rate": 0.00024946506584204133, + "loss": 3.961440124511719, + "step": 48700 + }, + { + "epoch": 0.5063974182033268, + "grad_norm": 6.686822891235352, + "learning_rate": 0.0002493612958792948, + "loss": 4.003260498046875, + "step": 48800 + }, + { + "epoch": 0.5074351178307926, + "grad_norm": 1.9818964004516602, + "learning_rate": 0.0002492575259165482, + "loss": 4.018614501953125, + "step": 48900 + }, + { + "epoch": 0.5084728174582586, + "grad_norm": 1.5698004961013794, + "learning_rate": 0.00024915375595380157, + "loss": 4.045997314453125, + "step": 49000 + }, + { + "epoch": 0.5095105170857244, + "grad_norm": 2.3865158557891846, + "learning_rate": 0.000249049985991055, + "loss": 4.050853576660156, + "step": 49100 + }, + { + "epoch": 0.5105482167131902, + "grad_norm": 14.248946189880371, + "learning_rate": 0.0002489462160283084, + "loss": 3.991949462890625, + "step": 49200 + }, + { + "epoch": 0.5115859163406561, + "grad_norm": 1.279118537902832, + "learning_rate": 0.00024884244606556186, + "loss": 3.92796875, + "step": 49300 + }, + { + "epoch": 0.5126236159681219, + "grad_norm": 2.575704574584961, + "learning_rate": 0.00024873867610281526, + "loss": 4.12865478515625, + "step": 49400 + }, + { + "epoch": 0.5136613155955877, + "grad_norm": 2.0912930965423584, + "learning_rate": 0.0002486349061400687, + "loss": 4.042799682617187, + "step": 49500 + }, + { + "epoch": 0.5146990152230535, + "grad_norm": 2.6358580589294434, + "learning_rate": 0.0002485311361773221, + "loss": 4.069761047363281, + "step": 49600 + }, + { + "epoch": 0.5157367148505194, + "grad_norm": 2.6711385250091553, + "learning_rate": 0.0002484273662145755, + "loss": 3.9823483276367186, + "step": 49700 + }, + { + "epoch": 0.5167744144779852, + "grad_norm": 3.348376989364624, + "learning_rate": 0.00024832359625182894, + "loss": 4.119874572753906, + "step": 49800 + }, + { + "epoch": 0.517812114105451, + "grad_norm": 1.7040736675262451, + "learning_rate": 0.00024821982628908234, + "loss": 4.038002319335938, + "step": 49900 + }, + { + "epoch": 0.5188498137329168, + "grad_norm": 11.144097328186035, + "learning_rate": 0.0002481160563263358, + "loss": 3.933763122558594, + "step": 50000 + }, + { + "epoch": 0.5198875133603827, + "grad_norm": 3.1529595851898193, + "learning_rate": 0.0002480122863635892, + "loss": 3.990421142578125, + "step": 50100 + }, + { + "epoch": 0.5209252129878486, + "grad_norm": 2.3761773109436035, + "learning_rate": 0.0002479085164008426, + "loss": 3.9385421752929686, + "step": 50200 + }, + { + "epoch": 0.5219629126153144, + "grad_norm": 14.909253120422363, + "learning_rate": 0.00024780474643809597, + "loss": 3.924638671875, + "step": 50300 + }, + { + "epoch": 0.5230006122427802, + "grad_norm": 1.4870705604553223, + "learning_rate": 0.0002477009764753494, + "loss": 4.003363037109375, + "step": 50400 + }, + { + "epoch": 0.5240383118702461, + "grad_norm": 2.5456697940826416, + "learning_rate": 0.00024759720651260287, + "loss": 4.063373413085937, + "step": 50500 + }, + { + "epoch": 0.5250760114977119, + "grad_norm": 4.392611980438232, + "learning_rate": 0.00024749343654985627, + "loss": 4.108450927734375, + "step": 50600 + }, + { + "epoch": 0.5261137111251777, + "grad_norm": 2.8420300483703613, + "learning_rate": 0.00024738966658710966, + "loss": 3.9724908447265626, + "step": 50700 + }, + { + "epoch": 0.5271514107526435, + "grad_norm": 2.3819692134857178, + "learning_rate": 0.00024728589662436306, + "loss": 4.040487060546875, + "step": 50800 + }, + { + "epoch": 0.5281891103801094, + "grad_norm": 2.1021909713745117, + "learning_rate": 0.0002471821266616165, + "loss": 4.101463623046875, + "step": 50900 + }, + { + "epoch": 0.5292268100075752, + "grad_norm": 2.8605117797851562, + "learning_rate": 0.00024707835669886995, + "loss": 3.9426974487304687, + "step": 51000 + }, + { + "epoch": 0.530264509635041, + "grad_norm": 1.331457257270813, + "learning_rate": 0.00024697458673612335, + "loss": 4.005464172363281, + "step": 51100 + }, + { + "epoch": 0.5313022092625068, + "grad_norm": 2.4866714477539062, + "learning_rate": 0.00024687081677337674, + "loss": 4.089916687011719, + "step": 51200 + }, + { + "epoch": 0.5323399088899727, + "grad_norm": 6.342608451843262, + "learning_rate": 0.00024676704681063014, + "loss": 3.979620361328125, + "step": 51300 + }, + { + "epoch": 0.5333776085174385, + "grad_norm": 1.3954708576202393, + "learning_rate": 0.0002466632768478836, + "loss": 3.9805123901367185, + "step": 51400 + }, + { + "epoch": 0.5344153081449043, + "grad_norm": 24.8520450592041, + "learning_rate": 0.000246559506885137, + "loss": 4.0105502319335935, + "step": 51500 + }, + { + "epoch": 0.5354530077723703, + "grad_norm": 2.0366039276123047, + "learning_rate": 0.00024645573692239043, + "loss": 3.919516296386719, + "step": 51600 + }, + { + "epoch": 0.5364907073998361, + "grad_norm": 1.3017858266830444, + "learning_rate": 0.0002463519669596439, + "loss": 3.951867980957031, + "step": 51700 + }, + { + "epoch": 0.5375284070273019, + "grad_norm": 2.579885244369507, + "learning_rate": 0.0002462481969968973, + "loss": 3.960545959472656, + "step": 51800 + }, + { + "epoch": 0.5385661066547677, + "grad_norm": 1.5787100791931152, + "learning_rate": 0.00024614442703415067, + "loss": 4.013999938964844, + "step": 51900 + }, + { + "epoch": 0.5396038062822336, + "grad_norm": 3.9871633052825928, + "learning_rate": 0.00024604065707140406, + "loss": 3.950070495605469, + "step": 52000 + }, + { + "epoch": 0.5406415059096994, + "grad_norm": 1.572277545928955, + "learning_rate": 0.0002459368871086575, + "loss": 4.086417846679687, + "step": 52100 + }, + { + "epoch": 0.5416792055371652, + "grad_norm": 7.029146671295166, + "learning_rate": 0.0002458331171459109, + "loss": 3.8767724609375, + "step": 52200 + }, + { + "epoch": 0.542716905164631, + "grad_norm": 1.2442755699157715, + "learning_rate": 0.00024572934718316436, + "loss": 3.875315856933594, + "step": 52300 + }, + { + "epoch": 0.5437546047920969, + "grad_norm": 3.5381152629852295, + "learning_rate": 0.00024562557722041775, + "loss": 4.013727416992188, + "step": 52400 + }, + { + "epoch": 0.5447923044195627, + "grad_norm": 16.472898483276367, + "learning_rate": 0.00024552180725767115, + "loss": 4.058722839355469, + "step": 52500 + }, + { + "epoch": 0.5458300040470285, + "grad_norm": 1.4836983680725098, + "learning_rate": 0.0002454180372949246, + "loss": 4.106039123535156, + "step": 52600 + }, + { + "epoch": 0.5468677036744943, + "grad_norm": 4.735908031463623, + "learning_rate": 0.000245314267332178, + "loss": 4.109900817871094, + "step": 52700 + }, + { + "epoch": 0.5479054033019602, + "grad_norm": 1.7438913583755493, + "learning_rate": 0.00024521049736943144, + "loss": 4.098789978027344, + "step": 52800 + }, + { + "epoch": 0.548943102929426, + "grad_norm": 3.592564105987549, + "learning_rate": 0.00024510672740668483, + "loss": 3.9866278076171877, + "step": 52900 + }, + { + "epoch": 0.5499808025568919, + "grad_norm": 1.9763888120651245, + "learning_rate": 0.00024500295744393823, + "loss": 3.9620831298828123, + "step": 53000 + }, + { + "epoch": 0.5510185021843577, + "grad_norm": 1.0539793968200684, + "learning_rate": 0.0002448991874811917, + "loss": 4.006460266113281, + "step": 53100 + }, + { + "epoch": 0.5520562018118236, + "grad_norm": 2.2474358081817627, + "learning_rate": 0.00024479541751844507, + "loss": 4.067258605957031, + "step": 53200 + }, + { + "epoch": 0.5530939014392894, + "grad_norm": 1.5785913467407227, + "learning_rate": 0.0002446916475556985, + "loss": 4.057683715820312, + "step": 53300 + }, + { + "epoch": 0.5541316010667552, + "grad_norm": 2.2754416465759277, + "learning_rate": 0.0002445878775929519, + "loss": 3.9662628173828125, + "step": 53400 + }, + { + "epoch": 0.5551693006942211, + "grad_norm": 2.0118043422698975, + "learning_rate": 0.00024448410763020537, + "loss": 3.9848583984375, + "step": 53500 + }, + { + "epoch": 0.5562070003216869, + "grad_norm": 2.3987770080566406, + "learning_rate": 0.00024438033766745876, + "loss": 4.00030029296875, + "step": 53600 + }, + { + "epoch": 0.5572446999491527, + "grad_norm": 2.9198148250579834, + "learning_rate": 0.00024427656770471216, + "loss": 3.8882846069335937, + "step": 53700 + }, + { + "epoch": 0.5582823995766185, + "grad_norm": 2.0234696865081787, + "learning_rate": 0.0002441727977419656, + "loss": 3.9845794677734374, + "step": 53800 + }, + { + "epoch": 0.5593200992040844, + "grad_norm": 1.701568841934204, + "learning_rate": 0.000244069027779219, + "loss": 4.01090087890625, + "step": 53900 + }, + { + "epoch": 0.5603577988315502, + "grad_norm": 2.3093771934509277, + "learning_rate": 0.00024396525781647242, + "loss": 3.9678195190429686, + "step": 54000 + }, + { + "epoch": 0.561395498459016, + "grad_norm": 2.0182909965515137, + "learning_rate": 0.00024386148785372582, + "loss": 4.025320434570313, + "step": 54100 + }, + { + "epoch": 0.5624331980864818, + "grad_norm": 3.1341028213500977, + "learning_rate": 0.00024375771789097927, + "loss": 3.9826446533203126, + "step": 54200 + }, + { + "epoch": 0.5634708977139478, + "grad_norm": 2.025581121444702, + "learning_rate": 0.0002436539479282327, + "loss": 3.906527404785156, + "step": 54300 + }, + { + "epoch": 0.5645085973414136, + "grad_norm": 2.913895845413208, + "learning_rate": 0.00024355017796548608, + "loss": 3.970755920410156, + "step": 54400 + }, + { + "epoch": 0.5655462969688794, + "grad_norm": 1.9220850467681885, + "learning_rate": 0.0002434464080027395, + "loss": 3.943621826171875, + "step": 54500 + }, + { + "epoch": 0.5665839965963452, + "grad_norm": 1.2168983221054077, + "learning_rate": 0.0002433426380399929, + "loss": 3.9780545043945312, + "step": 54600 + }, + { + "epoch": 0.5676216962238111, + "grad_norm": 1.5367380380630493, + "learning_rate": 0.00024323886807724635, + "loss": 3.8468157958984377, + "step": 54700 + }, + { + "epoch": 0.5686593958512769, + "grad_norm": 2.7281689643859863, + "learning_rate": 0.00024313509811449977, + "loss": 3.9043319702148436, + "step": 54800 + }, + { + "epoch": 0.5696970954787427, + "grad_norm": 1.1875724792480469, + "learning_rate": 0.00024303132815175316, + "loss": 4.029020385742188, + "step": 54900 + }, + { + "epoch": 0.5707347951062085, + "grad_norm": 9.087173461914062, + "learning_rate": 0.00024292755818900659, + "loss": 3.977708740234375, + "step": 55000 + }, + { + "epoch": 0.5717724947336744, + "grad_norm": 1.94620943069458, + "learning_rate": 0.00024282378822626, + "loss": 3.9465988159179686, + "step": 55100 + }, + { + "epoch": 0.5728101943611402, + "grad_norm": 3.0396885871887207, + "learning_rate": 0.00024272001826351343, + "loss": 4.030888366699219, + "step": 55200 + }, + { + "epoch": 0.573847893988606, + "grad_norm": 1.557199239730835, + "learning_rate": 0.00024261624830076682, + "loss": 3.9756591796875, + "step": 55300 + }, + { + "epoch": 0.5748855936160719, + "grad_norm": 3.0625579357147217, + "learning_rate": 0.00024251247833802025, + "loss": 4.076784362792969, + "step": 55400 + }, + { + "epoch": 0.5759232932435377, + "grad_norm": 1.9166301488876343, + "learning_rate": 0.0002424087083752737, + "loss": 3.9604058837890626, + "step": 55500 + }, + { + "epoch": 0.5769609928710036, + "grad_norm": 1.2829216718673706, + "learning_rate": 0.0002423049384125271, + "loss": 3.841531066894531, + "step": 55600 + }, + { + "epoch": 0.5779986924984694, + "grad_norm": 2.9800634384155273, + "learning_rate": 0.0002422011684497805, + "loss": 3.915208435058594, + "step": 55700 + }, + { + "epoch": 0.5790363921259353, + "grad_norm": 4.931972026824951, + "learning_rate": 0.0002420973984870339, + "loss": 3.7610516357421875, + "step": 55800 + }, + { + "epoch": 0.5800740917534011, + "grad_norm": 3.796473264694214, + "learning_rate": 0.00024199362852428733, + "loss": 4.009695129394531, + "step": 55900 + }, + { + "epoch": 0.5811117913808669, + "grad_norm": 2.3635172843933105, + "learning_rate": 0.00024188985856154075, + "loss": 4.164959716796875, + "step": 56000 + }, + { + "epoch": 0.5821494910083327, + "grad_norm": 2.3295187950134277, + "learning_rate": 0.00024178608859879417, + "loss": 4.012393493652343, + "step": 56100 + }, + { + "epoch": 0.5831871906357986, + "grad_norm": 3.1501762866973877, + "learning_rate": 0.0002416823186360476, + "loss": 3.9226104736328127, + "step": 56200 + }, + { + "epoch": 0.5842248902632644, + "grad_norm": 2.8185627460479736, + "learning_rate": 0.000241578548673301, + "loss": 3.9830364990234375, + "step": 56300 + }, + { + "epoch": 0.5852625898907302, + "grad_norm": 2.39125657081604, + "learning_rate": 0.00024147477871055444, + "loss": 4.058615112304688, + "step": 56400 + }, + { + "epoch": 0.586300289518196, + "grad_norm": 2.658254623413086, + "learning_rate": 0.00024137100874780783, + "loss": 3.9012820434570314, + "step": 56500 + }, + { + "epoch": 0.5873379891456619, + "grad_norm": 2.873662233352661, + "learning_rate": 0.00024126723878506126, + "loss": 4.018562622070313, + "step": 56600 + }, + { + "epoch": 0.5883756887731277, + "grad_norm": 2.0522000789642334, + "learning_rate": 0.00024116346882231468, + "loss": 4.0417938232421875, + "step": 56700 + }, + { + "epoch": 0.5894133884005935, + "grad_norm": 2.688117742538452, + "learning_rate": 0.00024105969885956807, + "loss": 3.910294494628906, + "step": 56800 + }, + { + "epoch": 0.5904510880280593, + "grad_norm": 3.5324251651763916, + "learning_rate": 0.00024095592889682152, + "loss": 4.042366027832031, + "step": 56900 + }, + { + "epoch": 0.5914887876555253, + "grad_norm": 3.254483461380005, + "learning_rate": 0.00024085215893407492, + "loss": 3.875579833984375, + "step": 57000 + }, + { + "epoch": 0.5925264872829911, + "grad_norm": 1.4469491243362427, + "learning_rate": 0.00024074838897132834, + "loss": 3.8468057250976564, + "step": 57100 + }, + { + "epoch": 0.5935641869104569, + "grad_norm": 7.142496585845947, + "learning_rate": 0.00024064461900858173, + "loss": 3.9028366088867186, + "step": 57200 + }, + { + "epoch": 0.5946018865379228, + "grad_norm": 2.8328020572662354, + "learning_rate": 0.00024054084904583518, + "loss": 4.013849182128906, + "step": 57300 + }, + { + "epoch": 0.5956395861653886, + "grad_norm": 1.999799370765686, + "learning_rate": 0.0002404370790830886, + "loss": 3.9890103149414062, + "step": 57400 + }, + { + "epoch": 0.5966772857928544, + "grad_norm": 5.142120361328125, + "learning_rate": 0.000240333309120342, + "loss": 3.8782421875, + "step": 57500 + }, + { + "epoch": 0.5977149854203202, + "grad_norm": 2.6170506477355957, + "learning_rate": 0.00024022953915759542, + "loss": 3.9341799926757814, + "step": 57600 + }, + { + "epoch": 0.5987526850477861, + "grad_norm": 4.847115993499756, + "learning_rate": 0.00024012576919484882, + "loss": 4.028234252929687, + "step": 57700 + }, + { + "epoch": 0.5997903846752519, + "grad_norm": 3.093014717102051, + "learning_rate": 0.00024002199923210226, + "loss": 4.02893310546875, + "step": 57800 + }, + { + "epoch": 0.6008280843027177, + "grad_norm": 2.6559977531433105, + "learning_rate": 0.00023991822926935566, + "loss": 3.9997882080078124, + "step": 57900 + }, + { + "epoch": 0.6018657839301835, + "grad_norm": 1.5972485542297363, + "learning_rate": 0.00023981445930660908, + "loss": 3.9749560546875, + "step": 58000 + }, + { + "epoch": 0.6029034835576494, + "grad_norm": 3.777557134628296, + "learning_rate": 0.0002397106893438625, + "loss": 3.9969076538085937, + "step": 58100 + }, + { + "epoch": 0.6039411831851152, + "grad_norm": 1.8903939723968506, + "learning_rate": 0.00023960691938111593, + "loss": 4.007763977050781, + "step": 58200 + }, + { + "epoch": 0.604978882812581, + "grad_norm": 3.150963068008423, + "learning_rate": 0.00023950314941836935, + "loss": 4.019749145507813, + "step": 58300 + }, + { + "epoch": 0.6060165824400469, + "grad_norm": 1.934287190437317, + "learning_rate": 0.00023939937945562274, + "loss": 4.014994812011719, + "step": 58400 + }, + { + "epoch": 0.6070542820675128, + "grad_norm": 7.10530948638916, + "learning_rate": 0.00023929560949287616, + "loss": 4.050195617675781, + "step": 58500 + }, + { + "epoch": 0.6080919816949786, + "grad_norm": 2.367403030395508, + "learning_rate": 0.0002391918395301296, + "loss": 3.8701296997070314, + "step": 58600 + }, + { + "epoch": 0.6091296813224444, + "grad_norm": 1.9392305612564087, + "learning_rate": 0.000239088069567383, + "loss": 4.08440185546875, + "step": 58700 + }, + { + "epoch": 0.6101673809499102, + "grad_norm": 2.5947983264923096, + "learning_rate": 0.00023898429960463643, + "loss": 4.050205078125, + "step": 58800 + }, + { + "epoch": 0.6112050805773761, + "grad_norm": 2.1583032608032227, + "learning_rate": 0.00023888052964188982, + "loss": 3.958690490722656, + "step": 58900 + }, + { + "epoch": 0.6122427802048419, + "grad_norm": 1.6529427766799927, + "learning_rate": 0.00023877675967914325, + "loss": 3.9609234619140623, + "step": 59000 + }, + { + "epoch": 0.6132804798323077, + "grad_norm": 2.0239171981811523, + "learning_rate": 0.00023867298971639667, + "loss": 4.128135986328125, + "step": 59100 + }, + { + "epoch": 0.6143181794597736, + "grad_norm": 3.8679206371307373, + "learning_rate": 0.0002385692197536501, + "loss": 4.005528869628907, + "step": 59200 + }, + { + "epoch": 0.6153558790872394, + "grad_norm": 3.305494785308838, + "learning_rate": 0.0002384654497909035, + "loss": 3.9134161376953127, + "step": 59300 + }, + { + "epoch": 0.6163935787147052, + "grad_norm": 1.640649676322937, + "learning_rate": 0.0002383616798281569, + "loss": 3.92852783203125, + "step": 59400 + }, + { + "epoch": 0.617431278342171, + "grad_norm": 1.7184723615646362, + "learning_rate": 0.00023825790986541036, + "loss": 3.8771322631835936, + "step": 59500 + }, + { + "epoch": 0.618468977969637, + "grad_norm": 2.6886117458343506, + "learning_rate": 0.00023815413990266375, + "loss": 4.047822875976562, + "step": 59600 + }, + { + "epoch": 0.6195066775971028, + "grad_norm": 2.9485394954681396, + "learning_rate": 0.00023805036993991717, + "loss": 4.04974853515625, + "step": 59700 + }, + { + "epoch": 0.6205443772245686, + "grad_norm": 18.998411178588867, + "learning_rate": 0.00023794659997717057, + "loss": 3.978843994140625, + "step": 59800 + }, + { + "epoch": 0.6215820768520344, + "grad_norm": 1.6347628831863403, + "learning_rate": 0.000237842830014424, + "loss": 3.94311279296875, + "step": 59900 + }, + { + "epoch": 0.6226197764795003, + "grad_norm": 4.1301798820495605, + "learning_rate": 0.00023773906005167744, + "loss": 4.044434814453125, + "step": 60000 + }, + { + "epoch": 0.6236574761069661, + "grad_norm": 2.7278170585632324, + "learning_rate": 0.00023763529008893083, + "loss": 3.9771295166015626, + "step": 60100 + }, + { + "epoch": 0.6246951757344319, + "grad_norm": 3.4196488857269287, + "learning_rate": 0.00023753152012618426, + "loss": 3.9663619995117188, + "step": 60200 + }, + { + "epoch": 0.6257328753618977, + "grad_norm": 1.3134477138519287, + "learning_rate": 0.00023742775016343765, + "loss": 4.089789733886719, + "step": 60300 + }, + { + "epoch": 0.6267705749893636, + "grad_norm": 4.490455627441406, + "learning_rate": 0.0002373239802006911, + "loss": 3.87512939453125, + "step": 60400 + }, + { + "epoch": 0.6278082746168294, + "grad_norm": 3.0652222633361816, + "learning_rate": 0.00023722021023794452, + "loss": 3.893270263671875, + "step": 60500 + }, + { + "epoch": 0.6288459742442952, + "grad_norm": 8.751646995544434, + "learning_rate": 0.00023711644027519792, + "loss": 3.862340393066406, + "step": 60600 + }, + { + "epoch": 0.629883673871761, + "grad_norm": 2.9108734130859375, + "learning_rate": 0.00023701267031245134, + "loss": 3.9849557495117187, + "step": 60700 + }, + { + "epoch": 0.630921373499227, + "grad_norm": 2.250643253326416, + "learning_rate": 0.00023690890034970473, + "loss": 3.955241394042969, + "step": 60800 + }, + { + "epoch": 0.6319590731266927, + "grad_norm": 1.4363751411437988, + "learning_rate": 0.00023680513038695818, + "loss": 4.0179071044921875, + "step": 60900 + }, + { + "epoch": 0.6329967727541586, + "grad_norm": 1.6399027109146118, + "learning_rate": 0.00023670136042421158, + "loss": 3.911060485839844, + "step": 61000 + }, + { + "epoch": 0.6340344723816245, + "grad_norm": 2.371727228164673, + "learning_rate": 0.000236597590461465, + "loss": 3.9237380981445313, + "step": 61100 + }, + { + "epoch": 0.6350721720090903, + "grad_norm": 1.6354718208312988, + "learning_rate": 0.00023649382049871842, + "loss": 4.036581420898438, + "step": 61200 + }, + { + "epoch": 0.6361098716365561, + "grad_norm": 3.147254705429077, + "learning_rate": 0.00023639005053597184, + "loss": 4.009747619628906, + "step": 61300 + }, + { + "epoch": 0.6371475712640219, + "grad_norm": 2.9439003467559814, + "learning_rate": 0.00023628628057322526, + "loss": 3.965068664550781, + "step": 61400 + }, + { + "epoch": 0.6381852708914878, + "grad_norm": 2.8980836868286133, + "learning_rate": 0.00023618251061047866, + "loss": 3.99951171875, + "step": 61500 + }, + { + "epoch": 0.6392229705189536, + "grad_norm": 2.862438201904297, + "learning_rate": 0.00023607874064773208, + "loss": 3.8896145629882812, + "step": 61600 + }, + { + "epoch": 0.6402606701464194, + "grad_norm": 1.7125756740570068, + "learning_rate": 0.00023597497068498548, + "loss": 3.9900253295898436, + "step": 61700 + }, + { + "epoch": 0.6412983697738852, + "grad_norm": 13.891119956970215, + "learning_rate": 0.00023587120072223892, + "loss": 3.8787249755859374, + "step": 61800 + }, + { + "epoch": 0.6423360694013511, + "grad_norm": 3.5258827209472656, + "learning_rate": 0.00023576743075949235, + "loss": 3.940326843261719, + "step": 61900 + }, + { + "epoch": 0.6433737690288169, + "grad_norm": 4.297271251678467, + "learning_rate": 0.00023566366079674574, + "loss": 3.8732571411132812, + "step": 62000 + }, + { + "epoch": 0.6444114686562827, + "grad_norm": 3.574477195739746, + "learning_rate": 0.00023555989083399916, + "loss": 4.078603515625, + "step": 62100 + }, + { + "epoch": 0.6454491682837485, + "grad_norm": 3.2514758110046387, + "learning_rate": 0.00023545612087125259, + "loss": 3.956298522949219, + "step": 62200 + }, + { + "epoch": 0.6464868679112145, + "grad_norm": 2.582719326019287, + "learning_rate": 0.000235352350908506, + "loss": 3.8729116821289065, + "step": 62300 + }, + { + "epoch": 0.6475245675386803, + "grad_norm": 2.445774793624878, + "learning_rate": 0.00023524858094575943, + "loss": 4.064724426269532, + "step": 62400 + }, + { + "epoch": 0.6485622671661461, + "grad_norm": 4.912772178649902, + "learning_rate": 0.00023514481098301282, + "loss": 4.02049560546875, + "step": 62500 + }, + { + "epoch": 0.6495999667936119, + "grad_norm": 3.490936040878296, + "learning_rate": 0.00023504104102026627, + "loss": 3.912366943359375, + "step": 62600 + }, + { + "epoch": 0.6506376664210778, + "grad_norm": 2.109618902206421, + "learning_rate": 0.00023493727105751967, + "loss": 3.963838806152344, + "step": 62700 + }, + { + "epoch": 0.6516753660485436, + "grad_norm": 12.706518173217773, + "learning_rate": 0.0002348335010947731, + "loss": 3.901888732910156, + "step": 62800 + }, + { + "epoch": 0.6527130656760094, + "grad_norm": 4.266041278839111, + "learning_rate": 0.00023472973113202648, + "loss": 3.902781982421875, + "step": 62900 + }, + { + "epoch": 0.6537507653034752, + "grad_norm": 3.4900457859039307, + "learning_rate": 0.0002346259611692799, + "loss": 3.8866873168945313, + "step": 63000 + }, + { + "epoch": 0.6547884649309411, + "grad_norm": 2.4276134967803955, + "learning_rate": 0.00023452219120653336, + "loss": 3.8234634399414062, + "step": 63100 + }, + { + "epoch": 0.6558261645584069, + "grad_norm": 2.8377914428710938, + "learning_rate": 0.00023441842124378675, + "loss": 3.836332092285156, + "step": 63200 + }, + { + "epoch": 0.6568638641858727, + "grad_norm": 6.935495853424072, + "learning_rate": 0.00023431465128104017, + "loss": 4.100373229980469, + "step": 63300 + }, + { + "epoch": 0.6579015638133386, + "grad_norm": 2.90283465385437, + "learning_rate": 0.00023421088131829357, + "loss": 3.9408758544921874, + "step": 63400 + }, + { + "epoch": 0.6589392634408044, + "grad_norm": 2.8002378940582275, + "learning_rate": 0.00023410711135554702, + "loss": 3.9959124755859374, + "step": 63500 + }, + { + "epoch": 0.6599769630682703, + "grad_norm": 6.091791152954102, + "learning_rate": 0.0002340033413928004, + "loss": 3.9287460327148436, + "step": 63600 + }, + { + "epoch": 0.661014662695736, + "grad_norm": 1.2786389589309692, + "learning_rate": 0.00023389957143005383, + "loss": 4.015799560546875, + "step": 63700 + }, + { + "epoch": 0.662052362323202, + "grad_norm": 1.4586912393569946, + "learning_rate": 0.00023379580146730726, + "loss": 3.89241455078125, + "step": 63800 + }, + { + "epoch": 0.6630900619506678, + "grad_norm": 2.502657890319824, + "learning_rate": 0.00023369203150456065, + "loss": 3.9217596435546875, + "step": 63900 + }, + { + "epoch": 0.6641277615781336, + "grad_norm": 3.8019394874572754, + "learning_rate": 0.0002335882615418141, + "loss": 3.91360595703125, + "step": 64000 + }, + { + "epoch": 0.6651654612055994, + "grad_norm": 1.5058764219284058, + "learning_rate": 0.0002334844915790675, + "loss": 4.059972839355469, + "step": 64100 + }, + { + "epoch": 0.6662031608330653, + "grad_norm": 2.416229248046875, + "learning_rate": 0.00023338072161632092, + "loss": 3.9887905883789063, + "step": 64200 + }, + { + "epoch": 0.6672408604605311, + "grad_norm": 1.8767884969711304, + "learning_rate": 0.00023327695165357434, + "loss": 3.8748153686523437, + "step": 64300 + }, + { + "epoch": 0.6682785600879969, + "grad_norm": 1.7000967264175415, + "learning_rate": 0.00023317318169082776, + "loss": 3.9118023681640626, + "step": 64400 + }, + { + "epoch": 0.6693162597154627, + "grad_norm": 4.796393394470215, + "learning_rate": 0.00023306941172808118, + "loss": 3.9076058959960935, + "step": 64500 + }, + { + "epoch": 0.6703539593429286, + "grad_norm": 3.117870807647705, + "learning_rate": 0.00023296564176533458, + "loss": 3.95484375, + "step": 64600 + }, + { + "epoch": 0.6713916589703944, + "grad_norm": 1.6787638664245605, + "learning_rate": 0.000232861871802588, + "loss": 3.858246154785156, + "step": 64700 + }, + { + "epoch": 0.6724293585978602, + "grad_norm": 5.671106815338135, + "learning_rate": 0.0002327581018398414, + "loss": 3.9156753540039064, + "step": 64800 + }, + { + "epoch": 0.673467058225326, + "grad_norm": 7.058924674987793, + "learning_rate": 0.00023265433187709484, + "loss": 3.8724734497070314, + "step": 64900 + }, + { + "epoch": 0.674504757852792, + "grad_norm": 4.8587422370910645, + "learning_rate": 0.00023255056191434826, + "loss": 3.958966064453125, + "step": 65000 + }, + { + "epoch": 0.6755424574802578, + "grad_norm": 2.546802520751953, + "learning_rate": 0.00023244679195160166, + "loss": 3.9913558959960938, + "step": 65100 + }, + { + "epoch": 0.6765801571077236, + "grad_norm": 1.8444024324417114, + "learning_rate": 0.00023234302198885508, + "loss": 4.089451293945313, + "step": 65200 + }, + { + "epoch": 0.6776178567351895, + "grad_norm": 1.5202494859695435, + "learning_rate": 0.0002322392520261085, + "loss": 3.83590576171875, + "step": 65300 + }, + { + "epoch": 0.6786555563626553, + "grad_norm": 2.554324150085449, + "learning_rate": 0.00023213548206336192, + "loss": 3.9957940673828123, + "step": 65400 + }, + { + "epoch": 0.6796932559901211, + "grad_norm": 1.6007890701293945, + "learning_rate": 0.00023203171210061532, + "loss": 3.9022012329101563, + "step": 65500 + }, + { + "epoch": 0.6807309556175869, + "grad_norm": 2.593081474304199, + "learning_rate": 0.00023192794213786874, + "loss": 3.944790954589844, + "step": 65600 + }, + { + "epoch": 0.6817686552450528, + "grad_norm": 2.1474156379699707, + "learning_rate": 0.0002318241721751222, + "loss": 3.78737060546875, + "step": 65700 + }, + { + "epoch": 0.6828063548725186, + "grad_norm": 3.1960246562957764, + "learning_rate": 0.00023172040221237559, + "loss": 3.9783554077148438, + "step": 65800 + }, + { + "epoch": 0.6838440544999844, + "grad_norm": 3.8228328227996826, + "learning_rate": 0.000231616632249629, + "loss": 3.856565246582031, + "step": 65900 + }, + { + "epoch": 0.6848817541274502, + "grad_norm": 11.939492225646973, + "learning_rate": 0.0002315128622868824, + "loss": 3.8156298828125, + "step": 66000 + }, + { + "epoch": 0.6859194537549161, + "grad_norm": 1.8741025924682617, + "learning_rate": 0.00023140909232413582, + "loss": 3.9566534423828124, + "step": 66100 + }, + { + "epoch": 0.686957153382382, + "grad_norm": 1.682139277458191, + "learning_rate": 0.00023130532236138927, + "loss": 3.9164004516601563, + "step": 66200 + }, + { + "epoch": 0.6879948530098478, + "grad_norm": 1.1901954412460327, + "learning_rate": 0.00023120155239864267, + "loss": 4.0331982421875, + "step": 66300 + }, + { + "epoch": 0.6890325526373136, + "grad_norm": 2.2226786613464355, + "learning_rate": 0.0002310977824358961, + "loss": 3.901326904296875, + "step": 66400 + }, + { + "epoch": 0.6900702522647795, + "grad_norm": 2.28139328956604, + "learning_rate": 0.00023099401247314948, + "loss": 3.734437255859375, + "step": 66500 + }, + { + "epoch": 0.6911079518922453, + "grad_norm": 3.9518322944641113, + "learning_rate": 0.00023089024251040293, + "loss": 3.890718994140625, + "step": 66600 + }, + { + "epoch": 0.6921456515197111, + "grad_norm": 4.689309120178223, + "learning_rate": 0.00023078647254765633, + "loss": 3.83462646484375, + "step": 66700 + }, + { + "epoch": 0.6931833511471769, + "grad_norm": 2.5103607177734375, + "learning_rate": 0.00023068270258490975, + "loss": 3.8714788818359374, + "step": 66800 + }, + { + "epoch": 0.6942210507746428, + "grad_norm": 2.060398578643799, + "learning_rate": 0.00023057893262216317, + "loss": 3.8463949584960937, + "step": 66900 + }, + { + "epoch": 0.6952587504021086, + "grad_norm": 3.9058265686035156, + "learning_rate": 0.00023047516265941657, + "loss": 3.955802001953125, + "step": 67000 + }, + { + "epoch": 0.6962964500295744, + "grad_norm": 2.7018091678619385, + "learning_rate": 0.00023037139269667002, + "loss": 4.010853271484375, + "step": 67100 + }, + { + "epoch": 0.6973341496570403, + "grad_norm": 1.759364366531372, + "learning_rate": 0.0002302676227339234, + "loss": 3.8436270141601563, + "step": 67200 + }, + { + "epoch": 0.6983718492845061, + "grad_norm": 4.264219284057617, + "learning_rate": 0.00023016385277117683, + "loss": 3.906452941894531, + "step": 67300 + }, + { + "epoch": 0.6994095489119719, + "grad_norm": 2.064502000808716, + "learning_rate": 0.00023006008280843023, + "loss": 3.9249755859375, + "step": 67400 + }, + { + "epoch": 0.7004472485394377, + "grad_norm": 4.326413154602051, + "learning_rate": 0.00022995631284568368, + "loss": 3.9763421630859375, + "step": 67500 + }, + { + "epoch": 0.7014849481669037, + "grad_norm": 1.5424126386642456, + "learning_rate": 0.0002298525428829371, + "loss": 3.9105490112304686, + "step": 67600 + }, + { + "epoch": 0.7025226477943695, + "grad_norm": 3.1067123413085938, + "learning_rate": 0.0002297487729201905, + "loss": 4.066288146972656, + "step": 67700 + }, + { + "epoch": 0.7035603474218353, + "grad_norm": 1.3455185890197754, + "learning_rate": 0.00022964500295744392, + "loss": 3.906605224609375, + "step": 67800 + }, + { + "epoch": 0.7045980470493011, + "grad_norm": 4.567904472351074, + "learning_rate": 0.0002295412329946973, + "loss": 3.8274655151367187, + "step": 67900 + }, + { + "epoch": 0.705635746676767, + "grad_norm": 1.4911061525344849, + "learning_rate": 0.00022943746303195076, + "loss": 3.8712289428710935, + "step": 68000 + }, + { + "epoch": 0.7066734463042328, + "grad_norm": 1.8636422157287598, + "learning_rate": 0.00022933369306920418, + "loss": 3.9435845947265626, + "step": 68100 + }, + { + "epoch": 0.7077111459316986, + "grad_norm": 4.616937637329102, + "learning_rate": 0.00022922992310645758, + "loss": 4.073515319824219, + "step": 68200 + }, + { + "epoch": 0.7087488455591644, + "grad_norm": 2.339660167694092, + "learning_rate": 0.000229126153143711, + "loss": 3.752909851074219, + "step": 68300 + }, + { + "epoch": 0.7097865451866303, + "grad_norm": 2.2960572242736816, + "learning_rate": 0.00022902238318096442, + "loss": 3.841389465332031, + "step": 68400 + }, + { + "epoch": 0.7108242448140961, + "grad_norm": 1.9303183555603027, + "learning_rate": 0.00022891861321821784, + "loss": 4.007230529785156, + "step": 68500 + }, + { + "epoch": 0.7118619444415619, + "grad_norm": 3.3750216960906982, + "learning_rate": 0.00022881484325547124, + "loss": 4.0530221557617185, + "step": 68600 + }, + { + "epoch": 0.7128996440690277, + "grad_norm": 3.9443397521972656, + "learning_rate": 0.00022871107329272466, + "loss": 3.92802734375, + "step": 68700 + }, + { + "epoch": 0.7139373436964936, + "grad_norm": 2.2526562213897705, + "learning_rate": 0.0002286073033299781, + "loss": 4.117896728515625, + "step": 68800 + }, + { + "epoch": 0.7149750433239594, + "grad_norm": 3.631329298019409, + "learning_rate": 0.0002285035333672315, + "loss": 3.876401062011719, + "step": 68900 + }, + { + "epoch": 0.7160127429514253, + "grad_norm": 2.0594444274902344, + "learning_rate": 0.00022839976340448492, + "loss": 3.9595294189453125, + "step": 69000 + }, + { + "epoch": 0.7170504425788912, + "grad_norm": 6.801323413848877, + "learning_rate": 0.00022829599344173832, + "loss": 3.966697998046875, + "step": 69100 + }, + { + "epoch": 0.718088142206357, + "grad_norm": 3.579699754714966, + "learning_rate": 0.00022819222347899174, + "loss": 3.9083868408203126, + "step": 69200 + }, + { + "epoch": 0.7191258418338228, + "grad_norm": 3.9111030101776123, + "learning_rate": 0.0002280884535162452, + "loss": 4.020595092773437, + "step": 69300 + }, + { + "epoch": 0.7201635414612886, + "grad_norm": 1.5465009212493896, + "learning_rate": 0.00022798468355349858, + "loss": 4.002583618164063, + "step": 69400 + }, + { + "epoch": 0.7212012410887545, + "grad_norm": 2.5977070331573486, + "learning_rate": 0.000227880913590752, + "loss": 3.82881591796875, + "step": 69500 + }, + { + "epoch": 0.7222389407162203, + "grad_norm": 3.807143211364746, + "learning_rate": 0.0002277771436280054, + "loss": 3.8127020263671874, + "step": 69600 + }, + { + "epoch": 0.7232766403436861, + "grad_norm": 3.562692165374756, + "learning_rate": 0.00022767337366525885, + "loss": 3.861103820800781, + "step": 69700 + }, + { + "epoch": 0.7243143399711519, + "grad_norm": 4.136765003204346, + "learning_rate": 0.00022756960370251225, + "loss": 3.817465515136719, + "step": 69800 + }, + { + "epoch": 0.7253520395986178, + "grad_norm": 1.9534144401550293, + "learning_rate": 0.00022746583373976567, + "loss": 3.784884338378906, + "step": 69900 + }, + { + "epoch": 0.7263897392260836, + "grad_norm": 2.2738490104675293, + "learning_rate": 0.0002273620637770191, + "loss": 3.9553741455078124, + "step": 70000 + }, + { + "epoch": 0.7274274388535494, + "grad_norm": 8.41178035736084, + "learning_rate": 0.00022725829381427248, + "loss": 3.9581622314453124, + "step": 70100 + }, + { + "epoch": 0.7284651384810152, + "grad_norm": 2.574738025665283, + "learning_rate": 0.00022715452385152593, + "loss": 3.865647888183594, + "step": 70200 + }, + { + "epoch": 0.7295028381084812, + "grad_norm": 4.12198543548584, + "learning_rate": 0.00022705075388877933, + "loss": 3.8447744750976565, + "step": 70300 + }, + { + "epoch": 0.730540537735947, + "grad_norm": 3.4615478515625, + "learning_rate": 0.00022694698392603275, + "loss": 3.8417919921875, + "step": 70400 + }, + { + "epoch": 0.7315782373634128, + "grad_norm": 1.9662399291992188, + "learning_rate": 0.00022684321396328614, + "loss": 3.943636779785156, + "step": 70500 + }, + { + "epoch": 0.7326159369908786, + "grad_norm": 6.054515361785889, + "learning_rate": 0.0002267394440005396, + "loss": 3.9477130126953126, + "step": 70600 + }, + { + "epoch": 0.7336536366183445, + "grad_norm": 2.6368846893310547, + "learning_rate": 0.00022663567403779302, + "loss": 3.9134860229492188, + "step": 70700 + }, + { + "epoch": 0.7346913362458103, + "grad_norm": 18.437114715576172, + "learning_rate": 0.0002265319040750464, + "loss": 3.9025979614257813, + "step": 70800 + }, + { + "epoch": 0.7357290358732761, + "grad_norm": 3.9227664470672607, + "learning_rate": 0.00022642813411229983, + "loss": 3.9925546264648437, + "step": 70900 + }, + { + "epoch": 0.736766735500742, + "grad_norm": 2.9096601009368896, + "learning_rate": 0.00022632436414955323, + "loss": 3.7520477294921877, + "step": 71000 + }, + { + "epoch": 0.7378044351282078, + "grad_norm": 2.756199598312378, + "learning_rate": 0.00022622059418680668, + "loss": 3.7744400024414064, + "step": 71100 + }, + { + "epoch": 0.7388421347556736, + "grad_norm": 4.398651123046875, + "learning_rate": 0.0002261168242240601, + "loss": 3.8754537963867186, + "step": 71200 + }, + { + "epoch": 0.7398798343831394, + "grad_norm": 3.0455260276794434, + "learning_rate": 0.0002260130542613135, + "loss": 3.8303518676757813, + "step": 71300 + }, + { + "epoch": 0.7409175340106053, + "grad_norm": 1.6435341835021973, + "learning_rate": 0.00022590928429856692, + "loss": 3.868741149902344, + "step": 71400 + }, + { + "epoch": 0.7419552336380711, + "grad_norm": 2.460381507873535, + "learning_rate": 0.00022580551433582034, + "loss": 3.971143798828125, + "step": 71500 + }, + { + "epoch": 0.742992933265537, + "grad_norm": 3.793260335922241, + "learning_rate": 0.00022570174437307376, + "loss": 3.9564599609375, + "step": 71600 + }, + { + "epoch": 0.7440306328930028, + "grad_norm": 2.2400221824645996, + "learning_rate": 0.00022559797441032715, + "loss": 3.868074951171875, + "step": 71700 + }, + { + "epoch": 0.7450683325204687, + "grad_norm": 4.521097660064697, + "learning_rate": 0.00022549420444758058, + "loss": 3.9104345703125, + "step": 71800 + }, + { + "epoch": 0.7461060321479345, + "grad_norm": 2.454610824584961, + "learning_rate": 0.00022539043448483402, + "loss": 3.8415142822265627, + "step": 71900 + }, + { + "epoch": 0.7471437317754003, + "grad_norm": 1.7384246587753296, + "learning_rate": 0.00022528666452208742, + "loss": 3.9767572021484376, + "step": 72000 + }, + { + "epoch": 0.7481814314028661, + "grad_norm": 2.3506603240966797, + "learning_rate": 0.00022518289455934084, + "loss": 3.804529724121094, + "step": 72100 + }, + { + "epoch": 0.749219131030332, + "grad_norm": 8.719681739807129, + "learning_rate": 0.00022507912459659424, + "loss": 3.6692437744140625, + "step": 72200 + }, + { + "epoch": 0.7502568306577978, + "grad_norm": 2.188565254211426, + "learning_rate": 0.00022497535463384766, + "loss": 3.9966400146484373, + "step": 72300 + }, + { + "epoch": 0.7512945302852636, + "grad_norm": 2.7061383724212646, + "learning_rate": 0.00022487158467110108, + "loss": 3.7955560302734375, + "step": 72400 + }, + { + "epoch": 0.7523322299127294, + "grad_norm": 1.820816993713379, + "learning_rate": 0.0002247678147083545, + "loss": 3.800717468261719, + "step": 72500 + }, + { + "epoch": 0.7533699295401953, + "grad_norm": 2.3510568141937256, + "learning_rate": 0.00022466404474560792, + "loss": 3.8987237548828126, + "step": 72600 + }, + { + "epoch": 0.7544076291676611, + "grad_norm": 3.0852279663085938, + "learning_rate": 0.00022456027478286132, + "loss": 3.9560122680664063, + "step": 72700 + }, + { + "epoch": 0.7554453287951269, + "grad_norm": 2.3377742767333984, + "learning_rate": 0.00022445650482011477, + "loss": 3.9077328491210936, + "step": 72800 + }, + { + "epoch": 0.7564830284225929, + "grad_norm": 4.257030010223389, + "learning_rate": 0.00022435273485736816, + "loss": 3.915125732421875, + "step": 72900 + }, + { + "epoch": 0.7575207280500587, + "grad_norm": 1.8238855600357056, + "learning_rate": 0.00022424896489462158, + "loss": 3.8456768798828125, + "step": 73000 + }, + { + "epoch": 0.7585584276775245, + "grad_norm": 2.2102901935577393, + "learning_rate": 0.000224145194931875, + "loss": 3.9905462646484375, + "step": 73100 + }, + { + "epoch": 0.7595961273049903, + "grad_norm": 6.003772735595703, + "learning_rate": 0.0002240414249691284, + "loss": 3.831954040527344, + "step": 73200 + }, + { + "epoch": 0.7606338269324562, + "grad_norm": 2.209681272506714, + "learning_rate": 0.00022393765500638185, + "loss": 3.96739990234375, + "step": 73300 + }, + { + "epoch": 0.761671526559922, + "grad_norm": 5.8811235427856445, + "learning_rate": 0.00022383388504363525, + "loss": 3.8418869018554687, + "step": 73400 + }, + { + "epoch": 0.7627092261873878, + "grad_norm": 1.9358527660369873, + "learning_rate": 0.00022373011508088867, + "loss": 3.9846435546875, + "step": 73500 + }, + { + "epoch": 0.7637469258148536, + "grad_norm": 4.668230056762695, + "learning_rate": 0.00022362634511814206, + "loss": 3.87702880859375, + "step": 73600 + }, + { + "epoch": 0.7647846254423195, + "grad_norm": 2.1674551963806152, + "learning_rate": 0.0002235225751553955, + "loss": 3.9948715209960937, + "step": 73700 + }, + { + "epoch": 0.7658223250697853, + "grad_norm": 3.276775360107422, + "learning_rate": 0.00022341880519264893, + "loss": 3.876432189941406, + "step": 73800 + }, + { + "epoch": 0.7668600246972511, + "grad_norm": 2.382432222366333, + "learning_rate": 0.00022331503522990233, + "loss": 3.9535626220703124, + "step": 73900 + }, + { + "epoch": 0.7678977243247169, + "grad_norm": 2.288184404373169, + "learning_rate": 0.00022321126526715575, + "loss": 3.962213134765625, + "step": 74000 + }, + { + "epoch": 0.7689354239521828, + "grad_norm": 11.535764694213867, + "learning_rate": 0.00022310749530440914, + "loss": 3.839007568359375, + "step": 74100 + }, + { + "epoch": 0.7699731235796486, + "grad_norm": 2.520615816116333, + "learning_rate": 0.0002230037253416626, + "loss": 3.942041015625, + "step": 74200 + }, + { + "epoch": 0.7710108232071144, + "grad_norm": 5.035190582275391, + "learning_rate": 0.000222899955378916, + "loss": 3.827362365722656, + "step": 74300 + }, + { + "epoch": 0.7720485228345803, + "grad_norm": 2.1133370399475098, + "learning_rate": 0.0002227961854161694, + "loss": 3.8085946655273437, + "step": 74400 + }, + { + "epoch": 0.7730862224620462, + "grad_norm": 3.3813223838806152, + "learning_rate": 0.00022269241545342283, + "loss": 3.8528924560546876, + "step": 74500 + }, + { + "epoch": 0.774123922089512, + "grad_norm": 2.5912599563598633, + "learning_rate": 0.00022258864549067625, + "loss": 4.025367126464844, + "step": 74600 + }, + { + "epoch": 0.7751616217169778, + "grad_norm": 8.560553550720215, + "learning_rate": 0.00022248487552792968, + "loss": 3.8942611694335936, + "step": 74700 + }, + { + "epoch": 0.7761993213444436, + "grad_norm": 2.7210657596588135, + "learning_rate": 0.00022238110556518307, + "loss": 3.7450421142578123, + "step": 74800 + }, + { + "epoch": 0.7772370209719095, + "grad_norm": 3.06449031829834, + "learning_rate": 0.0002222773356024365, + "loss": 4.058497619628906, + "step": 74900 + }, + { + "epoch": 0.7782747205993753, + "grad_norm": 2.6780056953430176, + "learning_rate": 0.00022217356563968994, + "loss": 3.908025207519531, + "step": 75000 + }, + { + "epoch": 0.7793124202268411, + "grad_norm": 2.579087257385254, + "learning_rate": 0.00022206979567694334, + "loss": 3.914963684082031, + "step": 75100 + }, + { + "epoch": 0.780350119854307, + "grad_norm": 6.844696998596191, + "learning_rate": 0.00022196602571419676, + "loss": 3.8832046508789064, + "step": 75200 + }, + { + "epoch": 0.7813878194817728, + "grad_norm": 7.694204330444336, + "learning_rate": 0.00022186225575145015, + "loss": 3.9718392944335936, + "step": 75300 + }, + { + "epoch": 0.7824255191092386, + "grad_norm": 9.200462341308594, + "learning_rate": 0.00022175848578870358, + "loss": 3.859333801269531, + "step": 75400 + }, + { + "epoch": 0.7834632187367044, + "grad_norm": 4.622501850128174, + "learning_rate": 0.000221654715825957, + "loss": 3.9099847412109376, + "step": 75500 + }, + { + "epoch": 0.7845009183641704, + "grad_norm": 1.9592938423156738, + "learning_rate": 0.00022155094586321042, + "loss": 3.8727886962890623, + "step": 75600 + }, + { + "epoch": 0.7855386179916362, + "grad_norm": 4.431970119476318, + "learning_rate": 0.00022144717590046384, + "loss": 3.9126931762695314, + "step": 75700 + }, + { + "epoch": 0.786576317619102, + "grad_norm": 4.069213390350342, + "learning_rate": 0.00022134340593771724, + "loss": 3.8846563720703124, + "step": 75800 + }, + { + "epoch": 0.7876140172465678, + "grad_norm": 2.009706497192383, + "learning_rate": 0.00022123963597497068, + "loss": 3.951784362792969, + "step": 75900 + }, + { + "epoch": 0.7886517168740337, + "grad_norm": 3.475999116897583, + "learning_rate": 0.00022113586601222408, + "loss": 3.8493191528320314, + "step": 76000 + }, + { + "epoch": 0.7896894165014995, + "grad_norm": 2.45090913772583, + "learning_rate": 0.0002210320960494775, + "loss": 3.938821105957031, + "step": 76100 + }, + { + "epoch": 0.7907271161289653, + "grad_norm": 3.2572762966156006, + "learning_rate": 0.0002209283260867309, + "loss": 3.8848175048828124, + "step": 76200 + }, + { + "epoch": 0.7917648157564311, + "grad_norm": 2.2695441246032715, + "learning_rate": 0.00022082455612398432, + "loss": 3.8166204833984376, + "step": 76300 + }, + { + "epoch": 0.792802515383897, + "grad_norm": 6.520568370819092, + "learning_rate": 0.00022072078616123777, + "loss": 3.8947482299804688, + "step": 76400 + }, + { + "epoch": 0.7938402150113628, + "grad_norm": 9.233070373535156, + "learning_rate": 0.00022061701619849116, + "loss": 3.8395782470703126, + "step": 76500 + }, + { + "epoch": 0.7948779146388286, + "grad_norm": 1.5229090452194214, + "learning_rate": 0.00022051324623574458, + "loss": 3.979128723144531, + "step": 76600 + }, + { + "epoch": 0.7959156142662944, + "grad_norm": 3.9737226963043213, + "learning_rate": 0.00022040947627299798, + "loss": 3.890586242675781, + "step": 76700 + }, + { + "epoch": 0.7969533138937603, + "grad_norm": 1.9717073440551758, + "learning_rate": 0.00022030570631025143, + "loss": 3.971199951171875, + "step": 76800 + }, + { + "epoch": 0.7979910135212261, + "grad_norm": 3.3416688442230225, + "learning_rate": 0.00022020193634750485, + "loss": 3.961914367675781, + "step": 76900 + }, + { + "epoch": 0.799028713148692, + "grad_norm": 2.037693738937378, + "learning_rate": 0.00022009816638475824, + "loss": 3.8637881469726563, + "step": 77000 + }, + { + "epoch": 0.8000664127761579, + "grad_norm": 5.026768207550049, + "learning_rate": 0.00021999439642201167, + "loss": 3.9692828369140627, + "step": 77100 + }, + { + "epoch": 0.8011041124036237, + "grad_norm": 2.230590581893921, + "learning_rate": 0.00021989062645926506, + "loss": 3.852244873046875, + "step": 77200 + }, + { + "epoch": 0.8021418120310895, + "grad_norm": 2.0119717121124268, + "learning_rate": 0.0002197868564965185, + "loss": 3.9774188232421874, + "step": 77300 + }, + { + "epoch": 0.8031795116585553, + "grad_norm": 5.08432674407959, + "learning_rate": 0.0002196830865337719, + "loss": 3.8257907104492186, + "step": 77400 + }, + { + "epoch": 0.8042172112860212, + "grad_norm": 3.0086820125579834, + "learning_rate": 0.00021957931657102533, + "loss": 3.865489501953125, + "step": 77500 + }, + { + "epoch": 0.805254910913487, + "grad_norm": 4.534199237823486, + "learning_rate": 0.00021947554660827875, + "loss": 3.875529479980469, + "step": 77600 + }, + { + "epoch": 0.8062926105409528, + "grad_norm": 2.68324613571167, + "learning_rate": 0.00021937177664553217, + "loss": 3.928450927734375, + "step": 77700 + }, + { + "epoch": 0.8073303101684186, + "grad_norm": 3.7302651405334473, + "learning_rate": 0.0002192680066827856, + "loss": 3.9593939208984374, + "step": 77800 + }, + { + "epoch": 0.8083680097958845, + "grad_norm": 2.8160176277160645, + "learning_rate": 0.000219164236720039, + "loss": 4.003828735351562, + "step": 77900 + }, + { + "epoch": 0.8094057094233503, + "grad_norm": 2.314183473587036, + "learning_rate": 0.0002190604667572924, + "loss": 3.988243408203125, + "step": 78000 + }, + { + "epoch": 0.8104434090508161, + "grad_norm": 2.661289691925049, + "learning_rate": 0.0002189566967945458, + "loss": 3.9358248901367188, + "step": 78100 + }, + { + "epoch": 0.8114811086782819, + "grad_norm": 5.065707206726074, + "learning_rate": 0.00021885292683179925, + "loss": 3.7886788940429685, + "step": 78200 + }, + { + "epoch": 0.8125188083057479, + "grad_norm": 5.173181056976318, + "learning_rate": 0.00021874915686905268, + "loss": 3.790332946777344, + "step": 78300 + }, + { + "epoch": 0.8135565079332137, + "grad_norm": 2.573274850845337, + "learning_rate": 0.00021864538690630607, + "loss": 3.975767822265625, + "step": 78400 + }, + { + "epoch": 0.8145942075606795, + "grad_norm": 3.010472536087036, + "learning_rate": 0.0002185416169435595, + "loss": 3.861507568359375, + "step": 78500 + }, + { + "epoch": 0.8156319071881453, + "grad_norm": 2.632009983062744, + "learning_rate": 0.00021843784698081291, + "loss": 3.9550189208984374, + "step": 78600 + }, + { + "epoch": 0.8166696068156112, + "grad_norm": 5.590510368347168, + "learning_rate": 0.00021833407701806634, + "loss": 3.924696044921875, + "step": 78700 + }, + { + "epoch": 0.817707306443077, + "grad_norm": 4.052700042724609, + "learning_rate": 0.00021823030705531976, + "loss": 3.831592712402344, + "step": 78800 + }, + { + "epoch": 0.8187450060705428, + "grad_norm": 2.7363622188568115, + "learning_rate": 0.00021812653709257315, + "loss": 4.028314208984375, + "step": 78900 + }, + { + "epoch": 0.8197827056980087, + "grad_norm": 4.773056507110596, + "learning_rate": 0.0002180227671298266, + "loss": 3.7855130004882813, + "step": 79000 + }, + { + "epoch": 0.8208204053254745, + "grad_norm": 2.6858768463134766, + "learning_rate": 0.00021791899716708, + "loss": 3.9081768798828125, + "step": 79100 + }, + { + "epoch": 0.8218581049529403, + "grad_norm": 4.861189842224121, + "learning_rate": 0.00021781522720433342, + "loss": 3.99755126953125, + "step": 79200 + }, + { + "epoch": 0.8228958045804061, + "grad_norm": 2.1088833808898926, + "learning_rate": 0.00021771145724158681, + "loss": 3.8839871215820314, + "step": 79300 + }, + { + "epoch": 0.823933504207872, + "grad_norm": 2.911973237991333, + "learning_rate": 0.00021760768727884024, + "loss": 3.864557189941406, + "step": 79400 + }, + { + "epoch": 0.8249712038353378, + "grad_norm": 6.847414016723633, + "learning_rate": 0.00021750391731609368, + "loss": 3.868388366699219, + "step": 79500 + }, + { + "epoch": 0.8260089034628036, + "grad_norm": 2.0376992225646973, + "learning_rate": 0.00021740014735334708, + "loss": 3.9390859985351563, + "step": 79600 + }, + { + "epoch": 0.8270466030902694, + "grad_norm": 4.972707271575928, + "learning_rate": 0.0002172963773906005, + "loss": 3.8582077026367188, + "step": 79700 + }, + { + "epoch": 0.8280843027177354, + "grad_norm": 7.205460071563721, + "learning_rate": 0.0002171926074278539, + "loss": 3.839405212402344, + "step": 79800 + }, + { + "epoch": 0.8291220023452012, + "grad_norm": 12.633910179138184, + "learning_rate": 0.00021708883746510735, + "loss": 3.831856384277344, + "step": 79900 + }, + { + "epoch": 0.830159701972667, + "grad_norm": 4.479480743408203, + "learning_rate": 0.00021698506750236074, + "loss": 3.795959167480469, + "step": 80000 + }, + { + "epoch": 0.8311974016001328, + "grad_norm": 4.281702995300293, + "learning_rate": 0.00021688129753961416, + "loss": 4.039653625488281, + "step": 80100 + }, + { + "epoch": 0.8322351012275987, + "grad_norm": 3.5497429370880127, + "learning_rate": 0.00021677752757686758, + "loss": 4.000389709472656, + "step": 80200 + }, + { + "epoch": 0.8332728008550645, + "grad_norm": 2.431144952774048, + "learning_rate": 0.00021667375761412098, + "loss": 3.9792193603515624, + "step": 80300 + }, + { + "epoch": 0.8343105004825303, + "grad_norm": 13.734992980957031, + "learning_rate": 0.00021656998765137443, + "loss": 3.8038821411132813, + "step": 80400 + }, + { + "epoch": 0.8353482001099961, + "grad_norm": 1.6895164251327515, + "learning_rate": 0.00021646621768862782, + "loss": 3.7827383422851564, + "step": 80500 + }, + { + "epoch": 0.836385899737462, + "grad_norm": 3.4907968044281006, + "learning_rate": 0.00021636244772588124, + "loss": 3.882090759277344, + "step": 80600 + }, + { + "epoch": 0.8374235993649278, + "grad_norm": 2.345144510269165, + "learning_rate": 0.0002162586777631347, + "loss": 3.8400167846679687, + "step": 80700 + }, + { + "epoch": 0.8384612989923936, + "grad_norm": 3.4369494915008545, + "learning_rate": 0.0002161549078003881, + "loss": 3.7776190185546876, + "step": 80800 + }, + { + "epoch": 0.8394989986198595, + "grad_norm": 5.47845983505249, + "learning_rate": 0.0002160511378376415, + "loss": 4.044588623046875, + "step": 80900 + }, + { + "epoch": 0.8405366982473254, + "grad_norm": 1.5931683778762817, + "learning_rate": 0.0002159473678748949, + "loss": 3.9998703002929688, + "step": 81000 + }, + { + "epoch": 0.8415743978747912, + "grad_norm": 3.1940066814422607, + "learning_rate": 0.00021584359791214833, + "loss": 3.8839016723632813, + "step": 81100 + }, + { + "epoch": 0.842612097502257, + "grad_norm": 9.511052131652832, + "learning_rate": 0.00021573982794940172, + "loss": 3.9398565673828125, + "step": 81200 + }, + { + "epoch": 0.8436497971297229, + "grad_norm": 1.9886616468429565, + "learning_rate": 0.00021563605798665517, + "loss": 3.82979736328125, + "step": 81300 + }, + { + "epoch": 0.8446874967571887, + "grad_norm": 2.362103223800659, + "learning_rate": 0.0002155322880239086, + "loss": 3.8248995971679687, + "step": 81400 + }, + { + "epoch": 0.8457251963846545, + "grad_norm": 1.7605165243148804, + "learning_rate": 0.000215428518061162, + "loss": 3.7230010986328126, + "step": 81500 + }, + { + "epoch": 0.8467628960121203, + "grad_norm": 1.8303929567337036, + "learning_rate": 0.0002153247480984154, + "loss": 3.861679992675781, + "step": 81600 + }, + { + "epoch": 0.8478005956395862, + "grad_norm": 4.539703845977783, + "learning_rate": 0.00021522097813566883, + "loss": 3.8151321411132812, + "step": 81700 + }, + { + "epoch": 0.848838295267052, + "grad_norm": 1.8927255868911743, + "learning_rate": 0.00021511720817292225, + "loss": 3.999220886230469, + "step": 81800 + }, + { + "epoch": 0.8498759948945178, + "grad_norm": 3.66632080078125, + "learning_rate": 0.00021501343821017565, + "loss": 3.9149603271484374, + "step": 81900 + }, + { + "epoch": 0.8509136945219836, + "grad_norm": 6.1261887550354, + "learning_rate": 0.00021490966824742907, + "loss": 3.808494873046875, + "step": 82000 + }, + { + "epoch": 0.8519513941494495, + "grad_norm": 2.9073901176452637, + "learning_rate": 0.00021480589828468252, + "loss": 3.8501129150390625, + "step": 82100 + }, + { + "epoch": 0.8529890937769153, + "grad_norm": 1.9176596403121948, + "learning_rate": 0.00021470212832193591, + "loss": 3.9358505249023437, + "step": 82200 + }, + { + "epoch": 0.8540267934043811, + "grad_norm": 2.3072047233581543, + "learning_rate": 0.00021459835835918934, + "loss": 3.8934945678710937, + "step": 82300 + }, + { + "epoch": 0.855064493031847, + "grad_norm": 2.7599945068359375, + "learning_rate": 0.00021449458839644273, + "loss": 3.929814453125, + "step": 82400 + }, + { + "epoch": 0.8561021926593129, + "grad_norm": 2.0721237659454346, + "learning_rate": 0.00021439081843369615, + "loss": 3.86040283203125, + "step": 82500 + }, + { + "epoch": 0.8571398922867787, + "grad_norm": 5.156016826629639, + "learning_rate": 0.0002142870484709496, + "loss": 3.8864166259765627, + "step": 82600 + }, + { + "epoch": 0.8581775919142445, + "grad_norm": 4.168294906616211, + "learning_rate": 0.000214183278508203, + "loss": 4.001069030761719, + "step": 82700 + }, + { + "epoch": 0.8592152915417104, + "grad_norm": 1.7126719951629639, + "learning_rate": 0.00021407950854545642, + "loss": 3.946321716308594, + "step": 82800 + }, + { + "epoch": 0.8602529911691762, + "grad_norm": 5.809075355529785, + "learning_rate": 0.0002139757385827098, + "loss": 3.82521240234375, + "step": 82900 + }, + { + "epoch": 0.861290690796642, + "grad_norm": 5.8849921226501465, + "learning_rate": 0.00021387196861996326, + "loss": 3.7766848754882814, + "step": 83000 + }, + { + "epoch": 0.8623283904241078, + "grad_norm": 2.317793607711792, + "learning_rate": 0.00021376819865721666, + "loss": 4.01570068359375, + "step": 83100 + }, + { + "epoch": 0.8633660900515737, + "grad_norm": 19.14999008178711, + "learning_rate": 0.00021366442869447008, + "loss": 3.760934143066406, + "step": 83200 + }, + { + "epoch": 0.8644037896790395, + "grad_norm": 2.025818109512329, + "learning_rate": 0.0002135606587317235, + "loss": 3.9255300903320314, + "step": 83300 + }, + { + "epoch": 0.8654414893065053, + "grad_norm": 3.068112373352051, + "learning_rate": 0.0002134568887689769, + "loss": 3.821394348144531, + "step": 83400 + }, + { + "epoch": 0.8664791889339711, + "grad_norm": 8.730904579162598, + "learning_rate": 0.00021335311880623034, + "loss": 3.8662478637695314, + "step": 83500 + }, + { + "epoch": 0.867516888561437, + "grad_norm": 2.9956910610198975, + "learning_rate": 0.00021324934884348374, + "loss": 3.8266961669921873, + "step": 83600 + }, + { + "epoch": 0.8685545881889029, + "grad_norm": 2.774705410003662, + "learning_rate": 0.00021314557888073716, + "loss": 3.8334832763671876, + "step": 83700 + }, + { + "epoch": 0.8695922878163687, + "grad_norm": 1.9926444292068481, + "learning_rate": 0.00021304180891799056, + "loss": 3.973898620605469, + "step": 83800 + }, + { + "epoch": 0.8706299874438345, + "grad_norm": 1.8433290719985962, + "learning_rate": 0.000212938038955244, + "loss": 3.8273077392578125, + "step": 83900 + }, + { + "epoch": 0.8716676870713004, + "grad_norm": 5.3389410972595215, + "learning_rate": 0.00021283426899249743, + "loss": 3.8604061889648436, + "step": 84000 + }, + { + "epoch": 0.8727053866987662, + "grad_norm": 7.391428470611572, + "learning_rate": 0.00021273049902975082, + "loss": 3.8056671142578127, + "step": 84100 + }, + { + "epoch": 0.873743086326232, + "grad_norm": 5.367404937744141, + "learning_rate": 0.00021262672906700424, + "loss": 3.8744406127929687, + "step": 84200 + }, + { + "epoch": 0.8747807859536978, + "grad_norm": 3.1199004650115967, + "learning_rate": 0.00021252295910425764, + "loss": 3.8992080688476562, + "step": 84300 + }, + { + "epoch": 0.8758184855811637, + "grad_norm": 1.8603098392486572, + "learning_rate": 0.0002124191891415111, + "loss": 3.8311639404296876, + "step": 84400 + }, + { + "epoch": 0.8768561852086295, + "grad_norm": 2.5739691257476807, + "learning_rate": 0.0002123154191787645, + "loss": 3.754921569824219, + "step": 84500 + }, + { + "epoch": 0.8778938848360953, + "grad_norm": 3.090057134628296, + "learning_rate": 0.0002122116492160179, + "loss": 3.74908935546875, + "step": 84600 + }, + { + "epoch": 0.8789315844635612, + "grad_norm": 9.258840560913086, + "learning_rate": 0.00021210787925327133, + "loss": 3.985562744140625, + "step": 84700 + }, + { + "epoch": 0.879969284091027, + "grad_norm": 3.738255262374878, + "learning_rate": 0.00021200410929052475, + "loss": 3.9656732177734373, + "step": 84800 + }, + { + "epoch": 0.8810069837184928, + "grad_norm": 3.415017604827881, + "learning_rate": 0.00021190033932777817, + "loss": 3.958587341308594, + "step": 84900 + }, + { + "epoch": 0.8820446833459586, + "grad_norm": 6.633699893951416, + "learning_rate": 0.00021179656936503157, + "loss": 3.866285705566406, + "step": 85000 + }, + { + "epoch": 0.8830823829734246, + "grad_norm": 1.7935473918914795, + "learning_rate": 0.000211692799402285, + "loss": 3.9740695190429687, + "step": 85100 + }, + { + "epoch": 0.8841200826008904, + "grad_norm": 2.706197500228882, + "learning_rate": 0.00021158902943953844, + "loss": 3.8669891357421875, + "step": 85200 + }, + { + "epoch": 0.8851577822283562, + "grad_norm": 4.353029727935791, + "learning_rate": 0.00021148525947679183, + "loss": 3.881668701171875, + "step": 85300 + }, + { + "epoch": 0.886195481855822, + "grad_norm": 3.0080366134643555, + "learning_rate": 0.00021138148951404525, + "loss": 3.8229278564453124, + "step": 85400 + }, + { + "epoch": 0.8872331814832879, + "grad_norm": 7.4073028564453125, + "learning_rate": 0.00021127771955129865, + "loss": 4.015174560546875, + "step": 85500 + }, + { + "epoch": 0.8882708811107537, + "grad_norm": 4.174534320831299, + "learning_rate": 0.00021117394958855207, + "loss": 3.8184585571289062, + "step": 85600 + }, + { + "epoch": 0.8893085807382195, + "grad_norm": 5.683806896209717, + "learning_rate": 0.0002110701796258055, + "loss": 3.8243557739257814, + "step": 85700 + }, + { + "epoch": 0.8903462803656853, + "grad_norm": 2.076599597930908, + "learning_rate": 0.00021096640966305891, + "loss": 3.71376220703125, + "step": 85800 + }, + { + "epoch": 0.8913839799931512, + "grad_norm": 2.4622974395751953, + "learning_rate": 0.00021086263970031234, + "loss": 3.85018310546875, + "step": 85900 + }, + { + "epoch": 0.892421679620617, + "grad_norm": 2.3247082233428955, + "learning_rate": 0.00021075886973756573, + "loss": 3.9427032470703125, + "step": 86000 + }, + { + "epoch": 0.8934593792480828, + "grad_norm": 5.115243911743164, + "learning_rate": 0.00021065509977481918, + "loss": 3.6884475708007813, + "step": 86100 + }, + { + "epoch": 0.8944970788755486, + "grad_norm": 5.306711196899414, + "learning_rate": 0.00021055132981207257, + "loss": 3.8416738891601563, + "step": 86200 + }, + { + "epoch": 0.8955347785030146, + "grad_norm": 1.5796631574630737, + "learning_rate": 0.000210447559849326, + "loss": 3.874592590332031, + "step": 86300 + }, + { + "epoch": 0.8965724781304804, + "grad_norm": 1.6183887720108032, + "learning_rate": 0.00021034378988657942, + "loss": 3.840068054199219, + "step": 86400 + }, + { + "epoch": 0.8976101777579462, + "grad_norm": 3.1412158012390137, + "learning_rate": 0.0002102400199238328, + "loss": 4.0432958984375, + "step": 86500 + }, + { + "epoch": 0.898647877385412, + "grad_norm": 1.6547956466674805, + "learning_rate": 0.00021013624996108626, + "loss": 3.829620361328125, + "step": 86600 + }, + { + "epoch": 0.8996855770128779, + "grad_norm": 9.84925365447998, + "learning_rate": 0.00021003247999833966, + "loss": 3.74409912109375, + "step": 86700 + }, + { + "epoch": 0.9007232766403437, + "grad_norm": 4.718574523925781, + "learning_rate": 0.00020992871003559308, + "loss": 3.8265109252929688, + "step": 86800 + }, + { + "epoch": 0.9017609762678095, + "grad_norm": 4.692354679107666, + "learning_rate": 0.00020982494007284647, + "loss": 3.9203875732421873, + "step": 86900 + }, + { + "epoch": 0.9027986758952754, + "grad_norm": 3.620683431625366, + "learning_rate": 0.00020972117011009992, + "loss": 3.9122955322265627, + "step": 87000 + }, + { + "epoch": 0.9038363755227412, + "grad_norm": 4.431119918823242, + "learning_rate": 0.00020961740014735334, + "loss": 3.9402545166015623, + "step": 87100 + }, + { + "epoch": 0.904874075150207, + "grad_norm": 3.734344005584717, + "learning_rate": 0.00020951363018460674, + "loss": 3.8481884765625, + "step": 87200 + }, + { + "epoch": 0.9059117747776728, + "grad_norm": 3.735985279083252, + "learning_rate": 0.00020940986022186016, + "loss": 3.8412353515625, + "step": 87300 + }, + { + "epoch": 0.9069494744051387, + "grad_norm": 2.774721145629883, + "learning_rate": 0.00020930609025911356, + "loss": 3.76121337890625, + "step": 87400 + }, + { + "epoch": 0.9079871740326045, + "grad_norm": 13.096595764160156, + "learning_rate": 0.000209202320296367, + "loss": 3.9009844970703127, + "step": 87500 + }, + { + "epoch": 0.9090248736600703, + "grad_norm": 5.561835765838623, + "learning_rate": 0.0002090985503336204, + "loss": 3.7489013671875, + "step": 87600 + }, + { + "epoch": 0.9100625732875361, + "grad_norm": 5.21470832824707, + "learning_rate": 0.00020899478037087382, + "loss": 3.9491476440429687, + "step": 87700 + }, + { + "epoch": 0.9111002729150021, + "grad_norm": 3.611980438232422, + "learning_rate": 0.00020889101040812724, + "loss": 3.8744741821289064, + "step": 87800 + }, + { + "epoch": 0.9121379725424679, + "grad_norm": 3.670480489730835, + "learning_rate": 0.00020878724044538067, + "loss": 3.8484326171875, + "step": 87900 + }, + { + "epoch": 0.9131756721699337, + "grad_norm": 2.46195387840271, + "learning_rate": 0.0002086834704826341, + "loss": 3.8545870971679688, + "step": 88000 + }, + { + "epoch": 0.9142133717973995, + "grad_norm": 2.256782054901123, + "learning_rate": 0.00020857970051988748, + "loss": 3.788062744140625, + "step": 88100 + }, + { + "epoch": 0.9152510714248654, + "grad_norm": 1.5597251653671265, + "learning_rate": 0.0002084759305571409, + "loss": 3.8967153930664065, + "step": 88200 + }, + { + "epoch": 0.9162887710523312, + "grad_norm": 4.607747554779053, + "learning_rate": 0.00020837216059439435, + "loss": 3.84433837890625, + "step": 88300 + }, + { + "epoch": 0.917326470679797, + "grad_norm": 2.7213637828826904, + "learning_rate": 0.00020826839063164775, + "loss": 3.6432476806640626, + "step": 88400 + }, + { + "epoch": 0.9183641703072628, + "grad_norm": 1.6943309307098389, + "learning_rate": 0.00020816462066890117, + "loss": 3.942064208984375, + "step": 88500 + }, + { + "epoch": 0.9194018699347287, + "grad_norm": 1.9761497974395752, + "learning_rate": 0.00020806085070615457, + "loss": 3.757283020019531, + "step": 88600 + }, + { + "epoch": 0.9204395695621945, + "grad_norm": 2.720459461212158, + "learning_rate": 0.000207957080743408, + "loss": 3.723210754394531, + "step": 88700 + }, + { + "epoch": 0.9214772691896603, + "grad_norm": 2.986565589904785, + "learning_rate": 0.0002078533107806614, + "loss": 3.9739913940429688, + "step": 88800 + }, + { + "epoch": 0.9225149688171262, + "grad_norm": 2.682279348373413, + "learning_rate": 0.00020774954081791483, + "loss": 3.706415100097656, + "step": 88900 + }, + { + "epoch": 0.923552668444592, + "grad_norm": 14.281532287597656, + "learning_rate": 0.00020764577085516825, + "loss": 3.799072570800781, + "step": 89000 + }, + { + "epoch": 0.9245903680720579, + "grad_norm": 3.1239538192749023, + "learning_rate": 0.00020754200089242165, + "loss": 3.8822201538085936, + "step": 89100 + }, + { + "epoch": 0.9256280676995237, + "grad_norm": 7.4986252784729, + "learning_rate": 0.0002074382309296751, + "loss": 3.852564392089844, + "step": 89200 + }, + { + "epoch": 0.9266657673269896, + "grad_norm": 4.3345441818237305, + "learning_rate": 0.0002073344609669285, + "loss": 3.890749206542969, + "step": 89300 + }, + { + "epoch": 0.9277034669544554, + "grad_norm": 2.6886496543884277, + "learning_rate": 0.0002072306910041819, + "loss": 3.8261907958984374, + "step": 89400 + }, + { + "epoch": 0.9287411665819212, + "grad_norm": 2.2986016273498535, + "learning_rate": 0.0002071269210414353, + "loss": 3.8075076293945314, + "step": 89500 + }, + { + "epoch": 0.929778866209387, + "grad_norm": 11.309110641479492, + "learning_rate": 0.00020702315107868873, + "loss": 3.829825744628906, + "step": 89600 + }, + { + "epoch": 0.9308165658368529, + "grad_norm": 2.784146308898926, + "learning_rate": 0.00020691938111594218, + "loss": 3.7934060668945313, + "step": 89700 + }, + { + "epoch": 0.9318542654643187, + "grad_norm": 2.3935048580169678, + "learning_rate": 0.00020681561115319557, + "loss": 3.882371826171875, + "step": 89800 + }, + { + "epoch": 0.9328919650917845, + "grad_norm": 3.6735377311706543, + "learning_rate": 0.000206711841190449, + "loss": 3.842451171875, + "step": 89900 + }, + { + "epoch": 0.9339296647192503, + "grad_norm": 3.037416696548462, + "learning_rate": 0.0002066080712277024, + "loss": 3.9087152099609375, + "step": 90000 + }, + { + "epoch": 0.9349673643467162, + "grad_norm": 9.315804481506348, + "learning_rate": 0.00020650430126495584, + "loss": 3.773963623046875, + "step": 90100 + }, + { + "epoch": 0.936005063974182, + "grad_norm": 5.039952278137207, + "learning_rate": 0.00020640053130220926, + "loss": 3.7935626220703127, + "step": 90200 + }, + { + "epoch": 0.9370427636016478, + "grad_norm": 5.707028388977051, + "learning_rate": 0.00020629676133946266, + "loss": 3.775277404785156, + "step": 90300 + }, + { + "epoch": 0.9380804632291136, + "grad_norm": 3.8109843730926514, + "learning_rate": 0.00020619299137671608, + "loss": 3.779449462890625, + "step": 90400 + }, + { + "epoch": 0.9391181628565796, + "grad_norm": 2.9235146045684814, + "learning_rate": 0.00020608922141396947, + "loss": 3.8383111572265625, + "step": 90500 + }, + { + "epoch": 0.9401558624840454, + "grad_norm": 1.6856282949447632, + "learning_rate": 0.00020598545145122292, + "loss": 3.8841232299804687, + "step": 90600 + }, + { + "epoch": 0.9411935621115112, + "grad_norm": 7.263090133666992, + "learning_rate": 0.00020588168148847632, + "loss": 3.9575741577148436, + "step": 90700 + }, + { + "epoch": 0.9422312617389771, + "grad_norm": 3.6679883003234863, + "learning_rate": 0.00020577791152572974, + "loss": 3.81220947265625, + "step": 90800 + }, + { + "epoch": 0.9432689613664429, + "grad_norm": 5.708615303039551, + "learning_rate": 0.0002056741415629832, + "loss": 3.807239685058594, + "step": 90900 + }, + { + "epoch": 0.9443066609939087, + "grad_norm": 4.463714122772217, + "learning_rate": 0.00020557037160023658, + "loss": 3.841280517578125, + "step": 91000 + }, + { + "epoch": 0.9453443606213745, + "grad_norm": 10.150075912475586, + "learning_rate": 0.00020546660163749, + "loss": 3.75313232421875, + "step": 91100 + }, + { + "epoch": 0.9463820602488404, + "grad_norm": 11.987652778625488, + "learning_rate": 0.0002053628316747434, + "loss": 3.903273620605469, + "step": 91200 + }, + { + "epoch": 0.9474197598763062, + "grad_norm": 4.522410869598389, + "learning_rate": 0.00020525906171199682, + "loss": 3.760314636230469, + "step": 91300 + }, + { + "epoch": 0.948457459503772, + "grad_norm": 4.449744701385498, + "learning_rate": 0.00020515529174925022, + "loss": 3.685667724609375, + "step": 91400 + }, + { + "epoch": 0.9494951591312378, + "grad_norm": 1.8593145608901978, + "learning_rate": 0.00020505152178650367, + "loss": 3.7343402099609375, + "step": 91500 + }, + { + "epoch": 0.9505328587587037, + "grad_norm": 2.4731132984161377, + "learning_rate": 0.0002049477518237571, + "loss": 3.783785705566406, + "step": 91600 + }, + { + "epoch": 0.9515705583861696, + "grad_norm": 1.820862889289856, + "learning_rate": 0.00020484398186101048, + "loss": 3.719476318359375, + "step": 91700 + }, + { + "epoch": 0.9526082580136354, + "grad_norm": 2.214238166809082, + "learning_rate": 0.0002047402118982639, + "loss": 3.7817031860351564, + "step": 91800 + }, + { + "epoch": 0.9536459576411012, + "grad_norm": 3.6466450691223145, + "learning_rate": 0.00020463644193551733, + "loss": 3.7672024536132813, + "step": 91900 + }, + { + "epoch": 0.9546836572685671, + "grad_norm": 5.454410076141357, + "learning_rate": 0.00020453267197277075, + "loss": 3.77567626953125, + "step": 92000 + }, + { + "epoch": 0.9557213568960329, + "grad_norm": 20.138710021972656, + "learning_rate": 0.00020442890201002417, + "loss": 3.7506854248046877, + "step": 92100 + }, + { + "epoch": 0.9567590565234987, + "grad_norm": 2.0090079307556152, + "learning_rate": 0.00020432513204727756, + "loss": 3.8082257080078126, + "step": 92200 + }, + { + "epoch": 0.9577967561509645, + "grad_norm": 2.6881604194641113, + "learning_rate": 0.00020422136208453101, + "loss": 4.051754150390625, + "step": 92300 + }, + { + "epoch": 0.9588344557784304, + "grad_norm": 3.293210029602051, + "learning_rate": 0.0002041175921217844, + "loss": 3.702369384765625, + "step": 92400 + }, + { + "epoch": 0.9598721554058962, + "grad_norm": 5.354658126831055, + "learning_rate": 0.00020401382215903783, + "loss": 3.8296829223632813, + "step": 92500 + }, + { + "epoch": 0.960909855033362, + "grad_norm": 2.285318374633789, + "learning_rate": 0.00020391005219629123, + "loss": 3.8205487060546877, + "step": 92600 + }, + { + "epoch": 0.9619475546608279, + "grad_norm": 3.3139116764068604, + "learning_rate": 0.00020380628223354465, + "loss": 3.9517453002929686, + "step": 92700 + }, + { + "epoch": 0.9629852542882937, + "grad_norm": 4.242766380310059, + "learning_rate": 0.0002037025122707981, + "loss": 3.819052429199219, + "step": 92800 + }, + { + "epoch": 0.9640229539157595, + "grad_norm": 11.361218452453613, + "learning_rate": 0.0002035987423080515, + "loss": 3.8673443603515625, + "step": 92900 + }, + { + "epoch": 0.9650606535432253, + "grad_norm": 1.6263092756271362, + "learning_rate": 0.0002034949723453049, + "loss": 3.6743267822265624, + "step": 93000 + }, + { + "epoch": 0.9660983531706913, + "grad_norm": 3.191160202026367, + "learning_rate": 0.0002033912023825583, + "loss": 3.85127685546875, + "step": 93100 + }, + { + "epoch": 0.9671360527981571, + "grad_norm": 14.219719886779785, + "learning_rate": 0.00020328743241981176, + "loss": 3.8775042724609374, + "step": 93200 + }, + { + "epoch": 0.9681737524256229, + "grad_norm": 2.592212200164795, + "learning_rate": 0.00020318366245706515, + "loss": 3.784809265136719, + "step": 93300 + }, + { + "epoch": 0.9692114520530887, + "grad_norm": 2.058199644088745, + "learning_rate": 0.00020307989249431857, + "loss": 3.7654934692382813, + "step": 93400 + }, + { + "epoch": 0.9702491516805546, + "grad_norm": 3.3060290813446045, + "learning_rate": 0.000202976122531572, + "loss": 3.78427734375, + "step": 93500 + }, + { + "epoch": 0.9712868513080204, + "grad_norm": 5.642673492431641, + "learning_rate": 0.0002028723525688254, + "loss": 3.768431396484375, + "step": 93600 + }, + { + "epoch": 0.9723245509354862, + "grad_norm": 2.416527271270752, + "learning_rate": 0.00020276858260607884, + "loss": 3.9477734375, + "step": 93700 + }, + { + "epoch": 0.973362250562952, + "grad_norm": 6.023645877838135, + "learning_rate": 0.00020266481264333223, + "loss": 3.8290167236328125, + "step": 93800 + }, + { + "epoch": 0.9743999501904179, + "grad_norm": 3.252999782562256, + "learning_rate": 0.00020256104268058566, + "loss": 3.959106750488281, + "step": 93900 + }, + { + "epoch": 0.9754376498178837, + "grad_norm": 2.065927743911743, + "learning_rate": 0.0002024572727178391, + "loss": 3.868408508300781, + "step": 94000 + }, + { + "epoch": 0.9764753494453495, + "grad_norm": 3.3688645362854004, + "learning_rate": 0.0002023535027550925, + "loss": 3.91245361328125, + "step": 94100 + }, + { + "epoch": 0.9775130490728153, + "grad_norm": 3.004783868789673, + "learning_rate": 0.00020224973279234592, + "loss": 3.7105670166015625, + "step": 94200 + }, + { + "epoch": 0.9785507487002812, + "grad_norm": 2.6519381999969482, + "learning_rate": 0.00020214596282959932, + "loss": 3.8060031127929688, + "step": 94300 + }, + { + "epoch": 0.979588448327747, + "grad_norm": 2.3849129676818848, + "learning_rate": 0.00020204219286685274, + "loss": 3.7225299072265625, + "step": 94400 + }, + { + "epoch": 0.9806261479552129, + "grad_norm": 2.5238912105560303, + "learning_rate": 0.00020193842290410613, + "loss": 3.6197088623046874, + "step": 94500 + }, + { + "epoch": 0.9816638475826788, + "grad_norm": 7.388523101806641, + "learning_rate": 0.00020183465294135958, + "loss": 3.6996939086914065, + "step": 94600 + }, + { + "epoch": 0.9827015472101446, + "grad_norm": 10.3375883102417, + "learning_rate": 0.000201730882978613, + "loss": 3.7547808837890626, + "step": 94700 + }, + { + "epoch": 0.9837392468376104, + "grad_norm": 2.251610040664673, + "learning_rate": 0.0002016271130158664, + "loss": 3.794500732421875, + "step": 94800 + }, + { + "epoch": 0.9847769464650762, + "grad_norm": 3.8766162395477295, + "learning_rate": 0.00020152334305311982, + "loss": 3.7538128662109376, + "step": 94900 + }, + { + "epoch": 0.9858146460925421, + "grad_norm": 2.7171695232391357, + "learning_rate": 0.00020141957309037324, + "loss": 3.7826458740234377, + "step": 95000 + }, + { + "epoch": 0.9868523457200079, + "grad_norm": 3.8345425128936768, + "learning_rate": 0.00020131580312762667, + "loss": 3.8197344970703124, + "step": 95100 + }, + { + "epoch": 0.9878900453474737, + "grad_norm": 5.732568740844727, + "learning_rate": 0.00020121203316488006, + "loss": 3.84238525390625, + "step": 95200 + }, + { + "epoch": 0.9889277449749395, + "grad_norm": 2.933835744857788, + "learning_rate": 0.00020110826320213348, + "loss": 3.8682632446289062, + "step": 95300 + }, + { + "epoch": 0.9899654446024054, + "grad_norm": 6.234426021575928, + "learning_rate": 0.00020100449323938693, + "loss": 3.7140426635742188, + "step": 95400 + }, + { + "epoch": 0.9910031442298712, + "grad_norm": 3.3652026653289795, + "learning_rate": 0.00020090072327664033, + "loss": 3.7597830200195315, + "step": 95500 + }, + { + "epoch": 0.992040843857337, + "grad_norm": 3.030595541000366, + "learning_rate": 0.00020079695331389375, + "loss": 3.824953308105469, + "step": 95600 + }, + { + "epoch": 0.9930785434848028, + "grad_norm": 2.6781022548675537, + "learning_rate": 0.00020069318335114714, + "loss": 3.71589599609375, + "step": 95700 + }, + { + "epoch": 0.9941162431122688, + "grad_norm": 6.144374370574951, + "learning_rate": 0.00020058941338840056, + "loss": 3.856881408691406, + "step": 95800 + }, + { + "epoch": 0.9951539427397346, + "grad_norm": 11.093416213989258, + "learning_rate": 0.000200485643425654, + "loss": 3.8529815673828125, + "step": 95900 + }, + { + "epoch": 0.9961916423672004, + "grad_norm": 3.1640384197235107, + "learning_rate": 0.0002003818734629074, + "loss": 3.966211853027344, + "step": 96000 + }, + { + "epoch": 0.9972293419946662, + "grad_norm": 4.370779037475586, + "learning_rate": 0.00020027810350016083, + "loss": 3.7798886108398437, + "step": 96100 + }, + { + "epoch": 0.9982670416221321, + "grad_norm": 3.453723669052124, + "learning_rate": 0.00020017433353741422, + "loss": 3.8633013916015626, + "step": 96200 + }, + { + "epoch": 0.9993047412495979, + "grad_norm": 2.1785902976989746, + "learning_rate": 0.00020007056357466767, + "loss": 3.7897879028320314, + "step": 96300 + }, + { + "epoch": 1.0003424408770638, + "grad_norm": 7.7243971824646, + "learning_rate": 0.00019996679361192107, + "loss": 3.999345397949219, + "step": 96400 + }, + { + "epoch": 1.0013801405045295, + "grad_norm": 4.7181925773620605, + "learning_rate": 0.0001998630236491745, + "loss": 3.6450360107421873, + "step": 96500 + }, + { + "epoch": 1.0024178401319954, + "grad_norm": 5.74350643157959, + "learning_rate": 0.0001997592536864279, + "loss": 3.742356872558594, + "step": 96600 + }, + { + "epoch": 1.0034555397594613, + "grad_norm": 4.781228065490723, + "learning_rate": 0.0001996554837236813, + "loss": 3.88675048828125, + "step": 96700 + }, + { + "epoch": 1.004493239386927, + "grad_norm": 3.398968458175659, + "learning_rate": 0.00019955171376093476, + "loss": 3.604486083984375, + "step": 96800 + }, + { + "epoch": 1.005530939014393, + "grad_norm": 2.33478045463562, + "learning_rate": 0.00019944794379818815, + "loss": 3.6777334594726563, + "step": 96900 + }, + { + "epoch": 1.0065686386418586, + "grad_norm": 5.443575382232666, + "learning_rate": 0.00019934417383544157, + "loss": 3.71547119140625, + "step": 97000 + }, + { + "epoch": 1.0076063382693246, + "grad_norm": 9.512263298034668, + "learning_rate": 0.00019924040387269497, + "loss": 3.7301199340820315, + "step": 97100 + }, + { + "epoch": 1.0086440378967905, + "grad_norm": 7.4802985191345215, + "learning_rate": 0.00019913663390994842, + "loss": 3.924736328125, + "step": 97200 + }, + { + "epoch": 1.0096817375242562, + "grad_norm": 3.0878612995147705, + "learning_rate": 0.00019903286394720184, + "loss": 3.802860107421875, + "step": 97300 + }, + { + "epoch": 1.010719437151722, + "grad_norm": 3.557770252227783, + "learning_rate": 0.00019892909398445523, + "loss": 3.782970275878906, + "step": 97400 + }, + { + "epoch": 1.011757136779188, + "grad_norm": 4.309437274932861, + "learning_rate": 0.00019882532402170866, + "loss": 3.7818194580078126, + "step": 97500 + }, + { + "epoch": 1.0127948364066537, + "grad_norm": 9.057745933532715, + "learning_rate": 0.00019872155405896205, + "loss": 3.807467041015625, + "step": 97600 + }, + { + "epoch": 1.0138325360341196, + "grad_norm": 3.3481385707855225, + "learning_rate": 0.0001986177840962155, + "loss": 3.7055014038085936, + "step": 97700 + }, + { + "epoch": 1.0148702356615853, + "grad_norm": 5.001105308532715, + "learning_rate": 0.00019851401413346892, + "loss": 3.803979797363281, + "step": 97800 + }, + { + "epoch": 1.0159079352890512, + "grad_norm": 2.7995588779449463, + "learning_rate": 0.00019841024417072232, + "loss": 3.784454650878906, + "step": 97900 + }, + { + "epoch": 1.0169456349165171, + "grad_norm": 2.4021806716918945, + "learning_rate": 0.00019830647420797574, + "loss": 3.8534210205078123, + "step": 98000 + }, + { + "epoch": 1.0179833345439828, + "grad_norm": 2.6125597953796387, + "learning_rate": 0.00019820270424522916, + "loss": 3.6783572387695314, + "step": 98100 + }, + { + "epoch": 1.0190210341714487, + "grad_norm": 12.870917320251465, + "learning_rate": 0.00019809893428248258, + "loss": 3.833390808105469, + "step": 98200 + }, + { + "epoch": 1.0200587337989147, + "grad_norm": 5.185585021972656, + "learning_rate": 0.00019799516431973598, + "loss": 3.7223880004882814, + "step": 98300 + }, + { + "epoch": 1.0210964334263803, + "grad_norm": 1.9634087085723877, + "learning_rate": 0.0001978913943569894, + "loss": 3.6614044189453123, + "step": 98400 + }, + { + "epoch": 1.0221341330538463, + "grad_norm": 5.82041072845459, + "learning_rate": 0.00019778762439424285, + "loss": 3.729730224609375, + "step": 98500 + }, + { + "epoch": 1.0231718326813122, + "grad_norm": 5.905141353607178, + "learning_rate": 0.00019768385443149624, + "loss": 3.8260488891601563, + "step": 98600 + }, + { + "epoch": 1.0242095323087779, + "grad_norm": 3.5444912910461426, + "learning_rate": 0.00019758008446874966, + "loss": 3.687132568359375, + "step": 98700 + }, + { + "epoch": 1.0252472319362438, + "grad_norm": 7.397883892059326, + "learning_rate": 0.00019747631450600306, + "loss": 3.815035400390625, + "step": 98800 + }, + { + "epoch": 1.0262849315637095, + "grad_norm": 4.467862129211426, + "learning_rate": 0.00019737254454325648, + "loss": 3.645810241699219, + "step": 98900 + }, + { + "epoch": 1.0273226311911754, + "grad_norm": 7.824927806854248, + "learning_rate": 0.0001972687745805099, + "loss": 3.7502801513671873, + "step": 99000 + }, + { + "epoch": 1.0283603308186413, + "grad_norm": 9.055319786071777, + "learning_rate": 0.00019716500461776333, + "loss": 3.895949401855469, + "step": 99100 + }, + { + "epoch": 1.029398030446107, + "grad_norm": 2.499072313308716, + "learning_rate": 0.00019706123465501675, + "loss": 3.729786071777344, + "step": 99200 + }, + { + "epoch": 1.030435730073573, + "grad_norm": 2.091538667678833, + "learning_rate": 0.00019695746469227014, + "loss": 3.6661376953125, + "step": 99300 + }, + { + "epoch": 1.0314734297010388, + "grad_norm": 2.9895308017730713, + "learning_rate": 0.0001968536947295236, + "loss": 3.7620065307617185, + "step": 99400 + }, + { + "epoch": 1.0325111293285045, + "grad_norm": 3.8646888732910156, + "learning_rate": 0.00019674992476677699, + "loss": 3.8454522705078125, + "step": 99500 + }, + { + "epoch": 1.0335488289559704, + "grad_norm": 4.3288044929504395, + "learning_rate": 0.0001966461548040304, + "loss": 3.682370300292969, + "step": 99600 + }, + { + "epoch": 1.0345865285834361, + "grad_norm": 1.888063907623291, + "learning_rate": 0.00019654238484128383, + "loss": 3.7136306762695312, + "step": 99700 + }, + { + "epoch": 1.035624228210902, + "grad_norm": 2.9146947860717773, + "learning_rate": 0.00019643861487853722, + "loss": 3.7029214477539063, + "step": 99800 + }, + { + "epoch": 1.036661927838368, + "grad_norm": 3.3660199642181396, + "learning_rate": 0.00019633484491579067, + "loss": 3.669721984863281, + "step": 99900 + }, + { + "epoch": 1.0376996274658337, + "grad_norm": 3.8642494678497314, + "learning_rate": 0.00019623107495304407, + "loss": 3.718172302246094, + "step": 100000 + }, + { + "epoch": 1.0387373270932996, + "grad_norm": 19.524248123168945, + "learning_rate": 0.0001961273049902975, + "loss": 3.8097552490234374, + "step": 100100 + }, + { + "epoch": 1.0397750267207655, + "grad_norm": 2.175708293914795, + "learning_rate": 0.00019602353502755089, + "loss": 3.7663388061523437, + "step": 100200 + }, + { + "epoch": 1.0408127263482312, + "grad_norm": 2.0963635444641113, + "learning_rate": 0.00019591976506480433, + "loss": 3.7331805419921875, + "step": 100300 + }, + { + "epoch": 1.041850425975697, + "grad_norm": 4.1156134605407715, + "learning_rate": 0.00019581599510205776, + "loss": 3.7513092041015623, + "step": 100400 + }, + { + "epoch": 1.042888125603163, + "grad_norm": 1.9364126920700073, + "learning_rate": 0.00019571222513931115, + "loss": 3.7811895751953126, + "step": 100500 + }, + { + "epoch": 1.0439258252306287, + "grad_norm": 3.9929726123809814, + "learning_rate": 0.00019560845517656457, + "loss": 3.6916510009765626, + "step": 100600 + }, + { + "epoch": 1.0449635248580946, + "grad_norm": 6.161198139190674, + "learning_rate": 0.00019550468521381797, + "loss": 3.735494384765625, + "step": 100700 + }, + { + "epoch": 1.0460012244855603, + "grad_norm": 5.300504207611084, + "learning_rate": 0.00019540091525107142, + "loss": 3.6318603515625, + "step": 100800 + }, + { + "epoch": 1.0470389241130262, + "grad_norm": 6.671936988830566, + "learning_rate": 0.0001952971452883248, + "loss": 3.753620300292969, + "step": 100900 + }, + { + "epoch": 1.0480766237404922, + "grad_norm": 4.034755229949951, + "learning_rate": 0.00019519337532557823, + "loss": 3.6916033935546877, + "step": 101000 + }, + { + "epoch": 1.0491143233679578, + "grad_norm": 2.8349599838256836, + "learning_rate": 0.00019508960536283168, + "loss": 3.6846957397460938, + "step": 101100 + }, + { + "epoch": 1.0501520229954238, + "grad_norm": 4.222849369049072, + "learning_rate": 0.00019498583540008508, + "loss": 3.785768737792969, + "step": 101200 + }, + { + "epoch": 1.0511897226228897, + "grad_norm": 7.210328102111816, + "learning_rate": 0.0001948820654373385, + "loss": 3.674949035644531, + "step": 101300 + }, + { + "epoch": 1.0522274222503554, + "grad_norm": 4.031270503997803, + "learning_rate": 0.0001947782954745919, + "loss": 3.7858917236328127, + "step": 101400 + }, + { + "epoch": 1.0532651218778213, + "grad_norm": 28.53989601135254, + "learning_rate": 0.00019467452551184532, + "loss": 3.8007437133789064, + "step": 101500 + }, + { + "epoch": 1.054302821505287, + "grad_norm": 5.528784275054932, + "learning_rate": 0.00019457075554909877, + "loss": 3.624027099609375, + "step": 101600 + }, + { + "epoch": 1.055340521132753, + "grad_norm": 3.1289713382720947, + "learning_rate": 0.00019446698558635216, + "loss": 3.7536968994140625, + "step": 101700 + }, + { + "epoch": 1.0563782207602188, + "grad_norm": 2.9442858695983887, + "learning_rate": 0.00019436321562360558, + "loss": 3.569986572265625, + "step": 101800 + }, + { + "epoch": 1.0574159203876845, + "grad_norm": 4.8674726486206055, + "learning_rate": 0.00019425944566085898, + "loss": 3.8215240478515624, + "step": 101900 + }, + { + "epoch": 1.0584536200151504, + "grad_norm": 13.513835906982422, + "learning_rate": 0.0001941556756981124, + "loss": 3.6686697387695313, + "step": 102000 + }, + { + "epoch": 1.0594913196426163, + "grad_norm": 3.146784543991089, + "learning_rate": 0.00019405190573536582, + "loss": 3.643824462890625, + "step": 102100 + }, + { + "epoch": 1.060529019270082, + "grad_norm": 4.964068412780762, + "learning_rate": 0.00019394813577261924, + "loss": 3.748782043457031, + "step": 102200 + }, + { + "epoch": 1.061566718897548, + "grad_norm": 3.178044557571411, + "learning_rate": 0.00019384436580987266, + "loss": 3.7086587524414063, + "step": 102300 + }, + { + "epoch": 1.0626044185250136, + "grad_norm": 2.6959052085876465, + "learning_rate": 0.00019374059584712606, + "loss": 3.8190512084960937, + "step": 102400 + }, + { + "epoch": 1.0636421181524796, + "grad_norm": 4.595401763916016, + "learning_rate": 0.0001936368258843795, + "loss": 3.6920120239257814, + "step": 102500 + }, + { + "epoch": 1.0646798177799455, + "grad_norm": 3.383439064025879, + "learning_rate": 0.0001935330559216329, + "loss": 3.7616091918945314, + "step": 102600 + }, + { + "epoch": 1.0657175174074112, + "grad_norm": 6.921218395233154, + "learning_rate": 0.00019342928595888633, + "loss": 3.8070159912109376, + "step": 102700 + }, + { + "epoch": 1.066755217034877, + "grad_norm": 3.7757728099823, + "learning_rate": 0.00019332551599613975, + "loss": 3.64797119140625, + "step": 102800 + }, + { + "epoch": 1.067792916662343, + "grad_norm": 5.452692985534668, + "learning_rate": 0.00019322174603339314, + "loss": 3.7128118896484374, + "step": 102900 + }, + { + "epoch": 1.0688306162898087, + "grad_norm": 2.324277639389038, + "learning_rate": 0.0001931179760706466, + "loss": 3.5481451416015624, + "step": 103000 + }, + { + "epoch": 1.0698683159172746, + "grad_norm": 2.998181104660034, + "learning_rate": 0.00019301420610789999, + "loss": 3.6443612670898435, + "step": 103100 + }, + { + "epoch": 1.0709060155447405, + "grad_norm": 5.453862190246582, + "learning_rate": 0.0001929104361451534, + "loss": 3.7542648315429688, + "step": 103200 + }, + { + "epoch": 1.0719437151722062, + "grad_norm": 7.444779396057129, + "learning_rate": 0.0001928066661824068, + "loss": 3.696410827636719, + "step": 103300 + }, + { + "epoch": 1.0729814147996721, + "grad_norm": 4.7863569259643555, + "learning_rate": 0.00019270289621966025, + "loss": 3.6802603149414064, + "step": 103400 + }, + { + "epoch": 1.0740191144271378, + "grad_norm": 2.9291558265686035, + "learning_rate": 0.00019259912625691367, + "loss": 3.7929959106445312, + "step": 103500 + }, + { + "epoch": 1.0750568140546037, + "grad_norm": 3.2032582759857178, + "learning_rate": 0.00019249535629416707, + "loss": 3.6861895751953124, + "step": 103600 + }, + { + "epoch": 1.0760945136820697, + "grad_norm": 3.1435580253601074, + "learning_rate": 0.0001923915863314205, + "loss": 3.799478759765625, + "step": 103700 + }, + { + "epoch": 1.0771322133095353, + "grad_norm": 2.8310792446136475, + "learning_rate": 0.00019228781636867388, + "loss": 3.73474365234375, + "step": 103800 + }, + { + "epoch": 1.0781699129370013, + "grad_norm": 2.285276174545288, + "learning_rate": 0.00019218404640592733, + "loss": 3.6168304443359376, + "step": 103900 + }, + { + "epoch": 1.0792076125644672, + "grad_norm": 5.524131774902344, + "learning_rate": 0.00019208027644318073, + "loss": 3.710784912109375, + "step": 104000 + }, + { + "epoch": 1.0802453121919329, + "grad_norm": 3.545400619506836, + "learning_rate": 0.00019197650648043415, + "loss": 3.6640530395507813, + "step": 104100 + }, + { + "epoch": 1.0812830118193988, + "grad_norm": 3.101451873779297, + "learning_rate": 0.0001918727365176876, + "loss": 3.7735882568359376, + "step": 104200 + }, + { + "epoch": 1.0823207114468647, + "grad_norm": 2.4820311069488525, + "learning_rate": 0.000191768966554941, + "loss": 3.6366726684570314, + "step": 104300 + }, + { + "epoch": 1.0833584110743304, + "grad_norm": 26.539804458618164, + "learning_rate": 0.00019166519659219442, + "loss": 3.7211334228515627, + "step": 104400 + }, + { + "epoch": 1.0843961107017963, + "grad_norm": 3.41780161857605, + "learning_rate": 0.0001915614266294478, + "loss": 3.60020263671875, + "step": 104500 + }, + { + "epoch": 1.085433810329262, + "grad_norm": 2.689753293991089, + "learning_rate": 0.00019145765666670123, + "loss": 3.7544232177734376, + "step": 104600 + }, + { + "epoch": 1.086471509956728, + "grad_norm": 2.2958478927612305, + "learning_rate": 0.00019135388670395468, + "loss": 3.849725646972656, + "step": 104700 + }, + { + "epoch": 1.0875092095841938, + "grad_norm": 3.697185754776001, + "learning_rate": 0.00019125011674120808, + "loss": 3.813602294921875, + "step": 104800 + }, + { + "epoch": 1.0885469092116595, + "grad_norm": 2.1992783546447754, + "learning_rate": 0.0001911463467784615, + "loss": 3.6952606201171876, + "step": 104900 + }, + { + "epoch": 1.0895846088391254, + "grad_norm": 2.1027495861053467, + "learning_rate": 0.0001910425768157149, + "loss": 3.6720751953125, + "step": 105000 + }, + { + "epoch": 1.0906223084665914, + "grad_norm": 2.2862184047698975, + "learning_rate": 0.00019093880685296832, + "loss": 3.729759521484375, + "step": 105100 + }, + { + "epoch": 1.091660008094057, + "grad_norm": 2.060633659362793, + "learning_rate": 0.00019083503689022174, + "loss": 3.7085842895507812, + "step": 105200 + }, + { + "epoch": 1.092697707721523, + "grad_norm": 2.636503219604492, + "learning_rate": 0.00019073126692747516, + "loss": 3.6184716796875, + "step": 105300 + }, + { + "epoch": 1.0937354073489887, + "grad_norm": 7.98659086227417, + "learning_rate": 0.00019062749696472858, + "loss": 3.875008544921875, + "step": 105400 + }, + { + "epoch": 1.0947731069764546, + "grad_norm": 3.7854599952697754, + "learning_rate": 0.00019052372700198198, + "loss": 3.8590658569335936, + "step": 105500 + }, + { + "epoch": 1.0958108066039205, + "grad_norm": 9.304828643798828, + "learning_rate": 0.00019041995703923543, + "loss": 3.7910305786132814, + "step": 105600 + }, + { + "epoch": 1.0968485062313862, + "grad_norm": 6.323867321014404, + "learning_rate": 0.00019031618707648882, + "loss": 3.763433532714844, + "step": 105700 + }, + { + "epoch": 1.097886205858852, + "grad_norm": 5.698137283325195, + "learning_rate": 0.00019021241711374224, + "loss": 3.6159381103515624, + "step": 105800 + }, + { + "epoch": 1.098923905486318, + "grad_norm": 80.88331604003906, + "learning_rate": 0.00019010864715099564, + "loss": 3.738255920410156, + "step": 105900 + }, + { + "epoch": 1.0999616051137837, + "grad_norm": 4.7448577880859375, + "learning_rate": 0.00019000487718824906, + "loss": 3.6675250244140627, + "step": 106000 + }, + { + "epoch": 1.1009993047412496, + "grad_norm": 5.72471809387207, + "learning_rate": 0.0001899011072255025, + "loss": 3.7835205078125, + "step": 106100 + }, + { + "epoch": 1.1020370043687153, + "grad_norm": 3.3427250385284424, + "learning_rate": 0.0001897973372627559, + "loss": 3.6577874755859376, + "step": 106200 + }, + { + "epoch": 1.1030747039961812, + "grad_norm": 15.587642669677734, + "learning_rate": 0.00018969356730000932, + "loss": 3.716649169921875, + "step": 106300 + }, + { + "epoch": 1.1041124036236472, + "grad_norm": 4.485306262969971, + "learning_rate": 0.00018958979733726272, + "loss": 3.8367926025390626, + "step": 106400 + }, + { + "epoch": 1.1051501032511128, + "grad_norm": 2.82476806640625, + "learning_rate": 0.00018948602737451617, + "loss": 3.7493435668945314, + "step": 106500 + }, + { + "epoch": 1.1061878028785788, + "grad_norm": 15.561006546020508, + "learning_rate": 0.0001893822574117696, + "loss": 3.826619873046875, + "step": 106600 + }, + { + "epoch": 1.1072255025060447, + "grad_norm": 2.592461109161377, + "learning_rate": 0.00018927848744902299, + "loss": 3.7684344482421874, + "step": 106700 + }, + { + "epoch": 1.1082632021335104, + "grad_norm": 7.259844779968262, + "learning_rate": 0.0001891747174862764, + "loss": 3.758468017578125, + "step": 106800 + }, + { + "epoch": 1.1093009017609763, + "grad_norm": 5.973848342895508, + "learning_rate": 0.0001890709475235298, + "loss": 3.638338317871094, + "step": 106900 + }, + { + "epoch": 1.1103386013884422, + "grad_norm": 4.451427459716797, + "learning_rate": 0.00018896717756078325, + "loss": 3.788179626464844, + "step": 107000 + }, + { + "epoch": 1.111376301015908, + "grad_norm": 4.0467143058776855, + "learning_rate": 0.00018886340759803665, + "loss": 3.7329791259765623, + "step": 107100 + }, + { + "epoch": 1.1124140006433738, + "grad_norm": 5.440663814544678, + "learning_rate": 0.00018875963763529007, + "loss": 3.9233663940429686, + "step": 107200 + }, + { + "epoch": 1.1134517002708395, + "grad_norm": 2.327005386352539, + "learning_rate": 0.00018865586767254352, + "loss": 3.688836975097656, + "step": 107300 + }, + { + "epoch": 1.1144893998983054, + "grad_norm": 2.948439598083496, + "learning_rate": 0.0001885520977097969, + "loss": 3.623143310546875, + "step": 107400 + }, + { + "epoch": 1.1155270995257713, + "grad_norm": 8.996918678283691, + "learning_rate": 0.00018844832774705033, + "loss": 3.6873675537109376, + "step": 107500 + }, + { + "epoch": 1.116564799153237, + "grad_norm": 13.88825798034668, + "learning_rate": 0.00018834455778430373, + "loss": 3.889109802246094, + "step": 107600 + }, + { + "epoch": 1.117602498780703, + "grad_norm": 4.712568283081055, + "learning_rate": 0.00018824078782155715, + "loss": 3.8336361694335936, + "step": 107700 + }, + { + "epoch": 1.1186401984081689, + "grad_norm": 9.021018028259277, + "learning_rate": 0.00018813701785881055, + "loss": 3.818023681640625, + "step": 107800 + }, + { + "epoch": 1.1196778980356346, + "grad_norm": 4.5635294914245605, + "learning_rate": 0.000188033247896064, + "loss": 3.8210824584960936, + "step": 107900 + }, + { + "epoch": 1.1207155976631005, + "grad_norm": 6.118738651275635, + "learning_rate": 0.00018792947793331742, + "loss": 3.762948303222656, + "step": 108000 + }, + { + "epoch": 1.1217532972905664, + "grad_norm": 6.2977824211120605, + "learning_rate": 0.0001878257079705708, + "loss": 3.7840338134765625, + "step": 108100 + }, + { + "epoch": 1.122790996918032, + "grad_norm": 5.161929607391357, + "learning_rate": 0.00018772193800782423, + "loss": 3.8385690307617186, + "step": 108200 + }, + { + "epoch": 1.123828696545498, + "grad_norm": 19.5078067779541, + "learning_rate": 0.00018761816804507765, + "loss": 3.708250732421875, + "step": 108300 + }, + { + "epoch": 1.1248663961729637, + "grad_norm": 6.583184242248535, + "learning_rate": 0.00018751439808233108, + "loss": 3.6731692504882814, + "step": 108400 + }, + { + "epoch": 1.1259040958004296, + "grad_norm": 2.8113479614257812, + "learning_rate": 0.0001874106281195845, + "loss": 3.776397705078125, + "step": 108500 + }, + { + "epoch": 1.1269417954278955, + "grad_norm": 3.526796340942383, + "learning_rate": 0.0001873068581568379, + "loss": 3.713113098144531, + "step": 108600 + }, + { + "epoch": 1.1279794950553612, + "grad_norm": 4.96720027923584, + "learning_rate": 0.00018720308819409134, + "loss": 3.758629150390625, + "step": 108700 + }, + { + "epoch": 1.1290171946828271, + "grad_norm": 2.3801918029785156, + "learning_rate": 0.00018709931823134474, + "loss": 3.931161193847656, + "step": 108800 + }, + { + "epoch": 1.1300548943102928, + "grad_norm": 5.336031913757324, + "learning_rate": 0.00018699554826859816, + "loss": 3.7431265258789064, + "step": 108900 + }, + { + "epoch": 1.1310925939377587, + "grad_norm": 3.3115835189819336, + "learning_rate": 0.00018689177830585155, + "loss": 3.6016845703125, + "step": 109000 + }, + { + "epoch": 1.1321302935652247, + "grad_norm": 3.2625627517700195, + "learning_rate": 0.00018678800834310498, + "loss": 3.8173687744140623, + "step": 109100 + }, + { + "epoch": 1.1331679931926903, + "grad_norm": 3.4688777923583984, + "learning_rate": 0.00018668423838035843, + "loss": 3.7339138793945312, + "step": 109200 + }, + { + "epoch": 1.1342056928201563, + "grad_norm": 5.170476913452148, + "learning_rate": 0.00018658046841761182, + "loss": 3.8035733032226564, + "step": 109300 + }, + { + "epoch": 1.1352433924476222, + "grad_norm": 6.003453731536865, + "learning_rate": 0.00018647669845486524, + "loss": 3.7767242431640624, + "step": 109400 + }, + { + "epoch": 1.1362810920750879, + "grad_norm": 3.4862396717071533, + "learning_rate": 0.00018637292849211864, + "loss": 3.643880615234375, + "step": 109500 + }, + { + "epoch": 1.1373187917025538, + "grad_norm": 5.885380268096924, + "learning_rate": 0.00018626915852937209, + "loss": 3.7285040283203124, + "step": 109600 + }, + { + "epoch": 1.1383564913300197, + "grad_norm": 2.839015245437622, + "learning_rate": 0.00018616538856662548, + "loss": 3.7614910888671873, + "step": 109700 + }, + { + "epoch": 1.1393941909574854, + "grad_norm": 10.154685020446777, + "learning_rate": 0.0001860616186038789, + "loss": 3.635873107910156, + "step": 109800 + }, + { + "epoch": 1.1404318905849513, + "grad_norm": 11.110898971557617, + "learning_rate": 0.00018595784864113232, + "loss": 3.690367431640625, + "step": 109900 + }, + { + "epoch": 1.141469590212417, + "grad_norm": 2.4880504608154297, + "learning_rate": 0.00018585407867838572, + "loss": 3.69529541015625, + "step": 110000 + }, + { + "epoch": 1.142507289839883, + "grad_norm": 12.104265213012695, + "learning_rate": 0.00018575030871563917, + "loss": 3.81604736328125, + "step": 110100 + }, + { + "epoch": 1.1435449894673488, + "grad_norm": 4.529385089874268, + "learning_rate": 0.00018564653875289256, + "loss": 3.847531433105469, + "step": 110200 + }, + { + "epoch": 1.1445826890948145, + "grad_norm": 4.51477575302124, + "learning_rate": 0.00018554276879014598, + "loss": 3.6786367797851565, + "step": 110300 + }, + { + "epoch": 1.1456203887222804, + "grad_norm": 3.946871757507324, + "learning_rate": 0.00018543899882739943, + "loss": 3.7411343383789064, + "step": 110400 + }, + { + "epoch": 1.1466580883497464, + "grad_norm": 24.773929595947266, + "learning_rate": 0.00018533522886465283, + "loss": 3.6971206665039062, + "step": 110500 + }, + { + "epoch": 1.147695787977212, + "grad_norm": 4.848511695861816, + "learning_rate": 0.00018523145890190625, + "loss": 3.6610791015625, + "step": 110600 + }, + { + "epoch": 1.148733487604678, + "grad_norm": 3.155839681625366, + "learning_rate": 0.00018512768893915965, + "loss": 3.6824301147460936, + "step": 110700 + }, + { + "epoch": 1.1497711872321439, + "grad_norm": 3.4173624515533447, + "learning_rate": 0.00018502391897641307, + "loss": 3.729654541015625, + "step": 110800 + }, + { + "epoch": 1.1508088868596096, + "grad_norm": 3.1743650436401367, + "learning_rate": 0.00018492014901366646, + "loss": 3.752574157714844, + "step": 110900 + }, + { + "epoch": 1.1518465864870755, + "grad_norm": 5.655935287475586, + "learning_rate": 0.0001848163790509199, + "loss": 3.6886166381835936, + "step": 111000 + }, + { + "epoch": 1.1528842861145412, + "grad_norm": 2.8840067386627197, + "learning_rate": 0.00018471260908817333, + "loss": 3.8322817993164064, + "step": 111100 + }, + { + "epoch": 1.153921985742007, + "grad_norm": 4.1215057373046875, + "learning_rate": 0.00018460883912542673, + "loss": 3.634107971191406, + "step": 111200 + }, + { + "epoch": 1.154959685369473, + "grad_norm": 8.988388061523438, + "learning_rate": 0.00018450506916268018, + "loss": 3.813481750488281, + "step": 111300 + }, + { + "epoch": 1.1559973849969387, + "grad_norm": 4.154327869415283, + "learning_rate": 0.00018440129919993357, + "loss": 3.792846374511719, + "step": 111400 + }, + { + "epoch": 1.1570350846244046, + "grad_norm": 5.43167781829834, + "learning_rate": 0.000184297529237187, + "loss": 3.695276794433594, + "step": 111500 + }, + { + "epoch": 1.1580727842518705, + "grad_norm": 2.1235880851745605, + "learning_rate": 0.0001841937592744404, + "loss": 3.7109506225585935, + "step": 111600 + }, + { + "epoch": 1.1591104838793362, + "grad_norm": 3.2670278549194336, + "learning_rate": 0.0001840899893116938, + "loss": 3.779457702636719, + "step": 111700 + }, + { + "epoch": 1.1601481835068022, + "grad_norm": 4.596736431121826, + "learning_rate": 0.00018398621934894726, + "loss": 3.690837097167969, + "step": 111800 + }, + { + "epoch": 1.161185883134268, + "grad_norm": 5.063496112823486, + "learning_rate": 0.00018388244938620065, + "loss": 3.7899896240234376, + "step": 111900 + }, + { + "epoch": 1.1622235827617338, + "grad_norm": 3.2700915336608887, + "learning_rate": 0.00018377867942345408, + "loss": 3.7538375854492188, + "step": 112000 + }, + { + "epoch": 1.1632612823891997, + "grad_norm": 2.544558048248291, + "learning_rate": 0.00018367490946070747, + "loss": 3.7601394653320312, + "step": 112100 + }, + { + "epoch": 1.1642989820166654, + "grad_norm": 6.950151443481445, + "learning_rate": 0.0001835711394979609, + "loss": 3.797687683105469, + "step": 112200 + }, + { + "epoch": 1.1653366816441313, + "grad_norm": 2.161999464035034, + "learning_rate": 0.00018346736953521434, + "loss": 3.7408486938476564, + "step": 112300 + }, + { + "epoch": 1.1663743812715972, + "grad_norm": 2.824725866317749, + "learning_rate": 0.00018336359957246774, + "loss": 3.708443298339844, + "step": 112400 + }, + { + "epoch": 1.167412080899063, + "grad_norm": 11.807979583740234, + "learning_rate": 0.00018325982960972116, + "loss": 3.6650485229492187, + "step": 112500 + }, + { + "epoch": 1.1684497805265288, + "grad_norm": 12.751113891601562, + "learning_rate": 0.00018315605964697455, + "loss": 3.5273590087890625, + "step": 112600 + }, + { + "epoch": 1.1694874801539945, + "grad_norm": 3.0161349773406982, + "learning_rate": 0.000183052289684228, + "loss": 3.8431436157226564, + "step": 112700 + }, + { + "epoch": 1.1705251797814604, + "grad_norm": 8.852095603942871, + "learning_rate": 0.0001829485197214814, + "loss": 3.6667901611328126, + "step": 112800 + }, + { + "epoch": 1.1715628794089263, + "grad_norm": 16.80730438232422, + "learning_rate": 0.00018284474975873482, + "loss": 3.7361489868164064, + "step": 112900 + }, + { + "epoch": 1.172600579036392, + "grad_norm": 4.340658187866211, + "learning_rate": 0.00018274097979598824, + "loss": 3.7126028442382815, + "step": 113000 + }, + { + "epoch": 1.173638278663858, + "grad_norm": 2.2295515537261963, + "learning_rate": 0.00018263720983324164, + "loss": 3.6620779418945313, + "step": 113100 + }, + { + "epoch": 1.1746759782913239, + "grad_norm": 3.5379912853240967, + "learning_rate": 0.00018253343987049509, + "loss": 3.60224609375, + "step": 113200 + }, + { + "epoch": 1.1757136779187896, + "grad_norm": 3.174776315689087, + "learning_rate": 0.00018242966990774848, + "loss": 3.7180682373046876, + "step": 113300 + }, + { + "epoch": 1.1767513775462555, + "grad_norm": 4.343127250671387, + "learning_rate": 0.0001823258999450019, + "loss": 3.7377755737304685, + "step": 113400 + }, + { + "epoch": 1.1777890771737214, + "grad_norm": 21.170530319213867, + "learning_rate": 0.0001822221299822553, + "loss": 3.752294921875, + "step": 113500 + }, + { + "epoch": 1.178826776801187, + "grad_norm": 4.612101078033447, + "learning_rate": 0.00018211836001950875, + "loss": 3.8363751220703124, + "step": 113600 + }, + { + "epoch": 1.179864476428653, + "grad_norm": 6.276144981384277, + "learning_rate": 0.00018201459005676217, + "loss": 3.713616943359375, + "step": 113700 + }, + { + "epoch": 1.1809021760561187, + "grad_norm": 10.716604232788086, + "learning_rate": 0.00018191082009401556, + "loss": 3.629880676269531, + "step": 113800 + }, + { + "epoch": 1.1819398756835846, + "grad_norm": 2.2933573722839355, + "learning_rate": 0.00018180705013126898, + "loss": 3.8490249633789064, + "step": 113900 + }, + { + "epoch": 1.1829775753110505, + "grad_norm": 4.147966384887695, + "learning_rate": 0.00018170328016852238, + "loss": 3.5557064819335937, + "step": 114000 + }, + { + "epoch": 1.1840152749385162, + "grad_norm": 3.122669219970703, + "learning_rate": 0.00018159951020577583, + "loss": 3.73438232421875, + "step": 114100 + }, + { + "epoch": 1.1850529745659821, + "grad_norm": 9.210347175598145, + "learning_rate": 0.00018149574024302925, + "loss": 3.6972500610351564, + "step": 114200 + }, + { + "epoch": 1.186090674193448, + "grad_norm": 17.161890029907227, + "learning_rate": 0.00018139197028028265, + "loss": 3.819235534667969, + "step": 114300 + }, + { + "epoch": 1.1871283738209137, + "grad_norm": 5.225100040435791, + "learning_rate": 0.0001812882003175361, + "loss": 3.7081121826171874, + "step": 114400 + }, + { + "epoch": 1.1881660734483797, + "grad_norm": 8.891063690185547, + "learning_rate": 0.0001811844303547895, + "loss": 3.7459030151367188, + "step": 114500 + }, + { + "epoch": 1.1892037730758456, + "grad_norm": 3.465555429458618, + "learning_rate": 0.0001810806603920429, + "loss": 3.7495687866210936, + "step": 114600 + }, + { + "epoch": 1.1902414727033113, + "grad_norm": 2.962984561920166, + "learning_rate": 0.0001809768904292963, + "loss": 3.620650329589844, + "step": 114700 + }, + { + "epoch": 1.1912791723307772, + "grad_norm": 66.27200317382812, + "learning_rate": 0.00018087312046654973, + "loss": 3.8266671752929686, + "step": 114800 + }, + { + "epoch": 1.1923168719582429, + "grad_norm": 10.21193790435791, + "learning_rate": 0.00018076935050380318, + "loss": 3.7377734375, + "step": 114900 + }, + { + "epoch": 1.1933545715857088, + "grad_norm": 4.959332466125488, + "learning_rate": 0.00018066558054105657, + "loss": 3.767408752441406, + "step": 115000 + }, + { + "epoch": 1.1943922712131747, + "grad_norm": 4.304464817047119, + "learning_rate": 0.00018056181057831, + "loss": 3.793067626953125, + "step": 115100 + }, + { + "epoch": 1.1954299708406404, + "grad_norm": 4.872037887573242, + "learning_rate": 0.0001804580406155634, + "loss": 3.754971923828125, + "step": 115200 + }, + { + "epoch": 1.1964676704681063, + "grad_norm": 5.543403625488281, + "learning_rate": 0.0001803542706528168, + "loss": 3.6738140869140623, + "step": 115300 + }, + { + "epoch": 1.1975053700955722, + "grad_norm": 4.535797595977783, + "learning_rate": 0.00018025050069007023, + "loss": 3.6706658935546876, + "step": 115400 + }, + { + "epoch": 1.198543069723038, + "grad_norm": 3.987654209136963, + "learning_rate": 0.00018014673072732365, + "loss": 3.7104837036132814, + "step": 115500 + }, + { + "epoch": 1.1995807693505038, + "grad_norm": 4.604912757873535, + "learning_rate": 0.00018004296076457708, + "loss": 3.7295111083984374, + "step": 115600 + }, + { + "epoch": 1.2006184689779698, + "grad_norm": 7.51154088973999, + "learning_rate": 0.00017993919080183047, + "loss": 3.882249755859375, + "step": 115700 + }, + { + "epoch": 1.2016561686054354, + "grad_norm": 7.570425987243652, + "learning_rate": 0.00017983542083908392, + "loss": 3.7709280395507814, + "step": 115800 + }, + { + "epoch": 1.2026938682329014, + "grad_norm": 7.528663635253906, + "learning_rate": 0.00017973165087633731, + "loss": 3.744920654296875, + "step": 115900 + }, + { + "epoch": 1.203731567860367, + "grad_norm": 4.613593578338623, + "learning_rate": 0.00017962788091359074, + "loss": 3.81932373046875, + "step": 116000 + }, + { + "epoch": 1.204769267487833, + "grad_norm": 4.6101508140563965, + "learning_rate": 0.00017952411095084416, + "loss": 3.701668701171875, + "step": 116100 + }, + { + "epoch": 1.2058069671152989, + "grad_norm": 3.3336641788482666, + "learning_rate": 0.00017942034098809755, + "loss": 3.5936102294921874, + "step": 116200 + }, + { + "epoch": 1.2068446667427646, + "grad_norm": 8.796258926391602, + "learning_rate": 0.000179316571025351, + "loss": 3.6812298583984373, + "step": 116300 + }, + { + "epoch": 1.2078823663702305, + "grad_norm": 2.9002747535705566, + "learning_rate": 0.0001792128010626044, + "loss": 3.79119873046875, + "step": 116400 + }, + { + "epoch": 1.2089200659976962, + "grad_norm": 3.5677108764648438, + "learning_rate": 0.00017910903109985782, + "loss": 3.868831787109375, + "step": 116500 + }, + { + "epoch": 1.209957765625162, + "grad_norm": 10.07345199584961, + "learning_rate": 0.00017900526113711121, + "loss": 3.8205535888671873, + "step": 116600 + }, + { + "epoch": 1.210995465252628, + "grad_norm": 2.9789609909057617, + "learning_rate": 0.00017890149117436466, + "loss": 3.655535888671875, + "step": 116700 + }, + { + "epoch": 1.2120331648800937, + "grad_norm": 7.362621784210205, + "learning_rate": 0.00017879772121161808, + "loss": 3.5663858032226563, + "step": 116800 + }, + { + "epoch": 1.2130708645075596, + "grad_norm": 3.515774726867676, + "learning_rate": 0.00017869395124887148, + "loss": 3.64054443359375, + "step": 116900 + }, + { + "epoch": 1.2141085641350255, + "grad_norm": 2.5356316566467285, + "learning_rate": 0.0001785901812861249, + "loss": 3.621481628417969, + "step": 117000 + }, + { + "epoch": 1.2151462637624912, + "grad_norm": 4.910796642303467, + "learning_rate": 0.0001784864113233783, + "loss": 3.6991619873046875, + "step": 117100 + }, + { + "epoch": 1.2161839633899572, + "grad_norm": 4.202451705932617, + "learning_rate": 0.00017838264136063175, + "loss": 3.8038519287109374, + "step": 117200 + }, + { + "epoch": 1.217221663017423, + "grad_norm": 4.467262268066406, + "learning_rate": 0.00017827887139788514, + "loss": 3.771558837890625, + "step": 117300 + }, + { + "epoch": 1.2182593626448888, + "grad_norm": 3.9160234928131104, + "learning_rate": 0.00017817510143513856, + "loss": 3.7639215087890623, + "step": 117400 + }, + { + "epoch": 1.2192970622723547, + "grad_norm": 4.396745681762695, + "learning_rate": 0.000178071331472392, + "loss": 3.68260498046875, + "step": 117500 + }, + { + "epoch": 1.2203347618998204, + "grad_norm": 3.5205559730529785, + "learning_rate": 0.0001779675615096454, + "loss": 3.6396359252929686, + "step": 117600 + }, + { + "epoch": 1.2213724615272863, + "grad_norm": 3.1027088165283203, + "learning_rate": 0.00017786379154689883, + "loss": 3.5732858276367185, + "step": 117700 + }, + { + "epoch": 1.2224101611547522, + "grad_norm": 2.6304574012756348, + "learning_rate": 0.00017776002158415222, + "loss": 3.508619384765625, + "step": 117800 + }, + { + "epoch": 1.223447860782218, + "grad_norm": 2.9613137245178223, + "learning_rate": 0.00017765625162140564, + "loss": 3.65043212890625, + "step": 117900 + }, + { + "epoch": 1.2244855604096838, + "grad_norm": 3.6579976081848145, + "learning_rate": 0.0001775524816586591, + "loss": 3.805189514160156, + "step": 118000 + }, + { + "epoch": 1.2255232600371497, + "grad_norm": 2.3908674716949463, + "learning_rate": 0.0001774487116959125, + "loss": 3.608123474121094, + "step": 118100 + }, + { + "epoch": 1.2265609596646154, + "grad_norm": 3.335692882537842, + "learning_rate": 0.0001773449417331659, + "loss": 3.707095947265625, + "step": 118200 + }, + { + "epoch": 1.2275986592920813, + "grad_norm": 5.722865581512451, + "learning_rate": 0.0001772411717704193, + "loss": 3.7158029174804685, + "step": 118300 + }, + { + "epoch": 1.2286363589195473, + "grad_norm": 9.1022310256958, + "learning_rate": 0.00017713740180767273, + "loss": 3.7301669311523438, + "step": 118400 + }, + { + "epoch": 1.229674058547013, + "grad_norm": 5.698774814605713, + "learning_rate": 0.00017703363184492615, + "loss": 3.638455810546875, + "step": 118500 + }, + { + "epoch": 1.2307117581744789, + "grad_norm": 2.373983144760132, + "learning_rate": 0.00017692986188217957, + "loss": 3.6596408081054688, + "step": 118600 + }, + { + "epoch": 1.2317494578019446, + "grad_norm": 8.193933486938477, + "learning_rate": 0.000176826091919433, + "loss": 3.670250244140625, + "step": 118700 + }, + { + "epoch": 1.2327871574294105, + "grad_norm": 4.394575119018555, + "learning_rate": 0.0001767223219566864, + "loss": 3.7637249755859377, + "step": 118800 + }, + { + "epoch": 1.2338248570568764, + "grad_norm": 8.713273048400879, + "learning_rate": 0.00017661855199393984, + "loss": 3.7907025146484377, + "step": 118900 + }, + { + "epoch": 1.234862556684342, + "grad_norm": 2.0170185565948486, + "learning_rate": 0.00017651478203119323, + "loss": 3.638475036621094, + "step": 119000 + }, + { + "epoch": 1.235900256311808, + "grad_norm": 14.477542877197266, + "learning_rate": 0.00017641101206844665, + "loss": 3.6606521606445312, + "step": 119100 + }, + { + "epoch": 1.236937955939274, + "grad_norm": 3.3395235538482666, + "learning_rate": 0.00017630724210570005, + "loss": 3.5342837524414064, + "step": 119200 + }, + { + "epoch": 1.2379756555667396, + "grad_norm": 3.269758701324463, + "learning_rate": 0.00017620347214295347, + "loss": 3.5976416015625, + "step": 119300 + }, + { + "epoch": 1.2390133551942055, + "grad_norm": 7.099674224853516, + "learning_rate": 0.00017609970218020692, + "loss": 3.599384460449219, + "step": 119400 + }, + { + "epoch": 1.2400510548216714, + "grad_norm": 2.358044385910034, + "learning_rate": 0.00017599593221746031, + "loss": 3.4857781982421874, + "step": 119500 + }, + { + "epoch": 1.2410887544491371, + "grad_norm": 5.485024929046631, + "learning_rate": 0.00017589216225471374, + "loss": 3.69429931640625, + "step": 119600 + }, + { + "epoch": 1.242126454076603, + "grad_norm": 5.038040637969971, + "learning_rate": 0.00017578839229196713, + "loss": 3.599921875, + "step": 119700 + }, + { + "epoch": 1.2431641537040687, + "grad_norm": 6.716040134429932, + "learning_rate": 0.00017568462232922058, + "loss": 3.555647888183594, + "step": 119800 + }, + { + "epoch": 1.2442018533315347, + "grad_norm": 9.499709129333496, + "learning_rate": 0.000175580852366474, + "loss": 3.740644836425781, + "step": 119900 + }, + { + "epoch": 1.2452395529590006, + "grad_norm": 2.5602540969848633, + "learning_rate": 0.0001754770824037274, + "loss": 3.7783831787109374, + "step": 120000 + }, + { + "epoch": 1.2462772525864663, + "grad_norm": 5.06706428527832, + "learning_rate": 0.00017537331244098082, + "loss": 3.7457623291015625, + "step": 120100 + }, + { + "epoch": 1.2473149522139322, + "grad_norm": 4.963079452514648, + "learning_rate": 0.00017526954247823421, + "loss": 3.726761474609375, + "step": 120200 + }, + { + "epoch": 1.2483526518413979, + "grad_norm": 4.604287624359131, + "learning_rate": 0.00017516577251548766, + "loss": 3.8796881103515624, + "step": 120300 + }, + { + "epoch": 1.2493903514688638, + "grad_norm": 7.884790897369385, + "learning_rate": 0.00017506200255274106, + "loss": 3.7173165893554687, + "step": 120400 + }, + { + "epoch": 1.2504280510963297, + "grad_norm": 7.230984687805176, + "learning_rate": 0.00017495823258999448, + "loss": 3.7296737670898437, + "step": 120500 + }, + { + "epoch": 1.2514657507237956, + "grad_norm": 4.4041032791137695, + "learning_rate": 0.00017485446262724793, + "loss": 3.695928039550781, + "step": 120600 + }, + { + "epoch": 1.2525034503512613, + "grad_norm": 4.800326347351074, + "learning_rate": 0.00017475069266450132, + "loss": 3.692496032714844, + "step": 120700 + }, + { + "epoch": 1.2535411499787272, + "grad_norm": 4.20355224609375, + "learning_rate": 0.00017464692270175475, + "loss": 3.724625549316406, + "step": 120800 + }, + { + "epoch": 1.254578849606193, + "grad_norm": 8.89311408996582, + "learning_rate": 0.00017454315273900814, + "loss": 3.6060061645507813, + "step": 120900 + }, + { + "epoch": 1.2556165492336588, + "grad_norm": 3.7018239498138428, + "learning_rate": 0.00017443938277626156, + "loss": 3.7614715576171873, + "step": 121000 + }, + { + "epoch": 1.2566542488611248, + "grad_norm": 3.2457141876220703, + "learning_rate": 0.00017433561281351496, + "loss": 3.729616394042969, + "step": 121100 + }, + { + "epoch": 1.2576919484885904, + "grad_norm": 9.342671394348145, + "learning_rate": 0.0001742318428507684, + "loss": 3.717445068359375, + "step": 121200 + }, + { + "epoch": 1.2587296481160564, + "grad_norm": 3.293091058731079, + "learning_rate": 0.00017412807288802183, + "loss": 3.7832305908203123, + "step": 121300 + }, + { + "epoch": 1.259767347743522, + "grad_norm": 4.222780704498291, + "learning_rate": 0.00017402430292527522, + "loss": 3.7384588623046877, + "step": 121400 + }, + { + "epoch": 1.260805047370988, + "grad_norm": 3.0761492252349854, + "learning_rate": 0.00017392053296252867, + "loss": 3.7555526733398437, + "step": 121500 + }, + { + "epoch": 1.261842746998454, + "grad_norm": 2.887803554534912, + "learning_rate": 0.00017381676299978207, + "loss": 3.695442810058594, + "step": 121600 + }, + { + "epoch": 1.2628804466259196, + "grad_norm": 3.7166850566864014, + "learning_rate": 0.0001737129930370355, + "loss": 3.815606689453125, + "step": 121700 + }, + { + "epoch": 1.2639181462533855, + "grad_norm": 12.183484077453613, + "learning_rate": 0.0001736092230742889, + "loss": 3.637664794921875, + "step": 121800 + }, + { + "epoch": 1.2649558458808512, + "grad_norm": 3.1364870071411133, + "learning_rate": 0.0001735054531115423, + "loss": 3.6319699096679687, + "step": 121900 + }, + { + "epoch": 1.265993545508317, + "grad_norm": 4.354419708251953, + "learning_rate": 0.00017340168314879575, + "loss": 3.786130065917969, + "step": 122000 + }, + { + "epoch": 1.267031245135783, + "grad_norm": 4.645047664642334, + "learning_rate": 0.00017329791318604915, + "loss": 3.7552008056640624, + "step": 122100 + }, + { + "epoch": 1.268068944763249, + "grad_norm": 4.269083499908447, + "learning_rate": 0.00017319414322330257, + "loss": 3.7506790161132812, + "step": 122200 + }, + { + "epoch": 1.2691066443907146, + "grad_norm": 5.066195011138916, + "learning_rate": 0.00017309037326055597, + "loss": 3.788629455566406, + "step": 122300 + }, + { + "epoch": 1.2701443440181805, + "grad_norm": 5.5616021156311035, + "learning_rate": 0.0001729866032978094, + "loss": 3.6688613891601562, + "step": 122400 + }, + { + "epoch": 1.2711820436456462, + "grad_norm": 3.1797661781311035, + "learning_rate": 0.00017288283333506284, + "loss": 3.718145751953125, + "step": 122500 + }, + { + "epoch": 1.2722197432731122, + "grad_norm": 3.063791275024414, + "learning_rate": 0.00017277906337231623, + "loss": 3.66003662109375, + "step": 122600 + }, + { + "epoch": 1.273257442900578, + "grad_norm": 24.703685760498047, + "learning_rate": 0.00017267529340956965, + "loss": 3.697345886230469, + "step": 122700 + }, + { + "epoch": 1.2742951425280438, + "grad_norm": 4.573358058929443, + "learning_rate": 0.00017257152344682305, + "loss": 3.770580139160156, + "step": 122800 + }, + { + "epoch": 1.2753328421555097, + "grad_norm": 6.073929309844971, + "learning_rate": 0.0001724677534840765, + "loss": 3.570367736816406, + "step": 122900 + }, + { + "epoch": 1.2763705417829754, + "grad_norm": 4.804381847381592, + "learning_rate": 0.0001723639835213299, + "loss": 3.7930453491210936, + "step": 123000 + }, + { + "epoch": 1.2774082414104413, + "grad_norm": 7.542964935302734, + "learning_rate": 0.00017226021355858331, + "loss": 3.6680117797851564, + "step": 123100 + }, + { + "epoch": 1.2784459410379072, + "grad_norm": 7.110779285430908, + "learning_rate": 0.00017215644359583674, + "loss": 3.645113830566406, + "step": 123200 + }, + { + "epoch": 1.2794836406653731, + "grad_norm": 5.410161018371582, + "learning_rate": 0.00017205267363309013, + "loss": 3.7428775024414063, + "step": 123300 + }, + { + "epoch": 1.2805213402928388, + "grad_norm": 4.089752197265625, + "learning_rate": 0.00017194890367034358, + "loss": 3.7075924682617187, + "step": 123400 + }, + { + "epoch": 1.2815590399203047, + "grad_norm": 5.877744197845459, + "learning_rate": 0.00017184513370759697, + "loss": 3.546766662597656, + "step": 123500 + }, + { + "epoch": 1.2825967395477704, + "grad_norm": 4.295921802520752, + "learning_rate": 0.0001717413637448504, + "loss": 3.5129269409179686, + "step": 123600 + }, + { + "epoch": 1.2836344391752363, + "grad_norm": 7.998104572296143, + "learning_rate": 0.00017163759378210385, + "loss": 3.6661138916015625, + "step": 123700 + }, + { + "epoch": 1.2846721388027023, + "grad_norm": 4.939531326293945, + "learning_rate": 0.00017153382381935724, + "loss": 3.665038757324219, + "step": 123800 + }, + { + "epoch": 1.285709838430168, + "grad_norm": 6.5936384201049805, + "learning_rate": 0.00017143005385661066, + "loss": 3.6241445922851563, + "step": 123900 + }, + { + "epoch": 1.2867475380576339, + "grad_norm": 4.765341281890869, + "learning_rate": 0.00017132628389386406, + "loss": 3.651435546875, + "step": 124000 + }, + { + "epoch": 1.2877852376850996, + "grad_norm": 5.4220147132873535, + "learning_rate": 0.00017122251393111748, + "loss": 3.8530377197265624, + "step": 124100 + }, + { + "epoch": 1.2888229373125655, + "grad_norm": 5.066165447235107, + "learning_rate": 0.00017111874396837087, + "loss": 3.6765261840820314, + "step": 124200 + }, + { + "epoch": 1.2898606369400314, + "grad_norm": 2.871612787246704, + "learning_rate": 0.00017101497400562432, + "loss": 3.7530276489257814, + "step": 124300 + }, + { + "epoch": 1.2908983365674973, + "grad_norm": 3.5445234775543213, + "learning_rate": 0.00017091120404287774, + "loss": 3.65380126953125, + "step": 124400 + }, + { + "epoch": 1.291936036194963, + "grad_norm": 12.712068557739258, + "learning_rate": 0.00017080743408013114, + "loss": 3.651844787597656, + "step": 124500 + }, + { + "epoch": 1.292973735822429, + "grad_norm": 5.535710334777832, + "learning_rate": 0.0001707036641173846, + "loss": 3.648440246582031, + "step": 124600 + }, + { + "epoch": 1.2940114354498946, + "grad_norm": 6.527225017547607, + "learning_rate": 0.00017059989415463798, + "loss": 3.6168035888671874, + "step": 124700 + }, + { + "epoch": 1.2950491350773605, + "grad_norm": 3.675743579864502, + "learning_rate": 0.0001704961241918914, + "loss": 3.689391784667969, + "step": 124800 + }, + { + "epoch": 1.2960868347048264, + "grad_norm": 7.041729927062988, + "learning_rate": 0.0001703923542291448, + "loss": 3.6547369384765624, + "step": 124900 + }, + { + "epoch": 1.2971245343322921, + "grad_norm": 2.5913071632385254, + "learning_rate": 0.00017028858426639822, + "loss": 3.803846740722656, + "step": 125000 + }, + { + "epoch": 1.298162233959758, + "grad_norm": 5.099416732788086, + "learning_rate": 0.00017018481430365167, + "loss": 3.661207580566406, + "step": 125100 + }, + { + "epoch": 1.2991999335872237, + "grad_norm": 3.8206946849823, + "learning_rate": 0.00017008104434090507, + "loss": 3.552643127441406, + "step": 125200 + }, + { + "epoch": 1.3002376332146897, + "grad_norm": 3.769073247909546, + "learning_rate": 0.0001699772743781585, + "loss": 3.842325439453125, + "step": 125300 + }, + { + "epoch": 1.3012753328421556, + "grad_norm": 2.529937744140625, + "learning_rate": 0.00016987350441541188, + "loss": 3.676832275390625, + "step": 125400 + }, + { + "epoch": 1.3023130324696213, + "grad_norm": 7.345049858093262, + "learning_rate": 0.0001697697344526653, + "loss": 3.6286630249023437, + "step": 125500 + }, + { + "epoch": 1.3033507320970872, + "grad_norm": 7.380908012390137, + "learning_rate": 0.00016966596448991875, + "loss": 3.6627023315429685, + "step": 125600 + }, + { + "epoch": 1.3043884317245529, + "grad_norm": 2.8857064247131348, + "learning_rate": 0.00016956219452717215, + "loss": 3.641376953125, + "step": 125700 + }, + { + "epoch": 1.3054261313520188, + "grad_norm": 6.945189476013184, + "learning_rate": 0.00016945842456442557, + "loss": 3.606731262207031, + "step": 125800 + }, + { + "epoch": 1.3064638309794847, + "grad_norm": 6.422026634216309, + "learning_rate": 0.00016935465460167897, + "loss": 3.5785845947265624, + "step": 125900 + }, + { + "epoch": 1.3075015306069506, + "grad_norm": 8.35920524597168, + "learning_rate": 0.00016925088463893241, + "loss": 3.6259381103515627, + "step": 126000 + }, + { + "epoch": 1.3085392302344163, + "grad_norm": 8.193489074707031, + "learning_rate": 0.0001691471146761858, + "loss": 3.7568353271484374, + "step": 126100 + }, + { + "epoch": 1.3095769298618822, + "grad_norm": 5.267637252807617, + "learning_rate": 0.00016904334471343923, + "loss": 3.757891845703125, + "step": 126200 + }, + { + "epoch": 1.310614629489348, + "grad_norm": 3.3981618881225586, + "learning_rate": 0.00016893957475069265, + "loss": 3.6808877563476563, + "step": 126300 + }, + { + "epoch": 1.3116523291168138, + "grad_norm": 11.042278289794922, + "learning_rate": 0.00016883580478794605, + "loss": 3.5690008544921876, + "step": 126400 + }, + { + "epoch": 1.3126900287442798, + "grad_norm": 12.522445678710938, + "learning_rate": 0.0001687320348251995, + "loss": 3.675894775390625, + "step": 126500 + }, + { + "epoch": 1.3137277283717455, + "grad_norm": 4.374575138092041, + "learning_rate": 0.0001686282648624529, + "loss": 3.8043743896484377, + "step": 126600 + }, + { + "epoch": 1.3147654279992114, + "grad_norm": 2.7740325927734375, + "learning_rate": 0.00016852449489970631, + "loss": 3.7183938598632813, + "step": 126700 + }, + { + "epoch": 1.315803127626677, + "grad_norm": 16.38130760192871, + "learning_rate": 0.0001684207249369597, + "loss": 3.7160101318359375, + "step": 126800 + }, + { + "epoch": 1.316840827254143, + "grad_norm": 9.450004577636719, + "learning_rate": 0.00016831695497421316, + "loss": 3.6377835083007812, + "step": 126900 + }, + { + "epoch": 1.317878526881609, + "grad_norm": 8.669651985168457, + "learning_rate": 0.00016821318501146658, + "loss": 3.5026895141601564, + "step": 127000 + }, + { + "epoch": 1.3189162265090748, + "grad_norm": 4.877604007720947, + "learning_rate": 0.00016810941504871997, + "loss": 3.6808175659179687, + "step": 127100 + }, + { + "epoch": 1.3199539261365405, + "grad_norm": 9.553235054016113, + "learning_rate": 0.0001680056450859734, + "loss": 3.706498718261719, + "step": 127200 + }, + { + "epoch": 1.3209916257640064, + "grad_norm": 4.275841236114502, + "learning_rate": 0.0001679018751232268, + "loss": 3.752271728515625, + "step": 127300 + }, + { + "epoch": 1.322029325391472, + "grad_norm": 7.115382671356201, + "learning_rate": 0.00016779810516048024, + "loss": 3.721490783691406, + "step": 127400 + }, + { + "epoch": 1.323067025018938, + "grad_norm": 3.066580057144165, + "learning_rate": 0.00016769433519773366, + "loss": 3.67330322265625, + "step": 127500 + }, + { + "epoch": 1.324104724646404, + "grad_norm": 3.145909547805786, + "learning_rate": 0.00016759056523498706, + "loss": 3.7071697998046873, + "step": 127600 + }, + { + "epoch": 1.3251424242738696, + "grad_norm": 3.342615842819214, + "learning_rate": 0.0001674867952722405, + "loss": 3.68224853515625, + "step": 127700 + }, + { + "epoch": 1.3261801239013356, + "grad_norm": 4.780127048492432, + "learning_rate": 0.0001673830253094939, + "loss": 3.914273986816406, + "step": 127800 + }, + { + "epoch": 1.3272178235288012, + "grad_norm": 8.07118034362793, + "learning_rate": 0.00016727925534674732, + "loss": 3.6639437866210938, + "step": 127900 + }, + { + "epoch": 1.3282555231562672, + "grad_norm": 6.763175964355469, + "learning_rate": 0.00016717548538400072, + "loss": 3.62579345703125, + "step": 128000 + }, + { + "epoch": 1.329293222783733, + "grad_norm": 12.123154640197754, + "learning_rate": 0.00016707171542125414, + "loss": 3.721268615722656, + "step": 128100 + }, + { + "epoch": 1.330330922411199, + "grad_norm": 3.787297010421753, + "learning_rate": 0.0001669679454585076, + "loss": 3.7412783813476564, + "step": 128200 + }, + { + "epoch": 1.3313686220386647, + "grad_norm": 2.629784107208252, + "learning_rate": 0.00016686417549576098, + "loss": 3.7266500854492186, + "step": 128300 + }, + { + "epoch": 1.3324063216661306, + "grad_norm": 2.8463058471679688, + "learning_rate": 0.0001667604055330144, + "loss": 3.56947021484375, + "step": 128400 + }, + { + "epoch": 1.3334440212935963, + "grad_norm": 3.5442264080047607, + "learning_rate": 0.0001666566355702678, + "loss": 3.6988034057617187, + "step": 128500 + }, + { + "epoch": 1.3344817209210622, + "grad_norm": 3.726022243499756, + "learning_rate": 0.00016655286560752122, + "loss": 3.6229156494140624, + "step": 128600 + }, + { + "epoch": 1.3355194205485281, + "grad_norm": 5.090481758117676, + "learning_rate": 0.00016644909564477464, + "loss": 3.5555209350585937, + "step": 128700 + }, + { + "epoch": 1.3365571201759938, + "grad_norm": 5.148849964141846, + "learning_rate": 0.00016634532568202807, + "loss": 3.723890380859375, + "step": 128800 + }, + { + "epoch": 1.3375948198034597, + "grad_norm": 7.033978462219238, + "learning_rate": 0.0001662415557192815, + "loss": 3.6295504760742188, + "step": 128900 + }, + { + "epoch": 1.3386325194309254, + "grad_norm": 5.022918701171875, + "learning_rate": 0.00016613778575653488, + "loss": 3.604397888183594, + "step": 129000 + }, + { + "epoch": 1.3396702190583913, + "grad_norm": 3.9396724700927734, + "learning_rate": 0.00016603401579378833, + "loss": 3.740953369140625, + "step": 129100 + }, + { + "epoch": 1.3407079186858573, + "grad_norm": 4.96920919418335, + "learning_rate": 0.00016593024583104173, + "loss": 3.6454959106445313, + "step": 129200 + }, + { + "epoch": 1.341745618313323, + "grad_norm": 3.2997357845306396, + "learning_rate": 0.00016582647586829515, + "loss": 3.64101806640625, + "step": 129300 + }, + { + "epoch": 1.3427833179407889, + "grad_norm": 12.793081283569336, + "learning_rate": 0.00016572270590554857, + "loss": 3.537852478027344, + "step": 129400 + }, + { + "epoch": 1.3438210175682546, + "grad_norm": 7.696393013000488, + "learning_rate": 0.00016561893594280197, + "loss": 3.6636843872070313, + "step": 129500 + }, + { + "epoch": 1.3448587171957205, + "grad_norm": 4.841111183166504, + "learning_rate": 0.00016551516598005541, + "loss": 3.6766192626953127, + "step": 129600 + }, + { + "epoch": 1.3458964168231864, + "grad_norm": 2.822445869445801, + "learning_rate": 0.0001654113960173088, + "loss": 3.5910659790039063, + "step": 129700 + }, + { + "epoch": 1.3469341164506523, + "grad_norm": 7.020183086395264, + "learning_rate": 0.00016530762605456223, + "loss": 3.6770706176757812, + "step": 129800 + }, + { + "epoch": 1.347971816078118, + "grad_norm": 3.323997974395752, + "learning_rate": 0.00016520385609181563, + "loss": 3.673494567871094, + "step": 129900 + }, + { + "epoch": 1.349009515705584, + "grad_norm": 12.734125137329102, + "learning_rate": 0.00016510008612906907, + "loss": 3.645369873046875, + "step": 130000 + }, + { + "epoch": 1.3500472153330496, + "grad_norm": 6.959007740020752, + "learning_rate": 0.0001649963161663225, + "loss": 3.5545895385742186, + "step": 130100 + }, + { + "epoch": 1.3510849149605155, + "grad_norm": 5.492075443267822, + "learning_rate": 0.0001648925462035759, + "loss": 3.73270263671875, + "step": 130200 + }, + { + "epoch": 1.3521226145879814, + "grad_norm": 5.578936576843262, + "learning_rate": 0.0001647887762408293, + "loss": 3.633159484863281, + "step": 130300 + }, + { + "epoch": 1.3531603142154471, + "grad_norm": 4.073727607727051, + "learning_rate": 0.0001646850062780827, + "loss": 3.7094195556640623, + "step": 130400 + }, + { + "epoch": 1.354198013842913, + "grad_norm": 3.7967214584350586, + "learning_rate": 0.00016458123631533616, + "loss": 3.6143753051757814, + "step": 130500 + }, + { + "epoch": 1.3552357134703787, + "grad_norm": 5.993916034698486, + "learning_rate": 0.00016447746635258955, + "loss": 3.722456359863281, + "step": 130600 + }, + { + "epoch": 1.3562734130978447, + "grad_norm": 4.235459327697754, + "learning_rate": 0.00016437369638984297, + "loss": 3.7401913452148436, + "step": 130700 + }, + { + "epoch": 1.3573111127253106, + "grad_norm": 13.88862133026123, + "learning_rate": 0.00016426992642709642, + "loss": 3.746804504394531, + "step": 130800 + }, + { + "epoch": 1.3583488123527765, + "grad_norm": 5.165769100189209, + "learning_rate": 0.00016416615646434982, + "loss": 3.74326416015625, + "step": 130900 + }, + { + "epoch": 1.3593865119802422, + "grad_norm": 3.6813595294952393, + "learning_rate": 0.00016406238650160324, + "loss": 3.617030029296875, + "step": 131000 + }, + { + "epoch": 1.360424211607708, + "grad_norm": 5.9350152015686035, + "learning_rate": 0.00016395861653885663, + "loss": 3.873332214355469, + "step": 131100 + }, + { + "epoch": 1.3614619112351738, + "grad_norm": 4.220798969268799, + "learning_rate": 0.00016385484657611006, + "loss": 3.6584405517578125, + "step": 131200 + }, + { + "epoch": 1.3624996108626397, + "grad_norm": 21.21164894104004, + "learning_rate": 0.0001637510766133635, + "loss": 3.617677917480469, + "step": 131300 + }, + { + "epoch": 1.3635373104901056, + "grad_norm": 5.271477699279785, + "learning_rate": 0.0001636473066506169, + "loss": 3.5792852783203126, + "step": 131400 + }, + { + "epoch": 1.3645750101175713, + "grad_norm": 4.747986316680908, + "learning_rate": 0.00016354353668787032, + "loss": 3.6235577392578127, + "step": 131500 + }, + { + "epoch": 1.3656127097450372, + "grad_norm": 3.8399877548217773, + "learning_rate": 0.00016343976672512372, + "loss": 3.780206604003906, + "step": 131600 + }, + { + "epoch": 1.366650409372503, + "grad_norm": 7.428284645080566, + "learning_rate": 0.00016333599676237714, + "loss": 3.600271911621094, + "step": 131700 + }, + { + "epoch": 1.3676881089999688, + "grad_norm": 4.4645304679870605, + "learning_rate": 0.00016323222679963056, + "loss": 3.6348703002929685, + "step": 131800 + }, + { + "epoch": 1.3687258086274348, + "grad_norm": 4.429653167724609, + "learning_rate": 0.00016312845683688398, + "loss": 3.704706726074219, + "step": 131900 + }, + { + "epoch": 1.3697635082549007, + "grad_norm": 4.308233737945557, + "learning_rate": 0.0001630246868741374, + "loss": 3.704057312011719, + "step": 132000 + }, + { + "epoch": 1.3708012078823664, + "grad_norm": 12.334646224975586, + "learning_rate": 0.0001629209169113908, + "loss": 3.6710003662109374, + "step": 132100 + }, + { + "epoch": 1.3718389075098323, + "grad_norm": 5.286363124847412, + "learning_rate": 0.00016281714694864425, + "loss": 3.6472879028320313, + "step": 132200 + }, + { + "epoch": 1.372876607137298, + "grad_norm": 3.0022027492523193, + "learning_rate": 0.00016271337698589764, + "loss": 3.867461853027344, + "step": 132300 + }, + { + "epoch": 1.373914306764764, + "grad_norm": 3.6052401065826416, + "learning_rate": 0.00016260960702315107, + "loss": 3.465709533691406, + "step": 132400 + }, + { + "epoch": 1.3749520063922298, + "grad_norm": 4.250115871429443, + "learning_rate": 0.00016250583706040446, + "loss": 3.6189974975585937, + "step": 132500 + }, + { + "epoch": 1.3759897060196955, + "grad_norm": 4.520415306091309, + "learning_rate": 0.00016240206709765788, + "loss": 3.697256774902344, + "step": 132600 + }, + { + "epoch": 1.3770274056471614, + "grad_norm": 3.608278751373291, + "learning_rate": 0.00016229829713491133, + "loss": 3.6748687744140627, + "step": 132700 + }, + { + "epoch": 1.3780651052746271, + "grad_norm": 3.6304538249969482, + "learning_rate": 0.00016219452717216473, + "loss": 3.6889605712890625, + "step": 132800 + }, + { + "epoch": 1.379102804902093, + "grad_norm": 4.484381675720215, + "learning_rate": 0.00016209075720941815, + "loss": 3.667810974121094, + "step": 132900 + }, + { + "epoch": 1.380140504529559, + "grad_norm": 12.79962158203125, + "learning_rate": 0.00016198698724667154, + "loss": 3.901937255859375, + "step": 133000 + }, + { + "epoch": 1.3811782041570246, + "grad_norm": 3.6465935707092285, + "learning_rate": 0.000161883217283925, + "loss": 3.6334658813476564, + "step": 133100 + }, + { + "epoch": 1.3822159037844906, + "grad_norm": 2.5269343852996826, + "learning_rate": 0.00016177944732117841, + "loss": 3.6968539428710936, + "step": 133200 + }, + { + "epoch": 1.3832536034119562, + "grad_norm": 4.01210880279541, + "learning_rate": 0.0001616756773584318, + "loss": 3.4310296630859374, + "step": 133300 + }, + { + "epoch": 1.3842913030394222, + "grad_norm": 4.493933200836182, + "learning_rate": 0.00016157190739568523, + "loss": 3.719140930175781, + "step": 133400 + }, + { + "epoch": 1.385329002666888, + "grad_norm": 3.25607967376709, + "learning_rate": 0.00016146813743293863, + "loss": 3.6992584228515626, + "step": 133500 + }, + { + "epoch": 1.386366702294354, + "grad_norm": 6.134942054748535, + "learning_rate": 0.00016136436747019207, + "loss": 3.748294677734375, + "step": 133600 + }, + { + "epoch": 1.3874044019218197, + "grad_norm": 3.706012725830078, + "learning_rate": 0.00016126059750744547, + "loss": 3.586408996582031, + "step": 133700 + }, + { + "epoch": 1.3884421015492856, + "grad_norm": 5.05728816986084, + "learning_rate": 0.0001611568275446989, + "loss": 3.7400482177734373, + "step": 133800 + }, + { + "epoch": 1.3894798011767513, + "grad_norm": 4.292380332946777, + "learning_rate": 0.00016105305758195234, + "loss": 3.7132363891601563, + "step": 133900 + }, + { + "epoch": 1.3905175008042172, + "grad_norm": 9.770214080810547, + "learning_rate": 0.00016094928761920573, + "loss": 3.5888162231445313, + "step": 134000 + }, + { + "epoch": 1.3915552004316831, + "grad_norm": 9.073437690734863, + "learning_rate": 0.00016084551765645916, + "loss": 3.6239898681640623, + "step": 134100 + }, + { + "epoch": 1.3925929000591488, + "grad_norm": 5.210220813751221, + "learning_rate": 0.00016074174769371255, + "loss": 3.4854669189453125, + "step": 134200 + }, + { + "epoch": 1.3936305996866147, + "grad_norm": 5.995209693908691, + "learning_rate": 0.00016063797773096597, + "loss": 3.6248184204101563, + "step": 134300 + }, + { + "epoch": 1.3946682993140804, + "grad_norm": 8.040777206420898, + "learning_rate": 0.00016053420776821937, + "loss": 3.767200622558594, + "step": 134400 + }, + { + "epoch": 1.3957059989415463, + "grad_norm": 6.153497695922852, + "learning_rate": 0.00016043043780547282, + "loss": 3.6283489990234377, + "step": 134500 + }, + { + "epoch": 1.3967436985690123, + "grad_norm": 3.4162278175354004, + "learning_rate": 0.00016032666784272624, + "loss": 3.6065017700195314, + "step": 134600 + }, + { + "epoch": 1.3977813981964782, + "grad_norm": 3.4524638652801514, + "learning_rate": 0.00016022289787997963, + "loss": 3.6301129150390623, + "step": 134700 + }, + { + "epoch": 1.3988190978239439, + "grad_norm": 6.9367804527282715, + "learning_rate": 0.00016011912791723308, + "loss": 3.6796551513671876, + "step": 134800 + }, + { + "epoch": 1.3998567974514098, + "grad_norm": 3.629422903060913, + "learning_rate": 0.00016001535795448648, + "loss": 3.745485534667969, + "step": 134900 + }, + { + "epoch": 1.4008944970788755, + "grad_norm": 3.658010959625244, + "learning_rate": 0.0001599115879917399, + "loss": 3.6311688232421875, + "step": 135000 + }, + { + "epoch": 1.4019321967063414, + "grad_norm": 16.63618278503418, + "learning_rate": 0.00015980781802899332, + "loss": 3.6807235717773437, + "step": 135100 + }, + { + "epoch": 1.4029698963338073, + "grad_norm": 6.354872703552246, + "learning_rate": 0.00015970404806624672, + "loss": 3.5296261596679686, + "step": 135200 + }, + { + "epoch": 1.404007595961273, + "grad_norm": 7.496634483337402, + "learning_rate": 0.00015960027810350017, + "loss": 3.5905780029296874, + "step": 135300 + }, + { + "epoch": 1.405045295588739, + "grad_norm": 2.790278673171997, + "learning_rate": 0.00015949650814075356, + "loss": 3.544078369140625, + "step": 135400 + }, + { + "epoch": 1.4060829952162046, + "grad_norm": 5.150670528411865, + "learning_rate": 0.00015939273817800698, + "loss": 3.7144375610351563, + "step": 135500 + }, + { + "epoch": 1.4071206948436705, + "grad_norm": 5.606545448303223, + "learning_rate": 0.00015928896821526038, + "loss": 3.719892578125, + "step": 135600 + }, + { + "epoch": 1.4081583944711364, + "grad_norm": 15.23755931854248, + "learning_rate": 0.0001591851982525138, + "loss": 3.649613952636719, + "step": 135700 + }, + { + "epoch": 1.4091960940986021, + "grad_norm": 20.73650550842285, + "learning_rate": 0.00015908142828976725, + "loss": 3.6828762817382814, + "step": 135800 + }, + { + "epoch": 1.410233793726068, + "grad_norm": 8.400344848632812, + "learning_rate": 0.00015897765832702064, + "loss": 3.6613919067382814, + "step": 135900 + }, + { + "epoch": 1.411271493353534, + "grad_norm": 2.5724685192108154, + "learning_rate": 0.00015887388836427407, + "loss": 3.657626037597656, + "step": 136000 + }, + { + "epoch": 1.4123091929809997, + "grad_norm": 19.325956344604492, + "learning_rate": 0.00015877011840152746, + "loss": 3.8178024291992188, + "step": 136100 + }, + { + "epoch": 1.4133468926084656, + "grad_norm": 2.402404308319092, + "learning_rate": 0.0001586663484387809, + "loss": 3.59340576171875, + "step": 136200 + }, + { + "epoch": 1.4143845922359315, + "grad_norm": 6.188352108001709, + "learning_rate": 0.00015856257847603433, + "loss": 3.6710971069335936, + "step": 136300 + }, + { + "epoch": 1.4154222918633972, + "grad_norm": 4.21588659286499, + "learning_rate": 0.00015845880851328773, + "loss": 3.721273193359375, + "step": 136400 + }, + { + "epoch": 1.416459991490863, + "grad_norm": 4.4968485832214355, + "learning_rate": 0.00015835503855054115, + "loss": 3.6669491577148436, + "step": 136500 + }, + { + "epoch": 1.4174976911183288, + "grad_norm": 7.214438438415527, + "learning_rate": 0.00015825126858779454, + "loss": 3.799635925292969, + "step": 136600 + }, + { + "epoch": 1.4185353907457947, + "grad_norm": 7.262329578399658, + "learning_rate": 0.000158147498625048, + "loss": 3.807882995605469, + "step": 136700 + }, + { + "epoch": 1.4195730903732606, + "grad_norm": 3.5909628868103027, + "learning_rate": 0.00015804372866230139, + "loss": 3.7313577270507814, + "step": 136800 + }, + { + "epoch": 1.4206107900007263, + "grad_norm": 10.205459594726562, + "learning_rate": 0.0001579399586995548, + "loss": 3.675950622558594, + "step": 136900 + }, + { + "epoch": 1.4216484896281922, + "grad_norm": 5.25307559967041, + "learning_rate": 0.00015783618873680826, + "loss": 3.6014810180664063, + "step": 137000 + }, + { + "epoch": 1.422686189255658, + "grad_norm": 42.26997756958008, + "learning_rate": 0.00015773241877406165, + "loss": 3.6278192138671876, + "step": 137100 + }, + { + "epoch": 1.4237238888831238, + "grad_norm": 6.092323303222656, + "learning_rate": 0.00015762864881131507, + "loss": 3.555603332519531, + "step": 137200 + }, + { + "epoch": 1.4247615885105898, + "grad_norm": 2.74434232711792, + "learning_rate": 0.00015752487884856847, + "loss": 3.5426220703125, + "step": 137300 + }, + { + "epoch": 1.4257992881380557, + "grad_norm": 13.12152099609375, + "learning_rate": 0.0001574211088858219, + "loss": 3.7107192993164064, + "step": 137400 + }, + { + "epoch": 1.4268369877655214, + "grad_norm": 3.9462010860443115, + "learning_rate": 0.00015731733892307529, + "loss": 3.5455560302734375, + "step": 137500 + }, + { + "epoch": 1.4278746873929873, + "grad_norm": 3.7687721252441406, + "learning_rate": 0.00015721356896032873, + "loss": 3.630052490234375, + "step": 137600 + }, + { + "epoch": 1.428912387020453, + "grad_norm": 4.470894813537598, + "learning_rate": 0.00015710979899758216, + "loss": 3.627494201660156, + "step": 137700 + }, + { + "epoch": 1.429950086647919, + "grad_norm": 4.3846259117126465, + "learning_rate": 0.00015700602903483555, + "loss": 3.5804782104492188, + "step": 137800 + }, + { + "epoch": 1.4309877862753848, + "grad_norm": 3.9794013500213623, + "learning_rate": 0.000156902259072089, + "loss": 3.739950866699219, + "step": 137900 + }, + { + "epoch": 1.4320254859028505, + "grad_norm": 10.886957168579102, + "learning_rate": 0.0001567984891093424, + "loss": 3.7072845458984376, + "step": 138000 + }, + { + "epoch": 1.4330631855303164, + "grad_norm": 4.187902927398682, + "learning_rate": 0.00015669471914659582, + "loss": 3.64345703125, + "step": 138100 + }, + { + "epoch": 1.4341008851577821, + "grad_norm": 32.209293365478516, + "learning_rate": 0.00015659094918384924, + "loss": 3.6210546875, + "step": 138200 + }, + { + "epoch": 1.435138584785248, + "grad_norm": 3.12260365486145, + "learning_rate": 0.00015648717922110263, + "loss": 3.7005911254882813, + "step": 138300 + }, + { + "epoch": 1.436176284412714, + "grad_norm": 6.220150470733643, + "learning_rate": 0.00015638340925835608, + "loss": 3.7236618041992187, + "step": 138400 + }, + { + "epoch": 1.4372139840401799, + "grad_norm": 2.38154673576355, + "learning_rate": 0.00015627963929560948, + "loss": 3.633033447265625, + "step": 138500 + }, + { + "epoch": 1.4382516836676456, + "grad_norm": 7.884495258331299, + "learning_rate": 0.0001561758693328629, + "loss": 3.5666903686523437, + "step": 138600 + }, + { + "epoch": 1.4392893832951115, + "grad_norm": 3.8970346450805664, + "learning_rate": 0.0001560720993701163, + "loss": 3.6862808227539063, + "step": 138700 + }, + { + "epoch": 1.4403270829225772, + "grad_norm": 3.273268461227417, + "learning_rate": 0.00015596832940736972, + "loss": 3.6251177978515625, + "step": 138800 + }, + { + "epoch": 1.441364782550043, + "grad_norm": 3.0285887718200684, + "learning_rate": 0.00015586455944462317, + "loss": 3.61291015625, + "step": 138900 + }, + { + "epoch": 1.442402482177509, + "grad_norm": 3.4767589569091797, + "learning_rate": 0.00015576078948187656, + "loss": 3.6781646728515627, + "step": 139000 + }, + { + "epoch": 1.4434401818049747, + "grad_norm": 156.1669158935547, + "learning_rate": 0.00015565701951912998, + "loss": 3.6272451782226565, + "step": 139100 + }, + { + "epoch": 1.4444778814324406, + "grad_norm": 2.3591196537017822, + "learning_rate": 0.00015555324955638338, + "loss": 3.589447021484375, + "step": 139200 + }, + { + "epoch": 1.4455155810599063, + "grad_norm": 3.8040847778320312, + "learning_rate": 0.00015544947959363683, + "loss": 3.64208251953125, + "step": 139300 + }, + { + "epoch": 1.4465532806873722, + "grad_norm": 2.655759811401367, + "learning_rate": 0.00015534570963089022, + "loss": 3.671148376464844, + "step": 139400 + }, + { + "epoch": 1.4475909803148381, + "grad_norm": 7.29696798324585, + "learning_rate": 0.00015524193966814364, + "loss": 3.751770324707031, + "step": 139500 + }, + { + "epoch": 1.4486286799423038, + "grad_norm": 6.334928035736084, + "learning_rate": 0.00015513816970539706, + "loss": 3.7970040893554686, + "step": 139600 + }, + { + "epoch": 1.4496663795697697, + "grad_norm": 6.7520623207092285, + "learning_rate": 0.00015503439974265046, + "loss": 3.6929965209960938, + "step": 139700 + }, + { + "epoch": 1.4507040791972354, + "grad_norm": 10.428074836730957, + "learning_rate": 0.0001549306297799039, + "loss": 3.734377136230469, + "step": 139800 + }, + { + "epoch": 1.4517417788247013, + "grad_norm": 8.371795654296875, + "learning_rate": 0.0001548268598171573, + "loss": 3.6029412841796873, + "step": 139900 + }, + { + "epoch": 1.4527794784521673, + "grad_norm": 3.291740894317627, + "learning_rate": 0.00015472308985441073, + "loss": 3.6670523071289063, + "step": 140000 + }, + { + "epoch": 1.4538171780796332, + "grad_norm": 7.120608806610107, + "learning_rate": 0.00015461931989166417, + "loss": 3.638569030761719, + "step": 140100 + }, + { + "epoch": 1.4548548777070989, + "grad_norm": 6.361410617828369, + "learning_rate": 0.00015451554992891757, + "loss": 3.661440734863281, + "step": 140200 + }, + { + "epoch": 1.4558925773345648, + "grad_norm": 3.5337114334106445, + "learning_rate": 0.000154411779966171, + "loss": 3.69423828125, + "step": 140300 + }, + { + "epoch": 1.4569302769620305, + "grad_norm": 8.946898460388184, + "learning_rate": 0.00015430801000342439, + "loss": 3.636510925292969, + "step": 140400 + }, + { + "epoch": 1.4579679765894964, + "grad_norm": 3.5454866886138916, + "learning_rate": 0.0001542042400406778, + "loss": 3.833760986328125, + "step": 140500 + }, + { + "epoch": 1.4590056762169623, + "grad_norm": 20.629167556762695, + "learning_rate": 0.0001541004700779312, + "loss": 3.740248718261719, + "step": 140600 + }, + { + "epoch": 1.460043375844428, + "grad_norm": 3.0284929275512695, + "learning_rate": 0.00015399670011518465, + "loss": 3.6760980224609376, + "step": 140700 + }, + { + "epoch": 1.461081075471894, + "grad_norm": 4.971894264221191, + "learning_rate": 0.00015389293015243807, + "loss": 3.600714111328125, + "step": 140800 + }, + { + "epoch": 1.4621187750993596, + "grad_norm": 3.689394950866699, + "learning_rate": 0.00015378916018969147, + "loss": 3.5257861328125, + "step": 140900 + }, + { + "epoch": 1.4631564747268255, + "grad_norm": 4.305582523345947, + "learning_rate": 0.00015368539022694492, + "loss": 3.66658447265625, + "step": 141000 + }, + { + "epoch": 1.4641941743542914, + "grad_norm": 12.191847801208496, + "learning_rate": 0.0001535816202641983, + "loss": 3.5539178466796875, + "step": 141100 + }, + { + "epoch": 1.4652318739817574, + "grad_norm": 5.9276814460754395, + "learning_rate": 0.00015347785030145173, + "loss": 3.712036437988281, + "step": 141200 + }, + { + "epoch": 1.466269573609223, + "grad_norm": 7.3767008781433105, + "learning_rate": 0.00015337408033870513, + "loss": 3.688995361328125, + "step": 141300 + }, + { + "epoch": 1.467307273236689, + "grad_norm": 4.156796932220459, + "learning_rate": 0.00015327031037595855, + "loss": 3.5971023559570314, + "step": 141400 + }, + { + "epoch": 1.4683449728641547, + "grad_norm": 3.876843214035034, + "learning_rate": 0.000153166540413212, + "loss": 3.7138726806640623, + "step": 141500 + }, + { + "epoch": 1.4693826724916206, + "grad_norm": 2.5647096633911133, + "learning_rate": 0.0001530627704504654, + "loss": 3.575816650390625, + "step": 141600 + }, + { + "epoch": 1.4704203721190865, + "grad_norm": 6.341168403625488, + "learning_rate": 0.00015295900048771882, + "loss": 3.675234375, + "step": 141700 + }, + { + "epoch": 1.4714580717465522, + "grad_norm": 11.66984748840332, + "learning_rate": 0.0001528552305249722, + "loss": 3.5949581909179686, + "step": 141800 + }, + { + "epoch": 1.472495771374018, + "grad_norm": 2.7472872734069824, + "learning_rate": 0.00015275146056222563, + "loss": 3.4315753173828125, + "step": 141900 + }, + { + "epoch": 1.4735334710014838, + "grad_norm": 2.7182295322418213, + "learning_rate": 0.00015264769059947908, + "loss": 3.580435791015625, + "step": 142000 + }, + { + "epoch": 1.4745711706289497, + "grad_norm": 7.28167200088501, + "learning_rate": 0.00015254392063673248, + "loss": 3.6344500732421876, + "step": 142100 + }, + { + "epoch": 1.4756088702564156, + "grad_norm": 3.1541340351104736, + "learning_rate": 0.0001524401506739859, + "loss": 3.6579803466796874, + "step": 142200 + }, + { + "epoch": 1.4766465698838815, + "grad_norm": 4.42963171005249, + "learning_rate": 0.0001523363807112393, + "loss": 3.5743417358398437, + "step": 142300 + }, + { + "epoch": 1.4776842695113472, + "grad_norm": 7.278059005737305, + "learning_rate": 0.00015223261074849274, + "loss": 3.7834173583984376, + "step": 142400 + }, + { + "epoch": 1.4787219691388132, + "grad_norm": 10.52426528930664, + "learning_rate": 0.00015212884078574614, + "loss": 3.6968179321289063, + "step": 142500 + }, + { + "epoch": 1.4797596687662788, + "grad_norm": 3.5773837566375732, + "learning_rate": 0.00015202507082299956, + "loss": 3.6810809326171876, + "step": 142600 + }, + { + "epoch": 1.4807973683937448, + "grad_norm": 3.344587802886963, + "learning_rate": 0.00015192130086025298, + "loss": 3.6345669555664064, + "step": 142700 + }, + { + "epoch": 1.4818350680212107, + "grad_norm": 6.329004287719727, + "learning_rate": 0.00015181753089750638, + "loss": 3.647319641113281, + "step": 142800 + }, + { + "epoch": 1.4828727676486764, + "grad_norm": 6.577507495880127, + "learning_rate": 0.00015171376093475983, + "loss": 3.5769888305664064, + "step": 142900 + }, + { + "epoch": 1.4839104672761423, + "grad_norm": 4.545724391937256, + "learning_rate": 0.00015160999097201322, + "loss": 3.583935546875, + "step": 143000 + }, + { + "epoch": 1.484948166903608, + "grad_norm": 13.324125289916992, + "learning_rate": 0.00015150622100926664, + "loss": 3.612706604003906, + "step": 143100 + }, + { + "epoch": 1.485985866531074, + "grad_norm": 4.545955657958984, + "learning_rate": 0.00015140245104652004, + "loss": 3.4066473388671876, + "step": 143200 + }, + { + "epoch": 1.4870235661585398, + "grad_norm": 8.517041206359863, + "learning_rate": 0.00015129868108377349, + "loss": 3.6258444213867187, + "step": 143300 + }, + { + "epoch": 1.4880612657860055, + "grad_norm": 5.813758373260498, + "learning_rate": 0.0001511949111210269, + "loss": 3.686318054199219, + "step": 143400 + }, + { + "epoch": 1.4890989654134714, + "grad_norm": 6.236087322235107, + "learning_rate": 0.0001510911411582803, + "loss": 3.7458810424804687, + "step": 143500 + }, + { + "epoch": 1.4901366650409371, + "grad_norm": 5.874231815338135, + "learning_rate": 0.00015098737119553373, + "loss": 3.6481814575195313, + "step": 143600 + }, + { + "epoch": 1.491174364668403, + "grad_norm": 7.229684829711914, + "learning_rate": 0.00015088360123278712, + "loss": 3.6855035400390626, + "step": 143700 + }, + { + "epoch": 1.492212064295869, + "grad_norm": 7.212390422821045, + "learning_rate": 0.00015077983127004057, + "loss": 3.750265808105469, + "step": 143800 + }, + { + "epoch": 1.4932497639233349, + "grad_norm": 5.408252239227295, + "learning_rate": 0.000150676061307294, + "loss": 3.5695159912109373, + "step": 143900 + }, + { + "epoch": 1.4942874635508006, + "grad_norm": 8.125064849853516, + "learning_rate": 0.00015057229134454739, + "loss": 3.642791442871094, + "step": 144000 + }, + { + "epoch": 1.4953251631782665, + "grad_norm": 5.047210216522217, + "learning_rate": 0.00015046852138180083, + "loss": 3.588906555175781, + "step": 144100 + }, + { + "epoch": 1.4963628628057322, + "grad_norm": 2.775951623916626, + "learning_rate": 0.00015036475141905423, + "loss": 3.672796325683594, + "step": 144200 + }, + { + "epoch": 1.497400562433198, + "grad_norm": 7.114427089691162, + "learning_rate": 0.00015026098145630765, + "loss": 3.7460537719726563, + "step": 144300 + }, + { + "epoch": 1.498438262060664, + "grad_norm": 4.1067585945129395, + "learning_rate": 0.00015015721149356105, + "loss": 3.4047305297851564, + "step": 144400 + }, + { + "epoch": 1.4994759616881297, + "grad_norm": 6.3360276222229, + "learning_rate": 0.00015005344153081447, + "loss": 3.6055087280273437, + "step": 144500 + }, + { + "epoch": 1.5005136613155956, + "grad_norm": 3.8499081134796143, + "learning_rate": 0.0001499496715680679, + "loss": 3.6976129150390626, + "step": 144600 + }, + { + "epoch": 1.5015513609430613, + "grad_norm": 4.669349193572998, + "learning_rate": 0.0001498459016053213, + "loss": 3.6301043701171873, + "step": 144700 + }, + { + "epoch": 1.5025890605705272, + "grad_norm": 12.484715461730957, + "learning_rate": 0.0001497421316425747, + "loss": 3.6376629638671876, + "step": 144800 + }, + { + "epoch": 1.5036267601979931, + "grad_norm": 3.1881167888641357, + "learning_rate": 0.00014963836167982816, + "loss": 3.688013000488281, + "step": 144900 + }, + { + "epoch": 1.504664459825459, + "grad_norm": 3.1999073028564453, + "learning_rate": 0.00014953459171708158, + "loss": 3.767580871582031, + "step": 145000 + }, + { + "epoch": 1.5057021594529247, + "grad_norm": 2.503138303756714, + "learning_rate": 0.00014943082175433497, + "loss": 3.772780456542969, + "step": 145100 + }, + { + "epoch": 1.5067398590803904, + "grad_norm": 5.124083995819092, + "learning_rate": 0.0001493270517915884, + "loss": 3.709577941894531, + "step": 145200 + }, + { + "epoch": 1.5077775587078563, + "grad_norm": 12.24608039855957, + "learning_rate": 0.00014922328182884182, + "loss": 3.46869140625, + "step": 145300 + }, + { + "epoch": 1.5088152583353223, + "grad_norm": 11.273271560668945, + "learning_rate": 0.0001491195118660952, + "loss": 3.4797503662109377, + "step": 145400 + }, + { + "epoch": 1.5098529579627882, + "grad_norm": 60.867916107177734, + "learning_rate": 0.00014901574190334866, + "loss": 3.54853515625, + "step": 145500 + }, + { + "epoch": 1.5108906575902539, + "grad_norm": 4.276978969573975, + "learning_rate": 0.00014891197194060206, + "loss": 3.908219299316406, + "step": 145600 + }, + { + "epoch": 1.5119283572177198, + "grad_norm": 2.901015281677246, + "learning_rate": 0.00014880820197785548, + "loss": 3.4694091796875, + "step": 145700 + }, + { + "epoch": 1.5129660568451855, + "grad_norm": 2.3719887733459473, + "learning_rate": 0.0001487044320151089, + "loss": 3.7670758056640623, + "step": 145800 + }, + { + "epoch": 1.5140037564726514, + "grad_norm": 2.4967026710510254, + "learning_rate": 0.0001486006620523623, + "loss": 3.635834045410156, + "step": 145900 + }, + { + "epoch": 1.5150414561001173, + "grad_norm": 3.604675769805908, + "learning_rate": 0.00014849689208961572, + "loss": 3.5507608032226563, + "step": 146000 + }, + { + "epoch": 1.5160791557275832, + "grad_norm": 5.442782402038574, + "learning_rate": 0.00014839312212686916, + "loss": 3.5730636596679686, + "step": 146100 + }, + { + "epoch": 1.517116855355049, + "grad_norm": 3.7341339588165283, + "learning_rate": 0.00014828935216412256, + "loss": 3.569194641113281, + "step": 146200 + }, + { + "epoch": 1.5181545549825146, + "grad_norm": 12.070112228393555, + "learning_rate": 0.00014818558220137598, + "loss": 3.60053955078125, + "step": 146300 + }, + { + "epoch": 1.5191922546099805, + "grad_norm": 5.036438941955566, + "learning_rate": 0.0001480818122386294, + "loss": 3.7114804077148436, + "step": 146400 + }, + { + "epoch": 1.5202299542374464, + "grad_norm": 10.83106803894043, + "learning_rate": 0.0001479780422758828, + "loss": 3.5428836059570314, + "step": 146500 + }, + { + "epoch": 1.5212676538649124, + "grad_norm": 9.07150650024414, + "learning_rate": 0.00014787427231313622, + "loss": 3.6087515258789065, + "step": 146600 + }, + { + "epoch": 1.522305353492378, + "grad_norm": 3.6539382934570312, + "learning_rate": 0.00014777050235038964, + "loss": 3.6974029541015625, + "step": 146700 + }, + { + "epoch": 1.523343053119844, + "grad_norm": 2.5568654537200928, + "learning_rate": 0.00014766673238764306, + "loss": 3.7100448608398438, + "step": 146800 + }, + { + "epoch": 1.5243807527473097, + "grad_norm": 5.767122745513916, + "learning_rate": 0.00014756296242489649, + "loss": 3.494932861328125, + "step": 146900 + }, + { + "epoch": 1.5254184523747756, + "grad_norm": 5.006596088409424, + "learning_rate": 0.00014745919246214988, + "loss": 3.804518737792969, + "step": 147000 + }, + { + "epoch": 1.5264561520022415, + "grad_norm": 3.907433271408081, + "learning_rate": 0.0001473554224994033, + "loss": 3.6617333984375, + "step": 147100 + }, + { + "epoch": 1.5274938516297074, + "grad_norm": 6.253331184387207, + "learning_rate": 0.00014725165253665672, + "loss": 3.611311950683594, + "step": 147200 + }, + { + "epoch": 1.528531551257173, + "grad_norm": 5.735301494598389, + "learning_rate": 0.00014714788257391015, + "loss": 3.605543518066406, + "step": 147300 + }, + { + "epoch": 1.5295692508846388, + "grad_norm": 1.7375198602676392, + "learning_rate": 0.00014704411261116357, + "loss": 3.6379776000976562, + "step": 147400 + }, + { + "epoch": 1.5306069505121047, + "grad_norm": 4.913732051849365, + "learning_rate": 0.000146940342648417, + "loss": 3.757569580078125, + "step": 147500 + }, + { + "epoch": 1.5316446501395706, + "grad_norm": 3.887519598007202, + "learning_rate": 0.00014683657268567039, + "loss": 3.654621887207031, + "step": 147600 + }, + { + "epoch": 1.5326823497670365, + "grad_norm": 45.76445007324219, + "learning_rate": 0.0001467328027229238, + "loss": 3.611448059082031, + "step": 147700 + }, + { + "epoch": 1.5337200493945022, + "grad_norm": 3.629575729370117, + "learning_rate": 0.00014662903276017723, + "loss": 3.6693844604492187, + "step": 147800 + }, + { + "epoch": 1.5347577490219682, + "grad_norm": 2.453900098800659, + "learning_rate": 0.00014652526279743062, + "loss": 3.6880978393554686, + "step": 147900 + }, + { + "epoch": 1.5357954486494338, + "grad_norm": 3.411557674407959, + "learning_rate": 0.00014642149283468407, + "loss": 3.656671447753906, + "step": 148000 + }, + { + "epoch": 1.5368331482768998, + "grad_norm": 3.5617477893829346, + "learning_rate": 0.0001463177228719375, + "loss": 3.706895446777344, + "step": 148100 + }, + { + "epoch": 1.5378708479043657, + "grad_norm": 3.5422544479370117, + "learning_rate": 0.0001462139529091909, + "loss": 3.605690612792969, + "step": 148200 + }, + { + "epoch": 1.5389085475318316, + "grad_norm": 3.9814698696136475, + "learning_rate": 0.0001461101829464443, + "loss": 3.6530465698242187, + "step": 148300 + }, + { + "epoch": 1.5399462471592973, + "grad_norm": 10.028122901916504, + "learning_rate": 0.00014600641298369773, + "loss": 3.623879089355469, + "step": 148400 + }, + { + "epoch": 1.540983946786763, + "grad_norm": 3.4206697940826416, + "learning_rate": 0.00014590264302095113, + "loss": 3.517763366699219, + "step": 148500 + }, + { + "epoch": 1.542021646414229, + "grad_norm": 3.4238781929016113, + "learning_rate": 0.00014579887305820455, + "loss": 3.52829833984375, + "step": 148600 + }, + { + "epoch": 1.5430593460416948, + "grad_norm": 58.35453414916992, + "learning_rate": 0.00014569510309545797, + "loss": 3.682017517089844, + "step": 148700 + }, + { + "epoch": 1.5440970456691607, + "grad_norm": 4.933131217956543, + "learning_rate": 0.0001455913331327114, + "loss": 3.577257080078125, + "step": 148800 + }, + { + "epoch": 1.5451347452966264, + "grad_norm": 17.892318725585938, + "learning_rate": 0.00014548756316996482, + "loss": 3.710743713378906, + "step": 148900 + }, + { + "epoch": 1.5461724449240921, + "grad_norm": 6.2961249351501465, + "learning_rate": 0.0001453837932072182, + "loss": 3.6647821044921876, + "step": 149000 + }, + { + "epoch": 1.547210144551558, + "grad_norm": 4.278889179229736, + "learning_rate": 0.00014528002324447163, + "loss": 3.613748779296875, + "step": 149100 + }, + { + "epoch": 1.548247844179024, + "grad_norm": 3.2785260677337646, + "learning_rate": 0.00014517625328172505, + "loss": 3.6411376953125, + "step": 149200 + }, + { + "epoch": 1.5492855438064899, + "grad_norm": 3.227151393890381, + "learning_rate": 0.00014507248331897848, + "loss": 3.758666687011719, + "step": 149300 + }, + { + "epoch": 1.5503232434339556, + "grad_norm": 2.6391334533691406, + "learning_rate": 0.0001449687133562319, + "loss": 3.5469485473632814, + "step": 149400 + }, + { + "epoch": 1.5513609430614215, + "grad_norm": 2.5920772552490234, + "learning_rate": 0.00014486494339348532, + "loss": 3.621335754394531, + "step": 149500 + }, + { + "epoch": 1.5523986426888872, + "grad_norm": 2.864225387573242, + "learning_rate": 0.00014476117343073872, + "loss": 3.6408596801757813, + "step": 149600 + }, + { + "epoch": 1.553436342316353, + "grad_norm": 4.697976112365723, + "learning_rate": 0.00014465740346799214, + "loss": 3.6993423461914063, + "step": 149700 + }, + { + "epoch": 1.554474041943819, + "grad_norm": 4.074455738067627, + "learning_rate": 0.00014455363350524556, + "loss": 3.6419488525390626, + "step": 149800 + }, + { + "epoch": 1.555511741571285, + "grad_norm": 2.933537721633911, + "learning_rate": 0.00014444986354249898, + "loss": 3.622572326660156, + "step": 149900 + }, + { + "epoch": 1.5565494411987506, + "grad_norm": 5.856564521789551, + "learning_rate": 0.0001443460935797524, + "loss": 3.7532833862304686, + "step": 150000 + }, + { + "epoch": 1.5575871408262163, + "grad_norm": 4.24385929107666, + "learning_rate": 0.00014424232361700583, + "loss": 3.67490234375, + "step": 150100 + }, + { + "epoch": 1.5586248404536822, + "grad_norm": 5.053845405578613, + "learning_rate": 0.00014413855365425922, + "loss": 3.7350125122070312, + "step": 150200 + }, + { + "epoch": 1.5596625400811481, + "grad_norm": 3.423252582550049, + "learning_rate": 0.00014403478369151264, + "loss": 3.522652893066406, + "step": 150300 + }, + { + "epoch": 1.560700239708614, + "grad_norm": 8.40445327758789, + "learning_rate": 0.00014393101372876606, + "loss": 3.480498962402344, + "step": 150400 + }, + { + "epoch": 1.5617379393360797, + "grad_norm": 3.1955294609069824, + "learning_rate": 0.00014382724376601946, + "loss": 3.6813082885742188, + "step": 150500 + }, + { + "epoch": 1.5627756389635457, + "grad_norm": 6.0853681564331055, + "learning_rate": 0.0001437234738032729, + "loss": 3.6238223266601564, + "step": 150600 + }, + { + "epoch": 1.5638133385910113, + "grad_norm": 5.178461074829102, + "learning_rate": 0.0001436197038405263, + "loss": 3.6469857788085935, + "step": 150700 + }, + { + "epoch": 1.5648510382184773, + "grad_norm": 8.24820613861084, + "learning_rate": 0.00014351593387777972, + "loss": 3.6198629760742187, + "step": 150800 + }, + { + "epoch": 1.5658887378459432, + "grad_norm": 4.228358745574951, + "learning_rate": 0.00014341216391503315, + "loss": 3.4716970825195315, + "step": 150900 + }, + { + "epoch": 1.566926437473409, + "grad_norm": 3.555584192276001, + "learning_rate": 0.00014330839395228654, + "loss": 3.739703369140625, + "step": 151000 + }, + { + "epoch": 1.5679641371008748, + "grad_norm": 5.781318187713623, + "learning_rate": 0.00014320462398953996, + "loss": 3.5981024169921874, + "step": 151100 + }, + { + "epoch": 1.5690018367283405, + "grad_norm": 6.903919696807861, + "learning_rate": 0.0001431008540267934, + "loss": 3.5764788818359374, + "step": 151200 + }, + { + "epoch": 1.5700395363558064, + "grad_norm": 3.584331512451172, + "learning_rate": 0.0001429970840640468, + "loss": 3.6005426025390626, + "step": 151300 + }, + { + "epoch": 1.5710772359832723, + "grad_norm": 4.393853664398193, + "learning_rate": 0.00014289331410130023, + "loss": 3.78184814453125, + "step": 151400 + }, + { + "epoch": 1.5721149356107382, + "grad_norm": 2.4552299976348877, + "learning_rate": 0.00014278954413855365, + "loss": 3.7241311645507813, + "step": 151500 + }, + { + "epoch": 1.573152635238204, + "grad_norm": 6.105810642242432, + "learning_rate": 0.00014268577417580705, + "loss": 3.6668280029296874, + "step": 151600 + }, + { + "epoch": 1.5741903348656698, + "grad_norm": 5.4593939781188965, + "learning_rate": 0.00014258200421306047, + "loss": 3.6350604248046876, + "step": 151700 + }, + { + "epoch": 1.5752280344931355, + "grad_norm": 8.01681900024414, + "learning_rate": 0.0001424782342503139, + "loss": 3.636524658203125, + "step": 151800 + }, + { + "epoch": 1.5762657341206014, + "grad_norm": 27.08595848083496, + "learning_rate": 0.0001423744642875673, + "loss": 3.7312826538085937, + "step": 151900 + }, + { + "epoch": 1.5773034337480674, + "grad_norm": 3.227189064025879, + "learning_rate": 0.00014227069432482073, + "loss": 3.54576416015625, + "step": 152000 + }, + { + "epoch": 1.5783411333755333, + "grad_norm": 3.922788619995117, + "learning_rate": 0.00014216692436207413, + "loss": 3.762126770019531, + "step": 152100 + }, + { + "epoch": 1.579378833002999, + "grad_norm": 11.172755241394043, + "learning_rate": 0.00014206315439932755, + "loss": 3.6238665771484375, + "step": 152200 + }, + { + "epoch": 1.5804165326304647, + "grad_norm": 4.898155212402344, + "learning_rate": 0.00014195938443658097, + "loss": 3.5397454833984376, + "step": 152300 + }, + { + "epoch": 1.5814542322579306, + "grad_norm": 4.228941440582275, + "learning_rate": 0.0001418556144738344, + "loss": 3.482630615234375, + "step": 152400 + }, + { + "epoch": 1.5824919318853965, + "grad_norm": 3.2711164951324463, + "learning_rate": 0.00014175184451108782, + "loss": 3.55691162109375, + "step": 152500 + }, + { + "epoch": 1.5835296315128624, + "grad_norm": 4.924630641937256, + "learning_rate": 0.00014164807454834124, + "loss": 3.6941983032226564, + "step": 152600 + }, + { + "epoch": 1.584567331140328, + "grad_norm": 4.247806072235107, + "learning_rate": 0.00014154430458559463, + "loss": 3.704905700683594, + "step": 152700 + }, + { + "epoch": 1.5856050307677938, + "grad_norm": 5.901268482208252, + "learning_rate": 0.00014144053462284805, + "loss": 3.4900387573242186, + "step": 152800 + }, + { + "epoch": 1.5866427303952597, + "grad_norm": 2.9829347133636475, + "learning_rate": 0.00014133676466010148, + "loss": 3.560227966308594, + "step": 152900 + }, + { + "epoch": 1.5876804300227256, + "grad_norm": 3.3158979415893555, + "learning_rate": 0.00014123299469735487, + "loss": 3.6083251953125, + "step": 153000 + }, + { + "epoch": 1.5887181296501915, + "grad_norm": 3.4291346073150635, + "learning_rate": 0.00014112922473460832, + "loss": 3.634643859863281, + "step": 153100 + }, + { + "epoch": 1.5897558292776572, + "grad_norm": 6.855015754699707, + "learning_rate": 0.00014102545477186174, + "loss": 3.5994863891601563, + "step": 153200 + }, + { + "epoch": 1.5907935289051232, + "grad_norm": 5.0481133460998535, + "learning_rate": 0.00014092168480911514, + "loss": 3.7016021728515627, + "step": 153300 + }, + { + "epoch": 1.5918312285325888, + "grad_norm": 7.888632297515869, + "learning_rate": 0.00014081791484636856, + "loss": 3.531593017578125, + "step": 153400 + }, + { + "epoch": 1.5928689281600548, + "grad_norm": 3.533106565475464, + "learning_rate": 0.00014071414488362198, + "loss": 3.671497497558594, + "step": 153500 + }, + { + "epoch": 1.5939066277875207, + "grad_norm": 3.2950990200042725, + "learning_rate": 0.00014061037492087538, + "loss": 3.6725836181640625, + "step": 153600 + }, + { + "epoch": 1.5949443274149866, + "grad_norm": 5.21208381652832, + "learning_rate": 0.00014050660495812882, + "loss": 3.6607846069335936, + "step": 153700 + }, + { + "epoch": 1.5959820270424523, + "grad_norm": 2.718191385269165, + "learning_rate": 0.00014040283499538222, + "loss": 3.607443542480469, + "step": 153800 + }, + { + "epoch": 1.597019726669918, + "grad_norm": 3.6571433544158936, + "learning_rate": 0.00014029906503263564, + "loss": 3.675062255859375, + "step": 153900 + }, + { + "epoch": 1.598057426297384, + "grad_norm": 2.440661907196045, + "learning_rate": 0.00014019529506988906, + "loss": 3.4682305908203124, + "step": 154000 + }, + { + "epoch": 1.5990951259248498, + "grad_norm": 4.171643257141113, + "learning_rate": 0.00014009152510714246, + "loss": 3.682950134277344, + "step": 154100 + }, + { + "epoch": 1.6001328255523157, + "grad_norm": 7.624752998352051, + "learning_rate": 0.00013998775514439588, + "loss": 3.67526611328125, + "step": 154200 + }, + { + "epoch": 1.6011705251797814, + "grad_norm": 7.279924392700195, + "learning_rate": 0.0001398839851816493, + "loss": 3.6037884521484376, + "step": 154300 + }, + { + "epoch": 1.6022082248072473, + "grad_norm": 3.2470226287841797, + "learning_rate": 0.00013978021521890272, + "loss": 3.658772277832031, + "step": 154400 + }, + { + "epoch": 1.603245924434713, + "grad_norm": 5.602239608764648, + "learning_rate": 0.00013967644525615615, + "loss": 3.5984457397460936, + "step": 154500 + }, + { + "epoch": 1.604283624062179, + "grad_norm": 3.6453311443328857, + "learning_rate": 0.00013957267529340957, + "loss": 3.388334045410156, + "step": 154600 + }, + { + "epoch": 1.6053213236896449, + "grad_norm": 6.957507610321045, + "learning_rate": 0.00013946890533066296, + "loss": 3.617900695800781, + "step": 154700 + }, + { + "epoch": 1.6063590233171108, + "grad_norm": 15.978106498718262, + "learning_rate": 0.00013936513536791638, + "loss": 3.514501647949219, + "step": 154800 + }, + { + "epoch": 1.6073967229445765, + "grad_norm": 4.719081401824951, + "learning_rate": 0.0001392613654051698, + "loss": 3.6095266723632813, + "step": 154900 + }, + { + "epoch": 1.6084344225720422, + "grad_norm": 3.6483592987060547, + "learning_rate": 0.00013915759544242323, + "loss": 3.6144635009765627, + "step": 155000 + }, + { + "epoch": 1.609472122199508, + "grad_norm": 3.3481674194335938, + "learning_rate": 0.00013905382547967665, + "loss": 3.5398931884765625, + "step": 155100 + }, + { + "epoch": 1.610509821826974, + "grad_norm": 6.413243293762207, + "learning_rate": 0.00013895005551693007, + "loss": 3.6416336059570313, + "step": 155200 + }, + { + "epoch": 1.61154752145444, + "grad_norm": 7.17488431930542, + "learning_rate": 0.00013884628555418347, + "loss": 3.717559814453125, + "step": 155300 + }, + { + "epoch": 1.6125852210819056, + "grad_norm": 6.735267162322998, + "learning_rate": 0.0001387425155914369, + "loss": 3.6701150512695313, + "step": 155400 + }, + { + "epoch": 1.6136229207093713, + "grad_norm": 3.489192008972168, + "learning_rate": 0.0001386387456286903, + "loss": 3.607757263183594, + "step": 155500 + }, + { + "epoch": 1.6146606203368372, + "grad_norm": 4.3538360595703125, + "learning_rate": 0.00013853497566594373, + "loss": 3.6339166259765623, + "step": 155600 + }, + { + "epoch": 1.6156983199643031, + "grad_norm": 17.20830535888672, + "learning_rate": 0.00013843120570319715, + "loss": 3.42401611328125, + "step": 155700 + }, + { + "epoch": 1.616736019591769, + "grad_norm": 2.5314135551452637, + "learning_rate": 0.00013832743574045055, + "loss": 3.527308349609375, + "step": 155800 + }, + { + "epoch": 1.617773719219235, + "grad_norm": 4.076705455780029, + "learning_rate": 0.00013822366577770397, + "loss": 3.5527752685546874, + "step": 155900 + }, + { + "epoch": 1.6188114188467007, + "grad_norm": 3.8894543647766113, + "learning_rate": 0.0001381198958149574, + "loss": 3.68035400390625, + "step": 156000 + }, + { + "epoch": 1.6198491184741663, + "grad_norm": 17.054737091064453, + "learning_rate": 0.0001380161258522108, + "loss": 3.4780517578125, + "step": 156100 + }, + { + "epoch": 1.6208868181016323, + "grad_norm": 20.06046485900879, + "learning_rate": 0.0001379123558894642, + "loss": 3.5436282348632813, + "step": 156200 + }, + { + "epoch": 1.6219245177290982, + "grad_norm": 3.36186146736145, + "learning_rate": 0.00013780858592671766, + "loss": 3.6915762329101565, + "step": 156300 + }, + { + "epoch": 1.622962217356564, + "grad_norm": 3.333552360534668, + "learning_rate": 0.00013770481596397105, + "loss": 3.551458740234375, + "step": 156400 + }, + { + "epoch": 1.6239999169840298, + "grad_norm": 16.679468154907227, + "learning_rate": 0.00013760104600122448, + "loss": 3.5686306762695312, + "step": 156500 + }, + { + "epoch": 1.6250376166114955, + "grad_norm": 3.8986880779266357, + "learning_rate": 0.0001374972760384779, + "loss": 3.6233151245117186, + "step": 156600 + }, + { + "epoch": 1.6260753162389614, + "grad_norm": 5.065491199493408, + "learning_rate": 0.0001373935060757313, + "loss": 3.6737161254882813, + "step": 156700 + }, + { + "epoch": 1.6271130158664273, + "grad_norm": 16.096450805664062, + "learning_rate": 0.00013728973611298471, + "loss": 3.6823269653320314, + "step": 156800 + }, + { + "epoch": 1.6281507154938932, + "grad_norm": 3.939023733139038, + "learning_rate": 0.00013718596615023814, + "loss": 3.655545349121094, + "step": 156900 + }, + { + "epoch": 1.629188415121359, + "grad_norm": 5.221971035003662, + "learning_rate": 0.00013708219618749156, + "loss": 3.5299761962890623, + "step": 157000 + }, + { + "epoch": 1.6302261147488248, + "grad_norm": 4.515364646911621, + "learning_rate": 0.00013697842622474498, + "loss": 3.5957623291015626, + "step": 157100 + }, + { + "epoch": 1.6312638143762905, + "grad_norm": 2.1334664821624756, + "learning_rate": 0.00013687465626199838, + "loss": 3.5724642944335936, + "step": 157200 + }, + { + "epoch": 1.6323015140037564, + "grad_norm": 3.8212311267852783, + "learning_rate": 0.0001367708862992518, + "loss": 3.5870269775390624, + "step": 157300 + }, + { + "epoch": 1.6333392136312224, + "grad_norm": 7.132654666900635, + "learning_rate": 0.00013666711633650522, + "loss": 3.5734619140625, + "step": 157400 + }, + { + "epoch": 1.6343769132586883, + "grad_norm": 4.568203926086426, + "learning_rate": 0.00013656334637375864, + "loss": 3.6052120971679686, + "step": 157500 + }, + { + "epoch": 1.635414612886154, + "grad_norm": 6.630765438079834, + "learning_rate": 0.00013645957641101206, + "loss": 3.7074453735351565, + "step": 157600 + }, + { + "epoch": 1.6364523125136197, + "grad_norm": 9.513466835021973, + "learning_rate": 0.00013635580644826549, + "loss": 3.4421658325195312, + "step": 157700 + }, + { + "epoch": 1.6374900121410856, + "grad_norm": 3.5600993633270264, + "learning_rate": 0.00013625203648551888, + "loss": 3.472029724121094, + "step": 157800 + }, + { + "epoch": 1.6385277117685515, + "grad_norm": 3.796132802963257, + "learning_rate": 0.0001361482665227723, + "loss": 3.700109558105469, + "step": 157900 + }, + { + "epoch": 1.6395654113960174, + "grad_norm": 5.419138431549072, + "learning_rate": 0.00013604449656002572, + "loss": 3.525767517089844, + "step": 158000 + }, + { + "epoch": 1.640603111023483, + "grad_norm": 7.728092193603516, + "learning_rate": 0.00013594072659727912, + "loss": 3.4612411499023437, + "step": 158100 + }, + { + "epoch": 1.641640810650949, + "grad_norm": 5.094764232635498, + "learning_rate": 0.00013583695663453257, + "loss": 3.5728485107421877, + "step": 158200 + }, + { + "epoch": 1.6426785102784147, + "grad_norm": 7.930044174194336, + "learning_rate": 0.000135733186671786, + "loss": 3.547598571777344, + "step": 158300 + }, + { + "epoch": 1.6437162099058806, + "grad_norm": 3.853911876678467, + "learning_rate": 0.00013562941670903938, + "loss": 3.5331781005859373, + "step": 158400 + }, + { + "epoch": 1.6447539095333465, + "grad_norm": 14.153372764587402, + "learning_rate": 0.0001355256467462928, + "loss": 3.5483056640625, + "step": 158500 + }, + { + "epoch": 1.6457916091608125, + "grad_norm": 4.353669166564941, + "learning_rate": 0.00013542187678354623, + "loss": 3.5902810668945313, + "step": 158600 + }, + { + "epoch": 1.6468293087882782, + "grad_norm": 3.16603946685791, + "learning_rate": 0.00013531810682079962, + "loss": 3.5274386596679688, + "step": 158700 + }, + { + "epoch": 1.6478670084157439, + "grad_norm": 5.928895950317383, + "learning_rate": 0.00013521433685805307, + "loss": 3.662962646484375, + "step": 158800 + }, + { + "epoch": 1.6489047080432098, + "grad_norm": 4.497453689575195, + "learning_rate": 0.00013511056689530647, + "loss": 3.6771749877929687, + "step": 158900 + }, + { + "epoch": 1.6499424076706757, + "grad_norm": 6.737712383270264, + "learning_rate": 0.0001350067969325599, + "loss": 3.546751708984375, + "step": 159000 + }, + { + "epoch": 1.6509801072981416, + "grad_norm": 3.984771490097046, + "learning_rate": 0.0001349030269698133, + "loss": 3.5879977416992186, + "step": 159100 + }, + { + "epoch": 1.6520178069256073, + "grad_norm": 7.267343521118164, + "learning_rate": 0.0001347992570070667, + "loss": 3.5557431030273436, + "step": 159200 + }, + { + "epoch": 1.653055506553073, + "grad_norm": 5.349457263946533, + "learning_rate": 0.00013469548704432013, + "loss": 3.6174130249023437, + "step": 159300 + }, + { + "epoch": 1.654093206180539, + "grad_norm": 3.6522059440612793, + "learning_rate": 0.00013459171708157358, + "loss": 3.609751892089844, + "step": 159400 + }, + { + "epoch": 1.6551309058080048, + "grad_norm": 5.704461574554443, + "learning_rate": 0.00013448794711882697, + "loss": 3.5679837036132813, + "step": 159500 + }, + { + "epoch": 1.6561686054354707, + "grad_norm": 5.23817777633667, + "learning_rate": 0.0001343841771560804, + "loss": 3.5738253784179688, + "step": 159600 + }, + { + "epoch": 1.6572063050629366, + "grad_norm": 12.301040649414062, + "learning_rate": 0.00013428040719333382, + "loss": 3.587038879394531, + "step": 159700 + }, + { + "epoch": 1.6582440046904023, + "grad_norm": 6.761283874511719, + "learning_rate": 0.0001341766372305872, + "loss": 3.521001281738281, + "step": 159800 + }, + { + "epoch": 1.659281704317868, + "grad_norm": 5.411608695983887, + "learning_rate": 0.00013407286726784063, + "loss": 3.473619384765625, + "step": 159900 + }, + { + "epoch": 1.660319403945334, + "grad_norm": 14.189502716064453, + "learning_rate": 0.00013396909730509405, + "loss": 3.5413604736328126, + "step": 160000 + }, + { + "epoch": 1.6613571035727999, + "grad_norm": 3.0541956424713135, + "learning_rate": 0.00013386532734234748, + "loss": 3.5548626708984377, + "step": 160100 + }, + { + "epoch": 1.6623948032002658, + "grad_norm": 3.2475764751434326, + "learning_rate": 0.0001337615573796009, + "loss": 3.5887530517578123, + "step": 160200 + }, + { + "epoch": 1.6634325028277315, + "grad_norm": 4.810506343841553, + "learning_rate": 0.00013365778741685432, + "loss": 3.6068450927734377, + "step": 160300 + }, + { + "epoch": 1.6644702024551972, + "grad_norm": 11.347721099853516, + "learning_rate": 0.00013355401745410771, + "loss": 3.663785705566406, + "step": 160400 + }, + { + "epoch": 1.665507902082663, + "grad_norm": 2.9197380542755127, + "learning_rate": 0.00013345024749136114, + "loss": 3.6435916137695314, + "step": 160500 + }, + { + "epoch": 1.666545601710129, + "grad_norm": 5.3932037353515625, + "learning_rate": 0.00013334647752861456, + "loss": 3.6256121826171874, + "step": 160600 + }, + { + "epoch": 1.667583301337595, + "grad_norm": 3.6826651096343994, + "learning_rate": 0.00013324270756586798, + "loss": 3.60268798828125, + "step": 160700 + }, + { + "epoch": 1.6686210009650606, + "grad_norm": 4.883547782897949, + "learning_rate": 0.0001331389376031214, + "loss": 3.508822326660156, + "step": 160800 + }, + { + "epoch": 1.6696587005925265, + "grad_norm": 3.1789474487304688, + "learning_rate": 0.0001330351676403748, + "loss": 3.5955624389648437, + "step": 160900 + }, + { + "epoch": 1.6706964002199922, + "grad_norm": 3.8428354263305664, + "learning_rate": 0.00013293139767762822, + "loss": 3.6681442260742188, + "step": 161000 + }, + { + "epoch": 1.6717340998474581, + "grad_norm": 5.440670490264893, + "learning_rate": 0.00013282762771488164, + "loss": 3.65127197265625, + "step": 161100 + }, + { + "epoch": 1.672771799474924, + "grad_norm": 4.737522125244141, + "learning_rate": 0.00013272385775213504, + "loss": 3.757344055175781, + "step": 161200 + }, + { + "epoch": 1.67380949910239, + "grad_norm": 5.953054428100586, + "learning_rate": 0.00013262008778938848, + "loss": 3.690797119140625, + "step": 161300 + }, + { + "epoch": 1.6748471987298557, + "grad_norm": 8.720730781555176, + "learning_rate": 0.0001325163178266419, + "loss": 3.790602722167969, + "step": 161400 + }, + { + "epoch": 1.6758848983573214, + "grad_norm": 3.9143240451812744, + "learning_rate": 0.0001324125478638953, + "loss": 3.439073486328125, + "step": 161500 + }, + { + "epoch": 1.6769225979847873, + "grad_norm": 4.572363376617432, + "learning_rate": 0.00013230877790114872, + "loss": 3.5498342895507813, + "step": 161600 + }, + { + "epoch": 1.6779602976122532, + "grad_norm": 9.166924476623535, + "learning_rate": 0.00013220500793840215, + "loss": 3.479727478027344, + "step": 161700 + }, + { + "epoch": 1.678997997239719, + "grad_norm": 2.0057218074798584, + "learning_rate": 0.00013210123797565554, + "loss": 3.72489990234375, + "step": 161800 + }, + { + "epoch": 1.6800356968671848, + "grad_norm": 4.892455101013184, + "learning_rate": 0.000131997468012909, + "loss": 3.6359210205078125, + "step": 161900 + }, + { + "epoch": 1.6810733964946507, + "grad_norm": 8.374796867370605, + "learning_rate": 0.00013189369805016238, + "loss": 3.5657424926757812, + "step": 162000 + }, + { + "epoch": 1.6821110961221164, + "grad_norm": 3.702462911605835, + "learning_rate": 0.0001317899280874158, + "loss": 3.6002679443359376, + "step": 162100 + }, + { + "epoch": 1.6831487957495823, + "grad_norm": 6.6382856369018555, + "learning_rate": 0.00013168615812466923, + "loss": 3.5055661010742187, + "step": 162200 + }, + { + "epoch": 1.6841864953770482, + "grad_norm": 4.067321300506592, + "learning_rate": 0.00013158238816192262, + "loss": 3.6370770263671877, + "step": 162300 + }, + { + "epoch": 1.6852241950045141, + "grad_norm": 6.839338779449463, + "learning_rate": 0.00013147861819917604, + "loss": 3.68888671875, + "step": 162400 + }, + { + "epoch": 1.6862618946319798, + "grad_norm": 4.304868221282959, + "learning_rate": 0.00013137484823642947, + "loss": 3.5517013549804686, + "step": 162500 + }, + { + "epoch": 1.6872995942594455, + "grad_norm": 6.149030685424805, + "learning_rate": 0.0001312710782736829, + "loss": 3.535697326660156, + "step": 162600 + }, + { + "epoch": 1.6883372938869114, + "grad_norm": 3.3684825897216797, + "learning_rate": 0.0001311673083109363, + "loss": 3.4286175537109376, + "step": 162700 + }, + { + "epoch": 1.6893749935143774, + "grad_norm": 3.4294440746307373, + "learning_rate": 0.00013106353834818973, + "loss": 3.443184509277344, + "step": 162800 + }, + { + "epoch": 1.6904126931418433, + "grad_norm": 4.177918434143066, + "learning_rate": 0.00013095976838544313, + "loss": 3.6785324096679686, + "step": 162900 + }, + { + "epoch": 1.691450392769309, + "grad_norm": 3.914222478866577, + "learning_rate": 0.00013085599842269655, + "loss": 3.6343704223632813, + "step": 163000 + }, + { + "epoch": 1.6924880923967747, + "grad_norm": 10.268918991088867, + "learning_rate": 0.00013075222845994997, + "loss": 3.625147399902344, + "step": 163100 + }, + { + "epoch": 1.6935257920242406, + "grad_norm": 3.8632876873016357, + "learning_rate": 0.0001306484584972034, + "loss": 3.62834228515625, + "step": 163200 + }, + { + "epoch": 1.6945634916517065, + "grad_norm": 3.8029658794403076, + "learning_rate": 0.00013054468853445681, + "loss": 3.4555462646484374, + "step": 163300 + }, + { + "epoch": 1.6956011912791724, + "grad_norm": 3.983098030090332, + "learning_rate": 0.00013044091857171024, + "loss": 3.6773056030273437, + "step": 163400 + }, + { + "epoch": 1.6966388909066383, + "grad_norm": 3.1625497341156006, + "learning_rate": 0.00013033714860896363, + "loss": 3.525480041503906, + "step": 163500 + }, + { + "epoch": 1.697676590534104, + "grad_norm": 6.201349258422852, + "learning_rate": 0.00013023337864621705, + "loss": 3.626365051269531, + "step": 163600 + }, + { + "epoch": 1.6987142901615697, + "grad_norm": 4.032458782196045, + "learning_rate": 0.00013012960868347048, + "loss": 3.5092694091796877, + "step": 163700 + }, + { + "epoch": 1.6997519897890356, + "grad_norm": 3.9698915481567383, + "learning_rate": 0.0001300258387207239, + "loss": 3.273734436035156, + "step": 163800 + }, + { + "epoch": 1.7007896894165015, + "grad_norm": 9.877572059631348, + "learning_rate": 0.00012992206875797732, + "loss": 3.576407775878906, + "step": 163900 + }, + { + "epoch": 1.7018273890439675, + "grad_norm": 14.561692237854004, + "learning_rate": 0.00012981829879523071, + "loss": 3.6638983154296874, + "step": 164000 + }, + { + "epoch": 1.7028650886714332, + "grad_norm": 2.6718385219573975, + "learning_rate": 0.00012971452883248414, + "loss": 3.671317138671875, + "step": 164100 + }, + { + "epoch": 1.7039027882988989, + "grad_norm": 3.6662535667419434, + "learning_rate": 0.00012961075886973756, + "loss": 3.648578796386719, + "step": 164200 + }, + { + "epoch": 1.7049404879263648, + "grad_norm": 4.04230260848999, + "learning_rate": 0.00012950698890699095, + "loss": 3.4332769775390624, + "step": 164300 + }, + { + "epoch": 1.7059781875538307, + "grad_norm": 9.336248397827148, + "learning_rate": 0.00012940321894424437, + "loss": 3.6213333129882814, + "step": 164400 + }, + { + "epoch": 1.7070158871812966, + "grad_norm": 5.882486820220947, + "learning_rate": 0.00012929944898149782, + "loss": 3.525044250488281, + "step": 164500 + }, + { + "epoch": 1.7080535868087623, + "grad_norm": 6.984238624572754, + "learning_rate": 0.00012919567901875122, + "loss": 3.6717626953125, + "step": 164600 + }, + { + "epoch": 1.7090912864362282, + "grad_norm": 19.616052627563477, + "learning_rate": 0.00012909190905600464, + "loss": 3.5099832153320314, + "step": 164700 + }, + { + "epoch": 1.710128986063694, + "grad_norm": 8.419858932495117, + "learning_rate": 0.00012898813909325806, + "loss": 3.624603576660156, + "step": 164800 + }, + { + "epoch": 1.7111666856911598, + "grad_norm": 3.145763397216797, + "learning_rate": 0.00012888436913051146, + "loss": 3.5627670288085938, + "step": 164900 + }, + { + "epoch": 1.7122043853186257, + "grad_norm": 2.620919704437256, + "learning_rate": 0.00012878059916776488, + "loss": 3.556968994140625, + "step": 165000 + }, + { + "epoch": 1.7132420849460916, + "grad_norm": 3.6687073707580566, + "learning_rate": 0.0001286768292050183, + "loss": 3.590003662109375, + "step": 165100 + }, + { + "epoch": 1.7142797845735573, + "grad_norm": 3.51960825920105, + "learning_rate": 0.00012857305924227172, + "loss": 3.443156433105469, + "step": 165200 + }, + { + "epoch": 1.715317484201023, + "grad_norm": 7.178112030029297, + "learning_rate": 0.00012846928927952514, + "loss": 3.5516900634765625, + "step": 165300 + }, + { + "epoch": 1.716355183828489, + "grad_norm": 3.60011887550354, + "learning_rate": 0.00012836551931677857, + "loss": 3.5771609497070314, + "step": 165400 + }, + { + "epoch": 1.7173928834559549, + "grad_norm": 5.902312278747559, + "learning_rate": 0.00012826174935403196, + "loss": 3.590467529296875, + "step": 165500 + }, + { + "epoch": 1.7184305830834208, + "grad_norm": 2.6880180835723877, + "learning_rate": 0.00012815797939128538, + "loss": 3.6772579956054687, + "step": 165600 + }, + { + "epoch": 1.7194682827108865, + "grad_norm": 4.136773109436035, + "learning_rate": 0.0001280542094285388, + "loss": 3.7336956787109377, + "step": 165700 + }, + { + "epoch": 1.7205059823383524, + "grad_norm": 5.155696392059326, + "learning_rate": 0.00012795043946579223, + "loss": 3.4659573364257814, + "step": 165800 + }, + { + "epoch": 1.721543681965818, + "grad_norm": 5.531459331512451, + "learning_rate": 0.00012784666950304565, + "loss": 3.4835992431640626, + "step": 165900 + }, + { + "epoch": 1.722581381593284, + "grad_norm": 6.343237400054932, + "learning_rate": 0.00012774289954029904, + "loss": 3.5382821655273435, + "step": 166000 + }, + { + "epoch": 1.72361908122075, + "grad_norm": 2.731682538986206, + "learning_rate": 0.00012763912957755247, + "loss": 3.426122131347656, + "step": 166100 + }, + { + "epoch": 1.7246567808482158, + "grad_norm": 5.487903594970703, + "learning_rate": 0.0001275353596148059, + "loss": 3.5763626098632812, + "step": 166200 + }, + { + "epoch": 1.7256944804756815, + "grad_norm": 6.798583984375, + "learning_rate": 0.00012743158965205928, + "loss": 3.439584045410156, + "step": 166300 + }, + { + "epoch": 1.7267321801031472, + "grad_norm": 18.596773147583008, + "learning_rate": 0.00012732781968931273, + "loss": 3.4846591186523437, + "step": 166400 + }, + { + "epoch": 1.7277698797306131, + "grad_norm": 9.826458930969238, + "learning_rate": 0.00012722404972656615, + "loss": 3.410422668457031, + "step": 166500 + }, + { + "epoch": 1.728807579358079, + "grad_norm": 5.076817035675049, + "learning_rate": 0.00012712027976381955, + "loss": 3.5888720703125, + "step": 166600 + }, + { + "epoch": 1.729845278985545, + "grad_norm": 2.289203405380249, + "learning_rate": 0.00012701650980107297, + "loss": 3.6262445068359375, + "step": 166700 + }, + { + "epoch": 1.7308829786130107, + "grad_norm": 2.4246132373809814, + "learning_rate": 0.0001269127398383264, + "loss": 3.5331646728515627, + "step": 166800 + }, + { + "epoch": 1.7319206782404764, + "grad_norm": 20.16929054260254, + "learning_rate": 0.0001268089698755798, + "loss": 3.3934396362304686, + "step": 166900 + }, + { + "epoch": 1.7329583778679423, + "grad_norm": 4.409317970275879, + "learning_rate": 0.00012670519991283324, + "loss": 3.46904052734375, + "step": 167000 + }, + { + "epoch": 1.7339960774954082, + "grad_norm": 3.533935308456421, + "learning_rate": 0.00012660142995008663, + "loss": 3.6115313720703126, + "step": 167100 + }, + { + "epoch": 1.735033777122874, + "grad_norm": 3.760765790939331, + "learning_rate": 0.00012649765998734005, + "loss": 3.7661947631835937, + "step": 167200 + }, + { + "epoch": 1.7360714767503398, + "grad_norm": 3.174926996231079, + "learning_rate": 0.00012639389002459348, + "loss": 3.4038616943359377, + "step": 167300 + }, + { + "epoch": 1.7371091763778057, + "grad_norm": 4.701259136199951, + "learning_rate": 0.00012629012006184687, + "loss": 3.575841064453125, + "step": 167400 + }, + { + "epoch": 1.7381468760052714, + "grad_norm": 4.684348106384277, + "learning_rate": 0.0001261863500991003, + "loss": 3.650244140625, + "step": 167500 + }, + { + "epoch": 1.7391845756327373, + "grad_norm": 5.04356575012207, + "learning_rate": 0.00012608258013635374, + "loss": 3.512914733886719, + "step": 167600 + }, + { + "epoch": 1.7402222752602032, + "grad_norm": 4.33563232421875, + "learning_rate": 0.00012597881017360714, + "loss": 3.462794189453125, + "step": 167700 + }, + { + "epoch": 1.7412599748876691, + "grad_norm": 3.108952522277832, + "learning_rate": 0.00012587504021086056, + "loss": 3.6481967163085938, + "step": 167800 + }, + { + "epoch": 1.7422976745151348, + "grad_norm": 7.204711437225342, + "learning_rate": 0.00012577127024811398, + "loss": 3.3575787353515625, + "step": 167900 + }, + { + "epoch": 1.7433353741426005, + "grad_norm": 9.035337448120117, + "learning_rate": 0.00012566750028536737, + "loss": 3.5675091552734375, + "step": 168000 + }, + { + "epoch": 1.7443730737700665, + "grad_norm": 5.063663005828857, + "learning_rate": 0.0001255637303226208, + "loss": 3.48505615234375, + "step": 168100 + }, + { + "epoch": 1.7454107733975324, + "grad_norm": 3.2425074577331543, + "learning_rate": 0.00012545996035987422, + "loss": 3.6897207641601564, + "step": 168200 + }, + { + "epoch": 1.7464484730249983, + "grad_norm": 5.356579303741455, + "learning_rate": 0.00012535619039712764, + "loss": 3.5673727416992187, + "step": 168300 + }, + { + "epoch": 1.747486172652464, + "grad_norm": 4.124982833862305, + "learning_rate": 0.00012525242043438106, + "loss": 3.512673034667969, + "step": 168400 + }, + { + "epoch": 1.74852387227993, + "grad_norm": 4.768991470336914, + "learning_rate": 0.00012514865047163448, + "loss": 3.5959738159179686, + "step": 168500 + }, + { + "epoch": 1.7495615719073956, + "grad_norm": 9.657281875610352, + "learning_rate": 0.00012504488050888788, + "loss": 3.528682861328125, + "step": 168600 + }, + { + "epoch": 1.7505992715348615, + "grad_norm": 2.538902759552002, + "learning_rate": 0.0001249411105461413, + "loss": 3.4649612426757814, + "step": 168700 + }, + { + "epoch": 1.7516369711623274, + "grad_norm": 4.286279201507568, + "learning_rate": 0.00012483734058339472, + "loss": 3.5286309814453123, + "step": 168800 + }, + { + "epoch": 1.7526746707897933, + "grad_norm": 15.081319808959961, + "learning_rate": 0.00012473357062064814, + "loss": 3.492412414550781, + "step": 168900 + }, + { + "epoch": 1.753712370417259, + "grad_norm": 2.91190767288208, + "learning_rate": 0.00012462980065790157, + "loss": 3.4919317626953124, + "step": 169000 + }, + { + "epoch": 1.7547500700447247, + "grad_norm": 3.788306713104248, + "learning_rate": 0.00012452603069515496, + "loss": 3.587347412109375, + "step": 169100 + }, + { + "epoch": 1.7557877696721906, + "grad_norm": 4.830081462860107, + "learning_rate": 0.00012442226073240838, + "loss": 3.6080587768554686, + "step": 169200 + }, + { + "epoch": 1.7568254692996566, + "grad_norm": 4.777892112731934, + "learning_rate": 0.0001243184907696618, + "loss": 3.653542175292969, + "step": 169300 + }, + { + "epoch": 1.7578631689271225, + "grad_norm": 8.966485977172852, + "learning_rate": 0.0001242147208069152, + "loss": 3.55691650390625, + "step": 169400 + }, + { + "epoch": 1.7589008685545882, + "grad_norm": 1.9701244831085205, + "learning_rate": 0.00012411095084416865, + "loss": 3.587906799316406, + "step": 169500 + }, + { + "epoch": 1.759938568182054, + "grad_norm": 12.719783782958984, + "learning_rate": 0.00012400718088142207, + "loss": 3.413060302734375, + "step": 169600 + }, + { + "epoch": 1.7609762678095198, + "grad_norm": 3.8632144927978516, + "learning_rate": 0.00012390341091867547, + "loss": 3.6044146728515627, + "step": 169700 + }, + { + "epoch": 1.7620139674369857, + "grad_norm": 5.806576251983643, + "learning_rate": 0.0001237996409559289, + "loss": 3.59072509765625, + "step": 169800 + }, + { + "epoch": 1.7630516670644516, + "grad_norm": 7.052939414978027, + "learning_rate": 0.0001236958709931823, + "loss": 3.4161257934570313, + "step": 169900 + }, + { + "epoch": 1.7640893666919175, + "grad_norm": 4.090539455413818, + "learning_rate": 0.0001235921010304357, + "loss": 3.4862603759765625, + "step": 170000 + }, + { + "epoch": 1.7651270663193832, + "grad_norm": 8.032806396484375, + "learning_rate": 0.00012348833106768913, + "loss": 3.5226229858398437, + "step": 170100 + }, + { + "epoch": 1.766164765946849, + "grad_norm": 7.900229454040527, + "learning_rate": 0.00012338456110494255, + "loss": 3.428408203125, + "step": 170200 + }, + { + "epoch": 1.7672024655743148, + "grad_norm": 3.3465304374694824, + "learning_rate": 0.00012328079114219597, + "loss": 3.4806304931640626, + "step": 170300 + }, + { + "epoch": 1.7682401652017807, + "grad_norm": 2.737323522567749, + "learning_rate": 0.0001231770211794494, + "loss": 3.5239492797851564, + "step": 170400 + }, + { + "epoch": 1.7692778648292466, + "grad_norm": 5.74827766418457, + "learning_rate": 0.00012307325121670281, + "loss": 3.5097976684570313, + "step": 170500 + }, + { + "epoch": 1.7703155644567123, + "grad_norm": 6.033031463623047, + "learning_rate": 0.0001229694812539562, + "loss": 3.4570046997070314, + "step": 170600 + }, + { + "epoch": 1.771353264084178, + "grad_norm": 8.032061576843262, + "learning_rate": 0.00012286571129120963, + "loss": 3.560968017578125, + "step": 170700 + }, + { + "epoch": 1.772390963711644, + "grad_norm": 4.955009460449219, + "learning_rate": 0.00012276194132846305, + "loss": 3.54818115234375, + "step": 170800 + }, + { + "epoch": 1.7734286633391099, + "grad_norm": 10.685212135314941, + "learning_rate": 0.00012265817136571647, + "loss": 3.5968731689453124, + "step": 170900 + }, + { + "epoch": 1.7744663629665758, + "grad_norm": 6.002890110015869, + "learning_rate": 0.0001225544014029699, + "loss": 3.6380169677734373, + "step": 171000 + }, + { + "epoch": 1.7755040625940415, + "grad_norm": 2.442901849746704, + "learning_rate": 0.0001224506314402233, + "loss": 3.546981201171875, + "step": 171100 + }, + { + "epoch": 1.7765417622215074, + "grad_norm": 7.106812000274658, + "learning_rate": 0.0001223468614774767, + "loss": 3.4353497314453123, + "step": 171200 + }, + { + "epoch": 1.777579461848973, + "grad_norm": 4.951285362243652, + "learning_rate": 0.00012224309151473014, + "loss": 3.5619387817382813, + "step": 171300 + }, + { + "epoch": 1.778617161476439, + "grad_norm": 4.533148765563965, + "learning_rate": 0.00012213932155198356, + "loss": 3.4085040283203125, + "step": 171400 + }, + { + "epoch": 1.779654861103905, + "grad_norm": 3.1281020641326904, + "learning_rate": 0.00012203555158923698, + "loss": 3.5755316162109376, + "step": 171500 + }, + { + "epoch": 1.7806925607313708, + "grad_norm": 3.2438437938690186, + "learning_rate": 0.00012193178162649039, + "loss": 3.419034118652344, + "step": 171600 + }, + { + "epoch": 1.7817302603588365, + "grad_norm": 6.113760948181152, + "learning_rate": 0.0001218280116637438, + "loss": 3.4608731079101562, + "step": 171700 + }, + { + "epoch": 1.7827679599863022, + "grad_norm": 3.805856227874756, + "learning_rate": 0.00012172424170099722, + "loss": 3.542497253417969, + "step": 171800 + }, + { + "epoch": 1.7838056596137681, + "grad_norm": 11.923066139221191, + "learning_rate": 0.00012162047173825063, + "loss": 3.5580120849609376, + "step": 171900 + }, + { + "epoch": 1.784843359241234, + "grad_norm": 7.653703212738037, + "learning_rate": 0.00012151670177550405, + "loss": 3.464120178222656, + "step": 172000 + }, + { + "epoch": 1.7858810588687, + "grad_norm": 4.955140113830566, + "learning_rate": 0.00012141293181275747, + "loss": 3.5985858154296877, + "step": 172100 + }, + { + "epoch": 1.7869187584961657, + "grad_norm": 2.7006173133850098, + "learning_rate": 0.00012130916185001089, + "loss": 3.608409423828125, + "step": 172200 + }, + { + "epoch": 1.7879564581236316, + "grad_norm": 10.799352645874023, + "learning_rate": 0.0001212053918872643, + "loss": 3.5314166259765627, + "step": 172300 + }, + { + "epoch": 1.7889941577510973, + "grad_norm": 2.7497682571411133, + "learning_rate": 0.00012110162192451772, + "loss": 3.5095343017578124, + "step": 172400 + }, + { + "epoch": 1.7900318573785632, + "grad_norm": 3.47670316696167, + "learning_rate": 0.00012099785196177113, + "loss": 3.508272705078125, + "step": 172500 + }, + { + "epoch": 1.791069557006029, + "grad_norm": 5.199550151824951, + "learning_rate": 0.00012089408199902454, + "loss": 3.503916015625, + "step": 172600 + }, + { + "epoch": 1.792107256633495, + "grad_norm": 5.3487043380737305, + "learning_rate": 0.00012079031203627797, + "loss": 3.63627685546875, + "step": 172700 + }, + { + "epoch": 1.7931449562609607, + "grad_norm": 4.6182074546813965, + "learning_rate": 0.00012068654207353138, + "loss": 3.517708740234375, + "step": 172800 + }, + { + "epoch": 1.7941826558884264, + "grad_norm": 2.607217788696289, + "learning_rate": 0.0001205827721107848, + "loss": 3.555519714355469, + "step": 172900 + }, + { + "epoch": 1.7952203555158923, + "grad_norm": 9.180208206176758, + "learning_rate": 0.00012047900214803821, + "loss": 3.5748587036132813, + "step": 173000 + }, + { + "epoch": 1.7962580551433582, + "grad_norm": 5.080584526062012, + "learning_rate": 0.00012037523218529164, + "loss": 3.5299716186523438, + "step": 173100 + }, + { + "epoch": 1.7972957547708241, + "grad_norm": 2.5319409370422363, + "learning_rate": 0.00012027146222254504, + "loss": 3.5544561767578124, + "step": 173200 + }, + { + "epoch": 1.7983334543982898, + "grad_norm": 4.81158447265625, + "learning_rate": 0.00012016769225979848, + "loss": 3.6039208984375, + "step": 173300 + }, + { + "epoch": 1.7993711540257558, + "grad_norm": Infinity, + "learning_rate": 0.00012006392229705189, + "loss": 3.439290771484375, + "step": 173400 + }, + { + "epoch": 1.8004088536532215, + "grad_norm": 2.6214425563812256, + "learning_rate": 0.00011996015233430531, + "loss": 3.670186767578125, + "step": 173500 + }, + { + "epoch": 1.8014465532806874, + "grad_norm": 2.7172493934631348, + "learning_rate": 0.00011985638237155872, + "loss": 3.5838592529296873, + "step": 173600 + }, + { + "epoch": 1.8024842529081533, + "grad_norm": 8.898774147033691, + "learning_rate": 0.00011975261240881213, + "loss": 3.4800985717773436, + "step": 173700 + }, + { + "epoch": 1.8035219525356192, + "grad_norm": 3.5623104572296143, + "learning_rate": 0.00011964884244606555, + "loss": 3.511365966796875, + "step": 173800 + }, + { + "epoch": 1.804559652163085, + "grad_norm": 8.46833610534668, + "learning_rate": 0.00011954507248331896, + "loss": 3.7088421630859374, + "step": 173900 + }, + { + "epoch": 1.8055973517905506, + "grad_norm": 5.097702980041504, + "learning_rate": 0.00011944130252057239, + "loss": 3.6202734375, + "step": 174000 + }, + { + "epoch": 1.8066350514180165, + "grad_norm": 2.758472204208374, + "learning_rate": 0.0001193375325578258, + "loss": 3.561451721191406, + "step": 174100 + }, + { + "epoch": 1.8076727510454824, + "grad_norm": 10.48659610748291, + "learning_rate": 0.00011923376259507922, + "loss": 3.5661395263671873, + "step": 174200 + }, + { + "epoch": 1.8087104506729483, + "grad_norm": 4.996297836303711, + "learning_rate": 0.00011912999263233263, + "loss": 3.680464782714844, + "step": 174300 + }, + { + "epoch": 1.809748150300414, + "grad_norm": 3.927097797393799, + "learning_rate": 0.00011902622266958605, + "loss": 3.4924087524414062, + "step": 174400 + }, + { + "epoch": 1.8107858499278797, + "grad_norm": 9.367024421691895, + "learning_rate": 0.00011892245270683946, + "loss": 3.4610064697265623, + "step": 174500 + }, + { + "epoch": 1.8118235495553456, + "grad_norm": 2.7783424854278564, + "learning_rate": 0.0001188186827440929, + "loss": 3.411673583984375, + "step": 174600 + }, + { + "epoch": 1.8128612491828116, + "grad_norm": 8.61545181274414, + "learning_rate": 0.0001187149127813463, + "loss": 3.5328875732421876, + "step": 174700 + }, + { + "epoch": 1.8138989488102775, + "grad_norm": 7.4906182289123535, + "learning_rate": 0.00011861114281859971, + "loss": 3.376343078613281, + "step": 174800 + }, + { + "epoch": 1.8149366484377432, + "grad_norm": 1.9939513206481934, + "learning_rate": 0.00011850737285585314, + "loss": 3.428880615234375, + "step": 174900 + }, + { + "epoch": 1.815974348065209, + "grad_norm": 6.011395454406738, + "learning_rate": 0.00011840360289310654, + "loss": 3.5798504638671873, + "step": 175000 + }, + { + "epoch": 1.8170120476926748, + "grad_norm": 2.0973944664001465, + "learning_rate": 0.00011829983293035997, + "loss": 3.5833367919921875, + "step": 175100 + }, + { + "epoch": 1.8180497473201407, + "grad_norm": 4.992910861968994, + "learning_rate": 0.00011819606296761339, + "loss": 3.6261285400390624, + "step": 175200 + }, + { + "epoch": 1.8190874469476066, + "grad_norm": 89.73089599609375, + "learning_rate": 0.00011809229300486681, + "loss": 3.390103454589844, + "step": 175300 + }, + { + "epoch": 1.8201251465750725, + "grad_norm": 4.343557834625244, + "learning_rate": 0.00011798852304212022, + "loss": 3.6147576904296876, + "step": 175400 + }, + { + "epoch": 1.8211628462025382, + "grad_norm": Infinity, + "learning_rate": 0.00011788475307937364, + "loss": 3.5382080078125, + "step": 175500 + }, + { + "epoch": 1.822200545830004, + "grad_norm": 8.41909408569336, + "learning_rate": 0.00011778098311662705, + "loss": 3.5090155029296874, + "step": 175600 + }, + { + "epoch": 1.8232382454574698, + "grad_norm": 7.508602619171143, + "learning_rate": 0.00011767721315388046, + "loss": 3.540501708984375, + "step": 175700 + }, + { + "epoch": 1.8242759450849357, + "grad_norm": 2.713555335998535, + "learning_rate": 0.00011757344319113388, + "loss": 3.5669735717773436, + "step": 175800 + }, + { + "epoch": 1.8253136447124017, + "grad_norm": 9.780903816223145, + "learning_rate": 0.0001174696732283873, + "loss": 3.542893981933594, + "step": 175900 + }, + { + "epoch": 1.8263513443398673, + "grad_norm": 2.6435556411743164, + "learning_rate": 0.00011736590326564072, + "loss": 3.6134707641601564, + "step": 176000 + }, + { + "epoch": 1.8273890439673333, + "grad_norm": 3.3884384632110596, + "learning_rate": 0.00011726213330289413, + "loss": 3.574878845214844, + "step": 176100 + }, + { + "epoch": 1.828426743594799, + "grad_norm": 4.323862552642822, + "learning_rate": 0.00011715836334014755, + "loss": 3.5432839965820313, + "step": 176200 + }, + { + "epoch": 1.8294644432222649, + "grad_norm": 6.794419765472412, + "learning_rate": 0.00011705459337740096, + "loss": 3.4334552001953127, + "step": 176300 + }, + { + "epoch": 1.8305021428497308, + "grad_norm": 3.3329992294311523, + "learning_rate": 0.00011695082341465438, + "loss": 3.511024169921875, + "step": 176400 + }, + { + "epoch": 1.8315398424771967, + "grad_norm": 6.582189083099365, + "learning_rate": 0.0001168470534519078, + "loss": 3.4057382202148436, + "step": 176500 + }, + { + "epoch": 1.8325775421046624, + "grad_norm": 3.5420665740966797, + "learning_rate": 0.00011674328348916123, + "loss": 3.530198974609375, + "step": 176600 + }, + { + "epoch": 1.833615241732128, + "grad_norm": 3.2835450172424316, + "learning_rate": 0.00011663951352641463, + "loss": 3.3689605712890627, + "step": 176700 + }, + { + "epoch": 1.834652941359594, + "grad_norm": 4.352384567260742, + "learning_rate": 0.00011653574356366804, + "loss": 3.5228622436523436, + "step": 176800 + }, + { + "epoch": 1.83569064098706, + "grad_norm": 6.940867900848389, + "learning_rate": 0.00011643197360092147, + "loss": 3.422699279785156, + "step": 176900 + }, + { + "epoch": 1.8367283406145258, + "grad_norm": 9.627628326416016, + "learning_rate": 0.00011632820363817487, + "loss": 3.4203256225585936, + "step": 177000 + }, + { + "epoch": 1.8377660402419915, + "grad_norm": 7.819676399230957, + "learning_rate": 0.00011622443367542831, + "loss": 3.569815673828125, + "step": 177100 + }, + { + "epoch": 1.8388037398694572, + "grad_norm": 3.4782094955444336, + "learning_rate": 0.00011612066371268172, + "loss": 3.5252569580078124, + "step": 177200 + }, + { + "epoch": 1.8398414394969231, + "grad_norm": 9.448952674865723, + "learning_rate": 0.00011601689374993514, + "loss": 3.43080322265625, + "step": 177300 + }, + { + "epoch": 1.840879139124389, + "grad_norm": 5.754225730895996, + "learning_rate": 0.00011591312378718855, + "loss": 3.45312744140625, + "step": 177400 + }, + { + "epoch": 1.841916838751855, + "grad_norm": 2.9918229579925537, + "learning_rate": 0.00011580935382444197, + "loss": 3.548991394042969, + "step": 177500 + }, + { + "epoch": 1.8429545383793209, + "grad_norm": 4.406205177307129, + "learning_rate": 0.00011570558386169538, + "loss": 3.5221047973632813, + "step": 177600 + }, + { + "epoch": 1.8439922380067866, + "grad_norm": 3.79978346824646, + "learning_rate": 0.00011560181389894879, + "loss": 3.4272702026367186, + "step": 177700 + }, + { + "epoch": 1.8450299376342523, + "grad_norm": 8.362844467163086, + "learning_rate": 0.00011549804393620222, + "loss": 3.496314697265625, + "step": 177800 + }, + { + "epoch": 1.8460676372617182, + "grad_norm": 4.00974702835083, + "learning_rate": 0.00011539427397345563, + "loss": 3.4739456176757812, + "step": 177900 + }, + { + "epoch": 1.847105336889184, + "grad_norm": 4.4382853507995605, + "learning_rate": 0.00011529050401070905, + "loss": 3.637906799316406, + "step": 178000 + }, + { + "epoch": 1.84814303651665, + "grad_norm": 3.4561121463775635, + "learning_rate": 0.00011518673404796246, + "loss": 3.4582669067382814, + "step": 178100 + }, + { + "epoch": 1.8491807361441157, + "grad_norm": 9.542756080627441, + "learning_rate": 0.00011508296408521588, + "loss": 3.5665469360351563, + "step": 178200 + }, + { + "epoch": 1.8502184357715814, + "grad_norm": 5.516635894775391, + "learning_rate": 0.00011497919412246929, + "loss": 3.5199371337890626, + "step": 178300 + }, + { + "epoch": 1.8512561353990473, + "grad_norm": 10.64023494720459, + "learning_rate": 0.00011487542415972273, + "loss": 3.605532531738281, + "step": 178400 + }, + { + "epoch": 1.8522938350265132, + "grad_norm": 3.7197024822235107, + "learning_rate": 0.00011477165419697613, + "loss": 3.5585647583007813, + "step": 178500 + }, + { + "epoch": 1.8533315346539792, + "grad_norm": 8.84176254272461, + "learning_rate": 0.00011466788423422956, + "loss": 3.469338684082031, + "step": 178600 + }, + { + "epoch": 1.8543692342814448, + "grad_norm": 13.789299011230469, + "learning_rate": 0.00011456411427148297, + "loss": 3.654618835449219, + "step": 178700 + }, + { + "epoch": 1.8554069339089108, + "grad_norm": 3.7758259773254395, + "learning_rate": 0.00011446034430873637, + "loss": 3.511930236816406, + "step": 178800 + }, + { + "epoch": 1.8564446335363765, + "grad_norm": 4.542521953582764, + "learning_rate": 0.0001143565743459898, + "loss": 3.572850341796875, + "step": 178900 + }, + { + "epoch": 1.8574823331638424, + "grad_norm": 7.155478477478027, + "learning_rate": 0.00011425280438324322, + "loss": 3.6194467163085937, + "step": 179000 + }, + { + "epoch": 1.8585200327913083, + "grad_norm": 5.109609603881836, + "learning_rate": 0.00011414903442049664, + "loss": 3.585841064453125, + "step": 179100 + }, + { + "epoch": 1.8595577324187742, + "grad_norm": 4.251883506774902, + "learning_rate": 0.00011404526445775005, + "loss": 3.4581594848632813, + "step": 179200 + }, + { + "epoch": 1.86059543204624, + "grad_norm": 22.98354148864746, + "learning_rate": 0.00011394149449500347, + "loss": 3.47680419921875, + "step": 179300 + }, + { + "epoch": 1.8616331316737056, + "grad_norm": 4.897403240203857, + "learning_rate": 0.00011383772453225688, + "loss": 3.5364599609375, + "step": 179400 + }, + { + "epoch": 1.8626708313011715, + "grad_norm": 11.166070938110352, + "learning_rate": 0.0001137339545695103, + "loss": 3.4703445434570312, + "step": 179500 + }, + { + "epoch": 1.8637085309286374, + "grad_norm": 3.64528226852417, + "learning_rate": 0.00011363018460676372, + "loss": 3.612529296875, + "step": 179600 + }, + { + "epoch": 1.8647462305561033, + "grad_norm": 3.4828524589538574, + "learning_rate": 0.00011352641464401714, + "loss": 3.622635803222656, + "step": 179700 + }, + { + "epoch": 1.865783930183569, + "grad_norm": 4.965012550354004, + "learning_rate": 0.00011342264468127055, + "loss": 3.420509033203125, + "step": 179800 + }, + { + "epoch": 1.866821629811035, + "grad_norm": 6.657770156860352, + "learning_rate": 0.00011331887471852396, + "loss": 3.57205810546875, + "step": 179900 + }, + { + "epoch": 1.8678593294385006, + "grad_norm": 6.785094738006592, + "learning_rate": 0.00011321510475577738, + "loss": 3.613439025878906, + "step": 180000 + }, + { + "epoch": 1.8688970290659666, + "grad_norm": 3.2131218910217285, + "learning_rate": 0.00011311133479303079, + "loss": 3.721015625, + "step": 180100 + }, + { + "epoch": 1.8699347286934325, + "grad_norm": 3.327937364578247, + "learning_rate": 0.00011300756483028421, + "loss": 3.47718017578125, + "step": 180200 + }, + { + "epoch": 1.8709724283208984, + "grad_norm": 8.65044116973877, + "learning_rate": 0.00011290379486753763, + "loss": 3.6089404296875, + "step": 180300 + }, + { + "epoch": 1.872010127948364, + "grad_norm": 2.0018603801727295, + "learning_rate": 0.00011280002490479106, + "loss": 3.3825112915039064, + "step": 180400 + }, + { + "epoch": 1.8730478275758298, + "grad_norm": 2.7814066410064697, + "learning_rate": 0.00011269625494204446, + "loss": 3.4428082275390626, + "step": 180500 + }, + { + "epoch": 1.8740855272032957, + "grad_norm": 2.5407564640045166, + "learning_rate": 0.00011259248497929789, + "loss": 3.6512811279296873, + "step": 180600 + }, + { + "epoch": 1.8751232268307616, + "grad_norm": 3.6118102073669434, + "learning_rate": 0.0001124887150165513, + "loss": 3.4491305541992188, + "step": 180700 + }, + { + "epoch": 1.8761609264582275, + "grad_norm": 4.681710720062256, + "learning_rate": 0.0001123849450538047, + "loss": 3.5399176025390626, + "step": 180800 + }, + { + "epoch": 1.8771986260856932, + "grad_norm": 5.6345062255859375, + "learning_rate": 0.00011228117509105814, + "loss": 3.580292053222656, + "step": 180900 + }, + { + "epoch": 1.878236325713159, + "grad_norm": 4.881344318389893, + "learning_rate": 0.00011217740512831155, + "loss": 3.5553582763671874, + "step": 181000 + }, + { + "epoch": 1.8792740253406248, + "grad_norm": 3.3916895389556885, + "learning_rate": 0.00011207363516556497, + "loss": 3.468414001464844, + "step": 181100 + }, + { + "epoch": 1.8803117249680907, + "grad_norm": 4.611287593841553, + "learning_rate": 0.00011196986520281838, + "loss": 3.420959167480469, + "step": 181200 + }, + { + "epoch": 1.8813494245955567, + "grad_norm": 3.4268012046813965, + "learning_rate": 0.0001118660952400718, + "loss": 3.614518737792969, + "step": 181300 + }, + { + "epoch": 1.8823871242230226, + "grad_norm": 9.675979614257812, + "learning_rate": 0.00011176232527732521, + "loss": 3.460643310546875, + "step": 181400 + }, + { + "epoch": 1.8834248238504883, + "grad_norm": 4.765254497528076, + "learning_rate": 0.00011165855531457864, + "loss": 3.5331201171875, + "step": 181500 + }, + { + "epoch": 1.884462523477954, + "grad_norm": 12.958268165588379, + "learning_rate": 0.00011155478535183205, + "loss": 3.458702392578125, + "step": 181600 + }, + { + "epoch": 1.8855002231054199, + "grad_norm": 3.9760847091674805, + "learning_rate": 0.00011145101538908547, + "loss": 3.5144024658203126, + "step": 181700 + }, + { + "epoch": 1.8865379227328858, + "grad_norm": 3.063124656677246, + "learning_rate": 0.00011134724542633888, + "loss": 3.3591217041015624, + "step": 181800 + }, + { + "epoch": 1.8875756223603517, + "grad_norm": 14.115145683288574, + "learning_rate": 0.00011124347546359229, + "loss": 3.5416494750976564, + "step": 181900 + }, + { + "epoch": 1.8886133219878174, + "grad_norm": 2.602299213409424, + "learning_rate": 0.00011113970550084571, + "loss": 3.4190499877929685, + "step": 182000 + }, + { + "epoch": 1.889651021615283, + "grad_norm": 6.7280168533325195, + "learning_rate": 0.00011103593553809912, + "loss": 3.3795068359375, + "step": 182100 + }, + { + "epoch": 1.890688721242749, + "grad_norm": 6.911862850189209, + "learning_rate": 0.00011093216557535256, + "loss": 3.5166439819335937, + "step": 182200 + }, + { + "epoch": 1.891726420870215, + "grad_norm": 6.751010894775391, + "learning_rate": 0.00011082839561260596, + "loss": 3.4338143920898436, + "step": 182300 + }, + { + "epoch": 1.8927641204976808, + "grad_norm": 4.327939510345459, + "learning_rate": 0.00011072462564985939, + "loss": 3.4822421264648438, + "step": 182400 + }, + { + "epoch": 1.8938018201251465, + "grad_norm": 2.485795259475708, + "learning_rate": 0.0001106208556871128, + "loss": 3.464154052734375, + "step": 182500 + }, + { + "epoch": 1.8948395197526124, + "grad_norm": 104.476318359375, + "learning_rate": 0.00011051708572436622, + "loss": 3.4480935668945314, + "step": 182600 + }, + { + "epoch": 1.8958772193800781, + "grad_norm": 2.829188346862793, + "learning_rate": 0.00011041331576161963, + "loss": 3.593952331542969, + "step": 182700 + }, + { + "epoch": 1.896914919007544, + "grad_norm": 4.845984935760498, + "learning_rate": 0.00011030954579887306, + "loss": 3.244365234375, + "step": 182800 + }, + { + "epoch": 1.89795261863501, + "grad_norm": 2.055333375930786, + "learning_rate": 0.00011020577583612647, + "loss": 3.5465518188476564, + "step": 182900 + }, + { + "epoch": 1.8989903182624759, + "grad_norm": 19.445037841796875, + "learning_rate": 0.00011010200587337988, + "loss": 3.5760122680664064, + "step": 183000 + }, + { + "epoch": 1.9000280178899416, + "grad_norm": 3.0907251834869385, + "learning_rate": 0.0001099982359106333, + "loss": 3.524999084472656, + "step": 183100 + }, + { + "epoch": 1.9010657175174073, + "grad_norm": 1.9697469472885132, + "learning_rate": 0.00010989446594788671, + "loss": 3.4634637451171875, + "step": 183200 + }, + { + "epoch": 1.9021034171448732, + "grad_norm": 6.751926898956299, + "learning_rate": 0.00010979069598514013, + "loss": 3.4596926879882814, + "step": 183300 + }, + { + "epoch": 1.903141116772339, + "grad_norm": 2.561213493347168, + "learning_rate": 0.00010968692602239355, + "loss": 3.5389044189453127, + "step": 183400 + }, + { + "epoch": 1.904178816399805, + "grad_norm": 6.130541801452637, + "learning_rate": 0.00010958315605964697, + "loss": 3.4779763793945313, + "step": 183500 + }, + { + "epoch": 1.9052165160272707, + "grad_norm": 3.2996444702148438, + "learning_rate": 0.00010947938609690038, + "loss": 3.4853436279296877, + "step": 183600 + }, + { + "epoch": 1.9062542156547366, + "grad_norm": 4.535896301269531, + "learning_rate": 0.0001093756161341538, + "loss": 3.4235238647460937, + "step": 183700 + }, + { + "epoch": 1.9072919152822023, + "grad_norm": 4.082485675811768, + "learning_rate": 0.00010927184617140721, + "loss": 3.4645541381835936, + "step": 183800 + }, + { + "epoch": 1.9083296149096682, + "grad_norm": 5.501161098480225, + "learning_rate": 0.00010916807620866062, + "loss": 3.555899658203125, + "step": 183900 + }, + { + "epoch": 1.9093673145371342, + "grad_norm": 7.624723434448242, + "learning_rate": 0.00010906430624591404, + "loss": 3.4653219604492187, + "step": 184000 + }, + { + "epoch": 1.9104050141646, + "grad_norm": 3.386392116546631, + "learning_rate": 0.00010896053628316746, + "loss": 3.5530450439453123, + "step": 184100 + }, + { + "epoch": 1.9114427137920658, + "grad_norm": 4.087791442871094, + "learning_rate": 0.00010885676632042089, + "loss": 3.470418701171875, + "step": 184200 + }, + { + "epoch": 1.9124804134195315, + "grad_norm": 4.145429611206055, + "learning_rate": 0.0001087529963576743, + "loss": 3.416697692871094, + "step": 184300 + }, + { + "epoch": 1.9135181130469974, + "grad_norm": 4.366927623748779, + "learning_rate": 0.00010864922639492772, + "loss": 3.4999765014648436, + "step": 184400 + }, + { + "epoch": 1.9145558126744633, + "grad_norm": 4.084202289581299, + "learning_rate": 0.00010854545643218113, + "loss": 3.435041809082031, + "step": 184500 + }, + { + "epoch": 1.9155935123019292, + "grad_norm": 9.935702323913574, + "learning_rate": 0.00010844168646943455, + "loss": 3.54100341796875, + "step": 184600 + }, + { + "epoch": 1.916631211929395, + "grad_norm": 6.931925296783447, + "learning_rate": 0.00010833791650668797, + "loss": 3.5136874389648436, + "step": 184700 + }, + { + "epoch": 1.9176689115568606, + "grad_norm": 3.0231878757476807, + "learning_rate": 0.00010823414654394139, + "loss": 3.6150555419921875, + "step": 184800 + }, + { + "epoch": 1.9187066111843265, + "grad_norm": 3.3393242359161377, + "learning_rate": 0.0001081303765811948, + "loss": 3.479617004394531, + "step": 184900 + }, + { + "epoch": 1.9197443108117924, + "grad_norm": 1.9449257850646973, + "learning_rate": 0.00010802660661844821, + "loss": 3.4772000122070312, + "step": 185000 + }, + { + "epoch": 1.9207820104392583, + "grad_norm": 5.924251079559326, + "learning_rate": 0.00010792283665570163, + "loss": 3.558631591796875, + "step": 185100 + }, + { + "epoch": 1.9218197100667243, + "grad_norm": 3.7242231369018555, + "learning_rate": 0.00010781906669295504, + "loss": 3.4901129150390626, + "step": 185200 + }, + { + "epoch": 1.92285740969419, + "grad_norm": 4.291270732879639, + "learning_rate": 0.00010771529673020847, + "loss": 3.4830392456054686, + "step": 185300 + }, + { + "epoch": 1.9238951093216556, + "grad_norm": 8.315948486328125, + "learning_rate": 0.00010761152676746188, + "loss": 3.654394226074219, + "step": 185400 + }, + { + "epoch": 1.9249328089491216, + "grad_norm": 3.3864219188690186, + "learning_rate": 0.0001075077568047153, + "loss": 3.4916171264648437, + "step": 185500 + }, + { + "epoch": 1.9259705085765875, + "grad_norm": 2.4446215629577637, + "learning_rate": 0.00010740398684196871, + "loss": 3.5801641845703127, + "step": 185600 + }, + { + "epoch": 1.9270082082040534, + "grad_norm": 4.319270133972168, + "learning_rate": 0.00010730021687922213, + "loss": 3.485596008300781, + "step": 185700 + }, + { + "epoch": 1.928045907831519, + "grad_norm": 12.243918418884277, + "learning_rate": 0.00010719644691647554, + "loss": 3.297283020019531, + "step": 185800 + }, + { + "epoch": 1.9290836074589848, + "grad_norm": 3.614396333694458, + "learning_rate": 0.00010709267695372895, + "loss": 3.4672842407226563, + "step": 185900 + }, + { + "epoch": 1.9301213070864507, + "grad_norm": 7.824878692626953, + "learning_rate": 0.00010698890699098239, + "loss": 3.5030999755859376, + "step": 186000 + }, + { + "epoch": 1.9311590067139166, + "grad_norm": 11.845438003540039, + "learning_rate": 0.0001068851370282358, + "loss": 3.5722430419921873, + "step": 186100 + }, + { + "epoch": 1.9321967063413825, + "grad_norm": 8.008241653442383, + "learning_rate": 0.00010678136706548922, + "loss": 3.4848983764648436, + "step": 186200 + }, + { + "epoch": 1.9332344059688482, + "grad_norm": 38.26485824584961, + "learning_rate": 0.00010667759710274262, + "loss": 3.4654171752929686, + "step": 186300 + }, + { + "epoch": 1.9342721055963141, + "grad_norm": 3.587207317352295, + "learning_rate": 0.00010657382713999605, + "loss": 3.443753967285156, + "step": 186400 + }, + { + "epoch": 1.9353098052237798, + "grad_norm": 7.548192024230957, + "learning_rate": 0.00010647005717724946, + "loss": 3.555989074707031, + "step": 186500 + }, + { + "epoch": 1.9363475048512457, + "grad_norm": 5.652491092681885, + "learning_rate": 0.00010636628721450289, + "loss": 3.5138848876953124, + "step": 186600 + }, + { + "epoch": 1.9373852044787117, + "grad_norm": 4.181760311126709, + "learning_rate": 0.0001062625172517563, + "loss": 3.4649755859375, + "step": 186700 + }, + { + "epoch": 1.9384229041061776, + "grad_norm": 39.51677703857422, + "learning_rate": 0.00010615874728900972, + "loss": 3.4170611572265623, + "step": 186800 + }, + { + "epoch": 1.9394606037336433, + "grad_norm": 5.663796901702881, + "learning_rate": 0.00010605497732626313, + "loss": 3.6423403930664064, + "step": 186900 + }, + { + "epoch": 1.940498303361109, + "grad_norm": 49.58971405029297, + "learning_rate": 0.00010595120736351654, + "loss": 3.556903076171875, + "step": 187000 + }, + { + "epoch": 1.9415360029885749, + "grad_norm": 4.037705421447754, + "learning_rate": 0.00010584743740076996, + "loss": 3.581287536621094, + "step": 187100 + }, + { + "epoch": 1.9425737026160408, + "grad_norm": 2.6354784965515137, + "learning_rate": 0.00010574366743802338, + "loss": 3.4927523803710936, + "step": 187200 + }, + { + "epoch": 1.9436114022435067, + "grad_norm": 3.8889167308807373, + "learning_rate": 0.0001056398974752768, + "loss": 3.485701904296875, + "step": 187300 + }, + { + "epoch": 1.9446491018709724, + "grad_norm": 6.694062232971191, + "learning_rate": 0.00010553612751253021, + "loss": 3.3910641479492187, + "step": 187400 + }, + { + "epoch": 1.9456868014984383, + "grad_norm": 5.231113910675049, + "learning_rate": 0.00010543235754978363, + "loss": 3.5116064453125, + "step": 187500 + }, + { + "epoch": 1.946724501125904, + "grad_norm": 13.281269073486328, + "learning_rate": 0.00010532858758703704, + "loss": 3.5454452514648436, + "step": 187600 + }, + { + "epoch": 1.94776220075337, + "grad_norm": 5.362813472747803, + "learning_rate": 0.00010522481762429046, + "loss": 3.5717642211914065, + "step": 187700 + }, + { + "epoch": 1.9487999003808358, + "grad_norm": 3.0265583992004395, + "learning_rate": 0.00010512104766154387, + "loss": 3.529712829589844, + "step": 187800 + }, + { + "epoch": 1.9498376000083018, + "grad_norm": 2.4003071784973145, + "learning_rate": 0.00010501727769879731, + "loss": 3.5179287719726564, + "step": 187900 + }, + { + "epoch": 1.9508752996357674, + "grad_norm": 3.5519869327545166, + "learning_rate": 0.00010491350773605072, + "loss": 3.3665447998046876, + "step": 188000 + }, + { + "epoch": 1.9519129992632331, + "grad_norm": 1.9300223588943481, + "learning_rate": 0.00010480973777330412, + "loss": 3.5477023315429688, + "step": 188100 + }, + { + "epoch": 1.952950698890699, + "grad_norm": 3.3745410442352295, + "learning_rate": 0.00010470596781055755, + "loss": 3.5283209228515626, + "step": 188200 + }, + { + "epoch": 1.953988398518165, + "grad_norm": 18.314775466918945, + "learning_rate": 0.00010460219784781096, + "loss": 3.4730484008789064, + "step": 188300 + }, + { + "epoch": 1.9550260981456309, + "grad_norm": 4.006529331207275, + "learning_rate": 0.00010449842788506438, + "loss": 3.4675115966796874, + "step": 188400 + }, + { + "epoch": 1.9560637977730966, + "grad_norm": 4.9441094398498535, + "learning_rate": 0.0001043946579223178, + "loss": 3.404721984863281, + "step": 188500 + }, + { + "epoch": 1.9571014974005623, + "grad_norm": 3.18265962600708, + "learning_rate": 0.00010429088795957122, + "loss": 3.667085876464844, + "step": 188600 + }, + { + "epoch": 1.9581391970280282, + "grad_norm": 3.0164151191711426, + "learning_rate": 0.00010418711799682463, + "loss": 3.5224847412109375, + "step": 188700 + }, + { + "epoch": 1.959176896655494, + "grad_norm": 5.3650007247924805, + "learning_rate": 0.00010408334803407805, + "loss": 3.4098544311523438, + "step": 188800 + }, + { + "epoch": 1.96021459628296, + "grad_norm": 6.3775224685668945, + "learning_rate": 0.00010397957807133146, + "loss": 3.649906005859375, + "step": 188900 + }, + { + "epoch": 1.9612522959104257, + "grad_norm": 18.32954978942871, + "learning_rate": 0.00010387580810858487, + "loss": 3.642203674316406, + "step": 189000 + }, + { + "epoch": 1.9622899955378916, + "grad_norm": 3.267017126083374, + "learning_rate": 0.0001037720381458383, + "loss": 3.522268981933594, + "step": 189100 + }, + { + "epoch": 1.9633276951653573, + "grad_norm": 3.3189854621887207, + "learning_rate": 0.00010366826818309171, + "loss": 3.525494384765625, + "step": 189200 + }, + { + "epoch": 1.9643653947928232, + "grad_norm": 20.459917068481445, + "learning_rate": 0.00010356449822034513, + "loss": 3.4846673583984376, + "step": 189300 + }, + { + "epoch": 1.9654030944202892, + "grad_norm": 10.600302696228027, + "learning_rate": 0.00010346072825759854, + "loss": 3.4710623168945314, + "step": 189400 + }, + { + "epoch": 1.966440794047755, + "grad_norm": 5.836012363433838, + "learning_rate": 0.00010335695829485196, + "loss": 3.395472412109375, + "step": 189500 + }, + { + "epoch": 1.9674784936752208, + "grad_norm": 1.8093000650405884, + "learning_rate": 0.00010325318833210537, + "loss": 3.4295391845703125, + "step": 189600 + }, + { + "epoch": 1.9685161933026865, + "grad_norm": 3.580705165863037, + "learning_rate": 0.0001031494183693588, + "loss": 3.571369934082031, + "step": 189700 + }, + { + "epoch": 1.9695538929301524, + "grad_norm": 4.870438575744629, + "learning_rate": 0.00010304564840661222, + "loss": 3.520045166015625, + "step": 189800 + }, + { + "epoch": 1.9705915925576183, + "grad_norm": 3.781505823135376, + "learning_rate": 0.00010294187844386564, + "loss": 3.5424517822265624, + "step": 189900 + }, + { + "epoch": 1.9716292921850842, + "grad_norm": 3.340085983276367, + "learning_rate": 0.00010283810848111905, + "loss": 3.518573913574219, + "step": 190000 + }, + { + "epoch": 1.97266699181255, + "grad_norm": 5.02490234375, + "learning_rate": 0.00010273433851837245, + "loss": 3.3679263305664064, + "step": 190100 + }, + { + "epoch": 1.9737046914400158, + "grad_norm": 4.117876052856445, + "learning_rate": 0.00010263056855562588, + "loss": 3.5929489135742188, + "step": 190200 + }, + { + "epoch": 1.9747423910674815, + "grad_norm": 3.8365478515625, + "learning_rate": 0.00010252679859287929, + "loss": 3.40560302734375, + "step": 190300 + }, + { + "epoch": 1.9757800906949474, + "grad_norm": 7.205904006958008, + "learning_rate": 0.00010242302863013272, + "loss": 3.38099609375, + "step": 190400 + }, + { + "epoch": 1.9768177903224133, + "grad_norm": 2.767961025238037, + "learning_rate": 0.00010231925866738613, + "loss": 3.4381674194335936, + "step": 190500 + }, + { + "epoch": 1.9778554899498793, + "grad_norm": 4.335025310516357, + "learning_rate": 0.00010221548870463955, + "loss": 3.3964199829101562, + "step": 190600 + }, + { + "epoch": 1.978893189577345, + "grad_norm": 4.294001579284668, + "learning_rate": 0.00010211171874189296, + "loss": 3.411571350097656, + "step": 190700 + }, + { + "epoch": 1.9799308892048106, + "grad_norm": 3.6443490982055664, + "learning_rate": 0.00010200794877914638, + "loss": 3.4534707641601563, + "step": 190800 + }, + { + "epoch": 1.9809685888322766, + "grad_norm": 4.729245662689209, + "learning_rate": 0.00010190417881639979, + "loss": 3.577586669921875, + "step": 190900 + }, + { + "epoch": 1.9820062884597425, + "grad_norm": 3.587510108947754, + "learning_rate": 0.00010180040885365323, + "loss": 3.4148577880859374, + "step": 191000 + }, + { + "epoch": 1.9830439880872084, + "grad_norm": 13.635988235473633, + "learning_rate": 0.00010169663889090663, + "loss": 3.531971435546875, + "step": 191100 + }, + { + "epoch": 1.984081687714674, + "grad_norm": 4.0034356117248535, + "learning_rate": 0.00010159286892816004, + "loss": 3.464627685546875, + "step": 191200 + }, + { + "epoch": 1.98511938734214, + "grad_norm": 4.326283931732178, + "learning_rate": 0.00010148909896541346, + "loss": 3.4689093017578125, + "step": 191300 + }, + { + "epoch": 1.9861570869696057, + "grad_norm": 10.159041404724121, + "learning_rate": 0.00010138532900266687, + "loss": 3.4093603515625, + "step": 191400 + }, + { + "epoch": 1.9871947865970716, + "grad_norm": 6.295145511627197, + "learning_rate": 0.0001012815590399203, + "loss": 3.4013311767578127, + "step": 191500 + }, + { + "epoch": 1.9882324862245375, + "grad_norm": 2.6228549480438232, + "learning_rate": 0.0001011777890771737, + "loss": 3.4039892578125, + "step": 191600 + }, + { + "epoch": 1.9892701858520034, + "grad_norm": 2.0637784004211426, + "learning_rate": 0.00010107401911442714, + "loss": 3.4192919921875, + "step": 191700 + }, + { + "epoch": 1.9903078854794691, + "grad_norm": 4.193583011627197, + "learning_rate": 0.00010097024915168055, + "loss": 3.5069757080078126, + "step": 191800 + }, + { + "epoch": 1.9913455851069348, + "grad_norm": 3.6812117099761963, + "learning_rate": 0.00010086647918893397, + "loss": 3.421480712890625, + "step": 191900 + }, + { + "epoch": 1.9923832847344007, + "grad_norm": 33.859195709228516, + "learning_rate": 0.00010076270922618738, + "loss": 3.506886291503906, + "step": 192000 + }, + { + "epoch": 1.9934209843618667, + "grad_norm": 3.308947801589966, + "learning_rate": 0.00010065893926344079, + "loss": 3.424991455078125, + "step": 192100 + }, + { + "epoch": 1.9944586839893326, + "grad_norm": 4.380412578582764, + "learning_rate": 0.00010055516930069421, + "loss": 3.4896340942382813, + "step": 192200 + }, + { + "epoch": 1.9954963836167983, + "grad_norm": 3.492359161376953, + "learning_rate": 0.00010045139933794763, + "loss": 3.403392333984375, + "step": 192300 + }, + { + "epoch": 1.996534083244264, + "grad_norm": 8.865891456604004, + "learning_rate": 0.00010034762937520105, + "loss": 3.60391845703125, + "step": 192400 + }, + { + "epoch": 1.9975717828717299, + "grad_norm": 1.982731819152832, + "learning_rate": 0.00010024385941245446, + "loss": 3.5614895629882812, + "step": 192500 + }, + { + "epoch": 1.9986094824991958, + "grad_norm": 2.9287161827087402, + "learning_rate": 0.00010014008944970788, + "loss": 3.5097760009765624, + "step": 192600 + }, + { + "epoch": 1.9996471821266617, + "grad_norm": 1.8267062902450562, + "learning_rate": 0.00010003631948696129, + "loss": 3.4958160400390623, + "step": 192700 + } + ], + "logging_steps": 100, + "max_steps": 289101, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2645192822135194e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}