{ "best_global_step": 252642, "best_metric": 0.8840214713296911, "best_model_checkpoint": "./results/twitter-roberta-finetune_2025-05-02_15-36-08/checkpoint-252642", "epoch": 4.142858839235883, "eval_steps": 500, "global_step": 421070, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001696378740303075, "grad_norm": 11.637032508850098, "learning_rate": 1.99966411700942e-05, "loss": 0.563, "step": 100 }, { "epoch": 0.000339275748060615, "grad_norm": 4.5099005699157715, "learning_rate": 1.9993248412613596e-05, "loss": 0.4422, "step": 200 }, { "epoch": 0.0005089136220909226, "grad_norm": 5.470180034637451, "learning_rate": 1.998985565513299e-05, "loss": 0.4013, "step": 300 }, { "epoch": 0.00067855149612123, "grad_norm": 6.416843414306641, "learning_rate": 1.9986462897652383e-05, "loss": 0.4121, "step": 400 }, { "epoch": 0.0008481893701515375, "grad_norm": 5.793498516082764, "learning_rate": 1.9983070140171776e-05, "loss": 0.4107, "step": 500 }, { "epoch": 0.0010178272441818451, "grad_norm": 6.405412673950195, "learning_rate": 1.997967738269117e-05, "loss": 0.3963, "step": 600 }, { "epoch": 0.0011874651182121525, "grad_norm": 5.807279586791992, "learning_rate": 1.9976284625210563e-05, "loss": 0.4108, "step": 700 }, { "epoch": 0.00135710299224246, "grad_norm": 6.905944347381592, "learning_rate": 1.997289186772996e-05, "loss": 0.3873, "step": 800 }, { "epoch": 0.0015267408662727674, "grad_norm": 6.016286373138428, "learning_rate": 1.9969499110249354e-05, "loss": 0.4169, "step": 900 }, { "epoch": 0.001696378740303075, "grad_norm": 5.017392158508301, "learning_rate": 1.9966106352768746e-05, "loss": 0.4026, "step": 1000 }, { "epoch": 0.0018660166143333826, "grad_norm": 4.776495456695557, "learning_rate": 1.996271359528814e-05, "loss": 0.383, "step": 1100 }, { "epoch": 0.0020356544883636902, "grad_norm": 5.569226264953613, "learning_rate": 1.9959320837807534e-05, "loss": 0.383, "step": 1200 }, { "epoch": 0.0022052923623939974, "grad_norm": 7.324199199676514, "learning_rate": 1.9955928080326926e-05, "loss": 0.3794, "step": 1300 }, { "epoch": 0.002374930236424305, "grad_norm": 5.1971516609191895, "learning_rate": 1.995253532284632e-05, "loss": 0.3868, "step": 1400 }, { "epoch": 0.0025445681104546126, "grad_norm": 9.394813537597656, "learning_rate": 1.9949142565365717e-05, "loss": 0.3795, "step": 1500 }, { "epoch": 0.00271420598448492, "grad_norm": 6.607308387756348, "learning_rate": 1.994574980788511e-05, "loss": 0.3569, "step": 1600 }, { "epoch": 0.0028838438585152277, "grad_norm": 4.745519161224365, "learning_rate": 1.9942357050404505e-05, "loss": 0.345, "step": 1700 }, { "epoch": 0.003053481732545535, "grad_norm": 7.081218719482422, "learning_rate": 1.9938964292923897e-05, "loss": 0.3853, "step": 1800 }, { "epoch": 0.0032231196065758425, "grad_norm": 7.3311004638671875, "learning_rate": 1.993557153544329e-05, "loss": 0.388, "step": 1900 }, { "epoch": 0.00339275748060615, "grad_norm": 10.336955070495605, "learning_rate": 1.9932178777962684e-05, "loss": 0.3709, "step": 2000 }, { "epoch": 0.0035623953546364577, "grad_norm": 6.447458267211914, "learning_rate": 1.992878602048208e-05, "loss": 0.3512, "step": 2100 }, { "epoch": 0.0037320332286667653, "grad_norm": 6.393381118774414, "learning_rate": 1.9925393263001472e-05, "loss": 0.3607, "step": 2200 }, { "epoch": 0.0039016711026970724, "grad_norm": 6.43084192276001, "learning_rate": 1.9922000505520867e-05, "loss": 0.3733, "step": 2300 }, { "epoch": 0.0040713089767273804, "grad_norm": 11.860773086547852, "learning_rate": 1.991860774804026e-05, "loss": 0.3544, "step": 2400 }, { "epoch": 0.004240946850757687, "grad_norm": 10.267533302307129, "learning_rate": 1.9915214990559655e-05, "loss": 0.3762, "step": 2500 }, { "epoch": 0.004410584724787995, "grad_norm": 5.229025840759277, "learning_rate": 1.9911822233079047e-05, "loss": 0.3838, "step": 2600 }, { "epoch": 0.004580222598818302, "grad_norm": 5.431205749511719, "learning_rate": 1.9908429475598442e-05, "loss": 0.3331, "step": 2700 }, { "epoch": 0.00474986047284861, "grad_norm": 10.326428413391113, "learning_rate": 1.9905036718117835e-05, "loss": 0.3726, "step": 2800 }, { "epoch": 0.0049194983468789175, "grad_norm": 6.539595603942871, "learning_rate": 1.990164396063723e-05, "loss": 0.3714, "step": 2900 }, { "epoch": 0.005089136220909225, "grad_norm": 7.394728660583496, "learning_rate": 1.9898251203156622e-05, "loss": 0.3616, "step": 3000 }, { "epoch": 0.005258774094939533, "grad_norm": 11.455352783203125, "learning_rate": 1.9894858445676018e-05, "loss": 0.344, "step": 3100 }, { "epoch": 0.00542841196896984, "grad_norm": 4.819492816925049, "learning_rate": 1.989146568819541e-05, "loss": 0.3531, "step": 3200 }, { "epoch": 0.005598049843000148, "grad_norm": 8.217123031616211, "learning_rate": 1.9888072930714805e-05, "loss": 0.3514, "step": 3300 }, { "epoch": 0.0057676877170304555, "grad_norm": 9.882905006408691, "learning_rate": 1.9884680173234197e-05, "loss": 0.3512, "step": 3400 }, { "epoch": 0.005937325591060762, "grad_norm": 10.056175231933594, "learning_rate": 1.9881287415753593e-05, "loss": 0.3824, "step": 3500 }, { "epoch": 0.00610696346509107, "grad_norm": 7.156865119934082, "learning_rate": 1.987789465827299e-05, "loss": 0.3351, "step": 3600 }, { "epoch": 0.006276601339121377, "grad_norm": 4.398000240325928, "learning_rate": 1.987450190079238e-05, "loss": 0.3599, "step": 3700 }, { "epoch": 0.006446239213151685, "grad_norm": 6.533860206604004, "learning_rate": 1.9871109143311773e-05, "loss": 0.3438, "step": 3800 }, { "epoch": 0.006615877087181993, "grad_norm": 5.555047035217285, "learning_rate": 1.9867716385831168e-05, "loss": 0.3615, "step": 3900 }, { "epoch": 0.0067855149612123, "grad_norm": 5.322057723999023, "learning_rate": 1.986432362835056e-05, "loss": 0.3557, "step": 4000 }, { "epoch": 0.006955152835242608, "grad_norm": 7.200858116149902, "learning_rate": 1.9860930870869956e-05, "loss": 0.358, "step": 4100 }, { "epoch": 0.007124790709272915, "grad_norm": 4.7612128257751465, "learning_rate": 1.985753811338935e-05, "loss": 0.3245, "step": 4200 }, { "epoch": 0.007294428583303223, "grad_norm": 9.209895133972168, "learning_rate": 1.9854145355908743e-05, "loss": 0.3491, "step": 4300 }, { "epoch": 0.0074640664573335305, "grad_norm": 2.504087209701538, "learning_rate": 1.985075259842814e-05, "loss": 0.3551, "step": 4400 }, { "epoch": 0.007633704331363837, "grad_norm": 4.516504287719727, "learning_rate": 1.984735984094753e-05, "loss": 0.3547, "step": 4500 }, { "epoch": 0.007803342205394145, "grad_norm": 8.417220115661621, "learning_rate": 1.9843967083466923e-05, "loss": 0.3258, "step": 4600 }, { "epoch": 0.007972980079424452, "grad_norm": 4.931686878204346, "learning_rate": 1.984057432598632e-05, "loss": 0.3685, "step": 4700 }, { "epoch": 0.008142617953454761, "grad_norm": 5.266425132751465, "learning_rate": 1.9837181568505714e-05, "loss": 0.3135, "step": 4800 }, { "epoch": 0.008312255827485068, "grad_norm": 5.692936420440674, "learning_rate": 1.9833788811025106e-05, "loss": 0.3268, "step": 4900 }, { "epoch": 0.008481893701515374, "grad_norm": 4.730175971984863, "learning_rate": 1.98303960535445e-05, "loss": 0.3564, "step": 5000 }, { "epoch": 0.008651531575545683, "grad_norm": 5.576816082000732, "learning_rate": 1.9827003296063894e-05, "loss": 0.3475, "step": 5100 }, { "epoch": 0.00882116944957599, "grad_norm": 5.672577857971191, "learning_rate": 1.982361053858329e-05, "loss": 0.3356, "step": 5200 }, { "epoch": 0.008990807323606298, "grad_norm": 7.343998908996582, "learning_rate": 1.982021778110268e-05, "loss": 0.3388, "step": 5300 }, { "epoch": 0.009160445197636605, "grad_norm": 9.448531150817871, "learning_rate": 1.9816825023622073e-05, "loss": 0.3477, "step": 5400 }, { "epoch": 0.009330083071666913, "grad_norm": 6.377099514007568, "learning_rate": 1.981343226614147e-05, "loss": 0.3211, "step": 5500 }, { "epoch": 0.00949972094569722, "grad_norm": 4.412994384765625, "learning_rate": 1.9810039508660864e-05, "loss": 0.3503, "step": 5600 }, { "epoch": 0.009669358819727528, "grad_norm": 7.966634273529053, "learning_rate": 1.9806646751180256e-05, "loss": 0.3119, "step": 5700 }, { "epoch": 0.009838996693757835, "grad_norm": 4.002699851989746, "learning_rate": 1.9803253993699652e-05, "loss": 0.3428, "step": 5800 }, { "epoch": 0.010008634567788144, "grad_norm": 16.6043701171875, "learning_rate": 1.9799861236219044e-05, "loss": 0.3251, "step": 5900 }, { "epoch": 0.01017827244181845, "grad_norm": 6.069911956787109, "learning_rate": 1.9796468478738436e-05, "loss": 0.3369, "step": 6000 }, { "epoch": 0.010347910315848757, "grad_norm": 7.189563751220703, "learning_rate": 1.979307572125783e-05, "loss": 0.3499, "step": 6100 }, { "epoch": 0.010517548189879065, "grad_norm": 2.9475955963134766, "learning_rate": 1.9789682963777227e-05, "loss": 0.3295, "step": 6200 }, { "epoch": 0.010687186063909372, "grad_norm": 4.195947170257568, "learning_rate": 1.9786290206296622e-05, "loss": 0.3545, "step": 6300 }, { "epoch": 0.01085682393793968, "grad_norm": 7.09829044342041, "learning_rate": 1.9782897448816015e-05, "loss": 0.3262, "step": 6400 }, { "epoch": 0.011026461811969987, "grad_norm": 7.042442798614502, "learning_rate": 1.9779504691335407e-05, "loss": 0.3127, "step": 6500 }, { "epoch": 0.011196099686000296, "grad_norm": 3.9595746994018555, "learning_rate": 1.9776111933854802e-05, "loss": 0.3484, "step": 6600 }, { "epoch": 0.011365737560030603, "grad_norm": 5.4327168464660645, "learning_rate": 1.9772719176374194e-05, "loss": 0.3539, "step": 6700 }, { "epoch": 0.011535375434060911, "grad_norm": 6.782555103302002, "learning_rate": 1.976932641889359e-05, "loss": 0.3279, "step": 6800 }, { "epoch": 0.011705013308091218, "grad_norm": 4.201333999633789, "learning_rate": 1.9765933661412985e-05, "loss": 0.3142, "step": 6900 }, { "epoch": 0.011874651182121524, "grad_norm": 11.040885925292969, "learning_rate": 1.9762540903932377e-05, "loss": 0.3248, "step": 7000 }, { "epoch": 0.012044289056151833, "grad_norm": 8.307191848754883, "learning_rate": 1.9759148146451773e-05, "loss": 0.3371, "step": 7100 }, { "epoch": 0.01221392693018214, "grad_norm": 4.398335933685303, "learning_rate": 1.9755755388971165e-05, "loss": 0.3268, "step": 7200 }, { "epoch": 0.012383564804212448, "grad_norm": 3.7130706310272217, "learning_rate": 1.9752362631490557e-05, "loss": 0.3242, "step": 7300 }, { "epoch": 0.012553202678242755, "grad_norm": 4.7666521072387695, "learning_rate": 1.9748969874009953e-05, "loss": 0.3255, "step": 7400 }, { "epoch": 0.012722840552273063, "grad_norm": 9.544007301330566, "learning_rate": 1.9745577116529348e-05, "loss": 0.3511, "step": 7500 }, { "epoch": 0.01289247842630337, "grad_norm": 4.180122375488281, "learning_rate": 1.974218435904874e-05, "loss": 0.326, "step": 7600 }, { "epoch": 0.013062116300333678, "grad_norm": 6.223621368408203, "learning_rate": 1.9738791601568136e-05, "loss": 0.3341, "step": 7700 }, { "epoch": 0.013231754174363985, "grad_norm": 6.588081359863281, "learning_rate": 1.9735398844087528e-05, "loss": 0.3423, "step": 7800 }, { "epoch": 0.013401392048394292, "grad_norm": 8.111326217651367, "learning_rate": 1.973200608660692e-05, "loss": 0.3438, "step": 7900 }, { "epoch": 0.0135710299224246, "grad_norm": 4.608439922332764, "learning_rate": 1.9728613329126315e-05, "loss": 0.3257, "step": 8000 }, { "epoch": 0.013740667796454907, "grad_norm": 3.966379404067993, "learning_rate": 1.9725220571645707e-05, "loss": 0.3177, "step": 8100 }, { "epoch": 0.013910305670485216, "grad_norm": 3.49314022064209, "learning_rate": 1.9721827814165103e-05, "loss": 0.3291, "step": 8200 }, { "epoch": 0.014079943544515522, "grad_norm": 7.768034934997559, "learning_rate": 1.97184350566845e-05, "loss": 0.3193, "step": 8300 }, { "epoch": 0.01424958141854583, "grad_norm": 4.5671000480651855, "learning_rate": 1.971504229920389e-05, "loss": 0.3319, "step": 8400 }, { "epoch": 0.014419219292576137, "grad_norm": 5.005865097045898, "learning_rate": 1.9711649541723286e-05, "loss": 0.3279, "step": 8500 }, { "epoch": 0.014588857166606446, "grad_norm": 3.8997530937194824, "learning_rate": 1.9708256784242678e-05, "loss": 0.3291, "step": 8600 }, { "epoch": 0.014758495040636753, "grad_norm": 5.730672836303711, "learning_rate": 1.970486402676207e-05, "loss": 0.3187, "step": 8700 }, { "epoch": 0.014928132914667061, "grad_norm": 6.161813259124756, "learning_rate": 1.9701471269281466e-05, "loss": 0.337, "step": 8800 }, { "epoch": 0.015097770788697368, "grad_norm": 3.074876308441162, "learning_rate": 1.969807851180086e-05, "loss": 0.3317, "step": 8900 }, { "epoch": 0.015267408662727674, "grad_norm": 7.350127220153809, "learning_rate": 1.9694685754320257e-05, "loss": 0.3036, "step": 9000 }, { "epoch": 0.015437046536757983, "grad_norm": 8.203046798706055, "learning_rate": 1.969129299683965e-05, "loss": 0.3291, "step": 9100 }, { "epoch": 0.01560668441078829, "grad_norm": 3.161381959915161, "learning_rate": 1.968790023935904e-05, "loss": 0.3352, "step": 9200 }, { "epoch": 0.015776322284818598, "grad_norm": 5.06301736831665, "learning_rate": 1.9684507481878436e-05, "loss": 0.3171, "step": 9300 }, { "epoch": 0.015945960158848905, "grad_norm": 7.062092304229736, "learning_rate": 1.968111472439783e-05, "loss": 0.34, "step": 9400 }, { "epoch": 0.01611559803287921, "grad_norm": 7.699665546417236, "learning_rate": 1.9677721966917224e-05, "loss": 0.3005, "step": 9500 }, { "epoch": 0.016285235906909522, "grad_norm": 3.865088701248169, "learning_rate": 1.967432920943662e-05, "loss": 0.3329, "step": 9600 }, { "epoch": 0.01645487378093983, "grad_norm": 4.331723690032959, "learning_rate": 1.967093645195601e-05, "loss": 0.3071, "step": 9700 }, { "epoch": 0.016624511654970135, "grad_norm": 3.1663691997528076, "learning_rate": 1.9667543694475404e-05, "loss": 0.3253, "step": 9800 }, { "epoch": 0.016794149529000442, "grad_norm": 6.1092329025268555, "learning_rate": 1.96641509369948e-05, "loss": 0.3199, "step": 9900 }, { "epoch": 0.01696378740303075, "grad_norm": 7.909847736358643, "learning_rate": 1.966075817951419e-05, "loss": 0.3289, "step": 10000 }, { "epoch": 0.01713342527706106, "grad_norm": 4.816249370574951, "learning_rate": 1.9657365422033587e-05, "loss": 0.3056, "step": 10100 }, { "epoch": 0.017303063151091366, "grad_norm": 6.455096244812012, "learning_rate": 1.965397266455298e-05, "loss": 0.3254, "step": 10200 }, { "epoch": 0.017472701025121672, "grad_norm": 5.755128860473633, "learning_rate": 1.9650579907072374e-05, "loss": 0.3266, "step": 10300 }, { "epoch": 0.01764233889915198, "grad_norm": 3.7598836421966553, "learning_rate": 1.964718714959177e-05, "loss": 0.3395, "step": 10400 }, { "epoch": 0.01781197677318229, "grad_norm": 12.385309219360352, "learning_rate": 1.9643794392111162e-05, "loss": 0.3264, "step": 10500 }, { "epoch": 0.017981614647212596, "grad_norm": 6.684805870056152, "learning_rate": 1.9640401634630554e-05, "loss": 0.3228, "step": 10600 }, { "epoch": 0.018151252521242903, "grad_norm": 3.029668092727661, "learning_rate": 1.963700887714995e-05, "loss": 0.3212, "step": 10700 }, { "epoch": 0.01832089039527321, "grad_norm": 4.2430877685546875, "learning_rate": 1.963361611966934e-05, "loss": 0.3185, "step": 10800 }, { "epoch": 0.018490528269303516, "grad_norm": 5.194127559661865, "learning_rate": 1.9630223362188737e-05, "loss": 0.3373, "step": 10900 }, { "epoch": 0.018660166143333826, "grad_norm": 3.005768060684204, "learning_rate": 1.9626830604708133e-05, "loss": 0.3211, "step": 11000 }, { "epoch": 0.018829804017364133, "grad_norm": 6.31130313873291, "learning_rate": 1.9623437847227525e-05, "loss": 0.3288, "step": 11100 }, { "epoch": 0.01899944189139444, "grad_norm": 7.13035774230957, "learning_rate": 1.962004508974692e-05, "loss": 0.332, "step": 11200 }, { "epoch": 0.019169079765424746, "grad_norm": 6.853993892669678, "learning_rate": 1.9616652332266312e-05, "loss": 0.3204, "step": 11300 }, { "epoch": 0.019338717639455057, "grad_norm": 4.164268970489502, "learning_rate": 1.9613259574785704e-05, "loss": 0.3331, "step": 11400 }, { "epoch": 0.019508355513485363, "grad_norm": 5.3462395668029785, "learning_rate": 1.96098668173051e-05, "loss": 0.3207, "step": 11500 }, { "epoch": 0.01967799338751567, "grad_norm": 3.625352621078491, "learning_rate": 1.9606474059824495e-05, "loss": 0.3228, "step": 11600 }, { "epoch": 0.019847631261545977, "grad_norm": 3.939159393310547, "learning_rate": 1.9603081302343887e-05, "loss": 0.3163, "step": 11700 }, { "epoch": 0.020017269135576287, "grad_norm": 6.6302690505981445, "learning_rate": 1.9599688544863283e-05, "loss": 0.306, "step": 11800 }, { "epoch": 0.020186907009606594, "grad_norm": 5.077898025512695, "learning_rate": 1.9596295787382675e-05, "loss": 0.3248, "step": 11900 }, { "epoch": 0.0203565448836369, "grad_norm": 5.365169525146484, "learning_rate": 1.959290302990207e-05, "loss": 0.3445, "step": 12000 }, { "epoch": 0.020526182757667207, "grad_norm": 3.2206056118011475, "learning_rate": 1.9589510272421463e-05, "loss": 0.3327, "step": 12100 }, { "epoch": 0.020695820631697514, "grad_norm": 3.9134984016418457, "learning_rate": 1.9586117514940858e-05, "loss": 0.3187, "step": 12200 }, { "epoch": 0.020865458505727824, "grad_norm": 7.120047092437744, "learning_rate": 1.958272475746025e-05, "loss": 0.3156, "step": 12300 }, { "epoch": 0.02103509637975813, "grad_norm": 10.738414764404297, "learning_rate": 1.9579331999979646e-05, "loss": 0.3307, "step": 12400 }, { "epoch": 0.021204734253788438, "grad_norm": 4.907688617706299, "learning_rate": 1.9575939242499038e-05, "loss": 0.337, "step": 12500 }, { "epoch": 0.021374372127818744, "grad_norm": 10.297996520996094, "learning_rate": 1.9572546485018433e-05, "loss": 0.3093, "step": 12600 }, { "epoch": 0.021544010001849054, "grad_norm": 7.308147430419922, "learning_rate": 1.9569153727537825e-05, "loss": 0.3049, "step": 12700 }, { "epoch": 0.02171364787587936, "grad_norm": 4.288365364074707, "learning_rate": 1.956576097005722e-05, "loss": 0.3118, "step": 12800 }, { "epoch": 0.021883285749909668, "grad_norm": 6.546408176422119, "learning_rate": 1.9562368212576613e-05, "loss": 0.3134, "step": 12900 }, { "epoch": 0.022052923623939975, "grad_norm": 4.836111545562744, "learning_rate": 1.955897545509601e-05, "loss": 0.3214, "step": 13000 }, { "epoch": 0.02222256149797028, "grad_norm": 7.453960418701172, "learning_rate": 1.9555582697615404e-05, "loss": 0.2982, "step": 13100 }, { "epoch": 0.02239219937200059, "grad_norm": 4.263779640197754, "learning_rate": 1.9552189940134796e-05, "loss": 0.3225, "step": 13200 }, { "epoch": 0.0225618372460309, "grad_norm": 6.579064846038818, "learning_rate": 1.9548797182654188e-05, "loss": 0.3173, "step": 13300 }, { "epoch": 0.022731475120061205, "grad_norm": 3.298647403717041, "learning_rate": 1.9545404425173584e-05, "loss": 0.3189, "step": 13400 }, { "epoch": 0.022901112994091512, "grad_norm": 5.8416948318481445, "learning_rate": 1.9542011667692976e-05, "loss": 0.3149, "step": 13500 }, { "epoch": 0.023070750868121822, "grad_norm": 4.548083782196045, "learning_rate": 1.953861891021237e-05, "loss": 0.3053, "step": 13600 }, { "epoch": 0.02324038874215213, "grad_norm": 6.395272731781006, "learning_rate": 1.9535226152731767e-05, "loss": 0.3208, "step": 13700 }, { "epoch": 0.023410026616182435, "grad_norm": 10.71978759765625, "learning_rate": 1.953183339525116e-05, "loss": 0.3212, "step": 13800 }, { "epoch": 0.023579664490212742, "grad_norm": 9.526215553283691, "learning_rate": 1.9528440637770554e-05, "loss": 0.3327, "step": 13900 }, { "epoch": 0.02374930236424305, "grad_norm": 4.366086483001709, "learning_rate": 1.9525047880289946e-05, "loss": 0.317, "step": 14000 }, { "epoch": 0.02391894023827336, "grad_norm": 2.759147882461548, "learning_rate": 1.952165512280934e-05, "loss": 0.3144, "step": 14100 }, { "epoch": 0.024088578112303666, "grad_norm": 3.282360553741455, "learning_rate": 1.9518262365328734e-05, "loss": 0.3011, "step": 14200 }, { "epoch": 0.024258215986333972, "grad_norm": 3.6273326873779297, "learning_rate": 1.951486960784813e-05, "loss": 0.3227, "step": 14300 }, { "epoch": 0.02442785386036428, "grad_norm": 5.695254325866699, "learning_rate": 1.951147685036752e-05, "loss": 0.3107, "step": 14400 }, { "epoch": 0.02459749173439459, "grad_norm": 3.841158866882324, "learning_rate": 1.9508084092886917e-05, "loss": 0.3188, "step": 14500 }, { "epoch": 0.024767129608424896, "grad_norm": 5.710983753204346, "learning_rate": 1.950469133540631e-05, "loss": 0.3007, "step": 14600 }, { "epoch": 0.024936767482455203, "grad_norm": 10.147504806518555, "learning_rate": 1.9501298577925705e-05, "loss": 0.3178, "step": 14700 }, { "epoch": 0.02510640535648551, "grad_norm": 3.2488319873809814, "learning_rate": 1.9497905820445097e-05, "loss": 0.3037, "step": 14800 }, { "epoch": 0.025276043230515816, "grad_norm": 4.885373592376709, "learning_rate": 1.9494513062964492e-05, "loss": 0.3325, "step": 14900 }, { "epoch": 0.025445681104546126, "grad_norm": 5.5563201904296875, "learning_rate": 1.9491120305483884e-05, "loss": 0.3269, "step": 15000 }, { "epoch": 0.025615318978576433, "grad_norm": 4.241575241088867, "learning_rate": 1.948772754800328e-05, "loss": 0.2949, "step": 15100 }, { "epoch": 0.02578495685260674, "grad_norm": 5.927247047424316, "learning_rate": 1.9484334790522672e-05, "loss": 0.3169, "step": 15200 }, { "epoch": 0.025954594726637047, "grad_norm": 4.195557594299316, "learning_rate": 1.9480942033042067e-05, "loss": 0.3109, "step": 15300 }, { "epoch": 0.026124232600667357, "grad_norm": 3.0681588649749756, "learning_rate": 1.947754927556146e-05, "loss": 0.3148, "step": 15400 }, { "epoch": 0.026293870474697664, "grad_norm": 7.699918746948242, "learning_rate": 1.9474156518080855e-05, "loss": 0.3165, "step": 15500 }, { "epoch": 0.02646350834872797, "grad_norm": 6.032403469085693, "learning_rate": 1.9470763760600247e-05, "loss": 0.3219, "step": 15600 }, { "epoch": 0.026633146222758277, "grad_norm": 8.282868385314941, "learning_rate": 1.9467371003119643e-05, "loss": 0.3063, "step": 15700 }, { "epoch": 0.026802784096788584, "grad_norm": 6.992432117462158, "learning_rate": 1.9463978245639038e-05, "loss": 0.3338, "step": 15800 }, { "epoch": 0.026972421970818894, "grad_norm": 5.107656002044678, "learning_rate": 1.946058548815843e-05, "loss": 0.3084, "step": 15900 }, { "epoch": 0.0271420598448492, "grad_norm": 5.038749694824219, "learning_rate": 1.9457192730677822e-05, "loss": 0.3089, "step": 16000 }, { "epoch": 0.027311697718879507, "grad_norm": 4.838282585144043, "learning_rate": 1.9453799973197218e-05, "loss": 0.3131, "step": 16100 }, { "epoch": 0.027481335592909814, "grad_norm": 8.015998840332031, "learning_rate": 1.945040721571661e-05, "loss": 0.3116, "step": 16200 }, { "epoch": 0.027650973466940124, "grad_norm": 5.232460021972656, "learning_rate": 1.9447014458236005e-05, "loss": 0.3136, "step": 16300 }, { "epoch": 0.02782061134097043, "grad_norm": 38.65550231933594, "learning_rate": 1.94436217007554e-05, "loss": 0.3188, "step": 16400 }, { "epoch": 0.027990249215000738, "grad_norm": 4.862700939178467, "learning_rate": 1.9440228943274793e-05, "loss": 0.3204, "step": 16500 }, { "epoch": 0.028159887089031044, "grad_norm": 6.910534381866455, "learning_rate": 1.943683618579419e-05, "loss": 0.3087, "step": 16600 }, { "epoch": 0.028329524963061355, "grad_norm": 3.5463767051696777, "learning_rate": 1.943344342831358e-05, "loss": 0.3021, "step": 16700 }, { "epoch": 0.02849916283709166, "grad_norm": 5.984306335449219, "learning_rate": 1.9430050670832973e-05, "loss": 0.3022, "step": 16800 }, { "epoch": 0.028668800711121968, "grad_norm": 4.216242790222168, "learning_rate": 1.9426657913352368e-05, "loss": 0.2981, "step": 16900 }, { "epoch": 0.028838438585152275, "grad_norm": 11.04842472076416, "learning_rate": 1.9423265155871764e-05, "loss": 0.307, "step": 17000 }, { "epoch": 0.02900807645918258, "grad_norm": 3.32411527633667, "learning_rate": 1.9419872398391156e-05, "loss": 0.3295, "step": 17100 }, { "epoch": 0.02917771433321289, "grad_norm": 5.97471809387207, "learning_rate": 1.941647964091055e-05, "loss": 0.3126, "step": 17200 }, { "epoch": 0.0293473522072432, "grad_norm": 4.056634902954102, "learning_rate": 1.9413086883429943e-05, "loss": 0.3082, "step": 17300 }, { "epoch": 0.029516990081273505, "grad_norm": 3.8590571880340576, "learning_rate": 1.9409694125949336e-05, "loss": 0.3106, "step": 17400 }, { "epoch": 0.029686627955303812, "grad_norm": 3.3209075927734375, "learning_rate": 1.940630136846873e-05, "loss": 0.3162, "step": 17500 }, { "epoch": 0.029856265829334122, "grad_norm": 3.8605172634124756, "learning_rate": 1.9402908610988126e-05, "loss": 0.3265, "step": 17600 }, { "epoch": 0.03002590370336443, "grad_norm": 7.332713603973389, "learning_rate": 1.939951585350752e-05, "loss": 0.3323, "step": 17700 }, { "epoch": 0.030195541577394736, "grad_norm": 6.4308762550354, "learning_rate": 1.9396123096026914e-05, "loss": 0.3127, "step": 17800 }, { "epoch": 0.030365179451425042, "grad_norm": 6.543829917907715, "learning_rate": 1.9392730338546306e-05, "loss": 0.3017, "step": 17900 }, { "epoch": 0.03053481732545535, "grad_norm": 11.294573783874512, "learning_rate": 1.93893375810657e-05, "loss": 0.3131, "step": 18000 }, { "epoch": 0.03070445519948566, "grad_norm": 7.962887763977051, "learning_rate": 1.9385944823585094e-05, "loss": 0.3098, "step": 18100 }, { "epoch": 0.030874093073515966, "grad_norm": 6.312958240509033, "learning_rate": 1.9382552066104486e-05, "loss": 0.3278, "step": 18200 }, { "epoch": 0.031043730947546273, "grad_norm": 4.289338111877441, "learning_rate": 1.937915930862388e-05, "loss": 0.3299, "step": 18300 }, { "epoch": 0.03121336882157658, "grad_norm": 5.444443702697754, "learning_rate": 1.9375766551143277e-05, "loss": 0.3003, "step": 18400 }, { "epoch": 0.03138300669560689, "grad_norm": 7.047294616699219, "learning_rate": 1.937237379366267e-05, "loss": 0.3228, "step": 18500 }, { "epoch": 0.031552644569637196, "grad_norm": 4.711514949798584, "learning_rate": 1.9368981036182064e-05, "loss": 0.296, "step": 18600 }, { "epoch": 0.0317222824436675, "grad_norm": 4.039498329162598, "learning_rate": 1.9365588278701457e-05, "loss": 0.3105, "step": 18700 }, { "epoch": 0.03189192031769781, "grad_norm": 5.736058235168457, "learning_rate": 1.9362195521220852e-05, "loss": 0.2998, "step": 18800 }, { "epoch": 0.032061558191728116, "grad_norm": 8.330560684204102, "learning_rate": 1.9358802763740244e-05, "loss": 0.2975, "step": 18900 }, { "epoch": 0.03223119606575842, "grad_norm": 2.6496951580047607, "learning_rate": 1.935541000625964e-05, "loss": 0.3092, "step": 19000 }, { "epoch": 0.03240083393978873, "grad_norm": 9.622672080993652, "learning_rate": 1.9352017248779035e-05, "loss": 0.2943, "step": 19100 }, { "epoch": 0.032570471813819044, "grad_norm": 7.332023620605469, "learning_rate": 1.9348624491298427e-05, "loss": 0.3054, "step": 19200 }, { "epoch": 0.03274010968784935, "grad_norm": 5.8805341720581055, "learning_rate": 1.934523173381782e-05, "loss": 0.3301, "step": 19300 }, { "epoch": 0.03290974756187966, "grad_norm": 5.076361656188965, "learning_rate": 1.9341838976337215e-05, "loss": 0.2861, "step": 19400 }, { "epoch": 0.033079385435909964, "grad_norm": 5.866313934326172, "learning_rate": 1.9338446218856607e-05, "loss": 0.3032, "step": 19500 }, { "epoch": 0.03324902330994027, "grad_norm": 4.01392126083374, "learning_rate": 1.9335053461376002e-05, "loss": 0.3095, "step": 19600 }, { "epoch": 0.03341866118397058, "grad_norm": 4.2507123947143555, "learning_rate": 1.9331660703895398e-05, "loss": 0.3098, "step": 19700 }, { "epoch": 0.033588299058000884, "grad_norm": 4.716035842895508, "learning_rate": 1.932826794641479e-05, "loss": 0.2889, "step": 19800 }, { "epoch": 0.03375793693203119, "grad_norm": 4.572210788726807, "learning_rate": 1.9324875188934185e-05, "loss": 0.3213, "step": 19900 }, { "epoch": 0.0339275748060615, "grad_norm": 7.21707010269165, "learning_rate": 1.9321482431453578e-05, "loss": 0.3115, "step": 20000 }, { "epoch": 0.03409721268009181, "grad_norm": 3.0901918411254883, "learning_rate": 1.931808967397297e-05, "loss": 0.2965, "step": 20100 }, { "epoch": 0.03426685055412212, "grad_norm": 2.76302170753479, "learning_rate": 1.9314696916492365e-05, "loss": 0.3028, "step": 20200 }, { "epoch": 0.034436488428152424, "grad_norm": 5.851570129394531, "learning_rate": 1.9311304159011757e-05, "loss": 0.3077, "step": 20300 }, { "epoch": 0.03460612630218273, "grad_norm": 4.35638427734375, "learning_rate": 1.9307911401531153e-05, "loss": 0.2943, "step": 20400 }, { "epoch": 0.03477576417621304, "grad_norm": 5.771326065063477, "learning_rate": 1.9304518644050548e-05, "loss": 0.2978, "step": 20500 }, { "epoch": 0.034945402050243345, "grad_norm": 7.85276460647583, "learning_rate": 1.930112588656994e-05, "loss": 0.3018, "step": 20600 }, { "epoch": 0.03511503992427365, "grad_norm": 4.6048126220703125, "learning_rate": 1.9297733129089336e-05, "loss": 0.3229, "step": 20700 }, { "epoch": 0.03528467779830396, "grad_norm": 5.740513324737549, "learning_rate": 1.9294340371608728e-05, "loss": 0.3066, "step": 20800 }, { "epoch": 0.035454315672334265, "grad_norm": 3.8959693908691406, "learning_rate": 1.929094761412812e-05, "loss": 0.3238, "step": 20900 }, { "epoch": 0.03562395354636458, "grad_norm": 4.749213695526123, "learning_rate": 1.9287554856647516e-05, "loss": 0.312, "step": 21000 }, { "epoch": 0.035793591420394885, "grad_norm": 3.0183956623077393, "learning_rate": 1.928416209916691e-05, "loss": 0.2993, "step": 21100 }, { "epoch": 0.03596322929442519, "grad_norm": 5.403594493865967, "learning_rate": 1.9280769341686303e-05, "loss": 0.3131, "step": 21200 }, { "epoch": 0.0361328671684555, "grad_norm": 5.854811191558838, "learning_rate": 1.92773765842057e-05, "loss": 0.3178, "step": 21300 }, { "epoch": 0.036302505042485805, "grad_norm": 7.962645053863525, "learning_rate": 1.927398382672509e-05, "loss": 0.3017, "step": 21400 }, { "epoch": 0.03647214291651611, "grad_norm": 5.824877738952637, "learning_rate": 1.9270591069244486e-05, "loss": 0.296, "step": 21500 }, { "epoch": 0.03664178079054642, "grad_norm": 5.648046493530273, "learning_rate": 1.9267198311763878e-05, "loss": 0.2917, "step": 21600 }, { "epoch": 0.036811418664576726, "grad_norm": 3.8472001552581787, "learning_rate": 1.9263805554283274e-05, "loss": 0.313, "step": 21700 }, { "epoch": 0.03698105653860703, "grad_norm": 4.848174571990967, "learning_rate": 1.926041279680267e-05, "loss": 0.3077, "step": 21800 }, { "epoch": 0.037150694412637346, "grad_norm": 5.035398006439209, "learning_rate": 1.925702003932206e-05, "loss": 0.3155, "step": 21900 }, { "epoch": 0.03732033228666765, "grad_norm": 4.550133228302002, "learning_rate": 1.9253627281841453e-05, "loss": 0.3185, "step": 22000 }, { "epoch": 0.03748997016069796, "grad_norm": 8.38109016418457, "learning_rate": 1.925023452436085e-05, "loss": 0.3028, "step": 22100 }, { "epoch": 0.037659608034728266, "grad_norm": 6.193725109100342, "learning_rate": 1.924684176688024e-05, "loss": 0.3012, "step": 22200 }, { "epoch": 0.03782924590875857, "grad_norm": 3.7424495220184326, "learning_rate": 1.9243449009399637e-05, "loss": 0.2925, "step": 22300 }, { "epoch": 0.03799888378278888, "grad_norm": 5.127978324890137, "learning_rate": 1.9240056251919032e-05, "loss": 0.2948, "step": 22400 }, { "epoch": 0.038168521656819186, "grad_norm": 10.110824584960938, "learning_rate": 1.9236663494438424e-05, "loss": 0.29, "step": 22500 }, { "epoch": 0.03833815953084949, "grad_norm": 5.516129016876221, "learning_rate": 1.923327073695782e-05, "loss": 0.3207, "step": 22600 }, { "epoch": 0.0385077974048798, "grad_norm": 4.244800090789795, "learning_rate": 1.9229877979477212e-05, "loss": 0.3054, "step": 22700 }, { "epoch": 0.03867743527891011, "grad_norm": 2.9317479133605957, "learning_rate": 1.9226485221996604e-05, "loss": 0.3049, "step": 22800 }, { "epoch": 0.03884707315294042, "grad_norm": 6.208774089813232, "learning_rate": 1.9223092464516e-05, "loss": 0.2983, "step": 22900 }, { "epoch": 0.03901671102697073, "grad_norm": 5.8420538902282715, "learning_rate": 1.921969970703539e-05, "loss": 0.2871, "step": 23000 }, { "epoch": 0.039186348901001034, "grad_norm": 7.940545082092285, "learning_rate": 1.9216306949554787e-05, "loss": 0.2914, "step": 23100 }, { "epoch": 0.03935598677503134, "grad_norm": 6.439417362213135, "learning_rate": 1.9212914192074182e-05, "loss": 0.3024, "step": 23200 }, { "epoch": 0.03952562464906165, "grad_norm": 5.705142021179199, "learning_rate": 1.9209521434593575e-05, "loss": 0.3234, "step": 23300 }, { "epoch": 0.039695262523091954, "grad_norm": 6.3700785636901855, "learning_rate": 1.920612867711297e-05, "loss": 0.3037, "step": 23400 }, { "epoch": 0.03986490039712226, "grad_norm": 6.309447288513184, "learning_rate": 1.9202735919632362e-05, "loss": 0.3038, "step": 23500 }, { "epoch": 0.040034538271152574, "grad_norm": 6.414402484893799, "learning_rate": 1.9199343162151754e-05, "loss": 0.2905, "step": 23600 }, { "epoch": 0.04020417614518288, "grad_norm": 5.034140586853027, "learning_rate": 1.919595040467115e-05, "loss": 0.287, "step": 23700 }, { "epoch": 0.04037381401921319, "grad_norm": 10.819372177124023, "learning_rate": 1.9192557647190545e-05, "loss": 0.3114, "step": 23800 }, { "epoch": 0.040543451893243494, "grad_norm": 3.805555582046509, "learning_rate": 1.9189164889709937e-05, "loss": 0.2879, "step": 23900 }, { "epoch": 0.0407130897672738, "grad_norm": 5.425251483917236, "learning_rate": 1.9185772132229333e-05, "loss": 0.2888, "step": 24000 }, { "epoch": 0.04088272764130411, "grad_norm": 5.254638671875, "learning_rate": 1.9182379374748725e-05, "loss": 0.3056, "step": 24100 }, { "epoch": 0.041052365515334414, "grad_norm": 5.436036586761475, "learning_rate": 1.9178986617268117e-05, "loss": 0.2902, "step": 24200 }, { "epoch": 0.04122200338936472, "grad_norm": 4.741951942443848, "learning_rate": 1.9175593859787512e-05, "loss": 0.2832, "step": 24300 }, { "epoch": 0.04139164126339503, "grad_norm": 4.86545467376709, "learning_rate": 1.9172201102306908e-05, "loss": 0.3024, "step": 24400 }, { "epoch": 0.04156127913742534, "grad_norm": 4.902988910675049, "learning_rate": 1.9168808344826303e-05, "loss": 0.303, "step": 24500 }, { "epoch": 0.04173091701145565, "grad_norm": 6.969100475311279, "learning_rate": 1.9165415587345696e-05, "loss": 0.309, "step": 24600 }, { "epoch": 0.041900554885485955, "grad_norm": 6.115233421325684, "learning_rate": 1.9162022829865088e-05, "loss": 0.3049, "step": 24700 }, { "epoch": 0.04207019275951626, "grad_norm": 5.473730087280273, "learning_rate": 1.9158630072384483e-05, "loss": 0.3179, "step": 24800 }, { "epoch": 0.04223983063354657, "grad_norm": 8.555501937866211, "learning_rate": 1.9155237314903875e-05, "loss": 0.288, "step": 24900 }, { "epoch": 0.042409468507576875, "grad_norm": 5.673954963684082, "learning_rate": 1.915184455742327e-05, "loss": 0.2964, "step": 25000 }, { "epoch": 0.04257910638160718, "grad_norm": 4.686615467071533, "learning_rate": 1.9148451799942663e-05, "loss": 0.29, "step": 25100 }, { "epoch": 0.04274874425563749, "grad_norm": 2.369277000427246, "learning_rate": 1.914505904246206e-05, "loss": 0.3122, "step": 25200 }, { "epoch": 0.042918382129667795, "grad_norm": 3.499999523162842, "learning_rate": 1.9141666284981454e-05, "loss": 0.3031, "step": 25300 }, { "epoch": 0.04308802000369811, "grad_norm": 4.138624668121338, "learning_rate": 1.9138273527500846e-05, "loss": 0.2964, "step": 25400 }, { "epoch": 0.043257657877728416, "grad_norm": 4.156997203826904, "learning_rate": 1.9134880770020238e-05, "loss": 0.3183, "step": 25500 }, { "epoch": 0.04342729575175872, "grad_norm": 2.943659543991089, "learning_rate": 1.9131488012539634e-05, "loss": 0.2905, "step": 25600 }, { "epoch": 0.04359693362578903, "grad_norm": 4.041491985321045, "learning_rate": 1.9128095255059026e-05, "loss": 0.2893, "step": 25700 }, { "epoch": 0.043766571499819336, "grad_norm": 4.2331223487854, "learning_rate": 1.912470249757842e-05, "loss": 0.3001, "step": 25800 }, { "epoch": 0.04393620937384964, "grad_norm": 4.752974510192871, "learning_rate": 1.9121309740097817e-05, "loss": 0.294, "step": 25900 }, { "epoch": 0.04410584724787995, "grad_norm": 6.499593257904053, "learning_rate": 1.911791698261721e-05, "loss": 0.3109, "step": 26000 }, { "epoch": 0.044275485121910256, "grad_norm": 3.4831254482269287, "learning_rate": 1.91145242251366e-05, "loss": 0.2813, "step": 26100 }, { "epoch": 0.04444512299594056, "grad_norm": 8.836137771606445, "learning_rate": 1.9111131467655996e-05, "loss": 0.3274, "step": 26200 }, { "epoch": 0.044614760869970876, "grad_norm": 4.002375602722168, "learning_rate": 1.910773871017539e-05, "loss": 0.3076, "step": 26300 }, { "epoch": 0.04478439874400118, "grad_norm": 5.142512798309326, "learning_rate": 1.9104345952694784e-05, "loss": 0.3117, "step": 26400 }, { "epoch": 0.04495403661803149, "grad_norm": 3.2780263423919678, "learning_rate": 1.910095319521418e-05, "loss": 0.2894, "step": 26500 }, { "epoch": 0.0451236744920618, "grad_norm": 2.8566198348999023, "learning_rate": 1.909756043773357e-05, "loss": 0.2964, "step": 26600 }, { "epoch": 0.0452933123660921, "grad_norm": 3.860313653945923, "learning_rate": 1.9094167680252967e-05, "loss": 0.2886, "step": 26700 }, { "epoch": 0.04546295024012241, "grad_norm": 4.558690071105957, "learning_rate": 1.909077492277236e-05, "loss": 0.3129, "step": 26800 }, { "epoch": 0.04563258811415272, "grad_norm": 6.047035217285156, "learning_rate": 1.908738216529175e-05, "loss": 0.3062, "step": 26900 }, { "epoch": 0.045802225988183023, "grad_norm": 1.9963480234146118, "learning_rate": 1.9083989407811147e-05, "loss": 0.2896, "step": 27000 }, { "epoch": 0.04597186386221333, "grad_norm": 5.530661106109619, "learning_rate": 1.9080596650330542e-05, "loss": 0.299, "step": 27100 }, { "epoch": 0.046141501736243644, "grad_norm": 3.0919837951660156, "learning_rate": 1.9077203892849934e-05, "loss": 0.3035, "step": 27200 }, { "epoch": 0.04631113961027395, "grad_norm": 5.472060203552246, "learning_rate": 1.907381113536933e-05, "loss": 0.2966, "step": 27300 }, { "epoch": 0.04648077748430426, "grad_norm": 7.818851470947266, "learning_rate": 1.9070418377888722e-05, "loss": 0.3027, "step": 27400 }, { "epoch": 0.046650415358334564, "grad_norm": 5.152708530426025, "learning_rate": 1.9067025620408117e-05, "loss": 0.3233, "step": 27500 }, { "epoch": 0.04682005323236487, "grad_norm": 6.95484733581543, "learning_rate": 1.906363286292751e-05, "loss": 0.2968, "step": 27600 }, { "epoch": 0.04698969110639518, "grad_norm": 6.102654457092285, "learning_rate": 1.9060240105446905e-05, "loss": 0.3076, "step": 27700 }, { "epoch": 0.047159328980425484, "grad_norm": 4.877326965332031, "learning_rate": 1.9056847347966297e-05, "loss": 0.3123, "step": 27800 }, { "epoch": 0.04732896685445579, "grad_norm": 8.426436424255371, "learning_rate": 1.9053454590485693e-05, "loss": 0.2992, "step": 27900 }, { "epoch": 0.0474986047284861, "grad_norm": 4.027080535888672, "learning_rate": 1.9050061833005085e-05, "loss": 0.3141, "step": 28000 }, { "epoch": 0.04766824260251641, "grad_norm": 6.852077484130859, "learning_rate": 1.904666907552448e-05, "loss": 0.2885, "step": 28100 }, { "epoch": 0.04783788047654672, "grad_norm": 7.646932601928711, "learning_rate": 1.9043276318043872e-05, "loss": 0.3142, "step": 28200 }, { "epoch": 0.048007518350577025, "grad_norm": 6.5580267906188965, "learning_rate": 1.9039883560563268e-05, "loss": 0.2968, "step": 28300 }, { "epoch": 0.04817715622460733, "grad_norm": 3.009244441986084, "learning_rate": 1.903649080308266e-05, "loss": 0.3085, "step": 28400 }, { "epoch": 0.04834679409863764, "grad_norm": 4.528927326202393, "learning_rate": 1.9033098045602055e-05, "loss": 0.2866, "step": 28500 }, { "epoch": 0.048516431972667945, "grad_norm": 6.178229808807373, "learning_rate": 1.902970528812145e-05, "loss": 0.286, "step": 28600 }, { "epoch": 0.04868606984669825, "grad_norm": 4.707374572753906, "learning_rate": 1.9026312530640843e-05, "loss": 0.3083, "step": 28700 }, { "epoch": 0.04885570772072856, "grad_norm": 5.818880081176758, "learning_rate": 1.9022919773160235e-05, "loss": 0.2957, "step": 28800 }, { "epoch": 0.049025345594758865, "grad_norm": 5.475292682647705, "learning_rate": 1.901952701567963e-05, "loss": 0.2929, "step": 28900 }, { "epoch": 0.04919498346878918, "grad_norm": 8.808664321899414, "learning_rate": 1.9016134258199023e-05, "loss": 0.3247, "step": 29000 }, { "epoch": 0.049364621342819485, "grad_norm": 5.795950412750244, "learning_rate": 1.9012741500718418e-05, "loss": 0.301, "step": 29100 }, { "epoch": 0.04953425921684979, "grad_norm": 7.253478527069092, "learning_rate": 1.9009348743237814e-05, "loss": 0.2907, "step": 29200 }, { "epoch": 0.0497038970908801, "grad_norm": 5.410602569580078, "learning_rate": 1.9005955985757206e-05, "loss": 0.2796, "step": 29300 }, { "epoch": 0.049873534964910406, "grad_norm": 4.739598274230957, "learning_rate": 1.90025632282766e-05, "loss": 0.2985, "step": 29400 }, { "epoch": 0.05004317283894071, "grad_norm": 4.493743896484375, "learning_rate": 1.8999170470795993e-05, "loss": 0.3068, "step": 29500 }, { "epoch": 0.05021281071297102, "grad_norm": 7.443539142608643, "learning_rate": 1.8995777713315385e-05, "loss": 0.2968, "step": 29600 }, { "epoch": 0.050382448587001326, "grad_norm": 5.894056797027588, "learning_rate": 1.899238495583478e-05, "loss": 0.2882, "step": 29700 }, { "epoch": 0.05055208646103163, "grad_norm": 4.181162357330322, "learning_rate": 1.8988992198354176e-05, "loss": 0.3132, "step": 29800 }, { "epoch": 0.050721724335061946, "grad_norm": 3.641421318054199, "learning_rate": 1.898559944087357e-05, "loss": 0.3146, "step": 29900 }, { "epoch": 0.05089136220909225, "grad_norm": 4.3490214347839355, "learning_rate": 1.8982206683392964e-05, "loss": 0.2928, "step": 30000 }, { "epoch": 0.05106100008312256, "grad_norm": 6.657039642333984, "learning_rate": 1.8978813925912356e-05, "loss": 0.2929, "step": 30100 }, { "epoch": 0.051230637957152866, "grad_norm": 7.719155311584473, "learning_rate": 1.897542116843175e-05, "loss": 0.2998, "step": 30200 }, { "epoch": 0.05140027583118317, "grad_norm": 3.2369911670684814, "learning_rate": 1.8972028410951144e-05, "loss": 0.2987, "step": 30300 }, { "epoch": 0.05156991370521348, "grad_norm": 7.848201751708984, "learning_rate": 1.8968635653470536e-05, "loss": 0.3012, "step": 30400 }, { "epoch": 0.05173955157924379, "grad_norm": 6.199644565582275, "learning_rate": 1.896524289598993e-05, "loss": 0.297, "step": 30500 }, { "epoch": 0.05190918945327409, "grad_norm": 4.747411727905273, "learning_rate": 1.8961850138509327e-05, "loss": 0.2779, "step": 30600 }, { "epoch": 0.0520788273273044, "grad_norm": 6.8643717765808105, "learning_rate": 1.895845738102872e-05, "loss": 0.308, "step": 30700 }, { "epoch": 0.052248465201334714, "grad_norm": 2.6329238414764404, "learning_rate": 1.8955064623548114e-05, "loss": 0.3045, "step": 30800 }, { "epoch": 0.05241810307536502, "grad_norm": 11.194560050964355, "learning_rate": 1.8951671866067506e-05, "loss": 0.2957, "step": 30900 }, { "epoch": 0.05258774094939533, "grad_norm": 5.225248336791992, "learning_rate": 1.89482791085869e-05, "loss": 0.2994, "step": 31000 }, { "epoch": 0.052757378823425634, "grad_norm": 6.687589645385742, "learning_rate": 1.8944886351106294e-05, "loss": 0.2952, "step": 31100 }, { "epoch": 0.05292701669745594, "grad_norm": 3.9322807788848877, "learning_rate": 1.894149359362569e-05, "loss": 0.2936, "step": 31200 }, { "epoch": 0.05309665457148625, "grad_norm": 4.411961555480957, "learning_rate": 1.8938100836145085e-05, "loss": 0.295, "step": 31300 }, { "epoch": 0.053266292445516554, "grad_norm": 2.752481698989868, "learning_rate": 1.8934708078664477e-05, "loss": 0.3172, "step": 31400 }, { "epoch": 0.05343593031954686, "grad_norm": 5.8485846519470215, "learning_rate": 1.893131532118387e-05, "loss": 0.2818, "step": 31500 }, { "epoch": 0.05360556819357717, "grad_norm": 5.358094215393066, "learning_rate": 1.8927922563703265e-05, "loss": 0.3064, "step": 31600 }, { "epoch": 0.05377520606760748, "grad_norm": 5.126290321350098, "learning_rate": 1.8924529806222657e-05, "loss": 0.307, "step": 31700 }, { "epoch": 0.05394484394163779, "grad_norm": 2.81768798828125, "learning_rate": 1.8921137048742052e-05, "loss": 0.2922, "step": 31800 }, { "epoch": 0.054114481815668095, "grad_norm": 3.7206032276153564, "learning_rate": 1.8917744291261448e-05, "loss": 0.3153, "step": 31900 }, { "epoch": 0.0542841196896984, "grad_norm": 4.555099964141846, "learning_rate": 1.891435153378084e-05, "loss": 0.2961, "step": 32000 }, { "epoch": 0.05445375756372871, "grad_norm": 4.590855121612549, "learning_rate": 1.8910958776300235e-05, "loss": 0.3125, "step": 32100 }, { "epoch": 0.054623395437759015, "grad_norm": 3.2159693241119385, "learning_rate": 1.8907566018819627e-05, "loss": 0.291, "step": 32200 }, { "epoch": 0.05479303331178932, "grad_norm": 6.064620018005371, "learning_rate": 1.890417326133902e-05, "loss": 0.2949, "step": 32300 }, { "epoch": 0.05496267118581963, "grad_norm": 3.5063529014587402, "learning_rate": 1.8900780503858415e-05, "loss": 0.2996, "step": 32400 }, { "epoch": 0.055132309059849935, "grad_norm": 4.376612663269043, "learning_rate": 1.889738774637781e-05, "loss": 0.3107, "step": 32500 }, { "epoch": 0.05530194693388025, "grad_norm": 7.111763000488281, "learning_rate": 1.8893994988897203e-05, "loss": 0.3067, "step": 32600 }, { "epoch": 0.055471584807910555, "grad_norm": 5.143043041229248, "learning_rate": 1.8890602231416598e-05, "loss": 0.2838, "step": 32700 }, { "epoch": 0.05564122268194086, "grad_norm": 3.3701210021972656, "learning_rate": 1.888720947393599e-05, "loss": 0.3101, "step": 32800 }, { "epoch": 0.05581086055597117, "grad_norm": 5.7372050285339355, "learning_rate": 1.8883816716455382e-05, "loss": 0.292, "step": 32900 }, { "epoch": 0.055980498430001475, "grad_norm": 6.06300163269043, "learning_rate": 1.8880423958974778e-05, "loss": 0.2881, "step": 33000 }, { "epoch": 0.05615013630403178, "grad_norm": 3.6867401599884033, "learning_rate": 1.887703120149417e-05, "loss": 0.2922, "step": 33100 }, { "epoch": 0.05631977417806209, "grad_norm": 3.9717016220092773, "learning_rate": 1.8873638444013565e-05, "loss": 0.291, "step": 33200 }, { "epoch": 0.056489412052092396, "grad_norm": 3.8224620819091797, "learning_rate": 1.887024568653296e-05, "loss": 0.294, "step": 33300 }, { "epoch": 0.05665904992612271, "grad_norm": 8.859092712402344, "learning_rate": 1.8866852929052353e-05, "loss": 0.3096, "step": 33400 }, { "epoch": 0.056828687800153016, "grad_norm": 5.533217430114746, "learning_rate": 1.886346017157175e-05, "loss": 0.2987, "step": 33500 }, { "epoch": 0.05699832567418332, "grad_norm": 3.636748790740967, "learning_rate": 1.886006741409114e-05, "loss": 0.2821, "step": 33600 }, { "epoch": 0.05716796354821363, "grad_norm": 4.531048774719238, "learning_rate": 1.8856674656610533e-05, "loss": 0.2922, "step": 33700 }, { "epoch": 0.057337601422243936, "grad_norm": 6.113272666931152, "learning_rate": 1.8853281899129928e-05, "loss": 0.2885, "step": 33800 }, { "epoch": 0.05750723929627424, "grad_norm": 4.762470722198486, "learning_rate": 1.8849889141649324e-05, "loss": 0.2989, "step": 33900 }, { "epoch": 0.05767687717030455, "grad_norm": 5.889090538024902, "learning_rate": 1.884649638416872e-05, "loss": 0.3087, "step": 34000 }, { "epoch": 0.057846515044334856, "grad_norm": 4.705977439880371, "learning_rate": 1.884310362668811e-05, "loss": 0.2935, "step": 34100 }, { "epoch": 0.05801615291836516, "grad_norm": 3.448176383972168, "learning_rate": 1.8839710869207503e-05, "loss": 0.3036, "step": 34200 }, { "epoch": 0.05818579079239548, "grad_norm": 2.747753381729126, "learning_rate": 1.88363181117269e-05, "loss": 0.2931, "step": 34300 }, { "epoch": 0.05835542866642578, "grad_norm": 2.9823110103607178, "learning_rate": 1.883292535424629e-05, "loss": 0.2776, "step": 34400 }, { "epoch": 0.05852506654045609, "grad_norm": 3.056541919708252, "learning_rate": 1.8829532596765686e-05, "loss": 0.2881, "step": 34500 }, { "epoch": 0.0586947044144864, "grad_norm": 5.15024471282959, "learning_rate": 1.8826139839285082e-05, "loss": 0.3185, "step": 34600 }, { "epoch": 0.058864342288516704, "grad_norm": 4.166646480560303, "learning_rate": 1.8822747081804474e-05, "loss": 0.2957, "step": 34700 }, { "epoch": 0.05903398016254701, "grad_norm": 5.343931674957275, "learning_rate": 1.8819354324323866e-05, "loss": 0.2987, "step": 34800 }, { "epoch": 0.05920361803657732, "grad_norm": 6.172517776489258, "learning_rate": 1.881596156684326e-05, "loss": 0.2976, "step": 34900 }, { "epoch": 0.059373255910607624, "grad_norm": 6.409415245056152, "learning_rate": 1.8812568809362654e-05, "loss": 0.2976, "step": 35000 }, { "epoch": 0.05954289378463793, "grad_norm": 7.1977620124816895, "learning_rate": 1.880917605188205e-05, "loss": 0.2956, "step": 35100 }, { "epoch": 0.059712531658668244, "grad_norm": 3.613752603530884, "learning_rate": 1.880578329440144e-05, "loss": 0.2776, "step": 35200 }, { "epoch": 0.05988216953269855, "grad_norm": 3.701549768447876, "learning_rate": 1.8802390536920837e-05, "loss": 0.2769, "step": 35300 }, { "epoch": 0.06005180740672886, "grad_norm": 4.736776351928711, "learning_rate": 1.8798997779440232e-05, "loss": 0.2904, "step": 35400 }, { "epoch": 0.060221445280759164, "grad_norm": 2.5571329593658447, "learning_rate": 1.8795605021959624e-05, "loss": 0.2906, "step": 35500 }, { "epoch": 0.06039108315478947, "grad_norm": 6.1576690673828125, "learning_rate": 1.8792212264479016e-05, "loss": 0.2829, "step": 35600 }, { "epoch": 0.06056072102881978, "grad_norm": 2.4765450954437256, "learning_rate": 1.8788819506998412e-05, "loss": 0.27, "step": 35700 }, { "epoch": 0.060730358902850085, "grad_norm": 3.6273481845855713, "learning_rate": 1.8785426749517804e-05, "loss": 0.2937, "step": 35800 }, { "epoch": 0.06089999677688039, "grad_norm": 7.852478981018066, "learning_rate": 1.87820339920372e-05, "loss": 0.291, "step": 35900 }, { "epoch": 0.0610696346509107, "grad_norm": 4.768692493438721, "learning_rate": 1.8778641234556595e-05, "loss": 0.3041, "step": 36000 }, { "epoch": 0.06123927252494101, "grad_norm": 3.2328109741210938, "learning_rate": 1.8775248477075987e-05, "loss": 0.2792, "step": 36100 }, { "epoch": 0.06140891039897132, "grad_norm": 5.355317115783691, "learning_rate": 1.8771855719595383e-05, "loss": 0.2956, "step": 36200 }, { "epoch": 0.061578548273001625, "grad_norm": 6.411366939544678, "learning_rate": 1.8768462962114775e-05, "loss": 0.3132, "step": 36300 }, { "epoch": 0.06174818614703193, "grad_norm": 4.190431594848633, "learning_rate": 1.8765070204634167e-05, "loss": 0.2789, "step": 36400 }, { "epoch": 0.06191782402106224, "grad_norm": 2.7207765579223633, "learning_rate": 1.8761677447153562e-05, "loss": 0.2857, "step": 36500 }, { "epoch": 0.062087461895092545, "grad_norm": 5.2573137283325195, "learning_rate": 1.8758284689672958e-05, "loss": 0.2889, "step": 36600 }, { "epoch": 0.06225709976912285, "grad_norm": 4.658709526062012, "learning_rate": 1.875489193219235e-05, "loss": 0.2853, "step": 36700 }, { "epoch": 0.06242673764315316, "grad_norm": 5.727476596832275, "learning_rate": 1.8751499174711745e-05, "loss": 0.2853, "step": 36800 }, { "epoch": 0.06259637551718347, "grad_norm": 6.89375114440918, "learning_rate": 1.8748106417231138e-05, "loss": 0.3091, "step": 36900 }, { "epoch": 0.06276601339121378, "grad_norm": 4.810854434967041, "learning_rate": 1.8744713659750533e-05, "loss": 0.3154, "step": 37000 }, { "epoch": 0.06293565126524409, "grad_norm": 9.32029914855957, "learning_rate": 1.8741320902269925e-05, "loss": 0.2803, "step": 37100 }, { "epoch": 0.06310528913927439, "grad_norm": 4.184632301330566, "learning_rate": 1.873792814478932e-05, "loss": 0.3114, "step": 37200 }, { "epoch": 0.0632749270133047, "grad_norm": 6.274867057800293, "learning_rate": 1.8734535387308713e-05, "loss": 0.271, "step": 37300 }, { "epoch": 0.063444564887335, "grad_norm": 4.28775691986084, "learning_rate": 1.8731142629828108e-05, "loss": 0.3055, "step": 37400 }, { "epoch": 0.06361420276136531, "grad_norm": 5.982944488525391, "learning_rate": 1.87277498723475e-05, "loss": 0.2956, "step": 37500 }, { "epoch": 0.06378384063539562, "grad_norm": 2.2512760162353516, "learning_rate": 1.8724357114866896e-05, "loss": 0.2952, "step": 37600 }, { "epoch": 0.06395347850942593, "grad_norm": 4.006387710571289, "learning_rate": 1.8720964357386288e-05, "loss": 0.2847, "step": 37700 }, { "epoch": 0.06412311638345623, "grad_norm": 6.234874725341797, "learning_rate": 1.8717571599905683e-05, "loss": 0.2938, "step": 37800 }, { "epoch": 0.06429275425748654, "grad_norm": 5.851118564605713, "learning_rate": 1.8714178842425075e-05, "loss": 0.3098, "step": 37900 }, { "epoch": 0.06446239213151685, "grad_norm": 5.330598831176758, "learning_rate": 1.871078608494447e-05, "loss": 0.2798, "step": 38000 }, { "epoch": 0.06463203000554715, "grad_norm": 4.804149627685547, "learning_rate": 1.8707393327463866e-05, "loss": 0.2965, "step": 38100 }, { "epoch": 0.06480166787957746, "grad_norm": 5.601982116699219, "learning_rate": 1.870400056998326e-05, "loss": 0.3072, "step": 38200 }, { "epoch": 0.06497130575360778, "grad_norm": 4.98843240737915, "learning_rate": 1.870060781250265e-05, "loss": 0.2786, "step": 38300 }, { "epoch": 0.06514094362763809, "grad_norm": 10.58345890045166, "learning_rate": 1.8697215055022046e-05, "loss": 0.294, "step": 38400 }, { "epoch": 0.0653105815016684, "grad_norm": 16.098262786865234, "learning_rate": 1.8693822297541438e-05, "loss": 0.3128, "step": 38500 }, { "epoch": 0.0654802193756987, "grad_norm": 5.0345377922058105, "learning_rate": 1.8690429540060834e-05, "loss": 0.3039, "step": 38600 }, { "epoch": 0.06564985724972901, "grad_norm": 5.1594767570495605, "learning_rate": 1.868703678258023e-05, "loss": 0.2939, "step": 38700 }, { "epoch": 0.06581949512375931, "grad_norm": 3.062312126159668, "learning_rate": 1.868364402509962e-05, "loss": 0.2879, "step": 38800 }, { "epoch": 0.06598913299778962, "grad_norm": 7.5915422439575195, "learning_rate": 1.8680251267619017e-05, "loss": 0.2937, "step": 38900 }, { "epoch": 0.06615877087181993, "grad_norm": 5.999471187591553, "learning_rate": 1.867685851013841e-05, "loss": 0.3035, "step": 39000 }, { "epoch": 0.06632840874585023, "grad_norm": 6.339089393615723, "learning_rate": 1.86734657526578e-05, "loss": 0.2844, "step": 39100 }, { "epoch": 0.06649804661988054, "grad_norm": 3.792302131652832, "learning_rate": 1.8670072995177196e-05, "loss": 0.2869, "step": 39200 }, { "epoch": 0.06666768449391085, "grad_norm": 8.209175109863281, "learning_rate": 1.8666680237696592e-05, "loss": 0.3063, "step": 39300 }, { "epoch": 0.06683732236794115, "grad_norm": 5.1575822830200195, "learning_rate": 1.8663287480215984e-05, "loss": 0.2908, "step": 39400 }, { "epoch": 0.06700696024197146, "grad_norm": 6.011523246765137, "learning_rate": 1.865989472273538e-05, "loss": 0.2881, "step": 39500 }, { "epoch": 0.06717659811600177, "grad_norm": 4.13815450668335, "learning_rate": 1.865650196525477e-05, "loss": 0.275, "step": 39600 }, { "epoch": 0.06734623599003207, "grad_norm": 5.636247634887695, "learning_rate": 1.8653109207774167e-05, "loss": 0.2911, "step": 39700 }, { "epoch": 0.06751587386406238, "grad_norm": 4.772889614105225, "learning_rate": 1.864971645029356e-05, "loss": 0.2988, "step": 39800 }, { "epoch": 0.06768551173809269, "grad_norm": 5.828875541687012, "learning_rate": 1.8646323692812955e-05, "loss": 0.2909, "step": 39900 }, { "epoch": 0.067855149612123, "grad_norm": 3.6134467124938965, "learning_rate": 1.8642930935332347e-05, "loss": 0.2864, "step": 40000 }, { "epoch": 0.06802478748615332, "grad_norm": 6.878890514373779, "learning_rate": 1.8639538177851742e-05, "loss": 0.3, "step": 40100 }, { "epoch": 0.06819442536018362, "grad_norm": 3.2541019916534424, "learning_rate": 1.8636145420371134e-05, "loss": 0.2724, "step": 40200 }, { "epoch": 0.06836406323421393, "grad_norm": 6.304093360900879, "learning_rate": 1.863275266289053e-05, "loss": 0.3064, "step": 40300 }, { "epoch": 0.06853370110824424, "grad_norm": 3.7863245010375977, "learning_rate": 1.8629359905409922e-05, "loss": 0.2919, "step": 40400 }, { "epoch": 0.06870333898227454, "grad_norm": 5.97468900680542, "learning_rate": 1.8625967147929314e-05, "loss": 0.2768, "step": 40500 }, { "epoch": 0.06887297685630485, "grad_norm": 12.00987720489502, "learning_rate": 1.862257439044871e-05, "loss": 0.2869, "step": 40600 }, { "epoch": 0.06904261473033516, "grad_norm": 3.671211004257202, "learning_rate": 1.8619181632968105e-05, "loss": 0.291, "step": 40700 }, { "epoch": 0.06921225260436546, "grad_norm": 4.041658401489258, "learning_rate": 1.86157888754875e-05, "loss": 0.299, "step": 40800 }, { "epoch": 0.06938189047839577, "grad_norm": 3.3238372802734375, "learning_rate": 1.8612396118006893e-05, "loss": 0.2872, "step": 40900 }, { "epoch": 0.06955152835242608, "grad_norm": 4.288298606872559, "learning_rate": 1.8609003360526285e-05, "loss": 0.2844, "step": 41000 }, { "epoch": 0.06972116622645638, "grad_norm": 6.197023391723633, "learning_rate": 1.860561060304568e-05, "loss": 0.2782, "step": 41100 }, { "epoch": 0.06989080410048669, "grad_norm": 3.3757781982421875, "learning_rate": 1.8602217845565072e-05, "loss": 0.2944, "step": 41200 }, { "epoch": 0.070060441974517, "grad_norm": 6.845643043518066, "learning_rate": 1.8598825088084468e-05, "loss": 0.2934, "step": 41300 }, { "epoch": 0.0702300798485473, "grad_norm": 4.8647847175598145, "learning_rate": 1.8595432330603863e-05, "loss": 0.2822, "step": 41400 }, { "epoch": 0.07039971772257761, "grad_norm": 5.153517723083496, "learning_rate": 1.8592039573123255e-05, "loss": 0.2968, "step": 41500 }, { "epoch": 0.07056935559660792, "grad_norm": 6.134669780731201, "learning_rate": 1.8588646815642648e-05, "loss": 0.2985, "step": 41600 }, { "epoch": 0.07073899347063822, "grad_norm": 5.656508445739746, "learning_rate": 1.8585254058162043e-05, "loss": 0.2911, "step": 41700 }, { "epoch": 0.07090863134466853, "grad_norm": 5.582975387573242, "learning_rate": 1.8581861300681435e-05, "loss": 0.282, "step": 41800 }, { "epoch": 0.07107826921869885, "grad_norm": 2.391340970993042, "learning_rate": 1.857846854320083e-05, "loss": 0.2953, "step": 41900 }, { "epoch": 0.07124790709272916, "grad_norm": 6.318803787231445, "learning_rate": 1.8575075785720226e-05, "loss": 0.2951, "step": 42000 }, { "epoch": 0.07141754496675946, "grad_norm": 7.733858108520508, "learning_rate": 1.8571683028239618e-05, "loss": 0.3182, "step": 42100 }, { "epoch": 0.07158718284078977, "grad_norm": 3.91455340385437, "learning_rate": 1.8568290270759014e-05, "loss": 0.3056, "step": 42200 }, { "epoch": 0.07175682071482008, "grad_norm": 4.476867198944092, "learning_rate": 1.8564897513278406e-05, "loss": 0.2907, "step": 42300 }, { "epoch": 0.07192645858885038, "grad_norm": 5.4174933433532715, "learning_rate": 1.8561504755797798e-05, "loss": 0.2838, "step": 42400 }, { "epoch": 0.07209609646288069, "grad_norm": 5.010890007019043, "learning_rate": 1.8558111998317193e-05, "loss": 0.2651, "step": 42500 }, { "epoch": 0.072265734336911, "grad_norm": 5.307522773742676, "learning_rate": 1.855471924083659e-05, "loss": 0.2961, "step": 42600 }, { "epoch": 0.0724353722109413, "grad_norm": 5.147155284881592, "learning_rate": 1.855132648335598e-05, "loss": 0.2932, "step": 42700 }, { "epoch": 0.07260501008497161, "grad_norm": 7.3125505447387695, "learning_rate": 1.8547933725875377e-05, "loss": 0.2872, "step": 42800 }, { "epoch": 0.07277464795900192, "grad_norm": 5.117211818695068, "learning_rate": 1.854454096839477e-05, "loss": 0.2944, "step": 42900 }, { "epoch": 0.07294428583303222, "grad_norm": 3.474435806274414, "learning_rate": 1.8541148210914164e-05, "loss": 0.2796, "step": 43000 }, { "epoch": 0.07311392370706253, "grad_norm": 5.133522033691406, "learning_rate": 1.8537755453433556e-05, "loss": 0.3016, "step": 43100 }, { "epoch": 0.07328356158109284, "grad_norm": 3.8202648162841797, "learning_rate": 1.853436269595295e-05, "loss": 0.2865, "step": 43200 }, { "epoch": 0.07345319945512314, "grad_norm": 5.032595634460449, "learning_rate": 1.8530969938472344e-05, "loss": 0.3123, "step": 43300 }, { "epoch": 0.07362283732915345, "grad_norm": 5.874632358551025, "learning_rate": 1.852757718099174e-05, "loss": 0.2777, "step": 43400 }, { "epoch": 0.07379247520318376, "grad_norm": 9.481407165527344, "learning_rate": 1.852418442351113e-05, "loss": 0.2865, "step": 43500 }, { "epoch": 0.07396211307721406, "grad_norm": 5.955962181091309, "learning_rate": 1.8520791666030527e-05, "loss": 0.2796, "step": 43600 }, { "epoch": 0.07413175095124439, "grad_norm": 6.955836296081543, "learning_rate": 1.851739890854992e-05, "loss": 0.2853, "step": 43700 }, { "epoch": 0.07430138882527469, "grad_norm": 3.5871200561523438, "learning_rate": 1.8514006151069314e-05, "loss": 0.2832, "step": 43800 }, { "epoch": 0.074471026699305, "grad_norm": 6.962113857269287, "learning_rate": 1.8510613393588707e-05, "loss": 0.2991, "step": 43900 }, { "epoch": 0.0746406645733353, "grad_norm": 7.3473124504089355, "learning_rate": 1.8507220636108102e-05, "loss": 0.2975, "step": 44000 }, { "epoch": 0.07481030244736561, "grad_norm": 4.764953136444092, "learning_rate": 1.8503827878627498e-05, "loss": 0.2877, "step": 44100 }, { "epoch": 0.07497994032139592, "grad_norm": 8.341894149780273, "learning_rate": 1.850043512114689e-05, "loss": 0.2815, "step": 44200 }, { "epoch": 0.07514957819542623, "grad_norm": 6.1239519119262695, "learning_rate": 1.8497042363666282e-05, "loss": 0.2965, "step": 44300 }, { "epoch": 0.07531921606945653, "grad_norm": 3.8017592430114746, "learning_rate": 1.8493649606185677e-05, "loss": 0.2861, "step": 44400 }, { "epoch": 0.07548885394348684, "grad_norm": 9.024279594421387, "learning_rate": 1.849025684870507e-05, "loss": 0.2752, "step": 44500 }, { "epoch": 0.07565849181751715, "grad_norm": 2.7965548038482666, "learning_rate": 1.8486864091224465e-05, "loss": 0.2778, "step": 44600 }, { "epoch": 0.07582812969154745, "grad_norm": 4.206871509552002, "learning_rate": 1.848347133374386e-05, "loss": 0.2836, "step": 44700 }, { "epoch": 0.07599776756557776, "grad_norm": 5.971789360046387, "learning_rate": 1.8480078576263252e-05, "loss": 0.2886, "step": 44800 }, { "epoch": 0.07616740543960807, "grad_norm": 10.429966926574707, "learning_rate": 1.8476685818782648e-05, "loss": 0.2853, "step": 44900 }, { "epoch": 0.07633704331363837, "grad_norm": 3.5841941833496094, "learning_rate": 1.847329306130204e-05, "loss": 0.2823, "step": 45000 }, { "epoch": 0.07650668118766868, "grad_norm": 3.297451972961426, "learning_rate": 1.8469900303821432e-05, "loss": 0.2662, "step": 45100 }, { "epoch": 0.07667631906169899, "grad_norm": 5.221707820892334, "learning_rate": 1.8466507546340828e-05, "loss": 0.2922, "step": 45200 }, { "epoch": 0.07684595693572929, "grad_norm": 4.059057712554932, "learning_rate": 1.846311478886022e-05, "loss": 0.3083, "step": 45300 }, { "epoch": 0.0770155948097596, "grad_norm": 3.36012601852417, "learning_rate": 1.8459722031379615e-05, "loss": 0.2707, "step": 45400 }, { "epoch": 0.07718523268378992, "grad_norm": 4.261825084686279, "learning_rate": 1.845632927389901e-05, "loss": 0.2906, "step": 45500 }, { "epoch": 0.07735487055782023, "grad_norm": 3.491938352584839, "learning_rate": 1.8452936516418403e-05, "loss": 0.2898, "step": 45600 }, { "epoch": 0.07752450843185053, "grad_norm": 4.772907733917236, "learning_rate": 1.8449543758937798e-05, "loss": 0.2909, "step": 45700 }, { "epoch": 0.07769414630588084, "grad_norm": 3.6635468006134033, "learning_rate": 1.844615100145719e-05, "loss": 0.3018, "step": 45800 }, { "epoch": 0.07786378417991115, "grad_norm": 6.157771587371826, "learning_rate": 1.8442758243976582e-05, "loss": 0.2951, "step": 45900 }, { "epoch": 0.07803342205394145, "grad_norm": 3.9932868480682373, "learning_rate": 1.8439365486495978e-05, "loss": 0.3024, "step": 46000 }, { "epoch": 0.07820305992797176, "grad_norm": 4.041582107543945, "learning_rate": 1.8435972729015373e-05, "loss": 0.2993, "step": 46100 }, { "epoch": 0.07837269780200207, "grad_norm": 6.411997318267822, "learning_rate": 1.8432579971534766e-05, "loss": 0.2933, "step": 46200 }, { "epoch": 0.07854233567603237, "grad_norm": 5.640613555908203, "learning_rate": 1.842918721405416e-05, "loss": 0.3073, "step": 46300 }, { "epoch": 0.07871197355006268, "grad_norm": 6.077905654907227, "learning_rate": 1.8425794456573553e-05, "loss": 0.2918, "step": 46400 }, { "epoch": 0.07888161142409299, "grad_norm": 6.582813262939453, "learning_rate": 1.842240169909295e-05, "loss": 0.2719, "step": 46500 }, { "epoch": 0.0790512492981233, "grad_norm": 2.708589792251587, "learning_rate": 1.841900894161234e-05, "loss": 0.2921, "step": 46600 }, { "epoch": 0.0792208871721536, "grad_norm": 4.0252275466918945, "learning_rate": 1.8415616184131736e-05, "loss": 0.2919, "step": 46700 }, { "epoch": 0.07939052504618391, "grad_norm": 5.024626731872559, "learning_rate": 1.8412223426651132e-05, "loss": 0.2872, "step": 46800 }, { "epoch": 0.07956016292021421, "grad_norm": 3.968055009841919, "learning_rate": 1.8408830669170524e-05, "loss": 0.2802, "step": 46900 }, { "epoch": 0.07972980079424452, "grad_norm": 5.665511131286621, "learning_rate": 1.8405437911689916e-05, "loss": 0.2859, "step": 47000 }, { "epoch": 0.07989943866827483, "grad_norm": 3.060563087463379, "learning_rate": 1.840204515420931e-05, "loss": 0.2839, "step": 47100 }, { "epoch": 0.08006907654230515, "grad_norm": 4.720939636230469, "learning_rate": 1.8398652396728704e-05, "loss": 0.2975, "step": 47200 }, { "epoch": 0.08023871441633545, "grad_norm": 5.813382625579834, "learning_rate": 1.83952596392481e-05, "loss": 0.2893, "step": 47300 }, { "epoch": 0.08040835229036576, "grad_norm": 3.5375208854675293, "learning_rate": 1.839186688176749e-05, "loss": 0.2596, "step": 47400 }, { "epoch": 0.08057799016439607, "grad_norm": 5.778880596160889, "learning_rate": 1.8388474124286887e-05, "loss": 0.2917, "step": 47500 }, { "epoch": 0.08074762803842638, "grad_norm": 3.478928565979004, "learning_rate": 1.8385081366806282e-05, "loss": 0.3093, "step": 47600 }, { "epoch": 0.08091726591245668, "grad_norm": 7.4397358894348145, "learning_rate": 1.8381688609325674e-05, "loss": 0.2839, "step": 47700 }, { "epoch": 0.08108690378648699, "grad_norm": 4.262002944946289, "learning_rate": 1.8378295851845066e-05, "loss": 0.3002, "step": 47800 }, { "epoch": 0.0812565416605173, "grad_norm": 6.081060886383057, "learning_rate": 1.8374903094364462e-05, "loss": 0.2828, "step": 47900 }, { "epoch": 0.0814261795345476, "grad_norm": 6.453099727630615, "learning_rate": 1.8371510336883854e-05, "loss": 0.2813, "step": 48000 }, { "epoch": 0.08159581740857791, "grad_norm": 6.578404903411865, "learning_rate": 1.836811757940325e-05, "loss": 0.2886, "step": 48100 }, { "epoch": 0.08176545528260822, "grad_norm": 4.056455135345459, "learning_rate": 1.8364724821922645e-05, "loss": 0.2835, "step": 48200 }, { "epoch": 0.08193509315663852, "grad_norm": 4.4009575843811035, "learning_rate": 1.8361332064442037e-05, "loss": 0.2852, "step": 48300 }, { "epoch": 0.08210473103066883, "grad_norm": 5.6173248291015625, "learning_rate": 1.8357939306961432e-05, "loss": 0.2819, "step": 48400 }, { "epoch": 0.08227436890469914, "grad_norm": 4.746309757232666, "learning_rate": 1.8354546549480825e-05, "loss": 0.2852, "step": 48500 }, { "epoch": 0.08244400677872944, "grad_norm": 5.073624134063721, "learning_rate": 1.8351153792000217e-05, "loss": 0.3005, "step": 48600 }, { "epoch": 0.08261364465275975, "grad_norm": 6.9145917892456055, "learning_rate": 1.8347761034519612e-05, "loss": 0.3004, "step": 48700 }, { "epoch": 0.08278328252679006, "grad_norm": 4.952718734741211, "learning_rate": 1.8344368277039008e-05, "loss": 0.2671, "step": 48800 }, { "epoch": 0.08295292040082036, "grad_norm": 8.070359230041504, "learning_rate": 1.83409755195584e-05, "loss": 0.274, "step": 48900 }, { "epoch": 0.08312255827485068, "grad_norm": 6.285434246063232, "learning_rate": 1.8337582762077795e-05, "loss": 0.2761, "step": 49000 }, { "epoch": 0.08329219614888099, "grad_norm": 2.584946632385254, "learning_rate": 1.8334190004597187e-05, "loss": 0.2871, "step": 49100 }, { "epoch": 0.0834618340229113, "grad_norm": 6.557065486907959, "learning_rate": 1.833079724711658e-05, "loss": 0.2844, "step": 49200 }, { "epoch": 0.0836314718969416, "grad_norm": 3.196906805038452, "learning_rate": 1.8327404489635975e-05, "loss": 0.3038, "step": 49300 }, { "epoch": 0.08380110977097191, "grad_norm": 4.319844722747803, "learning_rate": 1.832401173215537e-05, "loss": 0.2987, "step": 49400 }, { "epoch": 0.08397074764500222, "grad_norm": 6.0317277908325195, "learning_rate": 1.8320618974674766e-05, "loss": 0.2681, "step": 49500 }, { "epoch": 0.08414038551903252, "grad_norm": 2.986727714538574, "learning_rate": 1.8317226217194158e-05, "loss": 0.2817, "step": 49600 }, { "epoch": 0.08431002339306283, "grad_norm": 8.08899974822998, "learning_rate": 1.831383345971355e-05, "loss": 0.2731, "step": 49700 }, { "epoch": 0.08447966126709314, "grad_norm": 3.815591812133789, "learning_rate": 1.8310440702232946e-05, "loss": 0.2686, "step": 49800 }, { "epoch": 0.08464929914112344, "grad_norm": 4.29105806350708, "learning_rate": 1.8307047944752338e-05, "loss": 0.2935, "step": 49900 }, { "epoch": 0.08481893701515375, "grad_norm": 4.556109428405762, "learning_rate": 1.8303655187271733e-05, "loss": 0.2679, "step": 50000 }, { "epoch": 0.08498857488918406, "grad_norm": 6.7415080070495605, "learning_rate": 1.8300262429791125e-05, "loss": 0.2846, "step": 50100 }, { "epoch": 0.08515821276321436, "grad_norm": 3.9190878868103027, "learning_rate": 1.829686967231052e-05, "loss": 0.2819, "step": 50200 }, { "epoch": 0.08532785063724467, "grad_norm": 4.90328311920166, "learning_rate": 1.8293476914829916e-05, "loss": 0.2876, "step": 50300 }, { "epoch": 0.08549748851127498, "grad_norm": 4.983177661895752, "learning_rate": 1.829008415734931e-05, "loss": 0.2821, "step": 50400 }, { "epoch": 0.08566712638530528, "grad_norm": 6.555051803588867, "learning_rate": 1.82866913998687e-05, "loss": 0.3031, "step": 50500 }, { "epoch": 0.08583676425933559, "grad_norm": 4.579891204833984, "learning_rate": 1.8283298642388096e-05, "loss": 0.2741, "step": 50600 }, { "epoch": 0.0860064021333659, "grad_norm": 3.9871301651000977, "learning_rate": 1.8279905884907488e-05, "loss": 0.2825, "step": 50700 }, { "epoch": 0.08617604000739622, "grad_norm": 5.2083330154418945, "learning_rate": 1.8276513127426884e-05, "loss": 0.2716, "step": 50800 }, { "epoch": 0.08634567788142652, "grad_norm": 4.516756057739258, "learning_rate": 1.827312036994628e-05, "loss": 0.2903, "step": 50900 }, { "epoch": 0.08651531575545683, "grad_norm": 3.328808307647705, "learning_rate": 1.826972761246567e-05, "loss": 0.3059, "step": 51000 }, { "epoch": 0.08668495362948714, "grad_norm": 3.5385241508483887, "learning_rate": 1.8266334854985063e-05, "loss": 0.2793, "step": 51100 }, { "epoch": 0.08685459150351744, "grad_norm": 5.077141761779785, "learning_rate": 1.826294209750446e-05, "loss": 0.2845, "step": 51200 }, { "epoch": 0.08702422937754775, "grad_norm": 3.4782626628875732, "learning_rate": 1.825954934002385e-05, "loss": 0.3144, "step": 51300 }, { "epoch": 0.08719386725157806, "grad_norm": 5.997594356536865, "learning_rate": 1.8256156582543246e-05, "loss": 0.2803, "step": 51400 }, { "epoch": 0.08736350512560837, "grad_norm": 6.201345443725586, "learning_rate": 1.8252763825062642e-05, "loss": 0.2777, "step": 51500 }, { "epoch": 0.08753314299963867, "grad_norm": 6.14375114440918, "learning_rate": 1.8249371067582034e-05, "loss": 0.2755, "step": 51600 }, { "epoch": 0.08770278087366898, "grad_norm": 8.308558464050293, "learning_rate": 1.824597831010143e-05, "loss": 0.2943, "step": 51700 }, { "epoch": 0.08787241874769929, "grad_norm": 5.419277191162109, "learning_rate": 1.824258555262082e-05, "loss": 0.2773, "step": 51800 }, { "epoch": 0.08804205662172959, "grad_norm": 5.484429836273193, "learning_rate": 1.8239192795140214e-05, "loss": 0.2887, "step": 51900 }, { "epoch": 0.0882116944957599, "grad_norm": 3.9766979217529297, "learning_rate": 1.823580003765961e-05, "loss": 0.2774, "step": 52000 }, { "epoch": 0.0883813323697902, "grad_norm": 6.129132270812988, "learning_rate": 1.8232407280179005e-05, "loss": 0.2887, "step": 52100 }, { "epoch": 0.08855097024382051, "grad_norm": 3.909789800643921, "learning_rate": 1.8229014522698397e-05, "loss": 0.2883, "step": 52200 }, { "epoch": 0.08872060811785082, "grad_norm": 3.396498203277588, "learning_rate": 1.8225621765217792e-05, "loss": 0.3066, "step": 52300 }, { "epoch": 0.08889024599188113, "grad_norm": 4.789741039276123, "learning_rate": 1.8222229007737184e-05, "loss": 0.277, "step": 52400 }, { "epoch": 0.08905988386591143, "grad_norm": 5.656152725219727, "learning_rate": 1.821883625025658e-05, "loss": 0.286, "step": 52500 }, { "epoch": 0.08922952173994175, "grad_norm": 5.650546073913574, "learning_rate": 1.8215443492775972e-05, "loss": 0.2917, "step": 52600 }, { "epoch": 0.08939915961397206, "grad_norm": 6.8344526290893555, "learning_rate": 1.8212050735295367e-05, "loss": 0.274, "step": 52700 }, { "epoch": 0.08956879748800237, "grad_norm": 7.120495796203613, "learning_rate": 1.820865797781476e-05, "loss": 0.2614, "step": 52800 }, { "epoch": 0.08973843536203267, "grad_norm": 4.785940170288086, "learning_rate": 1.8205265220334155e-05, "loss": 0.3197, "step": 52900 }, { "epoch": 0.08990807323606298, "grad_norm": 4.830707550048828, "learning_rate": 1.8201872462853547e-05, "loss": 0.2833, "step": 53000 }, { "epoch": 0.09007771111009329, "grad_norm": 3.985246181488037, "learning_rate": 1.8198479705372943e-05, "loss": 0.293, "step": 53100 }, { "epoch": 0.0902473489841236, "grad_norm": 4.136169910430908, "learning_rate": 1.8195086947892335e-05, "loss": 0.2961, "step": 53200 }, { "epoch": 0.0904169868581539, "grad_norm": 5.470151901245117, "learning_rate": 1.819169419041173e-05, "loss": 0.2925, "step": 53300 }, { "epoch": 0.0905866247321842, "grad_norm": 5.676898956298828, "learning_rate": 1.8188301432931122e-05, "loss": 0.2865, "step": 53400 }, { "epoch": 0.09075626260621451, "grad_norm": 3.1361026763916016, "learning_rate": 1.8184908675450518e-05, "loss": 0.2751, "step": 53500 }, { "epoch": 0.09092590048024482, "grad_norm": 1.8005691766738892, "learning_rate": 1.8181515917969913e-05, "loss": 0.2965, "step": 53600 }, { "epoch": 0.09109553835427513, "grad_norm": 15.075016021728516, "learning_rate": 1.8178123160489305e-05, "loss": 0.2888, "step": 53700 }, { "epoch": 0.09126517622830543, "grad_norm": 4.634654998779297, "learning_rate": 1.8174730403008697e-05, "loss": 0.2863, "step": 53800 }, { "epoch": 0.09143481410233574, "grad_norm": 4.154201507568359, "learning_rate": 1.8171337645528093e-05, "loss": 0.2772, "step": 53900 }, { "epoch": 0.09160445197636605, "grad_norm": 4.881275177001953, "learning_rate": 1.8167944888047485e-05, "loss": 0.2808, "step": 54000 }, { "epoch": 0.09177408985039635, "grad_norm": 4.617827415466309, "learning_rate": 1.816455213056688e-05, "loss": 0.276, "step": 54100 }, { "epoch": 0.09194372772442666, "grad_norm": 3.384646415710449, "learning_rate": 1.8161159373086276e-05, "loss": 0.2814, "step": 54200 }, { "epoch": 0.09211336559845697, "grad_norm": 5.730099678039551, "learning_rate": 1.8157766615605668e-05, "loss": 0.293, "step": 54300 }, { "epoch": 0.09228300347248729, "grad_norm": 5.290204048156738, "learning_rate": 1.8154373858125064e-05, "loss": 0.2852, "step": 54400 }, { "epoch": 0.0924526413465176, "grad_norm": 7.329472541809082, "learning_rate": 1.8150981100644456e-05, "loss": 0.2931, "step": 54500 }, { "epoch": 0.0926222792205479, "grad_norm": 6.152427673339844, "learning_rate": 1.8147588343163848e-05, "loss": 0.2883, "step": 54600 }, { "epoch": 0.09279191709457821, "grad_norm": 3.917278528213501, "learning_rate": 1.8144195585683243e-05, "loss": 0.2809, "step": 54700 }, { "epoch": 0.09296155496860851, "grad_norm": 6.414370059967041, "learning_rate": 1.814080282820264e-05, "loss": 0.2804, "step": 54800 }, { "epoch": 0.09313119284263882, "grad_norm": 5.811496257781982, "learning_rate": 1.813741007072203e-05, "loss": 0.2873, "step": 54900 }, { "epoch": 0.09330083071666913, "grad_norm": 8.334771156311035, "learning_rate": 1.8134017313241426e-05, "loss": 0.2997, "step": 55000 }, { "epoch": 0.09347046859069943, "grad_norm": 7.446629047393799, "learning_rate": 1.813062455576082e-05, "loss": 0.2979, "step": 55100 }, { "epoch": 0.09364010646472974, "grad_norm": 6.15883731842041, "learning_rate": 1.8127231798280214e-05, "loss": 0.278, "step": 55200 }, { "epoch": 0.09380974433876005, "grad_norm": 11.939590454101562, "learning_rate": 1.8123839040799606e-05, "loss": 0.2836, "step": 55300 }, { "epoch": 0.09397938221279035, "grad_norm": 4.8299407958984375, "learning_rate": 1.8120446283318998e-05, "loss": 0.2744, "step": 55400 }, { "epoch": 0.09414902008682066, "grad_norm": 8.276083946228027, "learning_rate": 1.8117053525838394e-05, "loss": 0.2747, "step": 55500 }, { "epoch": 0.09431865796085097, "grad_norm": 3.7651185989379883, "learning_rate": 1.811366076835779e-05, "loss": 0.2793, "step": 55600 }, { "epoch": 0.09448829583488128, "grad_norm": 4.879868984222412, "learning_rate": 1.811026801087718e-05, "loss": 0.2871, "step": 55700 }, { "epoch": 0.09465793370891158, "grad_norm": 6.085641860961914, "learning_rate": 1.8106875253396577e-05, "loss": 0.2786, "step": 55800 }, { "epoch": 0.09482757158294189, "grad_norm": 4.736376762390137, "learning_rate": 1.810348249591597e-05, "loss": 0.2937, "step": 55900 }, { "epoch": 0.0949972094569722, "grad_norm": 3.436492681503296, "learning_rate": 1.810008973843536e-05, "loss": 0.275, "step": 56000 }, { "epoch": 0.0951668473310025, "grad_norm": 4.0492072105407715, "learning_rate": 1.8096696980954756e-05, "loss": 0.2846, "step": 56100 }, { "epoch": 0.09533648520503282, "grad_norm": 4.218546390533447, "learning_rate": 1.8093304223474152e-05, "loss": 0.3023, "step": 56200 }, { "epoch": 0.09550612307906313, "grad_norm": 2.415118932723999, "learning_rate": 1.8089911465993547e-05, "loss": 0.2893, "step": 56300 }, { "epoch": 0.09567576095309344, "grad_norm": 6.746696472167969, "learning_rate": 1.808651870851294e-05, "loss": 0.2787, "step": 56400 }, { "epoch": 0.09584539882712374, "grad_norm": 6.815089225769043, "learning_rate": 1.808312595103233e-05, "loss": 0.2724, "step": 56500 }, { "epoch": 0.09601503670115405, "grad_norm": 4.8559250831604, "learning_rate": 1.8079733193551727e-05, "loss": 0.2818, "step": 56600 }, { "epoch": 0.09618467457518436, "grad_norm": 5.159215927124023, "learning_rate": 1.807634043607112e-05, "loss": 0.2686, "step": 56700 }, { "epoch": 0.09635431244921466, "grad_norm": 6.286808490753174, "learning_rate": 1.8072947678590515e-05, "loss": 0.2942, "step": 56800 }, { "epoch": 0.09652395032324497, "grad_norm": 9.618817329406738, "learning_rate": 1.806955492110991e-05, "loss": 0.3073, "step": 56900 }, { "epoch": 0.09669358819727528, "grad_norm": 7.026090621948242, "learning_rate": 1.8066162163629302e-05, "loss": 0.2834, "step": 57000 }, { "epoch": 0.09686322607130558, "grad_norm": 7.144518852233887, "learning_rate": 1.8062769406148698e-05, "loss": 0.2941, "step": 57100 }, { "epoch": 0.09703286394533589, "grad_norm": 3.5571937561035156, "learning_rate": 1.805937664866809e-05, "loss": 0.2766, "step": 57200 }, { "epoch": 0.0972025018193662, "grad_norm": 2.6627674102783203, "learning_rate": 1.8055983891187482e-05, "loss": 0.2935, "step": 57300 }, { "epoch": 0.0973721396933965, "grad_norm": 4.254127025604248, "learning_rate": 1.8052591133706877e-05, "loss": 0.2753, "step": 57400 }, { "epoch": 0.09754177756742681, "grad_norm": 5.155171871185303, "learning_rate": 1.804919837622627e-05, "loss": 0.2826, "step": 57500 }, { "epoch": 0.09771141544145712, "grad_norm": 5.0977654457092285, "learning_rate": 1.8045805618745665e-05, "loss": 0.2823, "step": 57600 }, { "epoch": 0.09788105331548742, "grad_norm": 2.921365261077881, "learning_rate": 1.804241286126506e-05, "loss": 0.2872, "step": 57700 }, { "epoch": 0.09805069118951773, "grad_norm": 5.515784740447998, "learning_rate": 1.8039020103784453e-05, "loss": 0.2865, "step": 57800 }, { "epoch": 0.09822032906354805, "grad_norm": 4.5168776512146, "learning_rate": 1.8035627346303845e-05, "loss": 0.2919, "step": 57900 }, { "epoch": 0.09838996693757836, "grad_norm": 2.7508063316345215, "learning_rate": 1.803223458882324e-05, "loss": 0.2631, "step": 58000 }, { "epoch": 0.09855960481160866, "grad_norm": 3.908681631088257, "learning_rate": 1.8028841831342632e-05, "loss": 0.2829, "step": 58100 }, { "epoch": 0.09872924268563897, "grad_norm": 4.71787166595459, "learning_rate": 1.8025449073862028e-05, "loss": 0.2784, "step": 58200 }, { "epoch": 0.09889888055966928, "grad_norm": 3.3572702407836914, "learning_rate": 1.8022056316381423e-05, "loss": 0.2823, "step": 58300 }, { "epoch": 0.09906851843369958, "grad_norm": 6.225683689117432, "learning_rate": 1.8018663558900815e-05, "loss": 0.2779, "step": 58400 }, { "epoch": 0.09923815630772989, "grad_norm": 4.050034046173096, "learning_rate": 1.801527080142021e-05, "loss": 0.2575, "step": 58500 }, { "epoch": 0.0994077941817602, "grad_norm": 4.063872337341309, "learning_rate": 1.8011878043939603e-05, "loss": 0.2857, "step": 58600 }, { "epoch": 0.0995774320557905, "grad_norm": 4.027149677276611, "learning_rate": 1.8008485286458995e-05, "loss": 0.2852, "step": 58700 }, { "epoch": 0.09974706992982081, "grad_norm": 1.8193323612213135, "learning_rate": 1.800509252897839e-05, "loss": 0.2891, "step": 58800 }, { "epoch": 0.09991670780385112, "grad_norm": 4.918915748596191, "learning_rate": 1.8001699771497786e-05, "loss": 0.2981, "step": 58900 }, { "epoch": 0.10008634567788142, "grad_norm": 7.531496047973633, "learning_rate": 1.799830701401718e-05, "loss": 0.272, "step": 59000 }, { "epoch": 0.10025598355191173, "grad_norm": 4.3996381759643555, "learning_rate": 1.7994914256536574e-05, "loss": 0.2884, "step": 59100 }, { "epoch": 0.10042562142594204, "grad_norm": 4.291267395019531, "learning_rate": 1.7991521499055966e-05, "loss": 0.2743, "step": 59200 }, { "epoch": 0.10059525929997234, "grad_norm": 5.597812652587891, "learning_rate": 1.798812874157536e-05, "loss": 0.2909, "step": 59300 }, { "epoch": 0.10076489717400265, "grad_norm": 5.40132999420166, "learning_rate": 1.7984735984094753e-05, "loss": 0.2729, "step": 59400 }, { "epoch": 0.10093453504803296, "grad_norm": 4.744465351104736, "learning_rate": 1.798134322661415e-05, "loss": 0.3006, "step": 59500 }, { "epoch": 0.10110417292206327, "grad_norm": 4.616202354431152, "learning_rate": 1.7977950469133544e-05, "loss": 0.2809, "step": 59600 }, { "epoch": 0.10127381079609359, "grad_norm": 3.3520281314849854, "learning_rate": 1.7974557711652936e-05, "loss": 0.2905, "step": 59700 }, { "epoch": 0.10144344867012389, "grad_norm": 7.124616622924805, "learning_rate": 1.797116495417233e-05, "loss": 0.2875, "step": 59800 }, { "epoch": 0.1016130865441542, "grad_norm": 9.273002624511719, "learning_rate": 1.7967772196691724e-05, "loss": 0.2796, "step": 59900 }, { "epoch": 0.1017827244181845, "grad_norm": 4.855507850646973, "learning_rate": 1.7964379439211116e-05, "loss": 0.2805, "step": 60000 }, { "epoch": 0.10195236229221481, "grad_norm": 7.667230129241943, "learning_rate": 1.796098668173051e-05, "loss": 0.2651, "step": 60100 }, { "epoch": 0.10212200016624512, "grad_norm": 3.4390757083892822, "learning_rate": 1.7957593924249904e-05, "loss": 0.2802, "step": 60200 }, { "epoch": 0.10229163804027543, "grad_norm": 6.194849014282227, "learning_rate": 1.79542011667693e-05, "loss": 0.2884, "step": 60300 }, { "epoch": 0.10246127591430573, "grad_norm": 4.031449794769287, "learning_rate": 1.7950808409288695e-05, "loss": 0.2844, "step": 60400 }, { "epoch": 0.10263091378833604, "grad_norm": 8.769570350646973, "learning_rate": 1.7947415651808087e-05, "loss": 0.2647, "step": 60500 }, { "epoch": 0.10280055166236635, "grad_norm": 2.8384315967559814, "learning_rate": 1.794402289432748e-05, "loss": 0.2837, "step": 60600 }, { "epoch": 0.10297018953639665, "grad_norm": 3.9961097240448, "learning_rate": 1.7940630136846874e-05, "loss": 0.2774, "step": 60700 }, { "epoch": 0.10313982741042696, "grad_norm": 6.84942102432251, "learning_rate": 1.7937237379366267e-05, "loss": 0.2663, "step": 60800 }, { "epoch": 0.10330946528445727, "grad_norm": 3.410268783569336, "learning_rate": 1.7933844621885662e-05, "loss": 0.2884, "step": 60900 }, { "epoch": 0.10347910315848757, "grad_norm": 5.257760524749756, "learning_rate": 1.7930451864405057e-05, "loss": 0.2739, "step": 61000 }, { "epoch": 0.10364874103251788, "grad_norm": 4.672818183898926, "learning_rate": 1.792705910692445e-05, "loss": 0.2938, "step": 61100 }, { "epoch": 0.10381837890654819, "grad_norm": 3.6532907485961914, "learning_rate": 1.7923666349443845e-05, "loss": 0.308, "step": 61200 }, { "epoch": 0.1039880167805785, "grad_norm": 4.358036518096924, "learning_rate": 1.7920273591963237e-05, "loss": 0.2688, "step": 61300 }, { "epoch": 0.1041576546546088, "grad_norm": 3.6627635955810547, "learning_rate": 1.791688083448263e-05, "loss": 0.2937, "step": 61400 }, { "epoch": 0.10432729252863912, "grad_norm": 4.015224456787109, "learning_rate": 1.7913488077002025e-05, "loss": 0.2829, "step": 61500 }, { "epoch": 0.10449693040266943, "grad_norm": 5.906428337097168, "learning_rate": 1.791009531952142e-05, "loss": 0.2848, "step": 61600 }, { "epoch": 0.10466656827669973, "grad_norm": 4.195382595062256, "learning_rate": 1.7906702562040812e-05, "loss": 0.2764, "step": 61700 }, { "epoch": 0.10483620615073004, "grad_norm": 5.231374740600586, "learning_rate": 1.7903309804560208e-05, "loss": 0.2725, "step": 61800 }, { "epoch": 0.10500584402476035, "grad_norm": 4.967025279998779, "learning_rate": 1.78999170470796e-05, "loss": 0.2703, "step": 61900 }, { "epoch": 0.10517548189879065, "grad_norm": 3.840085506439209, "learning_rate": 1.7896524289598995e-05, "loss": 0.3005, "step": 62000 }, { "epoch": 0.10534511977282096, "grad_norm": 6.131738662719727, "learning_rate": 1.7893131532118388e-05, "loss": 0.2926, "step": 62100 }, { "epoch": 0.10551475764685127, "grad_norm": 4.975151538848877, "learning_rate": 1.7889738774637783e-05, "loss": 0.2677, "step": 62200 }, { "epoch": 0.10568439552088157, "grad_norm": 3.6437010765075684, "learning_rate": 1.7886346017157175e-05, "loss": 0.2942, "step": 62300 }, { "epoch": 0.10585403339491188, "grad_norm": 6.80495023727417, "learning_rate": 1.788295325967657e-05, "loss": 0.2802, "step": 62400 }, { "epoch": 0.10602367126894219, "grad_norm": 7.215899467468262, "learning_rate": 1.7879560502195963e-05, "loss": 0.2833, "step": 62500 }, { "epoch": 0.1061933091429725, "grad_norm": 4.794147491455078, "learning_rate": 1.7876167744715358e-05, "loss": 0.2663, "step": 62600 }, { "epoch": 0.1063629470170028, "grad_norm": 3.0074660778045654, "learning_rate": 1.787277498723475e-05, "loss": 0.268, "step": 62700 }, { "epoch": 0.10653258489103311, "grad_norm": 3.389073610305786, "learning_rate": 1.7869382229754146e-05, "loss": 0.2932, "step": 62800 }, { "epoch": 0.10670222276506341, "grad_norm": 5.789154052734375, "learning_rate": 1.7865989472273538e-05, "loss": 0.2869, "step": 62900 }, { "epoch": 0.10687186063909372, "grad_norm": 5.939857006072998, "learning_rate": 1.7862596714792933e-05, "loss": 0.2829, "step": 63000 }, { "epoch": 0.10704149851312403, "grad_norm": 7.591058254241943, "learning_rate": 1.785920395731233e-05, "loss": 0.2665, "step": 63100 }, { "epoch": 0.10721113638715433, "grad_norm": 1.716158390045166, "learning_rate": 1.785581119983172e-05, "loss": 0.2783, "step": 63200 }, { "epoch": 0.10738077426118466, "grad_norm": 4.78238582611084, "learning_rate": 1.7852418442351113e-05, "loss": 0.2952, "step": 63300 }, { "epoch": 0.10755041213521496, "grad_norm": 4.116053104400635, "learning_rate": 1.784902568487051e-05, "loss": 0.2692, "step": 63400 }, { "epoch": 0.10772005000924527, "grad_norm": 2.9127182960510254, "learning_rate": 1.78456329273899e-05, "loss": 0.2834, "step": 63500 }, { "epoch": 0.10788968788327558, "grad_norm": 5.452341556549072, "learning_rate": 1.7842240169909296e-05, "loss": 0.2659, "step": 63600 }, { "epoch": 0.10805932575730588, "grad_norm": 4.589710712432861, "learning_rate": 1.783884741242869e-05, "loss": 0.2633, "step": 63700 }, { "epoch": 0.10822896363133619, "grad_norm": 2.202305793762207, "learning_rate": 1.7835454654948084e-05, "loss": 0.2813, "step": 63800 }, { "epoch": 0.1083986015053665, "grad_norm": 5.467067718505859, "learning_rate": 1.783206189746748e-05, "loss": 0.2845, "step": 63900 }, { "epoch": 0.1085682393793968, "grad_norm": 4.6034159660339355, "learning_rate": 1.782866913998687e-05, "loss": 0.2881, "step": 64000 }, { "epoch": 0.10873787725342711, "grad_norm": 3.389188051223755, "learning_rate": 1.7825276382506263e-05, "loss": 0.2599, "step": 64100 }, { "epoch": 0.10890751512745742, "grad_norm": 6.8698201179504395, "learning_rate": 1.782188362502566e-05, "loss": 0.2899, "step": 64200 }, { "epoch": 0.10907715300148772, "grad_norm": 3.2493276596069336, "learning_rate": 1.7818490867545054e-05, "loss": 0.2864, "step": 64300 }, { "epoch": 0.10924679087551803, "grad_norm": 3.7744410037994385, "learning_rate": 1.7815098110064447e-05, "loss": 0.2764, "step": 64400 }, { "epoch": 0.10941642874954834, "grad_norm": 3.881692409515381, "learning_rate": 1.7811705352583842e-05, "loss": 0.2863, "step": 64500 }, { "epoch": 0.10958606662357864, "grad_norm": 6.059319019317627, "learning_rate": 1.7808312595103234e-05, "loss": 0.2645, "step": 64600 }, { "epoch": 0.10975570449760895, "grad_norm": 5.593514919281006, "learning_rate": 1.780491983762263e-05, "loss": 0.2825, "step": 64700 }, { "epoch": 0.10992534237163926, "grad_norm": 3.6071717739105225, "learning_rate": 1.7801527080142022e-05, "loss": 0.2713, "step": 64800 }, { "epoch": 0.11009498024566956, "grad_norm": 5.902979850769043, "learning_rate": 1.7798134322661417e-05, "loss": 0.2776, "step": 64900 }, { "epoch": 0.11026461811969987, "grad_norm": 5.50481653213501, "learning_rate": 1.779474156518081e-05, "loss": 0.2853, "step": 65000 }, { "epoch": 0.11043425599373019, "grad_norm": 4.222114086151123, "learning_rate": 1.7791348807700205e-05, "loss": 0.283, "step": 65100 }, { "epoch": 0.1106038938677605, "grad_norm": 5.482635974884033, "learning_rate": 1.7787956050219597e-05, "loss": 0.2716, "step": 65200 }, { "epoch": 0.1107735317417908, "grad_norm": 5.70487642288208, "learning_rate": 1.7784563292738992e-05, "loss": 0.2777, "step": 65300 }, { "epoch": 0.11094316961582111, "grad_norm": 4.768909931182861, "learning_rate": 1.7781170535258384e-05, "loss": 0.2771, "step": 65400 }, { "epoch": 0.11111280748985142, "grad_norm": 5.423692226409912, "learning_rate": 1.7777777777777777e-05, "loss": 0.2838, "step": 65500 }, { "epoch": 0.11128244536388172, "grad_norm": 4.154362678527832, "learning_rate": 1.7774385020297172e-05, "loss": 0.2893, "step": 65600 }, { "epoch": 0.11145208323791203, "grad_norm": 4.7547078132629395, "learning_rate": 1.7770992262816568e-05, "loss": 0.2841, "step": 65700 }, { "epoch": 0.11162172111194234, "grad_norm": 6.757493019104004, "learning_rate": 1.7767599505335963e-05, "loss": 0.2821, "step": 65800 }, { "epoch": 0.11179135898597264, "grad_norm": 4.125169277191162, "learning_rate": 1.7764206747855355e-05, "loss": 0.2829, "step": 65900 }, { "epoch": 0.11196099686000295, "grad_norm": 5.13736629486084, "learning_rate": 1.7760813990374747e-05, "loss": 0.2985, "step": 66000 }, { "epoch": 0.11213063473403326, "grad_norm": 2.8427257537841797, "learning_rate": 1.7757421232894143e-05, "loss": 0.2727, "step": 66100 }, { "epoch": 0.11230027260806356, "grad_norm": 3.1068708896636963, "learning_rate": 1.7754028475413535e-05, "loss": 0.2907, "step": 66200 }, { "epoch": 0.11246991048209387, "grad_norm": 4.969484806060791, "learning_rate": 1.775063571793293e-05, "loss": 0.2821, "step": 66300 }, { "epoch": 0.11263954835612418, "grad_norm": 4.892472267150879, "learning_rate": 1.7747242960452326e-05, "loss": 0.2889, "step": 66400 }, { "epoch": 0.11280918623015448, "grad_norm": 6.956464767456055, "learning_rate": 1.7743850202971718e-05, "loss": 0.2734, "step": 66500 }, { "epoch": 0.11297882410418479, "grad_norm": 5.509664058685303, "learning_rate": 1.774045744549111e-05, "loss": 0.2921, "step": 66600 }, { "epoch": 0.1131484619782151, "grad_norm": 6.236106872558594, "learning_rate": 1.7737064688010506e-05, "loss": 0.3021, "step": 66700 }, { "epoch": 0.11331809985224542, "grad_norm": 7.449003219604492, "learning_rate": 1.7733671930529898e-05, "loss": 0.2833, "step": 66800 }, { "epoch": 0.11348773772627573, "grad_norm": 4.284242630004883, "learning_rate": 1.7730279173049293e-05, "loss": 0.2767, "step": 66900 }, { "epoch": 0.11365737560030603, "grad_norm": 2.3094496726989746, "learning_rate": 1.772688641556869e-05, "loss": 0.2792, "step": 67000 }, { "epoch": 0.11382701347433634, "grad_norm": 6.687010765075684, "learning_rate": 1.772349365808808e-05, "loss": 0.2617, "step": 67100 }, { "epoch": 0.11399665134836665, "grad_norm": 4.366420745849609, "learning_rate": 1.7720100900607476e-05, "loss": 0.277, "step": 67200 }, { "epoch": 0.11416628922239695, "grad_norm": 5.828210353851318, "learning_rate": 1.7716708143126868e-05, "loss": 0.2735, "step": 67300 }, { "epoch": 0.11433592709642726, "grad_norm": 2.8380126953125, "learning_rate": 1.771331538564626e-05, "loss": 0.2818, "step": 67400 }, { "epoch": 0.11450556497045757, "grad_norm": 7.806057929992676, "learning_rate": 1.7709922628165656e-05, "loss": 0.2837, "step": 67500 }, { "epoch": 0.11467520284448787, "grad_norm": 3.1440675258636475, "learning_rate": 1.770652987068505e-05, "loss": 0.2895, "step": 67600 }, { "epoch": 0.11484484071851818, "grad_norm": 3.1279137134552, "learning_rate": 1.7703137113204443e-05, "loss": 0.2809, "step": 67700 }, { "epoch": 0.11501447859254849, "grad_norm": 3.8702638149261475, "learning_rate": 1.769974435572384e-05, "loss": 0.292, "step": 67800 }, { "epoch": 0.11518411646657879, "grad_norm": 5.378298282623291, "learning_rate": 1.769635159824323e-05, "loss": 0.2775, "step": 67900 }, { "epoch": 0.1153537543406091, "grad_norm": 5.249693870544434, "learning_rate": 1.7692958840762627e-05, "loss": 0.295, "step": 68000 }, { "epoch": 0.1155233922146394, "grad_norm": 4.038222789764404, "learning_rate": 1.768956608328202e-05, "loss": 0.2844, "step": 68100 }, { "epoch": 0.11569303008866971, "grad_norm": 7.189091205596924, "learning_rate": 1.768617332580141e-05, "loss": 0.2743, "step": 68200 }, { "epoch": 0.11586266796270002, "grad_norm": 5.0678486824035645, "learning_rate": 1.7682780568320806e-05, "loss": 0.2706, "step": 68300 }, { "epoch": 0.11603230583673033, "grad_norm": 9.859848022460938, "learning_rate": 1.7679387810840202e-05, "loss": 0.2458, "step": 68400 }, { "epoch": 0.11620194371076063, "grad_norm": 2.7910306453704834, "learning_rate": 1.7675995053359594e-05, "loss": 0.2821, "step": 68500 }, { "epoch": 0.11637158158479095, "grad_norm": 5.158913612365723, "learning_rate": 1.767260229587899e-05, "loss": 0.2745, "step": 68600 }, { "epoch": 0.11654121945882126, "grad_norm": 5.154831886291504, "learning_rate": 1.766920953839838e-05, "loss": 0.2772, "step": 68700 }, { "epoch": 0.11671085733285157, "grad_norm": 5.805807113647461, "learning_rate": 1.7665816780917777e-05, "loss": 0.2774, "step": 68800 }, { "epoch": 0.11688049520688187, "grad_norm": 4.757166862487793, "learning_rate": 1.766242402343717e-05, "loss": 0.2722, "step": 68900 }, { "epoch": 0.11705013308091218, "grad_norm": 4.450891971588135, "learning_rate": 1.7659031265956565e-05, "loss": 0.2931, "step": 69000 }, { "epoch": 0.11721977095494249, "grad_norm": 5.723304748535156, "learning_rate": 1.765563850847596e-05, "loss": 0.2698, "step": 69100 }, { "epoch": 0.1173894088289728, "grad_norm": 4.780496120452881, "learning_rate": 1.7652245750995352e-05, "loss": 0.2779, "step": 69200 }, { "epoch": 0.1175590467030031, "grad_norm": 10.823837280273438, "learning_rate": 1.7648852993514744e-05, "loss": 0.2724, "step": 69300 }, { "epoch": 0.11772868457703341, "grad_norm": 9.890966415405273, "learning_rate": 1.764546023603414e-05, "loss": 0.2557, "step": 69400 }, { "epoch": 0.11789832245106371, "grad_norm": 4.8737287521362305, "learning_rate": 1.7642067478553532e-05, "loss": 0.273, "step": 69500 }, { "epoch": 0.11806796032509402, "grad_norm": 5.231362819671631, "learning_rate": 1.7638674721072927e-05, "loss": 0.2746, "step": 69600 }, { "epoch": 0.11823759819912433, "grad_norm": 4.479855537414551, "learning_rate": 1.7635281963592323e-05, "loss": 0.2965, "step": 69700 }, { "epoch": 0.11840723607315463, "grad_norm": 2.6694321632385254, "learning_rate": 1.7631889206111715e-05, "loss": 0.2836, "step": 69800 }, { "epoch": 0.11857687394718494, "grad_norm": 4.519138813018799, "learning_rate": 1.762849644863111e-05, "loss": 0.2871, "step": 69900 }, { "epoch": 0.11874651182121525, "grad_norm": 2.5001416206359863, "learning_rate": 1.7625103691150502e-05, "loss": 0.2807, "step": 70000 }, { "epoch": 0.11891614969524555, "grad_norm": 4.422288417816162, "learning_rate": 1.7621710933669895e-05, "loss": 0.2762, "step": 70100 }, { "epoch": 0.11908578756927586, "grad_norm": 4.70911979675293, "learning_rate": 1.761831817618929e-05, "loss": 0.2765, "step": 70200 }, { "epoch": 0.11925542544330617, "grad_norm": 5.276724338531494, "learning_rate": 1.7614925418708682e-05, "loss": 0.278, "step": 70300 }, { "epoch": 0.11942506331733649, "grad_norm": 3.9933576583862305, "learning_rate": 1.7611532661228078e-05, "loss": 0.2591, "step": 70400 }, { "epoch": 0.1195947011913668, "grad_norm": 5.400506496429443, "learning_rate": 1.7608139903747473e-05, "loss": 0.2609, "step": 70500 }, { "epoch": 0.1197643390653971, "grad_norm": 3.4139840602874756, "learning_rate": 1.7604747146266865e-05, "loss": 0.2993, "step": 70600 }, { "epoch": 0.11993397693942741, "grad_norm": 5.219585418701172, "learning_rate": 1.760135438878626e-05, "loss": 0.27, "step": 70700 }, { "epoch": 0.12010361481345772, "grad_norm": 5.228090763092041, "learning_rate": 1.7597961631305653e-05, "loss": 0.2628, "step": 70800 }, { "epoch": 0.12027325268748802, "grad_norm": 4.593550682067871, "learning_rate": 1.7594568873825045e-05, "loss": 0.2762, "step": 70900 }, { "epoch": 0.12044289056151833, "grad_norm": 5.032214164733887, "learning_rate": 1.759117611634444e-05, "loss": 0.2738, "step": 71000 }, { "epoch": 0.12061252843554864, "grad_norm": 2.1336002349853516, "learning_rate": 1.7587783358863836e-05, "loss": 0.2901, "step": 71100 }, { "epoch": 0.12078216630957894, "grad_norm": 4.663808345794678, "learning_rate": 1.7584390601383228e-05, "loss": 0.2941, "step": 71200 }, { "epoch": 0.12095180418360925, "grad_norm": 3.9439005851745605, "learning_rate": 1.7580997843902624e-05, "loss": 0.2802, "step": 71300 }, { "epoch": 0.12112144205763956, "grad_norm": 3.9431374073028564, "learning_rate": 1.7577605086422016e-05, "loss": 0.2897, "step": 71400 }, { "epoch": 0.12129107993166986, "grad_norm": 4.900402545928955, "learning_rate": 1.757421232894141e-05, "loss": 0.2922, "step": 71500 }, { "epoch": 0.12146071780570017, "grad_norm": 5.03627347946167, "learning_rate": 1.7570819571460803e-05, "loss": 0.2808, "step": 71600 }, { "epoch": 0.12163035567973048, "grad_norm": 3.5433876514434814, "learning_rate": 1.75674268139802e-05, "loss": 0.2741, "step": 71700 }, { "epoch": 0.12179999355376078, "grad_norm": 5.870200157165527, "learning_rate": 1.7564034056499594e-05, "loss": 0.2741, "step": 71800 }, { "epoch": 0.12196963142779109, "grad_norm": 4.990780353546143, "learning_rate": 1.7560641299018986e-05, "loss": 0.284, "step": 71900 }, { "epoch": 0.1221392693018214, "grad_norm": 6.303432941436768, "learning_rate": 1.755724854153838e-05, "loss": 0.279, "step": 72000 }, { "epoch": 0.1223089071758517, "grad_norm": 3.6554417610168457, "learning_rate": 1.7553855784057774e-05, "loss": 0.2742, "step": 72100 }, { "epoch": 0.12247854504988202, "grad_norm": 6.174529552459717, "learning_rate": 1.7550463026577166e-05, "loss": 0.2946, "step": 72200 }, { "epoch": 0.12264818292391233, "grad_norm": 3.8030030727386475, "learning_rate": 1.754707026909656e-05, "loss": 0.2725, "step": 72300 }, { "epoch": 0.12281782079794264, "grad_norm": 5.218021392822266, "learning_rate": 1.7543677511615954e-05, "loss": 0.2641, "step": 72400 }, { "epoch": 0.12298745867197294, "grad_norm": 4.7851786613464355, "learning_rate": 1.754028475413535e-05, "loss": 0.2811, "step": 72500 }, { "epoch": 0.12315709654600325, "grad_norm": 4.218992710113525, "learning_rate": 1.7536891996654745e-05, "loss": 0.287, "step": 72600 }, { "epoch": 0.12332673442003356, "grad_norm": 8.209569931030273, "learning_rate": 1.7533499239174137e-05, "loss": 0.283, "step": 72700 }, { "epoch": 0.12349637229406386, "grad_norm": 6.67339563369751, "learning_rate": 1.753010648169353e-05, "loss": 0.2742, "step": 72800 }, { "epoch": 0.12366601016809417, "grad_norm": 3.892375946044922, "learning_rate": 1.7526713724212924e-05, "loss": 0.2795, "step": 72900 }, { "epoch": 0.12383564804212448, "grad_norm": 5.586345672607422, "learning_rate": 1.7523320966732316e-05, "loss": 0.2774, "step": 73000 }, { "epoch": 0.12400528591615478, "grad_norm": 4.74489164352417, "learning_rate": 1.7519928209251712e-05, "loss": 0.2667, "step": 73100 }, { "epoch": 0.12417492379018509, "grad_norm": 12.517495155334473, "learning_rate": 1.7516535451771107e-05, "loss": 0.2684, "step": 73200 }, { "epoch": 0.1243445616642154, "grad_norm": 5.093175411224365, "learning_rate": 1.75131426942905e-05, "loss": 0.2815, "step": 73300 }, { "epoch": 0.1245141995382457, "grad_norm": 5.885904312133789, "learning_rate": 1.7509749936809895e-05, "loss": 0.28, "step": 73400 }, { "epoch": 0.12468383741227601, "grad_norm": 4.632839679718018, "learning_rate": 1.7506357179329287e-05, "loss": 0.2579, "step": 73500 }, { "epoch": 0.12485347528630632, "grad_norm": 5.857240676879883, "learning_rate": 1.750296442184868e-05, "loss": 0.2858, "step": 73600 }, { "epoch": 0.12502311316033662, "grad_norm": 3.141610860824585, "learning_rate": 1.7499571664368075e-05, "loss": 0.2842, "step": 73700 }, { "epoch": 0.12519275103436694, "grad_norm": 6.757940769195557, "learning_rate": 1.749617890688747e-05, "loss": 0.2912, "step": 73800 }, { "epoch": 0.12536238890839724, "grad_norm": 4.926107406616211, "learning_rate": 1.7492786149406862e-05, "loss": 0.2786, "step": 73900 }, { "epoch": 0.12553202678242756, "grad_norm": 5.700347900390625, "learning_rate": 1.7489393391926258e-05, "loss": 0.2801, "step": 74000 }, { "epoch": 0.12570166465645785, "grad_norm": 11.877630233764648, "learning_rate": 1.748600063444565e-05, "loss": 0.2633, "step": 74100 }, { "epoch": 0.12587130253048817, "grad_norm": 6.468028545379639, "learning_rate": 1.7482607876965042e-05, "loss": 0.2637, "step": 74200 }, { "epoch": 0.12604094040451846, "grad_norm": 5.010571002960205, "learning_rate": 1.7479215119484437e-05, "loss": 0.2705, "step": 74300 }, { "epoch": 0.12621057827854879, "grad_norm": 3.2688815593719482, "learning_rate": 1.7475822362003833e-05, "loss": 0.2733, "step": 74400 }, { "epoch": 0.12638021615257908, "grad_norm": 2.26993465423584, "learning_rate": 1.747242960452323e-05, "loss": 0.2703, "step": 74500 }, { "epoch": 0.1265498540266094, "grad_norm": 6.457152843475342, "learning_rate": 1.746903684704262e-05, "loss": 0.2737, "step": 74600 }, { "epoch": 0.1267194919006397, "grad_norm": 2.6338016986846924, "learning_rate": 1.7465644089562013e-05, "loss": 0.2938, "step": 74700 }, { "epoch": 0.12688912977467, "grad_norm": 6.921136379241943, "learning_rate": 1.7462251332081408e-05, "loss": 0.2716, "step": 74800 }, { "epoch": 0.12705876764870033, "grad_norm": 9.977437019348145, "learning_rate": 1.74588585746008e-05, "loss": 0.261, "step": 74900 }, { "epoch": 0.12722840552273063, "grad_norm": 6.114049911499023, "learning_rate": 1.7455465817120196e-05, "loss": 0.2824, "step": 75000 }, { "epoch": 0.12739804339676095, "grad_norm": 9.968103408813477, "learning_rate": 1.7452073059639588e-05, "loss": 0.2899, "step": 75100 }, { "epoch": 0.12756768127079124, "grad_norm": 6.186594009399414, "learning_rate": 1.7448680302158983e-05, "loss": 0.2733, "step": 75200 }, { "epoch": 0.12773731914482156, "grad_norm": 4.024326324462891, "learning_rate": 1.744528754467838e-05, "loss": 0.2965, "step": 75300 }, { "epoch": 0.12790695701885185, "grad_norm": 5.572375297546387, "learning_rate": 1.744189478719777e-05, "loss": 0.2883, "step": 75400 }, { "epoch": 0.12807659489288217, "grad_norm": 3.497793674468994, "learning_rate": 1.7438502029717163e-05, "loss": 0.285, "step": 75500 }, { "epoch": 0.12824623276691247, "grad_norm": 3.2272682189941406, "learning_rate": 1.743510927223656e-05, "loss": 0.284, "step": 75600 }, { "epoch": 0.1284158706409428, "grad_norm": 5.746441841125488, "learning_rate": 1.743171651475595e-05, "loss": 0.2549, "step": 75700 }, { "epoch": 0.12858550851497308, "grad_norm": 5.194096565246582, "learning_rate": 1.7428323757275346e-05, "loss": 0.2639, "step": 75800 }, { "epoch": 0.1287551463890034, "grad_norm": 3.1811935901641846, "learning_rate": 1.742493099979474e-05, "loss": 0.2776, "step": 75900 }, { "epoch": 0.1289247842630337, "grad_norm": 8.586530685424805, "learning_rate": 1.7421538242314134e-05, "loss": 0.2626, "step": 76000 }, { "epoch": 0.129094422137064, "grad_norm": 8.150378227233887, "learning_rate": 1.7418145484833526e-05, "loss": 0.2906, "step": 76100 }, { "epoch": 0.1292640600110943, "grad_norm": 7.653988838195801, "learning_rate": 1.741475272735292e-05, "loss": 0.2761, "step": 76200 }, { "epoch": 0.12943369788512463, "grad_norm": 3.3724448680877686, "learning_rate": 1.7411359969872313e-05, "loss": 0.2947, "step": 76300 }, { "epoch": 0.12960333575915492, "grad_norm": 3.9929888248443604, "learning_rate": 1.740796721239171e-05, "loss": 0.2895, "step": 76400 }, { "epoch": 0.12977297363318524, "grad_norm": 5.627993106842041, "learning_rate": 1.7404574454911104e-05, "loss": 0.263, "step": 76500 }, { "epoch": 0.12994261150721556, "grad_norm": 5.319175720214844, "learning_rate": 1.7401181697430496e-05, "loss": 0.2864, "step": 76600 }, { "epoch": 0.13011224938124585, "grad_norm": 2.819857120513916, "learning_rate": 1.7397788939949892e-05, "loss": 0.2631, "step": 76700 }, { "epoch": 0.13028188725527617, "grad_norm": 4.879464626312256, "learning_rate": 1.7394396182469284e-05, "loss": 0.2683, "step": 76800 }, { "epoch": 0.13045152512930647, "grad_norm": 4.682009696960449, "learning_rate": 1.7391003424988676e-05, "loss": 0.2848, "step": 76900 }, { "epoch": 0.1306211630033368, "grad_norm": 3.2094027996063232, "learning_rate": 1.738761066750807e-05, "loss": 0.2691, "step": 77000 }, { "epoch": 0.13079080087736708, "grad_norm": 4.905536651611328, "learning_rate": 1.7384217910027467e-05, "loss": 0.2752, "step": 77100 }, { "epoch": 0.1309604387513974, "grad_norm": 6.259279727935791, "learning_rate": 1.738082515254686e-05, "loss": 0.2826, "step": 77200 }, { "epoch": 0.1311300766254277, "grad_norm": 4.83374547958374, "learning_rate": 1.7377432395066255e-05, "loss": 0.2748, "step": 77300 }, { "epoch": 0.13129971449945801, "grad_norm": 5.76043701171875, "learning_rate": 1.7374039637585647e-05, "loss": 0.2677, "step": 77400 }, { "epoch": 0.1314693523734883, "grad_norm": 4.85543155670166, "learning_rate": 1.7370646880105042e-05, "loss": 0.2868, "step": 77500 }, { "epoch": 0.13163899024751863, "grad_norm": 5.413662433624268, "learning_rate": 1.7367254122624434e-05, "loss": 0.2848, "step": 77600 }, { "epoch": 0.13180862812154892, "grad_norm": 5.461532115936279, "learning_rate": 1.736386136514383e-05, "loss": 0.2712, "step": 77700 }, { "epoch": 0.13197826599557924, "grad_norm": 4.653634548187256, "learning_rate": 1.7360468607663222e-05, "loss": 0.2858, "step": 77800 }, { "epoch": 0.13214790386960953, "grad_norm": 6.804802894592285, "learning_rate": 1.7357075850182617e-05, "loss": 0.2614, "step": 77900 }, { "epoch": 0.13231754174363985, "grad_norm": 5.264156341552734, "learning_rate": 1.735368309270201e-05, "loss": 0.2689, "step": 78000 }, { "epoch": 0.13248717961767015, "grad_norm": 4.920713901519775, "learning_rate": 1.7350290335221405e-05, "loss": 0.2505, "step": 78100 }, { "epoch": 0.13265681749170047, "grad_norm": 5.65202522277832, "learning_rate": 1.7346897577740797e-05, "loss": 0.2631, "step": 78200 }, { "epoch": 0.13282645536573076, "grad_norm": 4.741088390350342, "learning_rate": 1.7343504820260193e-05, "loss": 0.2626, "step": 78300 }, { "epoch": 0.13299609323976108, "grad_norm": 3.497368574142456, "learning_rate": 1.7340112062779585e-05, "loss": 0.2697, "step": 78400 }, { "epoch": 0.1331657311137914, "grad_norm": 9.68507194519043, "learning_rate": 1.733671930529898e-05, "loss": 0.294, "step": 78500 }, { "epoch": 0.1333353689878217, "grad_norm": 3.866812229156494, "learning_rate": 1.7333326547818376e-05, "loss": 0.2746, "step": 78600 }, { "epoch": 0.13350500686185202, "grad_norm": 4.106080055236816, "learning_rate": 1.7329933790337768e-05, "loss": 0.2753, "step": 78700 }, { "epoch": 0.1336746447358823, "grad_norm": 5.8751020431518555, "learning_rate": 1.732654103285716e-05, "loss": 0.2861, "step": 78800 }, { "epoch": 0.13384428260991263, "grad_norm": 7.257327556610107, "learning_rate": 1.7323148275376555e-05, "loss": 0.246, "step": 78900 }, { "epoch": 0.13401392048394292, "grad_norm": 5.08812952041626, "learning_rate": 1.7319755517895947e-05, "loss": 0.2911, "step": 79000 }, { "epoch": 0.13418355835797324, "grad_norm": 7.233123302459717, "learning_rate": 1.7316362760415343e-05, "loss": 0.2651, "step": 79100 }, { "epoch": 0.13435319623200354, "grad_norm": 2.523315906524658, "learning_rate": 1.731297000293474e-05, "loss": 0.2733, "step": 79200 }, { "epoch": 0.13452283410603386, "grad_norm": 5.521599769592285, "learning_rate": 1.730957724545413e-05, "loss": 0.2625, "step": 79300 }, { "epoch": 0.13469247198006415, "grad_norm": 3.740551233291626, "learning_rate": 1.7306184487973526e-05, "loss": 0.2706, "step": 79400 }, { "epoch": 0.13486210985409447, "grad_norm": 7.3445258140563965, "learning_rate": 1.7302791730492918e-05, "loss": 0.2786, "step": 79500 }, { "epoch": 0.13503174772812476, "grad_norm": 6.98891544342041, "learning_rate": 1.729939897301231e-05, "loss": 0.2638, "step": 79600 }, { "epoch": 0.13520138560215508, "grad_norm": 4.596282005310059, "learning_rate": 1.7296006215531706e-05, "loss": 0.265, "step": 79700 }, { "epoch": 0.13537102347618538, "grad_norm": 8.018917083740234, "learning_rate": 1.72926134580511e-05, "loss": 0.2588, "step": 79800 }, { "epoch": 0.1355406613502157, "grad_norm": 4.779946804046631, "learning_rate": 1.7289220700570493e-05, "loss": 0.2582, "step": 79900 }, { "epoch": 0.135710299224246, "grad_norm": 4.044360160827637, "learning_rate": 1.728582794308989e-05, "loss": 0.2911, "step": 80000 }, { "epoch": 0.1358799370982763, "grad_norm": 3.90630841255188, "learning_rate": 1.728243518560928e-05, "loss": 0.2671, "step": 80100 }, { "epoch": 0.13604957497230663, "grad_norm": 5.7952399253845215, "learning_rate": 1.7279042428128676e-05, "loss": 0.2818, "step": 80200 }, { "epoch": 0.13621921284633692, "grad_norm": 4.849979877471924, "learning_rate": 1.727564967064807e-05, "loss": 0.2682, "step": 80300 }, { "epoch": 0.13638885072036724, "grad_norm": 4.687153339385986, "learning_rate": 1.727225691316746e-05, "loss": 0.2692, "step": 80400 }, { "epoch": 0.13655848859439754, "grad_norm": 6.676980972290039, "learning_rate": 1.7268864155686856e-05, "loss": 0.2681, "step": 80500 }, { "epoch": 0.13672812646842786, "grad_norm": 2.6825499534606934, "learning_rate": 1.726547139820625e-05, "loss": 0.2852, "step": 80600 }, { "epoch": 0.13689776434245815, "grad_norm": 2.707706928253174, "learning_rate": 1.7262078640725644e-05, "loss": 0.2936, "step": 80700 }, { "epoch": 0.13706740221648847, "grad_norm": 4.585887908935547, "learning_rate": 1.725868588324504e-05, "loss": 0.2802, "step": 80800 }, { "epoch": 0.13723704009051876, "grad_norm": 4.5451788902282715, "learning_rate": 1.725529312576443e-05, "loss": 0.2835, "step": 80900 }, { "epoch": 0.13740667796454908, "grad_norm": 6.709548473358154, "learning_rate": 1.7251900368283823e-05, "loss": 0.2672, "step": 81000 }, { "epoch": 0.13757631583857938, "grad_norm": 13.95259952545166, "learning_rate": 1.724850761080322e-05, "loss": 0.2701, "step": 81100 }, { "epoch": 0.1377459537126097, "grad_norm": 4.640286922454834, "learning_rate": 1.7245114853322614e-05, "loss": 0.272, "step": 81200 }, { "epoch": 0.13791559158664, "grad_norm": 10.755087852478027, "learning_rate": 1.724172209584201e-05, "loss": 0.2763, "step": 81300 }, { "epoch": 0.1380852294606703, "grad_norm": 4.162614345550537, "learning_rate": 1.7238329338361402e-05, "loss": 0.2908, "step": 81400 }, { "epoch": 0.1382548673347006, "grad_norm": 4.862986087799072, "learning_rate": 1.7234936580880794e-05, "loss": 0.2793, "step": 81500 }, { "epoch": 0.13842450520873092, "grad_norm": 6.359107971191406, "learning_rate": 1.723154382340019e-05, "loss": 0.2695, "step": 81600 }, { "epoch": 0.13859414308276122, "grad_norm": 5.341662406921387, "learning_rate": 1.722815106591958e-05, "loss": 0.2659, "step": 81700 }, { "epoch": 0.13876378095679154, "grad_norm": 9.095802307128906, "learning_rate": 1.7224758308438977e-05, "loss": 0.2858, "step": 81800 }, { "epoch": 0.13893341883082183, "grad_norm": 4.926386833190918, "learning_rate": 1.7221365550958373e-05, "loss": 0.2873, "step": 81900 }, { "epoch": 0.13910305670485215, "grad_norm": 4.403234958648682, "learning_rate": 1.7217972793477765e-05, "loss": 0.2748, "step": 82000 }, { "epoch": 0.13927269457888247, "grad_norm": 4.266883373260498, "learning_rate": 1.721458003599716e-05, "loss": 0.2767, "step": 82100 }, { "epoch": 0.13944233245291277, "grad_norm": 5.451406478881836, "learning_rate": 1.7211187278516552e-05, "loss": 0.2759, "step": 82200 }, { "epoch": 0.13961197032694309, "grad_norm": 3.695340871810913, "learning_rate": 1.7207794521035944e-05, "loss": 0.2754, "step": 82300 }, { "epoch": 0.13978160820097338, "grad_norm": 6.690242767333984, "learning_rate": 1.720440176355534e-05, "loss": 0.2594, "step": 82400 }, { "epoch": 0.1399512460750037, "grad_norm": 4.937933444976807, "learning_rate": 1.7201009006074732e-05, "loss": 0.278, "step": 82500 }, { "epoch": 0.140120883949034, "grad_norm": 3.9675967693328857, "learning_rate": 1.7197616248594128e-05, "loss": 0.253, "step": 82600 }, { "epoch": 0.1402905218230643, "grad_norm": 3.752835273742676, "learning_rate": 1.7194223491113523e-05, "loss": 0.2846, "step": 82700 }, { "epoch": 0.1404601596970946, "grad_norm": 6.412888050079346, "learning_rate": 1.7190830733632915e-05, "loss": 0.27, "step": 82800 }, { "epoch": 0.14062979757112493, "grad_norm": 6.23638391494751, "learning_rate": 1.7187437976152307e-05, "loss": 0.2881, "step": 82900 }, { "epoch": 0.14079943544515522, "grad_norm": 7.154056072235107, "learning_rate": 1.7184045218671703e-05, "loss": 0.2851, "step": 83000 }, { "epoch": 0.14096907331918554, "grad_norm": 3.4265105724334717, "learning_rate": 1.7180652461191095e-05, "loss": 0.278, "step": 83100 }, { "epoch": 0.14113871119321583, "grad_norm": 7.618716716766357, "learning_rate": 1.717725970371049e-05, "loss": 0.2572, "step": 83200 }, { "epoch": 0.14130834906724615, "grad_norm": 7.375784873962402, "learning_rate": 1.7173866946229886e-05, "loss": 0.2716, "step": 83300 }, { "epoch": 0.14147798694127645, "grad_norm": 2.397731065750122, "learning_rate": 1.7170474188749278e-05, "loss": 0.2774, "step": 83400 }, { "epoch": 0.14164762481530677, "grad_norm": 6.272594451904297, "learning_rate": 1.7167081431268673e-05, "loss": 0.2702, "step": 83500 }, { "epoch": 0.14181726268933706, "grad_norm": 5.030258655548096, "learning_rate": 1.7163688673788065e-05, "loss": 0.2779, "step": 83600 }, { "epoch": 0.14198690056336738, "grad_norm": 2.350693941116333, "learning_rate": 1.7160295916307458e-05, "loss": 0.2767, "step": 83700 }, { "epoch": 0.1421565384373977, "grad_norm": 5.603451728820801, "learning_rate": 1.7156903158826853e-05, "loss": 0.2713, "step": 83800 }, { "epoch": 0.142326176311428, "grad_norm": 6.381509304046631, "learning_rate": 1.715351040134625e-05, "loss": 0.2742, "step": 83900 }, { "epoch": 0.1424958141854583, "grad_norm": 4.501616477966309, "learning_rate": 1.7150117643865644e-05, "loss": 0.2696, "step": 84000 }, { "epoch": 0.1426654520594886, "grad_norm": 2.4367287158966064, "learning_rate": 1.7146724886385036e-05, "loss": 0.2475, "step": 84100 }, { "epoch": 0.14283508993351893, "grad_norm": 2.6129348278045654, "learning_rate": 1.7143332128904428e-05, "loss": 0.2702, "step": 84200 }, { "epoch": 0.14285883923588316, "eval_accuracy": 0.8308939885228379, "eval_f1": 0.88231516340423, "eval_loss": 0.42492440342903137, "eval_runtime": 386.6108, "eval_samples_per_second": 866.313, "eval_steps_per_second": 27.074, "step": 84214 }, { "epoch": 1.000145888571666, "grad_norm": 4.562740802764893, "learning_rate": 1.7139939371423824e-05, "loss": 0.2557, "step": 84300 }, { "epoch": 1.0003155264456964, "grad_norm": 5.575155258178711, "learning_rate": 1.7136546613943216e-05, "loss": 0.2372, "step": 84400 }, { "epoch": 1.0004851643197268, "grad_norm": 3.6639022827148438, "learning_rate": 1.713315385646261e-05, "loss": 0.2323, "step": 84500 }, { "epoch": 1.000654802193757, "grad_norm": 5.135401725769043, "learning_rate": 1.7129761098982007e-05, "loss": 0.2311, "step": 84600 }, { "epoch": 1.0008244400677873, "grad_norm": 4.939180374145508, "learning_rate": 1.71263683415014e-05, "loss": 0.2356, "step": 84700 }, { "epoch": 1.0009940779418176, "grad_norm": 5.0626702308654785, "learning_rate": 1.712297558402079e-05, "loss": 0.2417, "step": 84800 }, { "epoch": 1.001163715815848, "grad_norm": 2.469388246536255, "learning_rate": 1.7119582826540186e-05, "loss": 0.2244, "step": 84900 }, { "epoch": 1.0013333536898783, "grad_norm": 3.8802292346954346, "learning_rate": 1.711619006905958e-05, "loss": 0.2511, "step": 85000 }, { "epoch": 1.0015029915639084, "grad_norm": 4.77537202835083, "learning_rate": 1.7112797311578974e-05, "loss": 0.2383, "step": 85100 }, { "epoch": 1.0016726294379388, "grad_norm": 4.837684631347656, "learning_rate": 1.7109404554098366e-05, "loss": 0.2534, "step": 85200 }, { "epoch": 1.0018422673119691, "grad_norm": 4.591492652893066, "learning_rate": 1.710601179661776e-05, "loss": 0.2379, "step": 85300 }, { "epoch": 1.0020119051859995, "grad_norm": 5.931984901428223, "learning_rate": 1.7102619039137157e-05, "loss": 0.232, "step": 85400 }, { "epoch": 1.0021815430600298, "grad_norm": 4.321887016296387, "learning_rate": 1.709922628165655e-05, "loss": 0.2385, "step": 85500 }, { "epoch": 1.0023511809340602, "grad_norm": 7.285331726074219, "learning_rate": 1.709583352417594e-05, "loss": 0.2548, "step": 85600 }, { "epoch": 1.0025208188080903, "grad_norm": 2.686384439468384, "learning_rate": 1.7092440766695337e-05, "loss": 0.2237, "step": 85700 }, { "epoch": 1.0026904566821206, "grad_norm": 4.071341037750244, "learning_rate": 1.708904800921473e-05, "loss": 0.2375, "step": 85800 }, { "epoch": 1.002860094556151, "grad_norm": 3.7597484588623047, "learning_rate": 1.7085655251734124e-05, "loss": 0.2282, "step": 85900 }, { "epoch": 1.0030297324301813, "grad_norm": 8.57940673828125, "learning_rate": 1.708226249425352e-05, "loss": 0.232, "step": 86000 }, { "epoch": 1.0031993703042117, "grad_norm": 4.077773571014404, "learning_rate": 1.7078869736772912e-05, "loss": 0.2388, "step": 86100 }, { "epoch": 1.0033690081782418, "grad_norm": 6.936248302459717, "learning_rate": 1.7075476979292308e-05, "loss": 0.234, "step": 86200 }, { "epoch": 1.0035386460522722, "grad_norm": 4.2187724113464355, "learning_rate": 1.70720842218117e-05, "loss": 0.2408, "step": 86300 }, { "epoch": 1.0037082839263025, "grad_norm": 14.114175796508789, "learning_rate": 1.7068691464331092e-05, "loss": 0.2381, "step": 86400 }, { "epoch": 1.0038779218003329, "grad_norm": 2.516184091567993, "learning_rate": 1.7065298706850487e-05, "loss": 0.2371, "step": 86500 }, { "epoch": 1.0040475596743632, "grad_norm": 5.548521995544434, "learning_rate": 1.7061905949369883e-05, "loss": 0.2452, "step": 86600 }, { "epoch": 1.0042171975483933, "grad_norm": 5.038306713104248, "learning_rate": 1.7058513191889275e-05, "loss": 0.2414, "step": 86700 }, { "epoch": 1.0043868354224237, "grad_norm": 3.5572407245635986, "learning_rate": 1.705512043440867e-05, "loss": 0.2553, "step": 86800 }, { "epoch": 1.004556473296454, "grad_norm": 6.9616241455078125, "learning_rate": 1.7051727676928062e-05, "loss": 0.2332, "step": 86900 }, { "epoch": 1.0047261111704844, "grad_norm": 6.995264530181885, "learning_rate": 1.7048334919447458e-05, "loss": 0.2291, "step": 87000 }, { "epoch": 1.0048957490445147, "grad_norm": 16.793272018432617, "learning_rate": 1.704494216196685e-05, "loss": 0.2343, "step": 87100 }, { "epoch": 1.005065386918545, "grad_norm": 6.167327880859375, "learning_rate": 1.7041549404486245e-05, "loss": 0.2382, "step": 87200 }, { "epoch": 1.0052350247925752, "grad_norm": 7.1610188484191895, "learning_rate": 1.7038156647005638e-05, "loss": 0.2551, "step": 87300 }, { "epoch": 1.0054046626666056, "grad_norm": 3.357177257537842, "learning_rate": 1.7034763889525033e-05, "loss": 0.228, "step": 87400 }, { "epoch": 1.005574300540636, "grad_norm": 5.3651204109191895, "learning_rate": 1.7031371132044425e-05, "loss": 0.2368, "step": 87500 }, { "epoch": 1.0057439384146662, "grad_norm": 3.190673351287842, "learning_rate": 1.702797837456382e-05, "loss": 0.2517, "step": 87600 }, { "epoch": 1.0059135762886966, "grad_norm": 14.544881820678711, "learning_rate": 1.7024585617083213e-05, "loss": 0.2378, "step": 87700 }, { "epoch": 1.0060832141627267, "grad_norm": 4.180810928344727, "learning_rate": 1.7021192859602608e-05, "loss": 0.2514, "step": 87800 }, { "epoch": 1.006252852036757, "grad_norm": 8.01595687866211, "learning_rate": 1.7017800102122e-05, "loss": 0.2292, "step": 87900 }, { "epoch": 1.0064224899107874, "grad_norm": 5.707816123962402, "learning_rate": 1.7014407344641396e-05, "loss": 0.2189, "step": 88000 }, { "epoch": 1.0065921277848178, "grad_norm": 7.515061378479004, "learning_rate": 1.701101458716079e-05, "loss": 0.2178, "step": 88100 }, { "epoch": 1.0067617656588481, "grad_norm": 8.485763549804688, "learning_rate": 1.7007621829680183e-05, "loss": 0.2357, "step": 88200 }, { "epoch": 1.0069314035328785, "grad_norm": 4.0967698097229, "learning_rate": 1.7004229072199576e-05, "loss": 0.2322, "step": 88300 }, { "epoch": 1.0071010414069086, "grad_norm": 6.336276531219482, "learning_rate": 1.700083631471897e-05, "loss": 0.2487, "step": 88400 }, { "epoch": 1.007270679280939, "grad_norm": 8.16193675994873, "learning_rate": 1.6997443557238363e-05, "loss": 0.2492, "step": 88500 }, { "epoch": 1.0074403171549693, "grad_norm": 6.0609450340271, "learning_rate": 1.699405079975776e-05, "loss": 0.2478, "step": 88600 }, { "epoch": 1.0076099550289996, "grad_norm": 4.050893306732178, "learning_rate": 1.6990658042277154e-05, "loss": 0.2566, "step": 88700 }, { "epoch": 1.00777959290303, "grad_norm": 5.582334518432617, "learning_rate": 1.6987265284796546e-05, "loss": 0.2394, "step": 88800 }, { "epoch": 1.0079492307770601, "grad_norm": 5.63973331451416, "learning_rate": 1.6983872527315942e-05, "loss": 0.2346, "step": 88900 }, { "epoch": 1.0081188686510905, "grad_norm": 1.8452056646347046, "learning_rate": 1.6980479769835334e-05, "loss": 0.2521, "step": 89000 }, { "epoch": 1.0082885065251208, "grad_norm": 5.2966718673706055, "learning_rate": 1.6977087012354726e-05, "loss": 0.2155, "step": 89100 }, { "epoch": 1.0084581443991512, "grad_norm": 4.985481262207031, "learning_rate": 1.697369425487412e-05, "loss": 0.2372, "step": 89200 }, { "epoch": 1.0086277822731815, "grad_norm": 4.114748001098633, "learning_rate": 1.6970301497393517e-05, "loss": 0.249, "step": 89300 }, { "epoch": 1.0087974201472119, "grad_norm": 10.858028411865234, "learning_rate": 1.696690873991291e-05, "loss": 0.2339, "step": 89400 }, { "epoch": 1.008967058021242, "grad_norm": 6.717566013336182, "learning_rate": 1.6963515982432304e-05, "loss": 0.2328, "step": 89500 }, { "epoch": 1.0091366958952723, "grad_norm": 9.19790267944336, "learning_rate": 1.6960123224951697e-05, "loss": 0.2351, "step": 89600 }, { "epoch": 1.0093063337693027, "grad_norm": 4.8032917976379395, "learning_rate": 1.695673046747109e-05, "loss": 0.2313, "step": 89700 }, { "epoch": 1.009475971643333, "grad_norm": 5.8459978103637695, "learning_rate": 1.6953337709990484e-05, "loss": 0.2226, "step": 89800 }, { "epoch": 1.0096456095173634, "grad_norm": 11.240850448608398, "learning_rate": 1.694994495250988e-05, "loss": 0.2336, "step": 89900 }, { "epoch": 1.0098152473913935, "grad_norm": 5.3205156326293945, "learning_rate": 1.6946552195029272e-05, "loss": 0.2404, "step": 90000 }, { "epoch": 1.0099848852654238, "grad_norm": 3.0975213050842285, "learning_rate": 1.6943159437548667e-05, "loss": 0.2343, "step": 90100 }, { "epoch": 1.0101545231394542, "grad_norm": 6.121389865875244, "learning_rate": 1.693976668006806e-05, "loss": 0.2531, "step": 90200 }, { "epoch": 1.0103241610134845, "grad_norm": 6.276196479797363, "learning_rate": 1.6936373922587455e-05, "loss": 0.2305, "step": 90300 }, { "epoch": 1.010493798887515, "grad_norm": 5.9128265380859375, "learning_rate": 1.6932981165106847e-05, "loss": 0.2289, "step": 90400 }, { "epoch": 1.010663436761545, "grad_norm": 3.1484572887420654, "learning_rate": 1.692958840762624e-05, "loss": 0.2408, "step": 90500 }, { "epoch": 1.0108330746355754, "grad_norm": 6.842945098876953, "learning_rate": 1.6926195650145635e-05, "loss": 0.2342, "step": 90600 }, { "epoch": 1.0110027125096057, "grad_norm": 7.0585103034973145, "learning_rate": 1.692280289266503e-05, "loss": 0.2404, "step": 90700 }, { "epoch": 1.011172350383636, "grad_norm": 8.305352210998535, "learning_rate": 1.6919410135184426e-05, "loss": 0.2524, "step": 90800 }, { "epoch": 1.0113419882576664, "grad_norm": 2.6868128776550293, "learning_rate": 1.6916017377703818e-05, "loss": 0.2176, "step": 90900 }, { "epoch": 1.0115116261316968, "grad_norm": 8.186285018920898, "learning_rate": 1.691262462022321e-05, "loss": 0.256, "step": 91000 }, { "epoch": 1.0116812640057269, "grad_norm": 4.990221977233887, "learning_rate": 1.6909231862742605e-05, "loss": 0.2519, "step": 91100 }, { "epoch": 1.0118509018797572, "grad_norm": 5.034881114959717, "learning_rate": 1.6905839105261997e-05, "loss": 0.2487, "step": 91200 }, { "epoch": 1.0120205397537876, "grad_norm": 7.174300670623779, "learning_rate": 1.6902446347781393e-05, "loss": 0.239, "step": 91300 }, { "epoch": 1.012190177627818, "grad_norm": 6.00614595413208, "learning_rate": 1.6899053590300788e-05, "loss": 0.2297, "step": 91400 }, { "epoch": 1.0123598155018483, "grad_norm": 4.848287105560303, "learning_rate": 1.689566083282018e-05, "loss": 0.2395, "step": 91500 }, { "epoch": 1.0125294533758784, "grad_norm": 4.888648986816406, "learning_rate": 1.6892268075339572e-05, "loss": 0.234, "step": 91600 }, { "epoch": 1.0126990912499088, "grad_norm": 9.51119327545166, "learning_rate": 1.6888875317858968e-05, "loss": 0.2406, "step": 91700 }, { "epoch": 1.012868729123939, "grad_norm": 4.312281131744385, "learning_rate": 1.688548256037836e-05, "loss": 0.2288, "step": 91800 }, { "epoch": 1.0130383669979695, "grad_norm": 5.4920759201049805, "learning_rate": 1.6882089802897756e-05, "loss": 0.237, "step": 91900 }, { "epoch": 1.0132080048719998, "grad_norm": 12.064587593078613, "learning_rate": 1.687869704541715e-05, "loss": 0.2302, "step": 92000 }, { "epoch": 1.0133776427460301, "grad_norm": 5.533562183380127, "learning_rate": 1.6875304287936543e-05, "loss": 0.243, "step": 92100 }, { "epoch": 1.0135472806200603, "grad_norm": 15.711512565612793, "learning_rate": 1.687191153045594e-05, "loss": 0.2381, "step": 92200 }, { "epoch": 1.0137169184940906, "grad_norm": 3.557494640350342, "learning_rate": 1.686851877297533e-05, "loss": 0.2414, "step": 92300 }, { "epoch": 1.013886556368121, "grad_norm": 3.412943124771118, "learning_rate": 1.6865126015494723e-05, "loss": 0.2469, "step": 92400 }, { "epoch": 1.0140561942421513, "grad_norm": 6.815277099609375, "learning_rate": 1.686173325801412e-05, "loss": 0.2507, "step": 92500 }, { "epoch": 1.0142258321161817, "grad_norm": 2.3679540157318115, "learning_rate": 1.685834050053351e-05, "loss": 0.2362, "step": 92600 }, { "epoch": 1.0143954699902118, "grad_norm": 4.69119930267334, "learning_rate": 1.6854947743052906e-05, "loss": 0.2279, "step": 92700 }, { "epoch": 1.0145651078642421, "grad_norm": 5.341691493988037, "learning_rate": 1.68515549855723e-05, "loss": 0.2426, "step": 92800 }, { "epoch": 1.0147347457382725, "grad_norm": 7.450440406799316, "learning_rate": 1.6848162228091694e-05, "loss": 0.2525, "step": 92900 }, { "epoch": 1.0149043836123028, "grad_norm": 5.462919235229492, "learning_rate": 1.684476947061109e-05, "loss": 0.2271, "step": 93000 }, { "epoch": 1.0150740214863332, "grad_norm": 6.152981281280518, "learning_rate": 1.684137671313048e-05, "loss": 0.2274, "step": 93100 }, { "epoch": 1.0152436593603635, "grad_norm": 7.593695640563965, "learning_rate": 1.6837983955649873e-05, "loss": 0.2306, "step": 93200 }, { "epoch": 1.0154132972343937, "grad_norm": 5.243957996368408, "learning_rate": 1.683459119816927e-05, "loss": 0.2542, "step": 93300 }, { "epoch": 1.015582935108424, "grad_norm": 7.50337028503418, "learning_rate": 1.6831198440688664e-05, "loss": 0.238, "step": 93400 }, { "epoch": 1.0157525729824544, "grad_norm": 4.456302165985107, "learning_rate": 1.6827805683208056e-05, "loss": 0.2503, "step": 93500 }, { "epoch": 1.0159222108564847, "grad_norm": 6.896182537078857, "learning_rate": 1.6824412925727452e-05, "loss": 0.2503, "step": 93600 }, { "epoch": 1.016091848730515, "grad_norm": 5.7001776695251465, "learning_rate": 1.6821020168246844e-05, "loss": 0.2561, "step": 93700 }, { "epoch": 1.0162614866045452, "grad_norm": 5.09548282623291, "learning_rate": 1.681762741076624e-05, "loss": 0.2303, "step": 93800 }, { "epoch": 1.0164311244785755, "grad_norm": 4.899608612060547, "learning_rate": 1.681423465328563e-05, "loss": 0.25, "step": 93900 }, { "epoch": 1.0166007623526059, "grad_norm": 7.474002838134766, "learning_rate": 1.6810841895805027e-05, "loss": 0.244, "step": 94000 }, { "epoch": 1.0167704002266362, "grad_norm": 7.662692070007324, "learning_rate": 1.6807449138324422e-05, "loss": 0.2306, "step": 94100 }, { "epoch": 1.0169400381006666, "grad_norm": 4.851945877075195, "learning_rate": 1.6804056380843815e-05, "loss": 0.2311, "step": 94200 }, { "epoch": 1.0171096759746967, "grad_norm": 8.194671630859375, "learning_rate": 1.6800663623363207e-05, "loss": 0.2421, "step": 94300 }, { "epoch": 1.017279313848727, "grad_norm": 11.076744079589844, "learning_rate": 1.6797270865882602e-05, "loss": 0.2503, "step": 94400 }, { "epoch": 1.0174489517227574, "grad_norm": 4.780698299407959, "learning_rate": 1.6793878108401994e-05, "loss": 0.2377, "step": 94500 }, { "epoch": 1.0176185895967877, "grad_norm": 8.586421012878418, "learning_rate": 1.679048535092139e-05, "loss": 0.2587, "step": 94600 }, { "epoch": 1.017788227470818, "grad_norm": 3.729369878768921, "learning_rate": 1.6787092593440785e-05, "loss": 0.2431, "step": 94700 }, { "epoch": 1.0179578653448484, "grad_norm": 5.711459159851074, "learning_rate": 1.6783699835960177e-05, "loss": 0.2403, "step": 94800 }, { "epoch": 1.0181275032188786, "grad_norm": 4.214360237121582, "learning_rate": 1.6780307078479573e-05, "loss": 0.2277, "step": 94900 }, { "epoch": 1.018297141092909, "grad_norm": 22.601207733154297, "learning_rate": 1.6776914320998965e-05, "loss": 0.2476, "step": 95000 }, { "epoch": 1.0184667789669393, "grad_norm": 7.827741622924805, "learning_rate": 1.6773521563518357e-05, "loss": 0.2322, "step": 95100 }, { "epoch": 1.0186364168409696, "grad_norm": 2.3004703521728516, "learning_rate": 1.6770128806037753e-05, "loss": 0.2403, "step": 95200 }, { "epoch": 1.018806054715, "grad_norm": 7.989618301391602, "learning_rate": 1.6766736048557145e-05, "loss": 0.2273, "step": 95300 }, { "epoch": 1.01897569258903, "grad_norm": 3.7908883094787598, "learning_rate": 1.676334329107654e-05, "loss": 0.2452, "step": 95400 }, { "epoch": 1.0191453304630604, "grad_norm": 9.209786415100098, "learning_rate": 1.6759950533595936e-05, "loss": 0.2295, "step": 95500 }, { "epoch": 1.0193149683370908, "grad_norm": 7.700103759765625, "learning_rate": 1.6756557776115328e-05, "loss": 0.2307, "step": 95600 }, { "epoch": 1.0194846062111211, "grad_norm": 2.9111480712890625, "learning_rate": 1.6753165018634723e-05, "loss": 0.2255, "step": 95700 }, { "epoch": 1.0196542440851515, "grad_norm": 4.09812068939209, "learning_rate": 1.6749772261154115e-05, "loss": 0.2404, "step": 95800 }, { "epoch": 1.0198238819591818, "grad_norm": 5.172861576080322, "learning_rate": 1.6746379503673507e-05, "loss": 0.2517, "step": 95900 }, { "epoch": 1.019993519833212, "grad_norm": 4.188581943511963, "learning_rate": 1.6742986746192903e-05, "loss": 0.2259, "step": 96000 }, { "epoch": 1.0201631577072423, "grad_norm": 10.418130874633789, "learning_rate": 1.67395939887123e-05, "loss": 0.2304, "step": 96100 }, { "epoch": 1.0203327955812727, "grad_norm": 7.788801670074463, "learning_rate": 1.673620123123169e-05, "loss": 0.2243, "step": 96200 }, { "epoch": 1.020502433455303, "grad_norm": 5.755122184753418, "learning_rate": 1.6732808473751086e-05, "loss": 0.2373, "step": 96300 }, { "epoch": 1.0206720713293334, "grad_norm": 6.444185733795166, "learning_rate": 1.6729415716270478e-05, "loss": 0.2246, "step": 96400 }, { "epoch": 1.0208417092033635, "grad_norm": 7.518077373504639, "learning_rate": 1.6726022958789874e-05, "loss": 0.2344, "step": 96500 }, { "epoch": 1.0210113470773938, "grad_norm": 4.169599533081055, "learning_rate": 1.6722630201309266e-05, "loss": 0.236, "step": 96600 }, { "epoch": 1.0211809849514242, "grad_norm": 5.564794540405273, "learning_rate": 1.671923744382866e-05, "loss": 0.2414, "step": 96700 }, { "epoch": 1.0213506228254545, "grad_norm": 5.838275909423828, "learning_rate": 1.6715844686348057e-05, "loss": 0.2289, "step": 96800 }, { "epoch": 1.0215202606994849, "grad_norm": 9.894466400146484, "learning_rate": 1.671245192886745e-05, "loss": 0.246, "step": 96900 }, { "epoch": 1.0216898985735152, "grad_norm": 2.841754674911499, "learning_rate": 1.670905917138684e-05, "loss": 0.2281, "step": 97000 }, { "epoch": 1.0218595364475453, "grad_norm": 5.338644504547119, "learning_rate": 1.6705666413906236e-05, "loss": 0.2446, "step": 97100 }, { "epoch": 1.0220291743215757, "grad_norm": 4.759637832641602, "learning_rate": 1.670227365642563e-05, "loss": 0.2327, "step": 97200 }, { "epoch": 1.022198812195606, "grad_norm": 4.555543899536133, "learning_rate": 1.6698880898945024e-05, "loss": 0.2363, "step": 97300 }, { "epoch": 1.0223684500696364, "grad_norm": 3.3954598903656006, "learning_rate": 1.6695488141464416e-05, "loss": 0.254, "step": 97400 }, { "epoch": 1.0225380879436667, "grad_norm": 4.4154253005981445, "learning_rate": 1.669209538398381e-05, "loss": 0.2469, "step": 97500 }, { "epoch": 1.0227077258176969, "grad_norm": 5.648190975189209, "learning_rate": 1.6688702626503207e-05, "loss": 0.2417, "step": 97600 }, { "epoch": 1.0228773636917272, "grad_norm": 7.520351409912109, "learning_rate": 1.66853098690226e-05, "loss": 0.2547, "step": 97700 }, { "epoch": 1.0230470015657576, "grad_norm": 6.792571067810059, "learning_rate": 1.668191711154199e-05, "loss": 0.2529, "step": 97800 }, { "epoch": 1.023216639439788, "grad_norm": 7.716655254364014, "learning_rate": 1.6678524354061387e-05, "loss": 0.223, "step": 97900 }, { "epoch": 1.0233862773138183, "grad_norm": 6.751737594604492, "learning_rate": 1.667513159658078e-05, "loss": 0.2601, "step": 98000 }, { "epoch": 1.0235559151878486, "grad_norm": 7.170324802398682, "learning_rate": 1.6671738839100174e-05, "loss": 0.2349, "step": 98100 }, { "epoch": 1.0237255530618787, "grad_norm": 4.83284854888916, "learning_rate": 1.666834608161957e-05, "loss": 0.231, "step": 98200 }, { "epoch": 1.023895190935909, "grad_norm": 2.1090004444122314, "learning_rate": 1.6664953324138962e-05, "loss": 0.2333, "step": 98300 }, { "epoch": 1.0240648288099394, "grad_norm": 4.874654293060303, "learning_rate": 1.6661560566658357e-05, "loss": 0.257, "step": 98400 }, { "epoch": 1.0242344666839698, "grad_norm": 3.7142999172210693, "learning_rate": 1.665816780917775e-05, "loss": 0.2422, "step": 98500 }, { "epoch": 1.0244041045580001, "grad_norm": 4.001662731170654, "learning_rate": 1.665477505169714e-05, "loss": 0.2297, "step": 98600 }, { "epoch": 1.0245737424320303, "grad_norm": 5.9061665534973145, "learning_rate": 1.6651382294216537e-05, "loss": 0.2438, "step": 98700 }, { "epoch": 1.0247433803060606, "grad_norm": 6.545547008514404, "learning_rate": 1.6647989536735933e-05, "loss": 0.2315, "step": 98800 }, { "epoch": 1.024913018180091, "grad_norm": 4.94508695602417, "learning_rate": 1.6644596779255325e-05, "loss": 0.248, "step": 98900 }, { "epoch": 1.0250826560541213, "grad_norm": 7.421692371368408, "learning_rate": 1.664120402177472e-05, "loss": 0.2334, "step": 99000 }, { "epoch": 1.0252522939281516, "grad_norm": 3.3852603435516357, "learning_rate": 1.6637811264294112e-05, "loss": 0.2499, "step": 99100 }, { "epoch": 1.0254219318021818, "grad_norm": 9.762946128845215, "learning_rate": 1.6634418506813504e-05, "loss": 0.2437, "step": 99200 }, { "epoch": 1.0255915696762121, "grad_norm": 8.178512573242188, "learning_rate": 1.66310257493329e-05, "loss": 0.251, "step": 99300 }, { "epoch": 1.0257612075502425, "grad_norm": 4.333343505859375, "learning_rate": 1.6627632991852295e-05, "loss": 0.2427, "step": 99400 }, { "epoch": 1.0259308454242728, "grad_norm": 5.504798412322998, "learning_rate": 1.6624240234371687e-05, "loss": 0.2449, "step": 99500 }, { "epoch": 1.0261004832983032, "grad_norm": 4.2991228103637695, "learning_rate": 1.6620847476891083e-05, "loss": 0.2478, "step": 99600 }, { "epoch": 1.0262701211723335, "grad_norm": 9.136775970458984, "learning_rate": 1.6617454719410475e-05, "loss": 0.24, "step": 99700 }, { "epoch": 1.0264397590463636, "grad_norm": 5.274001598358154, "learning_rate": 1.661406196192987e-05, "loss": 0.2425, "step": 99800 }, { "epoch": 1.026609396920394, "grad_norm": 9.508216857910156, "learning_rate": 1.6610669204449263e-05, "loss": 0.2341, "step": 99900 }, { "epoch": 1.0267790347944243, "grad_norm": 6.6127095222473145, "learning_rate": 1.6607276446968658e-05, "loss": 0.2351, "step": 100000 }, { "epoch": 1.0269486726684547, "grad_norm": 6.332643508911133, "learning_rate": 1.660388368948805e-05, "loss": 0.2434, "step": 100100 }, { "epoch": 1.027118310542485, "grad_norm": 5.462932109832764, "learning_rate": 1.6600490932007446e-05, "loss": 0.2348, "step": 100200 }, { "epoch": 1.0272879484165152, "grad_norm": 7.142257213592529, "learning_rate": 1.659709817452684e-05, "loss": 0.2402, "step": 100300 }, { "epoch": 1.0274575862905455, "grad_norm": 7.38620138168335, "learning_rate": 1.6593705417046233e-05, "loss": 0.2369, "step": 100400 }, { "epoch": 1.0276272241645759, "grad_norm": 7.193697929382324, "learning_rate": 1.6590312659565625e-05, "loss": 0.2414, "step": 100500 }, { "epoch": 1.0277968620386062, "grad_norm": 3.521663188934326, "learning_rate": 1.658691990208502e-05, "loss": 0.2349, "step": 100600 }, { "epoch": 1.0279664999126366, "grad_norm": 9.68940544128418, "learning_rate": 1.6583527144604413e-05, "loss": 0.2419, "step": 100700 }, { "epoch": 1.028136137786667, "grad_norm": 6.040586471557617, "learning_rate": 1.658013438712381e-05, "loss": 0.2368, "step": 100800 }, { "epoch": 1.028305775660697, "grad_norm": 7.243034839630127, "learning_rate": 1.6576741629643204e-05, "loss": 0.2205, "step": 100900 }, { "epoch": 1.0284754135347274, "grad_norm": 4.8003740310668945, "learning_rate": 1.6573348872162596e-05, "loss": 0.2485, "step": 101000 }, { "epoch": 1.0286450514087577, "grad_norm": 6.9113078117370605, "learning_rate": 1.6569956114681988e-05, "loss": 0.2314, "step": 101100 }, { "epoch": 1.028814689282788, "grad_norm": 6.181374549865723, "learning_rate": 1.6566563357201384e-05, "loss": 0.2356, "step": 101200 }, { "epoch": 1.0289843271568184, "grad_norm": 4.686517238616943, "learning_rate": 1.6563170599720776e-05, "loss": 0.2313, "step": 101300 }, { "epoch": 1.0291539650308485, "grad_norm": 4.899759769439697, "learning_rate": 1.655977784224017e-05, "loss": 0.2662, "step": 101400 }, { "epoch": 1.029323602904879, "grad_norm": 8.956764221191406, "learning_rate": 1.6556385084759567e-05, "loss": 0.23, "step": 101500 }, { "epoch": 1.0294932407789092, "grad_norm": 5.137114524841309, "learning_rate": 1.655299232727896e-05, "loss": 0.2433, "step": 101600 }, { "epoch": 1.0296628786529396, "grad_norm": 7.45181941986084, "learning_rate": 1.6549599569798354e-05, "loss": 0.2262, "step": 101700 }, { "epoch": 1.02983251652697, "grad_norm": 5.2147626876831055, "learning_rate": 1.6546206812317746e-05, "loss": 0.2367, "step": 101800 }, { "epoch": 1.0300021544010003, "grad_norm": 6.573465347290039, "learning_rate": 1.654281405483714e-05, "loss": 0.2347, "step": 101900 }, { "epoch": 1.0301717922750304, "grad_norm": 3.152808427810669, "learning_rate": 1.6539421297356534e-05, "loss": 0.2401, "step": 102000 }, { "epoch": 1.0303414301490608, "grad_norm": 5.311460018157959, "learning_rate": 1.653602853987593e-05, "loss": 0.2229, "step": 102100 }, { "epoch": 1.0305110680230911, "grad_norm": 4.631628513336182, "learning_rate": 1.653263578239532e-05, "loss": 0.2513, "step": 102200 }, { "epoch": 1.0306807058971215, "grad_norm": 4.328015327453613, "learning_rate": 1.6529243024914717e-05, "loss": 0.2556, "step": 102300 }, { "epoch": 1.0308503437711518, "grad_norm": 6.033789157867432, "learning_rate": 1.652585026743411e-05, "loss": 0.236, "step": 102400 }, { "epoch": 1.031019981645182, "grad_norm": 4.288794994354248, "learning_rate": 1.6522457509953505e-05, "loss": 0.2279, "step": 102500 }, { "epoch": 1.0311896195192123, "grad_norm": 9.08422565460205, "learning_rate": 1.6519064752472897e-05, "loss": 0.2569, "step": 102600 }, { "epoch": 1.0313592573932426, "grad_norm": 10.209990501403809, "learning_rate": 1.651567199499229e-05, "loss": 0.2184, "step": 102700 }, { "epoch": 1.031528895267273, "grad_norm": 6.483861923217773, "learning_rate": 1.6512279237511684e-05, "loss": 0.2253, "step": 102800 }, { "epoch": 1.0316985331413033, "grad_norm": 6.499017238616943, "learning_rate": 1.650888648003108e-05, "loss": 0.2501, "step": 102900 }, { "epoch": 1.0318681710153337, "grad_norm": 5.954804420471191, "learning_rate": 1.6505493722550472e-05, "loss": 0.2482, "step": 103000 }, { "epoch": 1.0320378088893638, "grad_norm": 5.099503993988037, "learning_rate": 1.6502100965069867e-05, "loss": 0.2407, "step": 103100 }, { "epoch": 1.0322074467633942, "grad_norm": 5.3625006675720215, "learning_rate": 1.649870820758926e-05, "loss": 0.246, "step": 103200 }, { "epoch": 1.0323770846374245, "grad_norm": 9.005240440368652, "learning_rate": 1.6495315450108655e-05, "loss": 0.2265, "step": 103300 }, { "epoch": 1.0325467225114549, "grad_norm": 3.712984800338745, "learning_rate": 1.6491922692628047e-05, "loss": 0.239, "step": 103400 }, { "epoch": 1.0327163603854852, "grad_norm": 6.8334059715271, "learning_rate": 1.6488529935147443e-05, "loss": 0.2465, "step": 103500 }, { "epoch": 1.0328859982595153, "grad_norm": 7.515334129333496, "learning_rate": 1.6485137177666838e-05, "loss": 0.2357, "step": 103600 }, { "epoch": 1.0330556361335457, "grad_norm": 4.470841407775879, "learning_rate": 1.648174442018623e-05, "loss": 0.2292, "step": 103700 }, { "epoch": 1.033225274007576, "grad_norm": 5.828950881958008, "learning_rate": 1.6478351662705622e-05, "loss": 0.2335, "step": 103800 }, { "epoch": 1.0333949118816064, "grad_norm": 5.611102104187012, "learning_rate": 1.6474958905225018e-05, "loss": 0.234, "step": 103900 }, { "epoch": 1.0335645497556367, "grad_norm": 4.702258110046387, "learning_rate": 1.647156614774441e-05, "loss": 0.2423, "step": 104000 }, { "epoch": 1.0337341876296668, "grad_norm": 5.1954121589660645, "learning_rate": 1.6468173390263805e-05, "loss": 0.2243, "step": 104100 }, { "epoch": 1.0339038255036972, "grad_norm": 6.094088077545166, "learning_rate": 1.64647806327832e-05, "loss": 0.2418, "step": 104200 }, { "epoch": 1.0340734633777275, "grad_norm": 8.037471771240234, "learning_rate": 1.6461387875302593e-05, "loss": 0.2499, "step": 104300 }, { "epoch": 1.034243101251758, "grad_norm": 3.6217806339263916, "learning_rate": 1.645799511782199e-05, "loss": 0.2347, "step": 104400 }, { "epoch": 1.0344127391257882, "grad_norm": 5.280145168304443, "learning_rate": 1.645460236034138e-05, "loss": 0.2507, "step": 104500 }, { "epoch": 1.0345823769998186, "grad_norm": 3.1634342670440674, "learning_rate": 1.6451209602860773e-05, "loss": 0.2459, "step": 104600 }, { "epoch": 1.0347520148738487, "grad_norm": 8.158108711242676, "learning_rate": 1.6447816845380168e-05, "loss": 0.2576, "step": 104700 }, { "epoch": 1.034921652747879, "grad_norm": 1.9419981241226196, "learning_rate": 1.6444424087899564e-05, "loss": 0.2268, "step": 104800 }, { "epoch": 1.0350912906219094, "grad_norm": 4.109727382659912, "learning_rate": 1.6441031330418956e-05, "loss": 0.2501, "step": 104900 }, { "epoch": 1.0352609284959398, "grad_norm": 3.487384557723999, "learning_rate": 1.643763857293835e-05, "loss": 0.257, "step": 105000 }, { "epoch": 1.03543056636997, "grad_norm": 3.6563103199005127, "learning_rate": 1.6434245815457743e-05, "loss": 0.2612, "step": 105100 }, { "epoch": 1.0356002042440002, "grad_norm": 6.818143367767334, "learning_rate": 1.643085305797714e-05, "loss": 0.24, "step": 105200 }, { "epoch": 1.0357698421180306, "grad_norm": 4.253990650177002, "learning_rate": 1.642746030049653e-05, "loss": 0.2376, "step": 105300 }, { "epoch": 1.035939479992061, "grad_norm": 5.8974995613098145, "learning_rate": 1.6424067543015923e-05, "loss": 0.2486, "step": 105400 }, { "epoch": 1.0361091178660913, "grad_norm": 5.250041961669922, "learning_rate": 1.642067478553532e-05, "loss": 0.2325, "step": 105500 }, { "epoch": 1.0362787557401216, "grad_norm": 5.304795742034912, "learning_rate": 1.6417282028054714e-05, "loss": 0.2323, "step": 105600 }, { "epoch": 1.036448393614152, "grad_norm": 5.146512985229492, "learning_rate": 1.6413889270574106e-05, "loss": 0.256, "step": 105700 }, { "epoch": 1.036618031488182, "grad_norm": 3.7205147743225098, "learning_rate": 1.64104965130935e-05, "loss": 0.2248, "step": 105800 }, { "epoch": 1.0367876693622124, "grad_norm": 14.026883125305176, "learning_rate": 1.6407103755612894e-05, "loss": 0.2381, "step": 105900 }, { "epoch": 1.0369573072362428, "grad_norm": 5.053284645080566, "learning_rate": 1.6403710998132286e-05, "loss": 0.2354, "step": 106000 }, { "epoch": 1.0371269451102731, "grad_norm": 5.882918357849121, "learning_rate": 1.640031824065168e-05, "loss": 0.2523, "step": 106100 }, { "epoch": 1.0372965829843035, "grad_norm": 10.665180206298828, "learning_rate": 1.6396925483171077e-05, "loss": 0.2496, "step": 106200 }, { "epoch": 1.0374662208583336, "grad_norm": 2.5990731716156006, "learning_rate": 1.6393532725690472e-05, "loss": 0.2293, "step": 106300 }, { "epoch": 1.037635858732364, "grad_norm": 19.67258071899414, "learning_rate": 1.6390139968209864e-05, "loss": 0.2141, "step": 106400 }, { "epoch": 1.0378054966063943, "grad_norm": 5.72860050201416, "learning_rate": 1.6386747210729257e-05, "loss": 0.239, "step": 106500 }, { "epoch": 1.0379751344804247, "grad_norm": 4.353442668914795, "learning_rate": 1.6383354453248652e-05, "loss": 0.2487, "step": 106600 }, { "epoch": 1.038144772354455, "grad_norm": 6.5341339111328125, "learning_rate": 1.6379961695768044e-05, "loss": 0.2371, "step": 106700 }, { "epoch": 1.0383144102284851, "grad_norm": 4.595044136047363, "learning_rate": 1.637656893828744e-05, "loss": 0.2396, "step": 106800 }, { "epoch": 1.0384840481025155, "grad_norm": 4.236234188079834, "learning_rate": 1.6373176180806835e-05, "loss": 0.2239, "step": 106900 }, { "epoch": 1.0386536859765458, "grad_norm": 4.848886013031006, "learning_rate": 1.6369783423326227e-05, "loss": 0.2457, "step": 107000 }, { "epoch": 1.0388233238505762, "grad_norm": 5.014443397521973, "learning_rate": 1.6366390665845623e-05, "loss": 0.2525, "step": 107100 }, { "epoch": 1.0389929617246065, "grad_norm": 4.325504779815674, "learning_rate": 1.6362997908365015e-05, "loss": 0.2465, "step": 107200 }, { "epoch": 1.0391625995986369, "grad_norm": 6.722569942474365, "learning_rate": 1.6359605150884407e-05, "loss": 0.2386, "step": 107300 }, { "epoch": 1.039332237472667, "grad_norm": 3.6529064178466797, "learning_rate": 1.6356212393403802e-05, "loss": 0.241, "step": 107400 }, { "epoch": 1.0395018753466974, "grad_norm": 11.794537544250488, "learning_rate": 1.6352819635923194e-05, "loss": 0.2308, "step": 107500 }, { "epoch": 1.0396715132207277, "grad_norm": 3.21207332611084, "learning_rate": 1.634942687844259e-05, "loss": 0.2629, "step": 107600 }, { "epoch": 1.039841151094758, "grad_norm": 6.930755138397217, "learning_rate": 1.6346034120961985e-05, "loss": 0.2453, "step": 107700 }, { "epoch": 1.0400107889687884, "grad_norm": 7.506489276885986, "learning_rate": 1.6342641363481378e-05, "loss": 0.2359, "step": 107800 }, { "epoch": 1.0401804268428185, "grad_norm": 4.977521896362305, "learning_rate": 1.633924860600077e-05, "loss": 0.2263, "step": 107900 }, { "epoch": 1.0403500647168489, "grad_norm": 6.505887985229492, "learning_rate": 1.6335855848520165e-05, "loss": 0.2454, "step": 108000 }, { "epoch": 1.0405197025908792, "grad_norm": 3.8838231563568115, "learning_rate": 1.6332463091039557e-05, "loss": 0.2351, "step": 108100 }, { "epoch": 1.0406893404649096, "grad_norm": 5.722506046295166, "learning_rate": 1.6329070333558953e-05, "loss": 0.218, "step": 108200 }, { "epoch": 1.04085897833894, "grad_norm": 5.868005752563477, "learning_rate": 1.6325677576078348e-05, "loss": 0.2325, "step": 108300 }, { "epoch": 1.0410286162129703, "grad_norm": 7.107659816741943, "learning_rate": 1.632228481859774e-05, "loss": 0.2358, "step": 108400 }, { "epoch": 1.0411982540870004, "grad_norm": 6.152891635894775, "learning_rate": 1.6318892061117136e-05, "loss": 0.2318, "step": 108500 }, { "epoch": 1.0413678919610307, "grad_norm": 4.409629821777344, "learning_rate": 1.6315499303636528e-05, "loss": 0.2537, "step": 108600 }, { "epoch": 1.041537529835061, "grad_norm": 5.98974609375, "learning_rate": 1.631210654615592e-05, "loss": 0.2395, "step": 108700 }, { "epoch": 1.0417071677090914, "grad_norm": 3.762979030609131, "learning_rate": 1.6308713788675316e-05, "loss": 0.2204, "step": 108800 }, { "epoch": 1.0418768055831218, "grad_norm": 3.3951921463012695, "learning_rate": 1.630532103119471e-05, "loss": 0.2518, "step": 108900 }, { "epoch": 1.042046443457152, "grad_norm": 6.012150764465332, "learning_rate": 1.6301928273714106e-05, "loss": 0.2435, "step": 109000 }, { "epoch": 1.0422160813311823, "grad_norm": 7.6780781745910645, "learning_rate": 1.62985355162335e-05, "loss": 0.2421, "step": 109100 }, { "epoch": 1.0423857192052126, "grad_norm": 7.409370422363281, "learning_rate": 1.629514275875289e-05, "loss": 0.2232, "step": 109200 }, { "epoch": 1.042555357079243, "grad_norm": 4.268203258514404, "learning_rate": 1.6291750001272286e-05, "loss": 0.2476, "step": 109300 }, { "epoch": 1.0427249949532733, "grad_norm": 4.8573784828186035, "learning_rate": 1.6288357243791678e-05, "loss": 0.2407, "step": 109400 }, { "epoch": 1.0428946328273037, "grad_norm": 4.416388988494873, "learning_rate": 1.6284964486311074e-05, "loss": 0.2448, "step": 109500 }, { "epoch": 1.0430642707013338, "grad_norm": 11.745798110961914, "learning_rate": 1.6281571728830466e-05, "loss": 0.2464, "step": 109600 }, { "epoch": 1.0432339085753641, "grad_norm": 4.997187614440918, "learning_rate": 1.627817897134986e-05, "loss": 0.2298, "step": 109700 }, { "epoch": 1.0434035464493945, "grad_norm": 4.42112398147583, "learning_rate": 1.6274786213869253e-05, "loss": 0.2512, "step": 109800 }, { "epoch": 1.0435731843234248, "grad_norm": 7.9793009757995605, "learning_rate": 1.627139345638865e-05, "loss": 0.2343, "step": 109900 }, { "epoch": 1.0437428221974552, "grad_norm": 9.319527626037598, "learning_rate": 1.626800069890804e-05, "loss": 0.2575, "step": 110000 }, { "epoch": 1.0439124600714853, "grad_norm": 6.425539493560791, "learning_rate": 1.6264607941427437e-05, "loss": 0.2414, "step": 110100 }, { "epoch": 1.0440820979455157, "grad_norm": 7.645553112030029, "learning_rate": 1.626121518394683e-05, "loss": 0.2419, "step": 110200 }, { "epoch": 1.044251735819546, "grad_norm": 7.059961795806885, "learning_rate": 1.6257822426466224e-05, "loss": 0.2493, "step": 110300 }, { "epoch": 1.0444213736935763, "grad_norm": 10.969443321228027, "learning_rate": 1.625442966898562e-05, "loss": 0.2535, "step": 110400 }, { "epoch": 1.0445910115676067, "grad_norm": 3.7967209815979004, "learning_rate": 1.6251036911505012e-05, "loss": 0.2375, "step": 110500 }, { "epoch": 1.044760649441637, "grad_norm": 6.147740840911865, "learning_rate": 1.6247644154024404e-05, "loss": 0.2326, "step": 110600 }, { "epoch": 1.0449302873156672, "grad_norm": 3.5094704627990723, "learning_rate": 1.62442513965438e-05, "loss": 0.2376, "step": 110700 }, { "epoch": 1.0450999251896975, "grad_norm": 8.889718055725098, "learning_rate": 1.624085863906319e-05, "loss": 0.224, "step": 110800 }, { "epoch": 1.0452695630637279, "grad_norm": 12.642106056213379, "learning_rate": 1.6237465881582587e-05, "loss": 0.2404, "step": 110900 }, { "epoch": 1.0454392009377582, "grad_norm": 3.5721912384033203, "learning_rate": 1.6234073124101982e-05, "loss": 0.2433, "step": 111000 }, { "epoch": 1.0456088388117886, "grad_norm": 5.891770362854004, "learning_rate": 1.6230680366621374e-05, "loss": 0.2339, "step": 111100 }, { "epoch": 1.0457784766858187, "grad_norm": 4.572074890136719, "learning_rate": 1.622728760914077e-05, "loss": 0.2612, "step": 111200 }, { "epoch": 1.045948114559849, "grad_norm": 10.256953239440918, "learning_rate": 1.6223894851660162e-05, "loss": 0.2504, "step": 111300 }, { "epoch": 1.0461177524338794, "grad_norm": 7.475249290466309, "learning_rate": 1.6220502094179554e-05, "loss": 0.2461, "step": 111400 }, { "epoch": 1.0462873903079097, "grad_norm": 5.985665798187256, "learning_rate": 1.621710933669895e-05, "loss": 0.2516, "step": 111500 }, { "epoch": 1.04645702818194, "grad_norm": 4.418927192687988, "learning_rate": 1.6213716579218345e-05, "loss": 0.2273, "step": 111600 }, { "epoch": 1.0466266660559702, "grad_norm": 10.523200035095215, "learning_rate": 1.6210323821737737e-05, "loss": 0.2404, "step": 111700 }, { "epoch": 1.0467963039300006, "grad_norm": 4.309625148773193, "learning_rate": 1.6206931064257133e-05, "loss": 0.2423, "step": 111800 }, { "epoch": 1.046965941804031, "grad_norm": 4.631493091583252, "learning_rate": 1.6203538306776525e-05, "loss": 0.2347, "step": 111900 }, { "epoch": 1.0471355796780613, "grad_norm": 2.1207542419433594, "learning_rate": 1.620014554929592e-05, "loss": 0.2056, "step": 112000 }, { "epoch": 1.0473052175520916, "grad_norm": 5.958195209503174, "learning_rate": 1.6196752791815312e-05, "loss": 0.2475, "step": 112100 }, { "epoch": 1.047474855426122, "grad_norm": 7.326410293579102, "learning_rate": 1.6193360034334708e-05, "loss": 0.2154, "step": 112200 }, { "epoch": 1.047644493300152, "grad_norm": 2.9897406101226807, "learning_rate": 1.61899672768541e-05, "loss": 0.2428, "step": 112300 }, { "epoch": 1.0478141311741824, "grad_norm": 7.1019511222839355, "learning_rate": 1.6186574519373496e-05, "loss": 0.2608, "step": 112400 }, { "epoch": 1.0479837690482128, "grad_norm": 5.309335231781006, "learning_rate": 1.6183181761892888e-05, "loss": 0.2282, "step": 112500 }, { "epoch": 1.0481534069222431, "grad_norm": 12.645231246948242, "learning_rate": 1.6179789004412283e-05, "loss": 0.2443, "step": 112600 }, { "epoch": 1.0483230447962735, "grad_norm": 6.158746719360352, "learning_rate": 1.6176396246931675e-05, "loss": 0.2399, "step": 112700 }, { "epoch": 1.0484926826703036, "grad_norm": 8.90205192565918, "learning_rate": 1.6173003489451067e-05, "loss": 0.2386, "step": 112800 }, { "epoch": 1.048662320544334, "grad_norm": 7.905735492706299, "learning_rate": 1.6169610731970463e-05, "loss": 0.2405, "step": 112900 }, { "epoch": 1.0488319584183643, "grad_norm": 2.3853213787078857, "learning_rate": 1.616621797448986e-05, "loss": 0.2256, "step": 113000 }, { "epoch": 1.0490015962923946, "grad_norm": 8.268918991088867, "learning_rate": 1.6162825217009254e-05, "loss": 0.2499, "step": 113100 }, { "epoch": 1.049171234166425, "grad_norm": 7.998558521270752, "learning_rate": 1.6159432459528646e-05, "loss": 0.2416, "step": 113200 }, { "epoch": 1.0493408720404553, "grad_norm": 4.571617126464844, "learning_rate": 1.6156039702048038e-05, "loss": 0.2177, "step": 113300 }, { "epoch": 1.0495105099144855, "grad_norm": 5.831237316131592, "learning_rate": 1.6152646944567433e-05, "loss": 0.2578, "step": 113400 }, { "epoch": 1.0496801477885158, "grad_norm": 10.54854965209961, "learning_rate": 1.6149254187086826e-05, "loss": 0.2337, "step": 113500 }, { "epoch": 1.0498497856625462, "grad_norm": 7.420141220092773, "learning_rate": 1.614586142960622e-05, "loss": 0.2443, "step": 113600 }, { "epoch": 1.0500194235365765, "grad_norm": 6.875145435333252, "learning_rate": 1.6142468672125617e-05, "loss": 0.2486, "step": 113700 }, { "epoch": 1.0501890614106069, "grad_norm": 5.095635414123535, "learning_rate": 1.613907591464501e-05, "loss": 0.2186, "step": 113800 }, { "epoch": 1.050358699284637, "grad_norm": 4.867079734802246, "learning_rate": 1.6135683157164404e-05, "loss": 0.2354, "step": 113900 }, { "epoch": 1.0505283371586673, "grad_norm": 3.795738697052002, "learning_rate": 1.6132290399683796e-05, "loss": 0.2287, "step": 114000 }, { "epoch": 1.0506979750326977, "grad_norm": 5.828240871429443, "learning_rate": 1.612889764220319e-05, "loss": 0.2423, "step": 114100 }, { "epoch": 1.050867612906728, "grad_norm": 4.970689296722412, "learning_rate": 1.6125504884722584e-05, "loss": 0.2518, "step": 114200 }, { "epoch": 1.0510372507807584, "grad_norm": 5.329409599304199, "learning_rate": 1.612211212724198e-05, "loss": 0.2211, "step": 114300 }, { "epoch": 1.0512068886547887, "grad_norm": 4.091852188110352, "learning_rate": 1.611871936976137e-05, "loss": 0.2513, "step": 114400 }, { "epoch": 1.0513765265288189, "grad_norm": 9.502054214477539, "learning_rate": 1.6115326612280767e-05, "loss": 0.2411, "step": 114500 }, { "epoch": 1.0515461644028492, "grad_norm": 5.292558670043945, "learning_rate": 1.611193385480016e-05, "loss": 0.2651, "step": 114600 }, { "epoch": 1.0517158022768796, "grad_norm": 3.7065558433532715, "learning_rate": 1.610854109731955e-05, "loss": 0.2431, "step": 114700 }, { "epoch": 1.05188544015091, "grad_norm": 2.9846434593200684, "learning_rate": 1.6105148339838947e-05, "loss": 0.2356, "step": 114800 }, { "epoch": 1.0520550780249402, "grad_norm": 4.114966869354248, "learning_rate": 1.6101755582358342e-05, "loss": 0.2415, "step": 114900 }, { "epoch": 1.0522247158989704, "grad_norm": 4.87261962890625, "learning_rate": 1.6098362824877734e-05, "loss": 0.2515, "step": 115000 }, { "epoch": 1.0523943537730007, "grad_norm": 8.873214721679688, "learning_rate": 1.609497006739713e-05, "loss": 0.2401, "step": 115100 }, { "epoch": 1.052563991647031, "grad_norm": 6.640275955200195, "learning_rate": 1.6091577309916522e-05, "loss": 0.2404, "step": 115200 }, { "epoch": 1.0527336295210614, "grad_norm": 4.99779748916626, "learning_rate": 1.6088184552435917e-05, "loss": 0.2365, "step": 115300 }, { "epoch": 1.0529032673950918, "grad_norm": 3.688915967941284, "learning_rate": 1.608479179495531e-05, "loss": 0.2421, "step": 115400 }, { "epoch": 1.0530729052691221, "grad_norm": 8.071297645568848, "learning_rate": 1.60813990374747e-05, "loss": 0.2294, "step": 115500 }, { "epoch": 1.0532425431431522, "grad_norm": 21.961544036865234, "learning_rate": 1.6078006279994097e-05, "loss": 0.2286, "step": 115600 }, { "epoch": 1.0534121810171826, "grad_norm": 5.899078845977783, "learning_rate": 1.6074613522513492e-05, "loss": 0.2372, "step": 115700 }, { "epoch": 1.053581818891213, "grad_norm": 3.79107666015625, "learning_rate": 1.6071220765032888e-05, "loss": 0.2406, "step": 115800 }, { "epoch": 1.0537514567652433, "grad_norm": 2.1656477451324463, "learning_rate": 1.606782800755228e-05, "loss": 0.2261, "step": 115900 }, { "epoch": 1.0539210946392736, "grad_norm": 6.869843006134033, "learning_rate": 1.6064435250071672e-05, "loss": 0.2296, "step": 116000 }, { "epoch": 1.0540907325133038, "grad_norm": 2.790172576904297, "learning_rate": 1.6061042492591068e-05, "loss": 0.2243, "step": 116100 }, { "epoch": 1.0542603703873341, "grad_norm": 6.450159072875977, "learning_rate": 1.605764973511046e-05, "loss": 0.2472, "step": 116200 }, { "epoch": 1.0544300082613645, "grad_norm": 3.537853479385376, "learning_rate": 1.6054256977629855e-05, "loss": 0.2378, "step": 116300 }, { "epoch": 1.0545996461353948, "grad_norm": 4.830201148986816, "learning_rate": 1.605086422014925e-05, "loss": 0.2363, "step": 116400 }, { "epoch": 1.0547692840094252, "grad_norm": 4.202089309692383, "learning_rate": 1.6047471462668643e-05, "loss": 0.2369, "step": 116500 }, { "epoch": 1.0549389218834553, "grad_norm": 6.864770889282227, "learning_rate": 1.6044078705188035e-05, "loss": 0.2538, "step": 116600 }, { "epoch": 1.0551085597574856, "grad_norm": 7.62398624420166, "learning_rate": 1.604068594770743e-05, "loss": 0.2236, "step": 116700 }, { "epoch": 1.055278197631516, "grad_norm": 7.105021953582764, "learning_rate": 1.6037293190226823e-05, "loss": 0.2344, "step": 116800 }, { "epoch": 1.0554478355055463, "grad_norm": 7.355177879333496, "learning_rate": 1.6033900432746218e-05, "loss": 0.2305, "step": 116900 }, { "epoch": 1.0556174733795767, "grad_norm": 8.725106239318848, "learning_rate": 1.6030507675265614e-05, "loss": 0.2444, "step": 117000 }, { "epoch": 1.055787111253607, "grad_norm": 4.852562427520752, "learning_rate": 1.6027114917785006e-05, "loss": 0.2223, "step": 117100 }, { "epoch": 1.0559567491276372, "grad_norm": 6.5674567222595215, "learning_rate": 1.60237221603044e-05, "loss": 0.2472, "step": 117200 }, { "epoch": 1.0561263870016675, "grad_norm": 7.591166019439697, "learning_rate": 1.6020329402823793e-05, "loss": 0.2262, "step": 117300 }, { "epoch": 1.0562960248756978, "grad_norm": 4.275929927825928, "learning_rate": 1.6016936645343185e-05, "loss": 0.2564, "step": 117400 }, { "epoch": 1.0564656627497282, "grad_norm": 4.789520263671875, "learning_rate": 1.601354388786258e-05, "loss": 0.2272, "step": 117500 }, { "epoch": 1.0566353006237585, "grad_norm": 1.8538156747817993, "learning_rate": 1.6010151130381973e-05, "loss": 0.2318, "step": 117600 }, { "epoch": 1.0568049384977887, "grad_norm": 8.85676383972168, "learning_rate": 1.600675837290137e-05, "loss": 0.2296, "step": 117700 }, { "epoch": 1.056974576371819, "grad_norm": 2.1343743801116943, "learning_rate": 1.6003365615420764e-05, "loss": 0.2469, "step": 117800 }, { "epoch": 1.0571442142458494, "grad_norm": 5.544300079345703, "learning_rate": 1.5999972857940156e-05, "loss": 0.2338, "step": 117900 }, { "epoch": 1.0573138521198797, "grad_norm": 7.693223476409912, "learning_rate": 1.599658010045955e-05, "loss": 0.2325, "step": 118000 }, { "epoch": 1.05748348999391, "grad_norm": 11.487672805786133, "learning_rate": 1.5993187342978944e-05, "loss": 0.2216, "step": 118100 }, { "epoch": 1.0576531278679404, "grad_norm": 5.807964324951172, "learning_rate": 1.5989794585498336e-05, "loss": 0.242, "step": 118200 }, { "epoch": 1.0578227657419705, "grad_norm": 3.3225419521331787, "learning_rate": 1.598640182801773e-05, "loss": 0.2278, "step": 118300 }, { "epoch": 1.0579924036160009, "grad_norm": 2.246917963027954, "learning_rate": 1.5983009070537127e-05, "loss": 0.2262, "step": 118400 }, { "epoch": 1.0581620414900312, "grad_norm": 4.453568935394287, "learning_rate": 1.597961631305652e-05, "loss": 0.226, "step": 118500 }, { "epoch": 1.0583316793640616, "grad_norm": 7.195823669433594, "learning_rate": 1.5976223555575914e-05, "loss": 0.2435, "step": 118600 }, { "epoch": 1.058501317238092, "grad_norm": 8.033218383789062, "learning_rate": 1.5972830798095306e-05, "loss": 0.2489, "step": 118700 }, { "epoch": 1.058670955112122, "grad_norm": 5.09698486328125, "learning_rate": 1.5969438040614702e-05, "loss": 0.2232, "step": 118800 }, { "epoch": 1.0588405929861524, "grad_norm": 3.377126455307007, "learning_rate": 1.5966045283134094e-05, "loss": 0.231, "step": 118900 }, { "epoch": 1.0590102308601828, "grad_norm": 4.04354190826416, "learning_rate": 1.596265252565349e-05, "loss": 0.2248, "step": 119000 }, { "epoch": 1.059179868734213, "grad_norm": 3.9495084285736084, "learning_rate": 1.5959259768172885e-05, "loss": 0.2438, "step": 119100 }, { "epoch": 1.0593495066082435, "grad_norm": 6.687170505523682, "learning_rate": 1.5955867010692277e-05, "loss": 0.2247, "step": 119200 }, { "epoch": 1.0595191444822736, "grad_norm": 8.95685863494873, "learning_rate": 1.595247425321167e-05, "loss": 0.2398, "step": 119300 }, { "epoch": 1.059688782356304, "grad_norm": 6.043853282928467, "learning_rate": 1.5949081495731065e-05, "loss": 0.2534, "step": 119400 }, { "epoch": 1.0598584202303343, "grad_norm": 7.00341796875, "learning_rate": 1.5945688738250457e-05, "loss": 0.2297, "step": 119500 }, { "epoch": 1.0600280581043646, "grad_norm": 5.334883689880371, "learning_rate": 1.5942295980769852e-05, "loss": 0.2345, "step": 119600 }, { "epoch": 1.060197695978395, "grad_norm": 6.179337024688721, "learning_rate": 1.5938903223289244e-05, "loss": 0.24, "step": 119700 }, { "epoch": 1.0603673338524253, "grad_norm": 3.675476551055908, "learning_rate": 1.593551046580864e-05, "loss": 0.2168, "step": 119800 }, { "epoch": 1.0605369717264554, "grad_norm": 6.982387065887451, "learning_rate": 1.5932117708328035e-05, "loss": 0.2406, "step": 119900 }, { "epoch": 1.0607066096004858, "grad_norm": 3.3264312744140625, "learning_rate": 1.5928724950847427e-05, "loss": 0.2144, "step": 120000 }, { "epoch": 1.0608762474745161, "grad_norm": 4.445656776428223, "learning_rate": 1.592533219336682e-05, "loss": 0.2412, "step": 120100 }, { "epoch": 1.0610458853485465, "grad_norm": 6.824995517730713, "learning_rate": 1.5921939435886215e-05, "loss": 0.2499, "step": 120200 }, { "epoch": 1.0612155232225768, "grad_norm": 4.406157970428467, "learning_rate": 1.5918546678405607e-05, "loss": 0.2282, "step": 120300 }, { "epoch": 1.061385161096607, "grad_norm": 7.7202019691467285, "learning_rate": 1.5915153920925003e-05, "loss": 0.2397, "step": 120400 }, { "epoch": 1.0615547989706373, "grad_norm": 4.394982814788818, "learning_rate": 1.5911761163444398e-05, "loss": 0.2393, "step": 120500 }, { "epoch": 1.0617244368446677, "grad_norm": 3.7022652626037598, "learning_rate": 1.590836840596379e-05, "loss": 0.238, "step": 120600 }, { "epoch": 1.061894074718698, "grad_norm": 4.0278778076171875, "learning_rate": 1.5904975648483186e-05, "loss": 0.2322, "step": 120700 }, { "epoch": 1.0620637125927284, "grad_norm": 8.155988693237305, "learning_rate": 1.5901582891002578e-05, "loss": 0.2429, "step": 120800 }, { "epoch": 1.0622333504667587, "grad_norm": 3.1102092266082764, "learning_rate": 1.589819013352197e-05, "loss": 0.2468, "step": 120900 }, { "epoch": 1.0624029883407888, "grad_norm": 4.492911338806152, "learning_rate": 1.5894797376041365e-05, "loss": 0.2399, "step": 121000 }, { "epoch": 1.0625726262148192, "grad_norm": 5.099388599395752, "learning_rate": 1.589140461856076e-05, "loss": 0.2177, "step": 121100 }, { "epoch": 1.0627422640888495, "grad_norm": 3.9275214672088623, "learning_rate": 1.5888011861080153e-05, "loss": 0.2184, "step": 121200 }, { "epoch": 1.0629119019628799, "grad_norm": 5.1093430519104, "learning_rate": 1.588461910359955e-05, "loss": 0.2553, "step": 121300 }, { "epoch": 1.0630815398369102, "grad_norm": 8.20578670501709, "learning_rate": 1.588122634611894e-05, "loss": 0.2315, "step": 121400 }, { "epoch": 1.0632511777109404, "grad_norm": 5.164793014526367, "learning_rate": 1.5877833588638336e-05, "loss": 0.2387, "step": 121500 }, { "epoch": 1.0634208155849707, "grad_norm": 3.872385025024414, "learning_rate": 1.5874440831157728e-05, "loss": 0.2361, "step": 121600 }, { "epoch": 1.063590453459001, "grad_norm": 10.027320861816406, "learning_rate": 1.5871048073677124e-05, "loss": 0.2272, "step": 121700 }, { "epoch": 1.0637600913330314, "grad_norm": 5.171230792999268, "learning_rate": 1.586765531619652e-05, "loss": 0.2225, "step": 121800 }, { "epoch": 1.0639297292070617, "grad_norm": 5.909432411193848, "learning_rate": 1.586426255871591e-05, "loss": 0.2399, "step": 121900 }, { "epoch": 1.064099367081092, "grad_norm": 8.448285102844238, "learning_rate": 1.5860869801235303e-05, "loss": 0.2414, "step": 122000 }, { "epoch": 1.0642690049551222, "grad_norm": 9.792718887329102, "learning_rate": 1.58574770437547e-05, "loss": 0.2327, "step": 122100 }, { "epoch": 1.0644386428291526, "grad_norm": 5.214166164398193, "learning_rate": 1.585408428627409e-05, "loss": 0.2294, "step": 122200 }, { "epoch": 1.064608280703183, "grad_norm": 6.5533127784729, "learning_rate": 1.5850691528793486e-05, "loss": 0.2373, "step": 122300 }, { "epoch": 1.0647779185772133, "grad_norm": 6.691283226013184, "learning_rate": 1.584729877131288e-05, "loss": 0.2296, "step": 122400 }, { "epoch": 1.0649475564512436, "grad_norm": 7.022083282470703, "learning_rate": 1.5843906013832274e-05, "loss": 0.2201, "step": 122500 }, { "epoch": 1.0651171943252737, "grad_norm": 3.8691043853759766, "learning_rate": 1.584051325635167e-05, "loss": 0.227, "step": 122600 }, { "epoch": 1.065286832199304, "grad_norm": 6.242950916290283, "learning_rate": 1.583712049887106e-05, "loss": 0.2393, "step": 122700 }, { "epoch": 1.0654564700733344, "grad_norm": 6.819797515869141, "learning_rate": 1.5833727741390454e-05, "loss": 0.2382, "step": 122800 }, { "epoch": 1.0656261079473648, "grad_norm": 8.522147178649902, "learning_rate": 1.583033498390985e-05, "loss": 0.2356, "step": 122900 }, { "epoch": 1.0657957458213951, "grad_norm": 5.093167781829834, "learning_rate": 1.582694222642924e-05, "loss": 0.2329, "step": 123000 }, { "epoch": 1.0659653836954255, "grad_norm": 8.274190902709961, "learning_rate": 1.5823549468948637e-05, "loss": 0.2498, "step": 123100 }, { "epoch": 1.0661350215694556, "grad_norm": 2.790396213531494, "learning_rate": 1.5820156711468032e-05, "loss": 0.2456, "step": 123200 }, { "epoch": 1.066304659443486, "grad_norm": 6.437891006469727, "learning_rate": 1.5816763953987424e-05, "loss": 0.2345, "step": 123300 }, { "epoch": 1.0664742973175163, "grad_norm": 6.20482063293457, "learning_rate": 1.581337119650682e-05, "loss": 0.2291, "step": 123400 }, { "epoch": 1.0666439351915467, "grad_norm": 3.1559793949127197, "learning_rate": 1.5809978439026212e-05, "loss": 0.2502, "step": 123500 }, { "epoch": 1.066813573065577, "grad_norm": 5.069153308868408, "learning_rate": 1.5806585681545604e-05, "loss": 0.2296, "step": 123600 }, { "epoch": 1.0669832109396071, "grad_norm": 5.30411958694458, "learning_rate": 1.5803192924065e-05, "loss": 0.2471, "step": 123700 }, { "epoch": 1.0671528488136375, "grad_norm": 5.784866809844971, "learning_rate": 1.5799800166584395e-05, "loss": 0.2339, "step": 123800 }, { "epoch": 1.0673224866876678, "grad_norm": 4.596671104431152, "learning_rate": 1.5796407409103787e-05, "loss": 0.2297, "step": 123900 }, { "epoch": 1.0674921245616982, "grad_norm": 5.818383693695068, "learning_rate": 1.5793014651623183e-05, "loss": 0.244, "step": 124000 }, { "epoch": 1.0676617624357285, "grad_norm": 5.671211242675781, "learning_rate": 1.5789621894142575e-05, "loss": 0.251, "step": 124100 }, { "epoch": 1.0678314003097586, "grad_norm": 4.624828815460205, "learning_rate": 1.5786229136661967e-05, "loss": 0.2526, "step": 124200 }, { "epoch": 1.068001038183789, "grad_norm": 6.929238796234131, "learning_rate": 1.5782836379181362e-05, "loss": 0.2372, "step": 124300 }, { "epoch": 1.0681706760578193, "grad_norm": 4.815464973449707, "learning_rate": 1.5779443621700758e-05, "loss": 0.245, "step": 124400 }, { "epoch": 1.0683403139318497, "grad_norm": 3.3467283248901367, "learning_rate": 1.577605086422015e-05, "loss": 0.2306, "step": 124500 }, { "epoch": 1.06850995180588, "grad_norm": 6.17009162902832, "learning_rate": 1.5772658106739545e-05, "loss": 0.237, "step": 124600 }, { "epoch": 1.0686795896799104, "grad_norm": 4.564947128295898, "learning_rate": 1.5769265349258937e-05, "loss": 0.253, "step": 124700 }, { "epoch": 1.0688492275539405, "grad_norm": 3.9206085205078125, "learning_rate": 1.5765872591778333e-05, "loss": 0.232, "step": 124800 }, { "epoch": 1.0690188654279709, "grad_norm": 5.2053985595703125, "learning_rate": 1.5762479834297725e-05, "loss": 0.2181, "step": 124900 }, { "epoch": 1.0691885033020012, "grad_norm": 11.436452865600586, "learning_rate": 1.575908707681712e-05, "loss": 0.2405, "step": 125000 }, { "epoch": 1.0693581411760316, "grad_norm": 3.8938348293304443, "learning_rate": 1.5755694319336513e-05, "loss": 0.2343, "step": 125100 }, { "epoch": 1.069527779050062, "grad_norm": 5.917283058166504, "learning_rate": 1.5752301561855908e-05, "loss": 0.237, "step": 125200 }, { "epoch": 1.069697416924092, "grad_norm": 6.296077728271484, "learning_rate": 1.57489088043753e-05, "loss": 0.2222, "step": 125300 }, { "epoch": 1.0698670547981224, "grad_norm": 6.308600902557373, "learning_rate": 1.5745516046894696e-05, "loss": 0.2329, "step": 125400 }, { "epoch": 1.0700366926721527, "grad_norm": 9.107645988464355, "learning_rate": 1.5742123289414088e-05, "loss": 0.2386, "step": 125500 }, { "epoch": 1.070206330546183, "grad_norm": 2.6373414993286133, "learning_rate": 1.5738730531933483e-05, "loss": 0.2366, "step": 125600 }, { "epoch": 1.0703759684202134, "grad_norm": 6.619503021240234, "learning_rate": 1.5735337774452875e-05, "loss": 0.2212, "step": 125700 }, { "epoch": 1.0705456062942438, "grad_norm": 4.672668933868408, "learning_rate": 1.573194501697227e-05, "loss": 0.2335, "step": 125800 }, { "epoch": 1.070715244168274, "grad_norm": 4.542766571044922, "learning_rate": 1.5728552259491666e-05, "loss": 0.2402, "step": 125900 }, { "epoch": 1.0708848820423043, "grad_norm": 4.3836870193481445, "learning_rate": 1.572515950201106e-05, "loss": 0.2211, "step": 126000 }, { "epoch": 1.0710545199163346, "grad_norm": 4.291752815246582, "learning_rate": 1.572176674453045e-05, "loss": 0.2274, "step": 126100 }, { "epoch": 1.071224157790365, "grad_norm": 4.834810733795166, "learning_rate": 1.5718373987049846e-05, "loss": 0.2397, "step": 126200 }, { "epoch": 1.0713937956643953, "grad_norm": 6.486642837524414, "learning_rate": 1.5714981229569238e-05, "loss": 0.2231, "step": 126300 }, { "epoch": 1.0715634335384254, "grad_norm": 6.639422416687012, "learning_rate": 1.5711588472088634e-05, "loss": 0.2293, "step": 126400 }, { "epoch": 1.0717330714124558, "grad_norm": 6.3961405754089355, "learning_rate": 1.570819571460803e-05, "loss": 0.2436, "step": 126500 }, { "epoch": 1.0719027092864861, "grad_norm": 6.134659290313721, "learning_rate": 1.570480295712742e-05, "loss": 0.2375, "step": 126600 }, { "epoch": 1.0720723471605165, "grad_norm": 3.5775723457336426, "learning_rate": 1.5701410199646817e-05, "loss": 0.2368, "step": 126700 }, { "epoch": 1.0722419850345468, "grad_norm": 7.442634105682373, "learning_rate": 1.569801744216621e-05, "loss": 0.2416, "step": 126800 }, { "epoch": 1.072411622908577, "grad_norm": 5.716022968292236, "learning_rate": 1.56946246846856e-05, "loss": 0.237, "step": 126900 }, { "epoch": 1.0725812607826073, "grad_norm": 6.90067720413208, "learning_rate": 1.5691231927204996e-05, "loss": 0.2374, "step": 127000 }, { "epoch": 1.0727508986566376, "grad_norm": 4.052627086639404, "learning_rate": 1.5687839169724392e-05, "loss": 0.2431, "step": 127100 }, { "epoch": 1.072920536530668, "grad_norm": 8.269156455993652, "learning_rate": 1.5684446412243784e-05, "loss": 0.229, "step": 127200 }, { "epoch": 1.0730901744046983, "grad_norm": 3.8054099082946777, "learning_rate": 1.568105365476318e-05, "loss": 0.2354, "step": 127300 }, { "epoch": 1.0732598122787287, "grad_norm": 9.106431007385254, "learning_rate": 1.567766089728257e-05, "loss": 0.2234, "step": 127400 }, { "epoch": 1.0734294501527588, "grad_norm": 7.059177398681641, "learning_rate": 1.5674268139801967e-05, "loss": 0.2374, "step": 127500 }, { "epoch": 1.0735990880267892, "grad_norm": 3.3186845779418945, "learning_rate": 1.567087538232136e-05, "loss": 0.2261, "step": 127600 }, { "epoch": 1.0737687259008195, "grad_norm": 6.5789103507995605, "learning_rate": 1.566748262484075e-05, "loss": 0.2447, "step": 127700 }, { "epoch": 1.0739383637748499, "grad_norm": 5.162630081176758, "learning_rate": 1.5664089867360147e-05, "loss": 0.2315, "step": 127800 }, { "epoch": 1.0741080016488802, "grad_norm": 4.033233642578125, "learning_rate": 1.5660697109879542e-05, "loss": 0.2399, "step": 127900 }, { "epoch": 1.0742776395229106, "grad_norm": 9.12545108795166, "learning_rate": 1.5657304352398934e-05, "loss": 0.2321, "step": 128000 }, { "epoch": 1.0744472773969407, "grad_norm": 4.372875213623047, "learning_rate": 1.565391159491833e-05, "loss": 0.2348, "step": 128100 }, { "epoch": 1.074616915270971, "grad_norm": 2.6398613452911377, "learning_rate": 1.5650518837437722e-05, "loss": 0.261, "step": 128200 }, { "epoch": 1.0747865531450014, "grad_norm": 6.015939712524414, "learning_rate": 1.5647126079957118e-05, "loss": 0.2339, "step": 128300 }, { "epoch": 1.0749561910190317, "grad_norm": 4.605691432952881, "learning_rate": 1.564373332247651e-05, "loss": 0.2217, "step": 128400 }, { "epoch": 1.075125828893062, "grad_norm": 4.227110862731934, "learning_rate": 1.5640340564995905e-05, "loss": 0.2404, "step": 128500 }, { "epoch": 1.0752954667670922, "grad_norm": 10.45752239227295, "learning_rate": 1.56369478075153e-05, "loss": 0.2434, "step": 128600 }, { "epoch": 1.0754651046411225, "grad_norm": 4.90556001663208, "learning_rate": 1.5633555050034693e-05, "loss": 0.2427, "step": 128700 }, { "epoch": 1.075634742515153, "grad_norm": 4.458987712860107, "learning_rate": 1.5630162292554085e-05, "loss": 0.2456, "step": 128800 }, { "epoch": 1.0758043803891832, "grad_norm": 6.1580305099487305, "learning_rate": 1.562676953507348e-05, "loss": 0.2496, "step": 128900 }, { "epoch": 1.0759740182632136, "grad_norm": 1.1206755638122559, "learning_rate": 1.5623376777592872e-05, "loss": 0.2219, "step": 129000 }, { "epoch": 1.0761436561372437, "grad_norm": 4.1399149894714355, "learning_rate": 1.5619984020112268e-05, "loss": 0.226, "step": 129100 }, { "epoch": 1.076313294011274, "grad_norm": 9.925853729248047, "learning_rate": 1.5616591262631663e-05, "loss": 0.2271, "step": 129200 }, { "epoch": 1.0764829318853044, "grad_norm": 5.2665510177612305, "learning_rate": 1.5613198505151055e-05, "loss": 0.236, "step": 129300 }, { "epoch": 1.0766525697593348, "grad_norm": 3.72847318649292, "learning_rate": 1.560980574767045e-05, "loss": 0.239, "step": 129400 }, { "epoch": 1.0768222076333651, "grad_norm": 3.6626086235046387, "learning_rate": 1.5606412990189843e-05, "loss": 0.2429, "step": 129500 }, { "epoch": 1.0769918455073955, "grad_norm": 4.427401065826416, "learning_rate": 1.5603020232709235e-05, "loss": 0.2279, "step": 129600 }, { "epoch": 1.0771614833814256, "grad_norm": 5.988269329071045, "learning_rate": 1.559962747522863e-05, "loss": 0.2403, "step": 129700 }, { "epoch": 1.077331121255456, "grad_norm": 4.7642388343811035, "learning_rate": 1.5596234717748026e-05, "loss": 0.238, "step": 129800 }, { "epoch": 1.0775007591294863, "grad_norm": 4.121160984039307, "learning_rate": 1.5592841960267418e-05, "loss": 0.2399, "step": 129900 }, { "epoch": 1.0776703970035166, "grad_norm": 7.005282878875732, "learning_rate": 1.5589449202786814e-05, "loss": 0.2356, "step": 130000 }, { "epoch": 1.077840034877547, "grad_norm": 7.202126979827881, "learning_rate": 1.5586056445306206e-05, "loss": 0.2146, "step": 130100 }, { "epoch": 1.078009672751577, "grad_norm": 4.791199684143066, "learning_rate": 1.55826636878256e-05, "loss": 0.247, "step": 130200 }, { "epoch": 1.0781793106256075, "grad_norm": 5.784078121185303, "learning_rate": 1.5579270930344993e-05, "loss": 0.2429, "step": 130300 }, { "epoch": 1.0783489484996378, "grad_norm": 5.536164283752441, "learning_rate": 1.5575878172864386e-05, "loss": 0.2428, "step": 130400 }, { "epoch": 1.0785185863736682, "grad_norm": 3.9472568035125732, "learning_rate": 1.557248541538378e-05, "loss": 0.2369, "step": 130500 }, { "epoch": 1.0786882242476985, "grad_norm": 6.471737384796143, "learning_rate": 1.5569092657903176e-05, "loss": 0.2314, "step": 130600 }, { "epoch": 1.0788578621217289, "grad_norm": 6.633335113525391, "learning_rate": 1.556569990042257e-05, "loss": 0.2398, "step": 130700 }, { "epoch": 1.079027499995759, "grad_norm": 3.7377395629882812, "learning_rate": 1.5562307142941964e-05, "loss": 0.2309, "step": 130800 }, { "epoch": 1.0791971378697893, "grad_norm": 5.688544750213623, "learning_rate": 1.5558914385461356e-05, "loss": 0.2417, "step": 130900 }, { "epoch": 1.0793667757438197, "grad_norm": 5.299839019775391, "learning_rate": 1.5555521627980748e-05, "loss": 0.2335, "step": 131000 }, { "epoch": 1.07953641361785, "grad_norm": 3.087064266204834, "learning_rate": 1.5552128870500144e-05, "loss": 0.2231, "step": 131100 }, { "epoch": 1.0797060514918804, "grad_norm": 4.761734962463379, "learning_rate": 1.554873611301954e-05, "loss": 0.251, "step": 131200 }, { "epoch": 1.0798756893659105, "grad_norm": 2.7992093563079834, "learning_rate": 1.5545343355538935e-05, "loss": 0.2324, "step": 131300 }, { "epoch": 1.0800453272399408, "grad_norm": 5.059682846069336, "learning_rate": 1.5541950598058327e-05, "loss": 0.2381, "step": 131400 }, { "epoch": 1.0802149651139712, "grad_norm": 6.297108173370361, "learning_rate": 1.553855784057772e-05, "loss": 0.2281, "step": 131500 }, { "epoch": 1.0803846029880015, "grad_norm": 5.910891056060791, "learning_rate": 1.5535165083097114e-05, "loss": 0.2493, "step": 131600 }, { "epoch": 1.080554240862032, "grad_norm": 1.8360539674758911, "learning_rate": 1.5531772325616507e-05, "loss": 0.2263, "step": 131700 }, { "epoch": 1.080723878736062, "grad_norm": 2.342198371887207, "learning_rate": 1.5528379568135902e-05, "loss": 0.2329, "step": 131800 }, { "epoch": 1.0808935166100924, "grad_norm": 6.693944931030273, "learning_rate": 1.5524986810655298e-05, "loss": 0.237, "step": 131900 }, { "epoch": 1.0810631544841227, "grad_norm": 4.89886999130249, "learning_rate": 1.552159405317469e-05, "loss": 0.237, "step": 132000 }, { "epoch": 1.081232792358153, "grad_norm": 6.643464088439941, "learning_rate": 1.5518201295694085e-05, "loss": 0.2388, "step": 132100 }, { "epoch": 1.0814024302321834, "grad_norm": 5.549567222595215, "learning_rate": 1.5514808538213477e-05, "loss": 0.2334, "step": 132200 }, { "epoch": 1.0815720681062138, "grad_norm": 5.2260870933532715, "learning_rate": 1.551141578073287e-05, "loss": 0.2445, "step": 132300 }, { "epoch": 1.0817417059802439, "grad_norm": 11.953410148620605, "learning_rate": 1.5508023023252265e-05, "loss": 0.2398, "step": 132400 }, { "epoch": 1.0819113438542742, "grad_norm": 5.38473653793335, "learning_rate": 1.5504630265771657e-05, "loss": 0.2335, "step": 132500 }, { "epoch": 1.0820809817283046, "grad_norm": 4.497625827789307, "learning_rate": 1.5501237508291052e-05, "loss": 0.253, "step": 132600 }, { "epoch": 1.082250619602335, "grad_norm": 8.68681526184082, "learning_rate": 1.5497844750810448e-05, "loss": 0.2297, "step": 132700 }, { "epoch": 1.0824202574763653, "grad_norm": 4.686168193817139, "learning_rate": 1.549445199332984e-05, "loss": 0.238, "step": 132800 }, { "epoch": 1.0825898953503956, "grad_norm": 5.757005214691162, "learning_rate": 1.5491059235849232e-05, "loss": 0.2405, "step": 132900 }, { "epoch": 1.0827595332244258, "grad_norm": 6.429945945739746, "learning_rate": 1.5487666478368628e-05, "loss": 0.2436, "step": 133000 }, { "epoch": 1.082929171098456, "grad_norm": 6.607705116271973, "learning_rate": 1.548427372088802e-05, "loss": 0.2371, "step": 133100 }, { "epoch": 1.0830988089724864, "grad_norm": 5.967549800872803, "learning_rate": 1.5480880963407415e-05, "loss": 0.2417, "step": 133200 }, { "epoch": 1.0832684468465168, "grad_norm": 5.192127704620361, "learning_rate": 1.547748820592681e-05, "loss": 0.236, "step": 133300 }, { "epoch": 1.0834380847205471, "grad_norm": 4.789306640625, "learning_rate": 1.5474095448446203e-05, "loss": 0.2381, "step": 133400 }, { "epoch": 1.0836077225945773, "grad_norm": 4.094269275665283, "learning_rate": 1.5470702690965598e-05, "loss": 0.2416, "step": 133500 }, { "epoch": 1.0837773604686076, "grad_norm": 3.6857314109802246, "learning_rate": 1.546730993348499e-05, "loss": 0.2403, "step": 133600 }, { "epoch": 1.083946998342638, "grad_norm": 5.700229644775391, "learning_rate": 1.5463917176004382e-05, "loss": 0.2405, "step": 133700 }, { "epoch": 1.0841166362166683, "grad_norm": 8.23345947265625, "learning_rate": 1.5460524418523778e-05, "loss": 0.2266, "step": 133800 }, { "epoch": 1.0842862740906987, "grad_norm": 6.393547534942627, "learning_rate": 1.5457131661043173e-05, "loss": 0.2182, "step": 133900 }, { "epoch": 1.0844559119647288, "grad_norm": 7.1295061111450195, "learning_rate": 1.545373890356257e-05, "loss": 0.225, "step": 134000 }, { "epoch": 1.0846255498387591, "grad_norm": 6.740891933441162, "learning_rate": 1.545034614608196e-05, "loss": 0.239, "step": 134100 }, { "epoch": 1.0847951877127895, "grad_norm": 5.166084289550781, "learning_rate": 1.5446953388601353e-05, "loss": 0.2443, "step": 134200 }, { "epoch": 1.0849648255868198, "grad_norm": 7.578908920288086, "learning_rate": 1.544356063112075e-05, "loss": 0.2359, "step": 134300 }, { "epoch": 1.0851344634608502, "grad_norm": 8.352858543395996, "learning_rate": 1.544016787364014e-05, "loss": 0.2479, "step": 134400 }, { "epoch": 1.0853041013348805, "grad_norm": 8.804341316223145, "learning_rate": 1.5436775116159536e-05, "loss": 0.2428, "step": 134500 }, { "epoch": 1.0854737392089107, "grad_norm": 6.319941997528076, "learning_rate": 1.543338235867893e-05, "loss": 0.2382, "step": 134600 }, { "epoch": 1.085643377082941, "grad_norm": 5.80189323425293, "learning_rate": 1.5429989601198324e-05, "loss": 0.2213, "step": 134700 }, { "epoch": 1.0858130149569714, "grad_norm": 7.600187301635742, "learning_rate": 1.5426596843717716e-05, "loss": 0.2254, "step": 134800 }, { "epoch": 1.0859826528310017, "grad_norm": 9.828634262084961, "learning_rate": 1.542320408623711e-05, "loss": 0.2403, "step": 134900 }, { "epoch": 1.086152290705032, "grad_norm": 15.055134773254395, "learning_rate": 1.5419811328756504e-05, "loss": 0.2366, "step": 135000 }, { "epoch": 1.0863219285790622, "grad_norm": 6.687788963317871, "learning_rate": 1.54164185712759e-05, "loss": 0.2435, "step": 135100 }, { "epoch": 1.0864915664530925, "grad_norm": 4.3503804206848145, "learning_rate": 1.541302581379529e-05, "loss": 0.2197, "step": 135200 }, { "epoch": 1.0866612043271229, "grad_norm": 8.343756675720215, "learning_rate": 1.5409633056314687e-05, "loss": 0.218, "step": 135300 }, { "epoch": 1.0868308422011532, "grad_norm": 5.638715744018555, "learning_rate": 1.5406240298834082e-05, "loss": 0.2297, "step": 135400 }, { "epoch": 1.0870004800751836, "grad_norm": 4.605719089508057, "learning_rate": 1.5402847541353474e-05, "loss": 0.2298, "step": 135500 }, { "epoch": 1.087170117949214, "grad_norm": 8.46635913848877, "learning_rate": 1.5399454783872866e-05, "loss": 0.2337, "step": 135600 }, { "epoch": 1.087339755823244, "grad_norm": 5.5247483253479, "learning_rate": 1.5396062026392262e-05, "loss": 0.2301, "step": 135700 }, { "epoch": 1.0875093936972744, "grad_norm": 4.064679145812988, "learning_rate": 1.5392669268911654e-05, "loss": 0.22, "step": 135800 }, { "epoch": 1.0876790315713047, "grad_norm": 10.272905349731445, "learning_rate": 1.538927651143105e-05, "loss": 0.2264, "step": 135900 }, { "epoch": 1.087848669445335, "grad_norm": 7.4483323097229, "learning_rate": 1.5385883753950445e-05, "loss": 0.2361, "step": 136000 }, { "epoch": 1.0880183073193654, "grad_norm": 2.381577730178833, "learning_rate": 1.5382490996469837e-05, "loss": 0.2366, "step": 136100 }, { "epoch": 1.0881879451933956, "grad_norm": 3.1244561672210693, "learning_rate": 1.5379098238989232e-05, "loss": 0.2512, "step": 136200 }, { "epoch": 1.088357583067426, "grad_norm": 4.0624542236328125, "learning_rate": 1.5375705481508625e-05, "loss": 0.2508, "step": 136300 }, { "epoch": 1.0885272209414563, "grad_norm": 7.808289527893066, "learning_rate": 1.5372312724028017e-05, "loss": 0.2398, "step": 136400 }, { "epoch": 1.0886968588154866, "grad_norm": 3.5493435859680176, "learning_rate": 1.5368919966547412e-05, "loss": 0.2306, "step": 136500 }, { "epoch": 1.088866496689517, "grad_norm": 8.596064567565918, "learning_rate": 1.5365527209066808e-05, "loss": 0.2529, "step": 136600 }, { "epoch": 1.089036134563547, "grad_norm": 1.461173176765442, "learning_rate": 1.53621344515862e-05, "loss": 0.2363, "step": 136700 }, { "epoch": 1.0892057724375774, "grad_norm": 3.5652148723602295, "learning_rate": 1.5358741694105595e-05, "loss": 0.2356, "step": 136800 }, { "epoch": 1.0893754103116078, "grad_norm": 2.165330410003662, "learning_rate": 1.5355348936624987e-05, "loss": 0.2341, "step": 136900 }, { "epoch": 1.0895450481856381, "grad_norm": 4.922107696533203, "learning_rate": 1.5351956179144383e-05, "loss": 0.2464, "step": 137000 }, { "epoch": 1.0897146860596685, "grad_norm": 7.283497333526611, "learning_rate": 1.5348563421663775e-05, "loss": 0.2372, "step": 137100 }, { "epoch": 1.0898843239336988, "grad_norm": 4.497725009918213, "learning_rate": 1.534517066418317e-05, "loss": 0.2466, "step": 137200 }, { "epoch": 1.090053961807729, "grad_norm": 5.571370601654053, "learning_rate": 1.5341777906702562e-05, "loss": 0.2167, "step": 137300 }, { "epoch": 1.0902235996817593, "grad_norm": 3.3652894496917725, "learning_rate": 1.5338385149221958e-05, "loss": 0.2113, "step": 137400 }, { "epoch": 1.0903932375557897, "grad_norm": 6.516097068786621, "learning_rate": 1.533499239174135e-05, "loss": 0.2323, "step": 137500 }, { "epoch": 1.09056287542982, "grad_norm": 2.62005352973938, "learning_rate": 1.5331599634260746e-05, "loss": 0.2226, "step": 137600 }, { "epoch": 1.0907325133038503, "grad_norm": 7.302656650543213, "learning_rate": 1.5328206876780138e-05, "loss": 0.2485, "step": 137700 }, { "epoch": 1.0909021511778805, "grad_norm": 4.264378547668457, "learning_rate": 1.532481411929953e-05, "loss": 0.2315, "step": 137800 }, { "epoch": 1.0910717890519108, "grad_norm": 11.101664543151855, "learning_rate": 1.5321421361818925e-05, "loss": 0.236, "step": 137900 }, { "epoch": 1.0912414269259412, "grad_norm": 7.9779953956604, "learning_rate": 1.531802860433832e-05, "loss": 0.2444, "step": 138000 }, { "epoch": 1.0914110647999715, "grad_norm": 5.35723352432251, "learning_rate": 1.5314635846857716e-05, "loss": 0.2319, "step": 138100 }, { "epoch": 1.0915807026740019, "grad_norm": 6.075881004333496, "learning_rate": 1.531124308937711e-05, "loss": 0.23, "step": 138200 }, { "epoch": 1.0917503405480322, "grad_norm": 5.234502792358398, "learning_rate": 1.53078503318965e-05, "loss": 0.2398, "step": 138300 }, { "epoch": 1.0919199784220623, "grad_norm": 6.223726272583008, "learning_rate": 1.5304457574415896e-05, "loss": 0.2346, "step": 138400 }, { "epoch": 1.0920896162960927, "grad_norm": 5.578091144561768, "learning_rate": 1.5301064816935288e-05, "loss": 0.2195, "step": 138500 }, { "epoch": 1.092259254170123, "grad_norm": 4.573008060455322, "learning_rate": 1.5297672059454684e-05, "loss": 0.2248, "step": 138600 }, { "epoch": 1.0924288920441534, "grad_norm": 4.710226535797119, "learning_rate": 1.529427930197408e-05, "loss": 0.2273, "step": 138700 }, { "epoch": 1.0925985299181837, "grad_norm": 9.57243824005127, "learning_rate": 1.529088654449347e-05, "loss": 0.2337, "step": 138800 }, { "epoch": 1.0927681677922139, "grad_norm": 4.8943562507629395, "learning_rate": 1.5287493787012867e-05, "loss": 0.2216, "step": 138900 }, { "epoch": 1.0929378056662442, "grad_norm": 5.643235683441162, "learning_rate": 1.528410102953226e-05, "loss": 0.2387, "step": 139000 }, { "epoch": 1.0931074435402746, "grad_norm": 9.292625427246094, "learning_rate": 1.528070827205165e-05, "loss": 0.2404, "step": 139100 }, { "epoch": 1.093277081414305, "grad_norm": 7.144375324249268, "learning_rate": 1.5277315514571046e-05, "loss": 0.2232, "step": 139200 }, { "epoch": 1.0934467192883353, "grad_norm": 4.605229377746582, "learning_rate": 1.5273922757090442e-05, "loss": 0.2311, "step": 139300 }, { "epoch": 1.0936163571623654, "grad_norm": 7.876538276672363, "learning_rate": 1.5270529999609834e-05, "loss": 0.2471, "step": 139400 }, { "epoch": 1.0937859950363957, "grad_norm": 3.9406492710113525, "learning_rate": 1.526713724212923e-05, "loss": 0.2328, "step": 139500 }, { "epoch": 1.093955632910426, "grad_norm": 3.0912091732025146, "learning_rate": 1.526374448464862e-05, "loss": 0.2437, "step": 139600 }, { "epoch": 1.0941252707844564, "grad_norm": 4.278962135314941, "learning_rate": 1.5260351727168014e-05, "loss": 0.2418, "step": 139700 }, { "epoch": 1.0942949086584868, "grad_norm": 4.393928050994873, "learning_rate": 1.5256958969687409e-05, "loss": 0.2448, "step": 139800 }, { "epoch": 1.0944645465325171, "grad_norm": 5.63219690322876, "learning_rate": 1.5253566212206803e-05, "loss": 0.2418, "step": 139900 }, { "epoch": 1.0946341844065473, "grad_norm": 10.668187141418457, "learning_rate": 1.5250173454726198e-05, "loss": 0.2364, "step": 140000 }, { "epoch": 1.0948038222805776, "grad_norm": 4.428503513336182, "learning_rate": 1.524678069724559e-05, "loss": 0.236, "step": 140100 }, { "epoch": 1.094973460154608, "grad_norm": 3.337808847427368, "learning_rate": 1.5243387939764984e-05, "loss": 0.2295, "step": 140200 }, { "epoch": 1.0951430980286383, "grad_norm": 6.650169372558594, "learning_rate": 1.523999518228438e-05, "loss": 0.2287, "step": 140300 }, { "epoch": 1.0953127359026686, "grad_norm": 3.5115139484405518, "learning_rate": 1.5236602424803772e-05, "loss": 0.2676, "step": 140400 }, { "epoch": 1.095482373776699, "grad_norm": 6.34481143951416, "learning_rate": 1.5233209667323166e-05, "loss": 0.2421, "step": 140500 }, { "epoch": 1.0956520116507291, "grad_norm": 4.807790279388428, "learning_rate": 1.5229816909842561e-05, "loss": 0.2118, "step": 140600 }, { "epoch": 1.0958216495247595, "grad_norm": 10.171920776367188, "learning_rate": 1.5226424152361953e-05, "loss": 0.2297, "step": 140700 }, { "epoch": 1.0959912873987898, "grad_norm": 3.021878242492676, "learning_rate": 1.5223031394881349e-05, "loss": 0.2306, "step": 140800 }, { "epoch": 1.0961609252728202, "grad_norm": 6.064032554626465, "learning_rate": 1.5219638637400743e-05, "loss": 0.2311, "step": 140900 }, { "epoch": 1.0963305631468505, "grad_norm": 5.043793678283691, "learning_rate": 1.5216245879920135e-05, "loss": 0.2204, "step": 141000 }, { "epoch": 1.0965002010208806, "grad_norm": 3.8513879776000977, "learning_rate": 1.521285312243953e-05, "loss": 0.2478, "step": 141100 }, { "epoch": 1.096669838894911, "grad_norm": 4.644242763519287, "learning_rate": 1.5209460364958924e-05, "loss": 0.239, "step": 141200 }, { "epoch": 1.0968394767689413, "grad_norm": 13.003419876098633, "learning_rate": 1.5206067607478316e-05, "loss": 0.2149, "step": 141300 }, { "epoch": 1.0970091146429717, "grad_norm": 6.20938777923584, "learning_rate": 1.5202674849997712e-05, "loss": 0.2351, "step": 141400 }, { "epoch": 1.097178752517002, "grad_norm": 8.280740737915039, "learning_rate": 1.5199282092517105e-05, "loss": 0.2197, "step": 141500 }, { "epoch": 1.0973483903910322, "grad_norm": 8.330668449401855, "learning_rate": 1.5195889335036497e-05, "loss": 0.2356, "step": 141600 }, { "epoch": 1.0975180282650625, "grad_norm": 6.068686485290527, "learning_rate": 1.5192496577555893e-05, "loss": 0.2443, "step": 141700 }, { "epoch": 1.0976876661390929, "grad_norm": 3.7416603565216064, "learning_rate": 1.5189103820075287e-05, "loss": 0.2404, "step": 141800 }, { "epoch": 1.0978573040131232, "grad_norm": 4.799732208251953, "learning_rate": 1.518571106259468e-05, "loss": 0.2385, "step": 141900 }, { "epoch": 1.0980269418871536, "grad_norm": 6.028345584869385, "learning_rate": 1.5182318305114074e-05, "loss": 0.2276, "step": 142000 }, { "epoch": 1.098196579761184, "grad_norm": 2.867041826248169, "learning_rate": 1.5178925547633468e-05, "loss": 0.2478, "step": 142100 }, { "epoch": 1.098366217635214, "grad_norm": 7.36961030960083, "learning_rate": 1.5175532790152862e-05, "loss": 0.231, "step": 142200 }, { "epoch": 1.0985358555092444, "grad_norm": 3.9825003147125244, "learning_rate": 1.5172140032672256e-05, "loss": 0.2264, "step": 142300 }, { "epoch": 1.0987054933832747, "grad_norm": 8.71201229095459, "learning_rate": 1.516874727519165e-05, "loss": 0.2347, "step": 142400 }, { "epoch": 1.098875131257305, "grad_norm": 8.074851989746094, "learning_rate": 1.5165354517711043e-05, "loss": 0.2673, "step": 142500 }, { "epoch": 1.0990447691313354, "grad_norm": 5.12618350982666, "learning_rate": 1.5161961760230437e-05, "loss": 0.2159, "step": 142600 }, { "epoch": 1.0992144070053655, "grad_norm": 3.819607734680176, "learning_rate": 1.5158569002749833e-05, "loss": 0.2306, "step": 142700 }, { "epoch": 1.099384044879396, "grad_norm": 7.210899353027344, "learning_rate": 1.5155176245269225e-05, "loss": 0.2413, "step": 142800 }, { "epoch": 1.0995536827534262, "grad_norm": 7.005092620849609, "learning_rate": 1.5151783487788618e-05, "loss": 0.2516, "step": 142900 }, { "epoch": 1.0997233206274566, "grad_norm": 3.3487868309020996, "learning_rate": 1.5148390730308014e-05, "loss": 0.2373, "step": 143000 }, { "epoch": 1.099892958501487, "grad_norm": 4.8929290771484375, "learning_rate": 1.5144997972827406e-05, "loss": 0.2239, "step": 143100 }, { "epoch": 1.1000625963755173, "grad_norm": 5.267115592956543, "learning_rate": 1.51416052153468e-05, "loss": 0.2386, "step": 143200 }, { "epoch": 1.1002322342495474, "grad_norm": 7.7546000480651855, "learning_rate": 1.5138212457866195e-05, "loss": 0.2316, "step": 143300 }, { "epoch": 1.1004018721235778, "grad_norm": 5.796301364898682, "learning_rate": 1.5134819700385587e-05, "loss": 0.2314, "step": 143400 }, { "epoch": 1.100571509997608, "grad_norm": 8.033646583557129, "learning_rate": 1.5131426942904981e-05, "loss": 0.2376, "step": 143500 }, { "epoch": 1.1007411478716385, "grad_norm": 3.758758306503296, "learning_rate": 1.5128034185424377e-05, "loss": 0.2608, "step": 143600 }, { "epoch": 1.1009107857456688, "grad_norm": 5.599483489990234, "learning_rate": 1.5124641427943769e-05, "loss": 0.2412, "step": 143700 }, { "epoch": 1.101080423619699, "grad_norm": 3.9385006427764893, "learning_rate": 1.5121248670463164e-05, "loss": 0.2334, "step": 143800 }, { "epoch": 1.1012500614937293, "grad_norm": 6.374066352844238, "learning_rate": 1.5117855912982558e-05, "loss": 0.2273, "step": 143900 }, { "epoch": 1.1014196993677596, "grad_norm": 8.210346221923828, "learning_rate": 1.511446315550195e-05, "loss": 0.214, "step": 144000 }, { "epoch": 1.10158933724179, "grad_norm": 7.938467502593994, "learning_rate": 1.5111070398021346e-05, "loss": 0.2247, "step": 144100 }, { "epoch": 1.1017589751158203, "grad_norm": 5.397073268890381, "learning_rate": 1.510767764054074e-05, "loss": 0.2296, "step": 144200 }, { "epoch": 1.1019286129898505, "grad_norm": 3.714932441711426, "learning_rate": 1.5104284883060132e-05, "loss": 0.2334, "step": 144300 }, { "epoch": 1.1020982508638808, "grad_norm": 2.607745885848999, "learning_rate": 1.5100892125579527e-05, "loss": 0.226, "step": 144400 }, { "epoch": 1.1022678887379112, "grad_norm": 12.286839485168457, "learning_rate": 1.5097499368098921e-05, "loss": 0.2343, "step": 144500 }, { "epoch": 1.1024375266119415, "grad_norm": 8.409263610839844, "learning_rate": 1.5094106610618315e-05, "loss": 0.2657, "step": 144600 }, { "epoch": 1.1026071644859718, "grad_norm": 6.540457248687744, "learning_rate": 1.5090713853137708e-05, "loss": 0.2237, "step": 144700 }, { "epoch": 1.1027768023600022, "grad_norm": 9.15771198272705, "learning_rate": 1.5087321095657102e-05, "loss": 0.223, "step": 144800 }, { "epoch": 1.1029464402340323, "grad_norm": 8.687532424926758, "learning_rate": 1.5083928338176496e-05, "loss": 0.252, "step": 144900 }, { "epoch": 1.1031160781080627, "grad_norm": 6.865411758422852, "learning_rate": 1.508053558069589e-05, "loss": 0.2384, "step": 145000 }, { "epoch": 1.103285715982093, "grad_norm": 5.365298271179199, "learning_rate": 1.5077142823215282e-05, "loss": 0.2238, "step": 145100 }, { "epoch": 1.1034553538561234, "grad_norm": 5.358958721160889, "learning_rate": 1.5073750065734677e-05, "loss": 0.2345, "step": 145200 }, { "epoch": 1.1036249917301537, "grad_norm": 9.38072681427002, "learning_rate": 1.5070357308254071e-05, "loss": 0.2368, "step": 145300 }, { "epoch": 1.103794629604184, "grad_norm": 7.759133338928223, "learning_rate": 1.5066964550773463e-05, "loss": 0.2453, "step": 145400 }, { "epoch": 1.1039642674782142, "grad_norm": 9.516827583312988, "learning_rate": 1.5063571793292859e-05, "loss": 0.2344, "step": 145500 }, { "epoch": 1.1041339053522445, "grad_norm": 7.547486305236816, "learning_rate": 1.5060179035812253e-05, "loss": 0.2435, "step": 145600 }, { "epoch": 1.1043035432262749, "grad_norm": 7.848860740661621, "learning_rate": 1.5056786278331648e-05, "loss": 0.2278, "step": 145700 }, { "epoch": 1.1044731811003052, "grad_norm": 3.8868985176086426, "learning_rate": 1.505339352085104e-05, "loss": 0.2357, "step": 145800 }, { "epoch": 1.1046428189743356, "grad_norm": 6.225032329559326, "learning_rate": 1.5050000763370434e-05, "loss": 0.2291, "step": 145900 }, { "epoch": 1.1048124568483657, "grad_norm": 3.6380577087402344, "learning_rate": 1.504660800588983e-05, "loss": 0.227, "step": 146000 }, { "epoch": 1.104982094722396, "grad_norm": 4.411957740783691, "learning_rate": 1.5043215248409222e-05, "loss": 0.2352, "step": 146100 }, { "epoch": 1.1051517325964264, "grad_norm": 6.17123556137085, "learning_rate": 1.5039822490928615e-05, "loss": 0.2371, "step": 146200 }, { "epoch": 1.1053213704704568, "grad_norm": 5.114989757537842, "learning_rate": 1.5036429733448011e-05, "loss": 0.2424, "step": 146300 }, { "epoch": 1.105491008344487, "grad_norm": 7.662924289703369, "learning_rate": 1.5033036975967403e-05, "loss": 0.2194, "step": 146400 }, { "epoch": 1.1056606462185172, "grad_norm": 7.344404697418213, "learning_rate": 1.5029644218486798e-05, "loss": 0.2146, "step": 146500 }, { "epoch": 1.1058302840925476, "grad_norm": 5.116180896759033, "learning_rate": 1.5026251461006192e-05, "loss": 0.2402, "step": 146600 }, { "epoch": 1.105999921966578, "grad_norm": 3.678295135498047, "learning_rate": 1.5022858703525584e-05, "loss": 0.2446, "step": 146700 }, { "epoch": 1.1061695598406083, "grad_norm": 4.739404201507568, "learning_rate": 1.501946594604498e-05, "loss": 0.2195, "step": 146800 }, { "epoch": 1.1063391977146386, "grad_norm": 4.838621139526367, "learning_rate": 1.5016073188564374e-05, "loss": 0.2284, "step": 146900 }, { "epoch": 1.106508835588669, "grad_norm": 4.191385269165039, "learning_rate": 1.5012680431083766e-05, "loss": 0.2212, "step": 147000 }, { "epoch": 1.106678473462699, "grad_norm": 7.098555564880371, "learning_rate": 1.5009287673603161e-05, "loss": 0.255, "step": 147100 }, { "epoch": 1.1068481113367294, "grad_norm": 4.93930196762085, "learning_rate": 1.5005894916122555e-05, "loss": 0.2252, "step": 147200 }, { "epoch": 1.1070177492107598, "grad_norm": 6.311878204345703, "learning_rate": 1.5002502158641947e-05, "loss": 0.2149, "step": 147300 }, { "epoch": 1.1071873870847901, "grad_norm": 6.08720588684082, "learning_rate": 1.4999109401161343e-05, "loss": 0.2172, "step": 147400 }, { "epoch": 1.1073570249588205, "grad_norm": 8.128152847290039, "learning_rate": 1.4995716643680735e-05, "loss": 0.2522, "step": 147500 }, { "epoch": 1.1075266628328506, "grad_norm": 6.269208908081055, "learning_rate": 1.499232388620013e-05, "loss": 0.2299, "step": 147600 }, { "epoch": 1.107696300706881, "grad_norm": 3.40877103805542, "learning_rate": 1.4988931128719524e-05, "loss": 0.2322, "step": 147700 }, { "epoch": 1.1078659385809113, "grad_norm": 5.5784687995910645, "learning_rate": 1.4985538371238916e-05, "loss": 0.2306, "step": 147800 }, { "epoch": 1.1080355764549417, "grad_norm": 7.757547855377197, "learning_rate": 1.4982145613758312e-05, "loss": 0.2302, "step": 147900 }, { "epoch": 1.108205214328972, "grad_norm": 5.98262357711792, "learning_rate": 1.4978752856277705e-05, "loss": 0.2309, "step": 148000 }, { "epoch": 1.1083748522030024, "grad_norm": 4.85951566696167, "learning_rate": 1.4975360098797098e-05, "loss": 0.235, "step": 148100 }, { "epoch": 1.1085444900770325, "grad_norm": 3.013167142868042, "learning_rate": 1.4971967341316493e-05, "loss": 0.228, "step": 148200 }, { "epoch": 1.1087141279510628, "grad_norm": 6.347944259643555, "learning_rate": 1.4968574583835887e-05, "loss": 0.2355, "step": 148300 }, { "epoch": 1.1088837658250932, "grad_norm": 7.9081196784973145, "learning_rate": 1.4965181826355282e-05, "loss": 0.2356, "step": 148400 }, { "epoch": 1.1090534036991235, "grad_norm": 4.880331993103027, "learning_rate": 1.4961789068874674e-05, "loss": 0.2398, "step": 148500 }, { "epoch": 1.1092230415731539, "grad_norm": 9.213129043579102, "learning_rate": 1.4958396311394068e-05, "loss": 0.2215, "step": 148600 }, { "epoch": 1.109392679447184, "grad_norm": 7.315440654754639, "learning_rate": 1.4955003553913464e-05, "loss": 0.24, "step": 148700 }, { "epoch": 1.1095623173212144, "grad_norm": 7.229963779449463, "learning_rate": 1.4951610796432856e-05, "loss": 0.2281, "step": 148800 }, { "epoch": 1.1097319551952447, "grad_norm": 7.311700344085693, "learning_rate": 1.494821803895225e-05, "loss": 0.2298, "step": 148900 }, { "epoch": 1.109901593069275, "grad_norm": 3.780348777770996, "learning_rate": 1.4944825281471645e-05, "loss": 0.2394, "step": 149000 }, { "epoch": 1.1100712309433054, "grad_norm": 7.603663921356201, "learning_rate": 1.4941432523991037e-05, "loss": 0.2231, "step": 149100 }, { "epoch": 1.1102408688173355, "grad_norm": 5.632371425628662, "learning_rate": 1.4938039766510431e-05, "loss": 0.231, "step": 149200 }, { "epoch": 1.1104105066913659, "grad_norm": 5.567756652832031, "learning_rate": 1.4934647009029826e-05, "loss": 0.2304, "step": 149300 }, { "epoch": 1.1105801445653962, "grad_norm": 8.136098861694336, "learning_rate": 1.4931254251549219e-05, "loss": 0.2349, "step": 149400 }, { "epoch": 1.1107497824394266, "grad_norm": 5.423779010772705, "learning_rate": 1.4927861494068614e-05, "loss": 0.2415, "step": 149500 }, { "epoch": 1.110919420313457, "grad_norm": 6.469794750213623, "learning_rate": 1.4924468736588008e-05, "loss": 0.2411, "step": 149600 }, { "epoch": 1.1110890581874873, "grad_norm": 6.251758098602295, "learning_rate": 1.49210759791074e-05, "loss": 0.2441, "step": 149700 }, { "epoch": 1.1112586960615174, "grad_norm": 4.532764434814453, "learning_rate": 1.4917683221626795e-05, "loss": 0.2368, "step": 149800 }, { "epoch": 1.1114283339355477, "grad_norm": 4.787090301513672, "learning_rate": 1.4914290464146188e-05, "loss": 0.2252, "step": 149900 }, { "epoch": 1.111597971809578, "grad_norm": 4.637678146362305, "learning_rate": 1.4910897706665581e-05, "loss": 0.2377, "step": 150000 }, { "epoch": 1.1117676096836084, "grad_norm": 2.7543866634368896, "learning_rate": 1.4907504949184977e-05, "loss": 0.2265, "step": 150100 }, { "epoch": 1.1119372475576388, "grad_norm": 6.55967378616333, "learning_rate": 1.4904112191704369e-05, "loss": 0.2428, "step": 150200 }, { "epoch": 1.112106885431669, "grad_norm": 6.578974723815918, "learning_rate": 1.4900719434223763e-05, "loss": 0.2434, "step": 150300 }, { "epoch": 1.1122765233056993, "grad_norm": 5.444930553436279, "learning_rate": 1.4897326676743158e-05, "loss": 0.2339, "step": 150400 }, { "epoch": 1.1124461611797296, "grad_norm": 7.350747585296631, "learning_rate": 1.489393391926255e-05, "loss": 0.2139, "step": 150500 }, { "epoch": 1.11261579905376, "grad_norm": 3.7058515548706055, "learning_rate": 1.4890541161781946e-05, "loss": 0.2251, "step": 150600 }, { "epoch": 1.1127854369277903, "grad_norm": 8.709574699401855, "learning_rate": 1.488714840430134e-05, "loss": 0.2305, "step": 150700 }, { "epoch": 1.1129550748018207, "grad_norm": 6.8740739822387695, "learning_rate": 1.4883755646820732e-05, "loss": 0.2244, "step": 150800 }, { "epoch": 1.1131247126758508, "grad_norm": 8.471184730529785, "learning_rate": 1.4880362889340127e-05, "loss": 0.243, "step": 150900 }, { "epoch": 1.1132943505498811, "grad_norm": 4.773309230804443, "learning_rate": 1.4876970131859521e-05, "loss": 0.2297, "step": 151000 }, { "epoch": 1.1134639884239115, "grad_norm": 6.799368858337402, "learning_rate": 1.4873577374378913e-05, "loss": 0.2212, "step": 151100 }, { "epoch": 1.1136336262979418, "grad_norm": 6.754696846008301, "learning_rate": 1.4870184616898309e-05, "loss": 0.2244, "step": 151200 }, { "epoch": 1.1138032641719722, "grad_norm": 4.202824592590332, "learning_rate": 1.4866791859417702e-05, "loss": 0.2391, "step": 151300 }, { "epoch": 1.1139729020460023, "grad_norm": 5.462973594665527, "learning_rate": 1.4863399101937098e-05, "loss": 0.2287, "step": 151400 }, { "epoch": 1.1141425399200326, "grad_norm": 5.903615951538086, "learning_rate": 1.486000634445649e-05, "loss": 0.2357, "step": 151500 }, { "epoch": 1.114312177794063, "grad_norm": 4.678394794464111, "learning_rate": 1.4856613586975884e-05, "loss": 0.2315, "step": 151600 }, { "epoch": 1.1144818156680933, "grad_norm": 4.968438148498535, "learning_rate": 1.485322082949528e-05, "loss": 0.2348, "step": 151700 }, { "epoch": 1.1146514535421237, "grad_norm": 7.492313385009766, "learning_rate": 1.4849828072014671e-05, "loss": 0.2358, "step": 151800 }, { "epoch": 1.1148210914161538, "grad_norm": 6.301089286804199, "learning_rate": 1.4846435314534065e-05, "loss": 0.2322, "step": 151900 }, { "epoch": 1.1149907292901842, "grad_norm": 11.526525497436523, "learning_rate": 1.4843042557053459e-05, "loss": 0.2272, "step": 152000 }, { "epoch": 1.1151603671642145, "grad_norm": 5.912302017211914, "learning_rate": 1.4839649799572853e-05, "loss": 0.2305, "step": 152100 }, { "epoch": 1.1153300050382449, "grad_norm": 5.881083965301514, "learning_rate": 1.4836257042092247e-05, "loss": 0.2249, "step": 152200 }, { "epoch": 1.1154996429122752, "grad_norm": 6.068069934844971, "learning_rate": 1.483286428461164e-05, "loss": 0.228, "step": 152300 }, { "epoch": 1.1156692807863056, "grad_norm": 5.964060306549072, "learning_rate": 1.4829471527131034e-05, "loss": 0.2454, "step": 152400 }, { "epoch": 1.1158389186603357, "grad_norm": 8.167543411254883, "learning_rate": 1.482607876965043e-05, "loss": 0.2272, "step": 152500 }, { "epoch": 1.116008556534366, "grad_norm": 7.178010940551758, "learning_rate": 1.4822686012169822e-05, "loss": 0.2252, "step": 152600 }, { "epoch": 1.1161781944083964, "grad_norm": 7.273097991943359, "learning_rate": 1.4819293254689215e-05, "loss": 0.2527, "step": 152700 }, { "epoch": 1.1163478322824267, "grad_norm": 5.088498115539551, "learning_rate": 1.4815900497208611e-05, "loss": 0.2544, "step": 152800 }, { "epoch": 1.116517470156457, "grad_norm": 6.125808238983154, "learning_rate": 1.4812507739728003e-05, "loss": 0.2389, "step": 152900 }, { "epoch": 1.1166871080304874, "grad_norm": 5.293359756469727, "learning_rate": 1.4809114982247397e-05, "loss": 0.2335, "step": 153000 }, { "epoch": 1.1168567459045176, "grad_norm": 11.554478645324707, "learning_rate": 1.4805722224766792e-05, "loss": 0.2119, "step": 153100 }, { "epoch": 1.117026383778548, "grad_norm": 4.730544090270996, "learning_rate": 1.4802329467286184e-05, "loss": 0.2412, "step": 153200 }, { "epoch": 1.1171960216525783, "grad_norm": 3.641746759414673, "learning_rate": 1.479893670980558e-05, "loss": 0.2295, "step": 153300 }, { "epoch": 1.1173656595266086, "grad_norm": 7.59814453125, "learning_rate": 1.4795543952324974e-05, "loss": 0.2242, "step": 153400 }, { "epoch": 1.117535297400639, "grad_norm": 3.951085090637207, "learning_rate": 1.4792151194844366e-05, "loss": 0.2354, "step": 153500 }, { "epoch": 1.117704935274669, "grad_norm": 6.98118257522583, "learning_rate": 1.4788758437363761e-05, "loss": 0.2311, "step": 153600 }, { "epoch": 1.1178745731486994, "grad_norm": 6.985825061798096, "learning_rate": 1.4785365679883155e-05, "loss": 0.2343, "step": 153700 }, { "epoch": 1.1180442110227298, "grad_norm": 7.039731025695801, "learning_rate": 1.4781972922402547e-05, "loss": 0.2277, "step": 153800 }, { "epoch": 1.1182138488967601, "grad_norm": 10.21862506866455, "learning_rate": 1.4778580164921943e-05, "loss": 0.2387, "step": 153900 }, { "epoch": 1.1183834867707905, "grad_norm": 3.9023890495300293, "learning_rate": 1.4775187407441337e-05, "loss": 0.2227, "step": 154000 }, { "epoch": 1.1185531246448206, "grad_norm": 9.423526763916016, "learning_rate": 1.4771794649960729e-05, "loss": 0.2433, "step": 154100 }, { "epoch": 1.118722762518851, "grad_norm": 8.418224334716797, "learning_rate": 1.4768401892480124e-05, "loss": 0.2279, "step": 154200 }, { "epoch": 1.1188924003928813, "grad_norm": 4.063918590545654, "learning_rate": 1.4765009134999518e-05, "loss": 0.241, "step": 154300 }, { "epoch": 1.1190620382669116, "grad_norm": 4.075032711029053, "learning_rate": 1.4761616377518912e-05, "loss": 0.235, "step": 154400 }, { "epoch": 1.119231676140942, "grad_norm": 9.321063041687012, "learning_rate": 1.4758223620038306e-05, "loss": 0.2255, "step": 154500 }, { "epoch": 1.1194013140149723, "grad_norm": 5.548722267150879, "learning_rate": 1.47548308625577e-05, "loss": 0.2316, "step": 154600 }, { "epoch": 1.1195709518890025, "grad_norm": 4.869016170501709, "learning_rate": 1.4751438105077093e-05, "loss": 0.2334, "step": 154700 }, { "epoch": 1.1197405897630328, "grad_norm": 6.608687400817871, "learning_rate": 1.4748045347596487e-05, "loss": 0.2211, "step": 154800 }, { "epoch": 1.1199102276370632, "grad_norm": 1.1324548721313477, "learning_rate": 1.474465259011588e-05, "loss": 0.2334, "step": 154900 }, { "epoch": 1.1200798655110935, "grad_norm": 3.7304611206054688, "learning_rate": 1.4741259832635274e-05, "loss": 0.2218, "step": 155000 }, { "epoch": 1.1202495033851239, "grad_norm": 5.588261604309082, "learning_rate": 1.4737867075154668e-05, "loss": 0.2407, "step": 155100 }, { "epoch": 1.120419141259154, "grad_norm": 7.1734418869018555, "learning_rate": 1.4734474317674064e-05, "loss": 0.2307, "step": 155200 }, { "epoch": 1.1205887791331843, "grad_norm": 6.361325263977051, "learning_rate": 1.4731081560193456e-05, "loss": 0.2356, "step": 155300 }, { "epoch": 1.1207584170072147, "grad_norm": 5.172506809234619, "learning_rate": 1.472768880271285e-05, "loss": 0.2293, "step": 155400 }, { "epoch": 1.120928054881245, "grad_norm": 5.069222927093506, "learning_rate": 1.4724296045232245e-05, "loss": 0.2305, "step": 155500 }, { "epoch": 1.1210976927552754, "grad_norm": 6.1346755027771, "learning_rate": 1.4720903287751637e-05, "loss": 0.2285, "step": 155600 }, { "epoch": 1.1212673306293057, "grad_norm": 2.1004979610443115, "learning_rate": 1.4717510530271031e-05, "loss": 0.2246, "step": 155700 }, { "epoch": 1.1214369685033359, "grad_norm": 4.859232425689697, "learning_rate": 1.4714117772790427e-05, "loss": 0.2444, "step": 155800 }, { "epoch": 1.1216066063773662, "grad_norm": 8.745616912841797, "learning_rate": 1.4710725015309819e-05, "loss": 0.2252, "step": 155900 }, { "epoch": 1.1217762442513965, "grad_norm": 2.796124219894409, "learning_rate": 1.4707332257829212e-05, "loss": 0.2284, "step": 156000 }, { "epoch": 1.121945882125427, "grad_norm": 5.784322738647461, "learning_rate": 1.4703939500348608e-05, "loss": 0.2246, "step": 156100 }, { "epoch": 1.1221155199994572, "grad_norm": 3.8672056198120117, "learning_rate": 1.4700546742868e-05, "loss": 0.2308, "step": 156200 }, { "epoch": 1.1222851578734874, "grad_norm": 3.855318307876587, "learning_rate": 1.4697153985387396e-05, "loss": 0.2203, "step": 156300 }, { "epoch": 1.1224547957475177, "grad_norm": 8.94708251953125, "learning_rate": 1.469376122790679e-05, "loss": 0.2353, "step": 156400 }, { "epoch": 1.122624433621548, "grad_norm": 7.380045413970947, "learning_rate": 1.4690368470426181e-05, "loss": 0.2313, "step": 156500 }, { "epoch": 1.1227940714955784, "grad_norm": 9.852660179138184, "learning_rate": 1.4686975712945577e-05, "loss": 0.2266, "step": 156600 }, { "epoch": 1.1229637093696088, "grad_norm": 5.81839656829834, "learning_rate": 1.468358295546497e-05, "loss": 0.2194, "step": 156700 }, { "epoch": 1.123133347243639, "grad_norm": 5.318418979644775, "learning_rate": 1.4680190197984363e-05, "loss": 0.2271, "step": 156800 }, { "epoch": 1.1233029851176692, "grad_norm": 6.465486526489258, "learning_rate": 1.4676797440503758e-05, "loss": 0.2258, "step": 156900 }, { "epoch": 1.1234726229916996, "grad_norm": 4.283987998962402, "learning_rate": 1.4673404683023152e-05, "loss": 0.2421, "step": 157000 }, { "epoch": 1.12364226086573, "grad_norm": 3.673734426498413, "learning_rate": 1.4670011925542546e-05, "loss": 0.2333, "step": 157100 }, { "epoch": 1.1238118987397603, "grad_norm": 5.92421817779541, "learning_rate": 1.466661916806194e-05, "loss": 0.236, "step": 157200 }, { "epoch": 1.1239815366137906, "grad_norm": 7.361629486083984, "learning_rate": 1.4663226410581333e-05, "loss": 0.2457, "step": 157300 }, { "epoch": 1.1241511744878208, "grad_norm": 7.7452874183654785, "learning_rate": 1.4659833653100727e-05, "loss": 0.2376, "step": 157400 }, { "epoch": 1.124320812361851, "grad_norm": 10.300382614135742, "learning_rate": 1.4656440895620121e-05, "loss": 0.2229, "step": 157500 }, { "epoch": 1.1244904502358815, "grad_norm": 4.852621555328369, "learning_rate": 1.4653048138139513e-05, "loss": 0.2302, "step": 157600 }, { "epoch": 1.1246600881099118, "grad_norm": 5.1902241706848145, "learning_rate": 1.4649655380658909e-05, "loss": 0.2513, "step": 157700 }, { "epoch": 1.1248297259839422, "grad_norm": 6.102197647094727, "learning_rate": 1.4646262623178302e-05, "loss": 0.2382, "step": 157800 }, { "epoch": 1.1249993638579725, "grad_norm": 3.8082492351531982, "learning_rate": 1.4642869865697695e-05, "loss": 0.2257, "step": 157900 }, { "epoch": 1.1251690017320026, "grad_norm": 6.240933895111084, "learning_rate": 1.463947710821709e-05, "loss": 0.2102, "step": 158000 }, { "epoch": 1.125338639606033, "grad_norm": 5.081489562988281, "learning_rate": 1.4636084350736484e-05, "loss": 0.2323, "step": 158100 }, { "epoch": 1.1255082774800633, "grad_norm": 6.4700927734375, "learning_rate": 1.463269159325588e-05, "loss": 0.2407, "step": 158200 }, { "epoch": 1.1256779153540937, "grad_norm": 6.741516590118408, "learning_rate": 1.4629298835775271e-05, "loss": 0.2355, "step": 158300 }, { "epoch": 1.125847553228124, "grad_norm": 2.978228807449341, "learning_rate": 1.4625906078294665e-05, "loss": 0.2304, "step": 158400 }, { "epoch": 1.1260171911021541, "grad_norm": 3.63726544380188, "learning_rate": 1.462251332081406e-05, "loss": 0.2239, "step": 158500 }, { "epoch": 1.1261868289761845, "grad_norm": 7.158109664916992, "learning_rate": 1.4619120563333453e-05, "loss": 0.2441, "step": 158600 }, { "epoch": 1.1263564668502148, "grad_norm": 5.292952060699463, "learning_rate": 1.4615727805852847e-05, "loss": 0.236, "step": 158700 }, { "epoch": 1.1265261047242452, "grad_norm": 2.6970670223236084, "learning_rate": 1.4612335048372242e-05, "loss": 0.2134, "step": 158800 }, { "epoch": 1.1266957425982755, "grad_norm": 2.9427387714385986, "learning_rate": 1.4608942290891634e-05, "loss": 0.2283, "step": 158900 }, { "epoch": 1.1268653804723057, "grad_norm": 8.5725736618042, "learning_rate": 1.460554953341103e-05, "loss": 0.2208, "step": 159000 }, { "epoch": 1.127035018346336, "grad_norm": 5.533041954040527, "learning_rate": 1.4602156775930423e-05, "loss": 0.2353, "step": 159100 }, { "epoch": 1.1272046562203664, "grad_norm": 3.6451432704925537, "learning_rate": 1.4598764018449816e-05, "loss": 0.2534, "step": 159200 }, { "epoch": 1.1273742940943967, "grad_norm": 4.257209777832031, "learning_rate": 1.4595371260969211e-05, "loss": 0.2168, "step": 159300 }, { "epoch": 1.127543931968427, "grad_norm": 7.53319787979126, "learning_rate": 1.4591978503488605e-05, "loss": 0.2332, "step": 159400 }, { "epoch": 1.1277135698424572, "grad_norm": 4.665531635284424, "learning_rate": 1.4588585746007997e-05, "loss": 0.2359, "step": 159500 }, { "epoch": 1.1278832077164875, "grad_norm": 6.633471965789795, "learning_rate": 1.4585192988527392e-05, "loss": 0.2357, "step": 159600 }, { "epoch": 1.1280528455905179, "grad_norm": 9.785979270935059, "learning_rate": 1.4581800231046786e-05, "loss": 0.2208, "step": 159700 }, { "epoch": 1.1282224834645482, "grad_norm": 5.978979110717773, "learning_rate": 1.4578407473566178e-05, "loss": 0.2255, "step": 159800 }, { "epoch": 1.1283921213385786, "grad_norm": 4.1492486000061035, "learning_rate": 1.4575014716085574e-05, "loss": 0.2306, "step": 159900 }, { "epoch": 1.128561759212609, "grad_norm": 3.6145384311676025, "learning_rate": 1.4571621958604966e-05, "loss": 0.2374, "step": 160000 }, { "epoch": 1.128731397086639, "grad_norm": 7.373808860778809, "learning_rate": 1.4568229201124361e-05, "loss": 0.2309, "step": 160100 }, { "epoch": 1.1289010349606694, "grad_norm": 5.701391220092773, "learning_rate": 1.4564836443643755e-05, "loss": 0.2188, "step": 160200 }, { "epoch": 1.1290706728346998, "grad_norm": 7.448570251464844, "learning_rate": 1.4561443686163147e-05, "loss": 0.2263, "step": 160300 }, { "epoch": 1.12924031070873, "grad_norm": 4.086200714111328, "learning_rate": 1.4558050928682543e-05, "loss": 0.2154, "step": 160400 }, { "epoch": 1.1294099485827604, "grad_norm": 8.463911056518555, "learning_rate": 1.4554658171201937e-05, "loss": 0.2278, "step": 160500 }, { "epoch": 1.1295795864567908, "grad_norm": 6.603061676025391, "learning_rate": 1.4551265413721329e-05, "loss": 0.2271, "step": 160600 }, { "epoch": 1.129749224330821, "grad_norm": 4.200881004333496, "learning_rate": 1.4547872656240724e-05, "loss": 0.225, "step": 160700 }, { "epoch": 1.1299188622048513, "grad_norm": 4.619681358337402, "learning_rate": 1.4544479898760118e-05, "loss": 0.2299, "step": 160800 }, { "epoch": 1.1300885000788816, "grad_norm": 2.3037397861480713, "learning_rate": 1.454108714127951e-05, "loss": 0.2255, "step": 160900 }, { "epoch": 1.130258137952912, "grad_norm": 5.2309770584106445, "learning_rate": 1.4537694383798906e-05, "loss": 0.2144, "step": 161000 }, { "epoch": 1.1304277758269423, "grad_norm": 7.357405662536621, "learning_rate": 1.45343016263183e-05, "loss": 0.2381, "step": 161100 }, { "epoch": 1.1305974137009724, "grad_norm": 7.998576641082764, "learning_rate": 1.4530908868837695e-05, "loss": 0.2193, "step": 161200 }, { "epoch": 1.1307670515750028, "grad_norm": 7.933591842651367, "learning_rate": 1.4527516111357087e-05, "loss": 0.2417, "step": 161300 }, { "epoch": 1.1309366894490331, "grad_norm": 6.345344543457031, "learning_rate": 1.452412335387648e-05, "loss": 0.2315, "step": 161400 }, { "epoch": 1.1311063273230635, "grad_norm": 8.401808738708496, "learning_rate": 1.4520730596395876e-05, "loss": 0.2421, "step": 161500 }, { "epoch": 1.1312759651970938, "grad_norm": 3.220946788787842, "learning_rate": 1.4517337838915268e-05, "loss": 0.2286, "step": 161600 }, { "epoch": 1.131445603071124, "grad_norm": 6.96453332901001, "learning_rate": 1.4513945081434662e-05, "loss": 0.2201, "step": 161700 }, { "epoch": 1.1316152409451543, "grad_norm": 6.678293704986572, "learning_rate": 1.4510552323954058e-05, "loss": 0.2393, "step": 161800 }, { "epoch": 1.1317848788191847, "grad_norm": 5.806995391845703, "learning_rate": 1.450715956647345e-05, "loss": 0.2329, "step": 161900 }, { "epoch": 1.131954516693215, "grad_norm": 9.75451946258545, "learning_rate": 1.4503766808992845e-05, "loss": 0.2432, "step": 162000 }, { "epoch": 1.1321241545672454, "grad_norm": 10.60030746459961, "learning_rate": 1.4500374051512237e-05, "loss": 0.2408, "step": 162100 }, { "epoch": 1.1322937924412757, "grad_norm": 5.684751987457275, "learning_rate": 1.4496981294031631e-05, "loss": 0.2344, "step": 162200 }, { "epoch": 1.1324634303153058, "grad_norm": 4.055408954620361, "learning_rate": 1.4493588536551027e-05, "loss": 0.2232, "step": 162300 }, { "epoch": 1.1326330681893362, "grad_norm": 10.01876449584961, "learning_rate": 1.4490195779070419e-05, "loss": 0.2287, "step": 162400 }, { "epoch": 1.1328027060633665, "grad_norm": 7.000214099884033, "learning_rate": 1.4486803021589813e-05, "loss": 0.2193, "step": 162500 }, { "epoch": 1.1329723439373969, "grad_norm": 8.165783882141113, "learning_rate": 1.4483410264109208e-05, "loss": 0.2383, "step": 162600 }, { "epoch": 1.1331419818114272, "grad_norm": 6.5257248878479, "learning_rate": 1.44800175066286e-05, "loss": 0.2365, "step": 162700 }, { "epoch": 1.1333116196854576, "grad_norm": 4.892368316650391, "learning_rate": 1.4476624749147994e-05, "loss": 0.248, "step": 162800 }, { "epoch": 1.1334812575594877, "grad_norm": 6.547915458679199, "learning_rate": 1.447323199166739e-05, "loss": 0.2247, "step": 162900 }, { "epoch": 1.133650895433518, "grad_norm": 3.8118743896484375, "learning_rate": 1.4469839234186782e-05, "loss": 0.2327, "step": 163000 }, { "epoch": 1.1338205333075484, "grad_norm": 5.215146064758301, "learning_rate": 1.4466446476706177e-05, "loss": 0.2402, "step": 163100 }, { "epoch": 1.1339901711815787, "grad_norm": 3.5644094944000244, "learning_rate": 1.446305371922557e-05, "loss": 0.2292, "step": 163200 }, { "epoch": 1.134159809055609, "grad_norm": 7.187602519989014, "learning_rate": 1.4459660961744963e-05, "loss": 0.2288, "step": 163300 }, { "epoch": 1.1343294469296392, "grad_norm": 6.7500786781311035, "learning_rate": 1.4456268204264358e-05, "loss": 0.2454, "step": 163400 }, { "epoch": 1.1344990848036696, "grad_norm": 5.58074951171875, "learning_rate": 1.4452875446783752e-05, "loss": 0.2415, "step": 163500 }, { "epoch": 1.1346687226777, "grad_norm": 3.5008938312530518, "learning_rate": 1.4449482689303144e-05, "loss": 0.2276, "step": 163600 }, { "epoch": 1.1348383605517303, "grad_norm": 11.439308166503906, "learning_rate": 1.444608993182254e-05, "loss": 0.2384, "step": 163700 }, { "epoch": 1.1350079984257606, "grad_norm": 11.266529083251953, "learning_rate": 1.4442697174341934e-05, "loss": 0.2394, "step": 163800 }, { "epoch": 1.1351776362997907, "grad_norm": 5.603114604949951, "learning_rate": 1.4439304416861329e-05, "loss": 0.2458, "step": 163900 }, { "epoch": 1.135347274173821, "grad_norm": 4.037384510040283, "learning_rate": 1.4435911659380721e-05, "loss": 0.2137, "step": 164000 }, { "epoch": 1.1355169120478514, "grad_norm": 4.556715488433838, "learning_rate": 1.4432518901900115e-05, "loss": 0.2209, "step": 164100 }, { "epoch": 1.1356865499218818, "grad_norm": 5.521581649780273, "learning_rate": 1.442912614441951e-05, "loss": 0.2212, "step": 164200 }, { "epoch": 1.1358561877959121, "grad_norm": 6.365002632141113, "learning_rate": 1.4425733386938903e-05, "loss": 0.2263, "step": 164300 }, { "epoch": 1.1360258256699423, "grad_norm": 4.393952369689941, "learning_rate": 1.4422340629458296e-05, "loss": 0.2488, "step": 164400 }, { "epoch": 1.1361954635439726, "grad_norm": 5.557241916656494, "learning_rate": 1.441894787197769e-05, "loss": 0.2225, "step": 164500 }, { "epoch": 1.136365101418003, "grad_norm": 8.301450729370117, "learning_rate": 1.4415555114497084e-05, "loss": 0.2421, "step": 164600 }, { "epoch": 1.1365347392920333, "grad_norm": 13.580363273620605, "learning_rate": 1.4412162357016478e-05, "loss": 0.2247, "step": 164700 }, { "epoch": 1.1367043771660637, "grad_norm": 6.211540222167969, "learning_rate": 1.4408769599535872e-05, "loss": 0.2487, "step": 164800 }, { "epoch": 1.136874015040094, "grad_norm": 3.9741625785827637, "learning_rate": 1.4405376842055265e-05, "loss": 0.1956, "step": 164900 }, { "epoch": 1.1370436529141241, "grad_norm": 7.942294120788574, "learning_rate": 1.440198408457466e-05, "loss": 0.2329, "step": 165000 }, { "epoch": 1.1372132907881545, "grad_norm": 9.055078506469727, "learning_rate": 1.4398591327094053e-05, "loss": 0.2253, "step": 165100 }, { "epoch": 1.1373829286621848, "grad_norm": 6.044885158538818, "learning_rate": 1.4395198569613447e-05, "loss": 0.2278, "step": 165200 }, { "epoch": 1.1375525665362152, "grad_norm": 6.962028980255127, "learning_rate": 1.4391805812132842e-05, "loss": 0.2191, "step": 165300 }, { "epoch": 1.1377222044102455, "grad_norm": 4.3691253662109375, "learning_rate": 1.4388413054652234e-05, "loss": 0.2475, "step": 165400 }, { "epoch": 1.1378918422842759, "grad_norm": 6.183272838592529, "learning_rate": 1.4385020297171628e-05, "loss": 0.2241, "step": 165500 }, { "epoch": 1.138061480158306, "grad_norm": 4.886302947998047, "learning_rate": 1.4381627539691024e-05, "loss": 0.2332, "step": 165600 }, { "epoch": 1.1382311180323363, "grad_norm": 6.9967360496521, "learning_rate": 1.4378234782210416e-05, "loss": 0.238, "step": 165700 }, { "epoch": 1.1384007559063667, "grad_norm": 4.981508255004883, "learning_rate": 1.4374842024729811e-05, "loss": 0.2329, "step": 165800 }, { "epoch": 1.138570393780397, "grad_norm": 4.247314453125, "learning_rate": 1.4371449267249205e-05, "loss": 0.2302, "step": 165900 }, { "epoch": 1.1387400316544274, "grad_norm": 6.629524230957031, "learning_rate": 1.4368056509768597e-05, "loss": 0.2351, "step": 166000 }, { "epoch": 1.1389096695284575, "grad_norm": 5.1867594718933105, "learning_rate": 1.4364663752287993e-05, "loss": 0.2319, "step": 166100 }, { "epoch": 1.1390793074024879, "grad_norm": 6.137604236602783, "learning_rate": 1.4361270994807386e-05, "loss": 0.2326, "step": 166200 }, { "epoch": 1.1392489452765182, "grad_norm": 6.502552032470703, "learning_rate": 1.4357878237326778e-05, "loss": 0.2172, "step": 166300 }, { "epoch": 1.1394185831505486, "grad_norm": 1.4197795391082764, "learning_rate": 1.4354485479846174e-05, "loss": 0.2326, "step": 166400 }, { "epoch": 1.139588221024579, "grad_norm": 8.38801383972168, "learning_rate": 1.4351092722365568e-05, "loss": 0.2329, "step": 166500 }, { "epoch": 1.139757858898609, "grad_norm": 5.718846797943115, "learning_rate": 1.434769996488496e-05, "loss": 0.2474, "step": 166600 }, { "epoch": 1.1399274967726394, "grad_norm": 3.239778995513916, "learning_rate": 1.4344307207404355e-05, "loss": 0.2174, "step": 166700 }, { "epoch": 1.1400971346466697, "grad_norm": 4.766222953796387, "learning_rate": 1.4340914449923749e-05, "loss": 0.2372, "step": 166800 }, { "epoch": 1.1402667725207, "grad_norm": 5.420569896697998, "learning_rate": 1.4337521692443143e-05, "loss": 0.2428, "step": 166900 }, { "epoch": 1.1404364103947304, "grad_norm": 9.13536548614502, "learning_rate": 1.4334128934962537e-05, "loss": 0.2348, "step": 167000 }, { "epoch": 1.1406060482687606, "grad_norm": 6.464512348175049, "learning_rate": 1.433073617748193e-05, "loss": 0.2437, "step": 167100 }, { "epoch": 1.140775686142791, "grad_norm": 7.253960609436035, "learning_rate": 1.4327343420001324e-05, "loss": 0.2148, "step": 167200 }, { "epoch": 1.1409453240168212, "grad_norm": 6.730727672576904, "learning_rate": 1.4323950662520718e-05, "loss": 0.2341, "step": 167300 }, { "epoch": 1.1411149618908516, "grad_norm": 5.958529949188232, "learning_rate": 1.4320557905040112e-05, "loss": 0.2317, "step": 167400 }, { "epoch": 1.141284599764882, "grad_norm": 3.964656114578247, "learning_rate": 1.4317165147559506e-05, "loss": 0.2217, "step": 167500 }, { "epoch": 1.1414542376389123, "grad_norm": 2.4381179809570312, "learning_rate": 1.43137723900789e-05, "loss": 0.2219, "step": 167600 }, { "epoch": 1.1416238755129426, "grad_norm": 2.8895606994628906, "learning_rate": 1.4310379632598295e-05, "loss": 0.2421, "step": 167700 }, { "epoch": 1.1417935133869728, "grad_norm": 13.193029403686523, "learning_rate": 1.4306986875117687e-05, "loss": 0.2312, "step": 167800 }, { "epoch": 1.1419631512610031, "grad_norm": 6.234099864959717, "learning_rate": 1.4303594117637081e-05, "loss": 0.2288, "step": 167900 }, { "epoch": 1.1421327891350335, "grad_norm": 5.747440338134766, "learning_rate": 1.4300201360156476e-05, "loss": 0.2287, "step": 168000 }, { "epoch": 1.1423024270090638, "grad_norm": 2.277885913848877, "learning_rate": 1.4296808602675868e-05, "loss": 0.2164, "step": 168100 }, { "epoch": 1.1424720648830942, "grad_norm": 5.081198692321777, "learning_rate": 1.4293415845195262e-05, "loss": 0.2324, "step": 168200 }, { "epoch": 1.1426417027571243, "grad_norm": 6.1313629150390625, "learning_rate": 1.4290023087714658e-05, "loss": 0.2245, "step": 168300 }, { "epoch": 1.1428113406311546, "grad_norm": 5.965959072113037, "learning_rate": 1.428663033023405e-05, "loss": 0.2347, "step": 168400 }, { "epoch": 1.1428588392358832, "eval_accuracy": 0.8019801388963532, "eval_f1": 0.8644807441317004, "eval_loss": 0.5284786224365234, "eval_runtime": 386.2292, "eval_samples_per_second": 867.169, "eval_steps_per_second": 27.1, "step": 168428 }, { "epoch": 2.0001221392693016, "grad_norm": 7.916367053985596, "learning_rate": 1.4283237572753444e-05, "loss": 0.2167, "step": 168500 }, { "epoch": 2.000291777143332, "grad_norm": 4.163214206695557, "learning_rate": 1.427984481527284e-05, "loss": 0.2031, "step": 168600 }, { "epoch": 2.0004614150173623, "grad_norm": 4.9825439453125, "learning_rate": 1.4276452057792231e-05, "loss": 0.193, "step": 168700 }, { "epoch": 2.000631052891393, "grad_norm": 3.90004563331604, "learning_rate": 1.4273059300311627e-05, "loss": 0.1989, "step": 168800 }, { "epoch": 2.000800690765423, "grad_norm": 8.324400901794434, "learning_rate": 1.426966654283102e-05, "loss": 0.1853, "step": 168900 }, { "epoch": 2.0009703286394536, "grad_norm": 8.134799003601074, "learning_rate": 1.4266273785350413e-05, "loss": 0.1869, "step": 169000 }, { "epoch": 2.0011399665134837, "grad_norm": 4.882049083709717, "learning_rate": 1.4262881027869808e-05, "loss": 0.202, "step": 169100 }, { "epoch": 2.001309604387514, "grad_norm": 5.7459282875061035, "learning_rate": 1.4259488270389202e-05, "loss": 0.1808, "step": 169200 }, { "epoch": 2.0014792422615444, "grad_norm": 4.531463146209717, "learning_rate": 1.4256095512908594e-05, "loss": 0.2135, "step": 169300 }, { "epoch": 2.0016488801355745, "grad_norm": 9.194780349731445, "learning_rate": 1.425270275542799e-05, "loss": 0.2023, "step": 169400 }, { "epoch": 2.001818518009605, "grad_norm": 5.360759735107422, "learning_rate": 1.4249309997947383e-05, "loss": 0.1834, "step": 169500 }, { "epoch": 2.001988155883635, "grad_norm": 4.825377941131592, "learning_rate": 1.4245917240466777e-05, "loss": 0.1835, "step": 169600 }, { "epoch": 2.0021577937576653, "grad_norm": 9.065420150756836, "learning_rate": 1.4242524482986171e-05, "loss": 0.1948, "step": 169700 }, { "epoch": 2.002327431631696, "grad_norm": 12.182185173034668, "learning_rate": 1.4239131725505565e-05, "loss": 0.1858, "step": 169800 }, { "epoch": 2.002497069505726, "grad_norm": 7.947506427764893, "learning_rate": 1.4235738968024958e-05, "loss": 0.1916, "step": 169900 }, { "epoch": 2.0026667073797566, "grad_norm": 8.394538879394531, "learning_rate": 1.4232346210544352e-05, "loss": 0.1976, "step": 170000 }, { "epoch": 2.0028363452537867, "grad_norm": 3.7438418865203857, "learning_rate": 1.4228953453063744e-05, "loss": 0.1946, "step": 170100 }, { "epoch": 2.003005983127817, "grad_norm": 6.767313480377197, "learning_rate": 1.422556069558314e-05, "loss": 0.2003, "step": 170200 }, { "epoch": 2.0031756210018474, "grad_norm": 9.204118728637695, "learning_rate": 1.4222167938102534e-05, "loss": 0.195, "step": 170300 }, { "epoch": 2.0033452588758776, "grad_norm": 6.7684221267700195, "learning_rate": 1.4218775180621926e-05, "loss": 0.1911, "step": 170400 }, { "epoch": 2.003514896749908, "grad_norm": 9.638067245483398, "learning_rate": 1.4215382423141321e-05, "loss": 0.183, "step": 170500 }, { "epoch": 2.0036845346239383, "grad_norm": 6.892617702484131, "learning_rate": 1.4211989665660715e-05, "loss": 0.1921, "step": 170600 }, { "epoch": 2.0038541724979684, "grad_norm": 9.645166397094727, "learning_rate": 1.420859690818011e-05, "loss": 0.2031, "step": 170700 }, { "epoch": 2.004023810371999, "grad_norm": 1.1071406602859497, "learning_rate": 1.4205204150699503e-05, "loss": 0.1865, "step": 170800 }, { "epoch": 2.004193448246029, "grad_norm": 8.576038360595703, "learning_rate": 1.4201811393218896e-05, "loss": 0.1977, "step": 170900 }, { "epoch": 2.0043630861200596, "grad_norm": 3.5592777729034424, "learning_rate": 1.4198418635738292e-05, "loss": 0.1789, "step": 171000 }, { "epoch": 2.0045327239940898, "grad_norm": 4.691405773162842, "learning_rate": 1.4195025878257684e-05, "loss": 0.1864, "step": 171100 }, { "epoch": 2.0047023618681203, "grad_norm": 6.888957500457764, "learning_rate": 1.4191633120777078e-05, "loss": 0.1845, "step": 171200 }, { "epoch": 2.0048719997421505, "grad_norm": 14.68862533569336, "learning_rate": 1.4188240363296473e-05, "loss": 0.174, "step": 171300 }, { "epoch": 2.0050416376161806, "grad_norm": 3.7277324199676514, "learning_rate": 1.4184847605815865e-05, "loss": 0.1755, "step": 171400 }, { "epoch": 2.005211275490211, "grad_norm": 5.1596903800964355, "learning_rate": 1.4181454848335261e-05, "loss": 0.1886, "step": 171500 }, { "epoch": 2.0053809133642413, "grad_norm": 8.506818771362305, "learning_rate": 1.4178062090854655e-05, "loss": 0.1993, "step": 171600 }, { "epoch": 2.005550551238272, "grad_norm": 11.234387397766113, "learning_rate": 1.4174669333374047e-05, "loss": 0.1936, "step": 171700 }, { "epoch": 2.005720189112302, "grad_norm": 8.304226875305176, "learning_rate": 1.4171276575893442e-05, "loss": 0.1859, "step": 171800 }, { "epoch": 2.005889826986332, "grad_norm": 11.702751159667969, "learning_rate": 1.4167883818412836e-05, "loss": 0.1833, "step": 171900 }, { "epoch": 2.0060594648603627, "grad_norm": 9.802342414855957, "learning_rate": 1.4164491060932228e-05, "loss": 0.1928, "step": 172000 }, { "epoch": 2.006229102734393, "grad_norm": 3.4006121158599854, "learning_rate": 1.4161098303451624e-05, "loss": 0.203, "step": 172100 }, { "epoch": 2.0063987406084234, "grad_norm": 6.945215225219727, "learning_rate": 1.4157705545971017e-05, "loss": 0.1991, "step": 172200 }, { "epoch": 2.0065683784824535, "grad_norm": 5.915694236755371, "learning_rate": 1.415431278849041e-05, "loss": 0.1842, "step": 172300 }, { "epoch": 2.0067380163564836, "grad_norm": 10.1319580078125, "learning_rate": 1.4150920031009805e-05, "loss": 0.1911, "step": 172400 }, { "epoch": 2.006907654230514, "grad_norm": 6.596518039703369, "learning_rate": 1.4147527273529197e-05, "loss": 0.1993, "step": 172500 }, { "epoch": 2.0070772921045443, "grad_norm": 10.574604988098145, "learning_rate": 1.4144134516048593e-05, "loss": 0.1898, "step": 172600 }, { "epoch": 2.007246929978575, "grad_norm": 16.311120986938477, "learning_rate": 1.4140741758567986e-05, "loss": 0.1848, "step": 172700 }, { "epoch": 2.007416567852605, "grad_norm": 8.004034042358398, "learning_rate": 1.4137349001087379e-05, "loss": 0.1897, "step": 172800 }, { "epoch": 2.007586205726635, "grad_norm": 6.610294818878174, "learning_rate": 1.4133956243606774e-05, "loss": 0.1964, "step": 172900 }, { "epoch": 2.0077558436006657, "grad_norm": 4.768069267272949, "learning_rate": 1.4130563486126168e-05, "loss": 0.194, "step": 173000 }, { "epoch": 2.007925481474696, "grad_norm": 6.319394588470459, "learning_rate": 1.412717072864556e-05, "loss": 0.2036, "step": 173100 }, { "epoch": 2.0080951193487264, "grad_norm": 3.5767197608947754, "learning_rate": 1.4123777971164955e-05, "loss": 0.1883, "step": 173200 }, { "epoch": 2.0082647572227565, "grad_norm": 10.442258834838867, "learning_rate": 1.412038521368435e-05, "loss": 0.1903, "step": 173300 }, { "epoch": 2.0084343950967867, "grad_norm": 6.499282360076904, "learning_rate": 1.4116992456203741e-05, "loss": 0.1838, "step": 173400 }, { "epoch": 2.0086040329708172, "grad_norm": 3.500115156173706, "learning_rate": 1.4113599698723137e-05, "loss": 0.1875, "step": 173500 }, { "epoch": 2.0087736708448474, "grad_norm": 4.648612022399902, "learning_rate": 1.411020694124253e-05, "loss": 0.1943, "step": 173600 }, { "epoch": 2.008943308718878, "grad_norm": 4.9727783203125, "learning_rate": 1.4106814183761926e-05, "loss": 0.1872, "step": 173700 }, { "epoch": 2.009112946592908, "grad_norm": 11.233717918395996, "learning_rate": 1.4103421426281318e-05, "loss": 0.1941, "step": 173800 }, { "epoch": 2.0092825844669386, "grad_norm": 5.4135260581970215, "learning_rate": 1.4100028668800712e-05, "loss": 0.1888, "step": 173900 }, { "epoch": 2.0094522223409688, "grad_norm": 5.7923688888549805, "learning_rate": 1.4096635911320108e-05, "loss": 0.1707, "step": 174000 }, { "epoch": 2.009621860214999, "grad_norm": 9.065203666687012, "learning_rate": 1.40932431538395e-05, "loss": 0.1909, "step": 174100 }, { "epoch": 2.0097914980890295, "grad_norm": 10.589447975158691, "learning_rate": 1.4089850396358893e-05, "loss": 0.1839, "step": 174200 }, { "epoch": 2.0099611359630596, "grad_norm": 14.52007007598877, "learning_rate": 1.4086457638878289e-05, "loss": 0.1761, "step": 174300 }, { "epoch": 2.01013077383709, "grad_norm": 6.971049785614014, "learning_rate": 1.4083064881397681e-05, "loss": 0.2182, "step": 174400 }, { "epoch": 2.0103004117111203, "grad_norm": 4.257385730743408, "learning_rate": 1.4079672123917076e-05, "loss": 0.1779, "step": 174500 }, { "epoch": 2.0104700495851504, "grad_norm": 2.9859261512756348, "learning_rate": 1.4076279366436469e-05, "loss": 0.1918, "step": 174600 }, { "epoch": 2.010639687459181, "grad_norm": 5.438234806060791, "learning_rate": 1.4072886608955862e-05, "loss": 0.1997, "step": 174700 }, { "epoch": 2.010809325333211, "grad_norm": 9.906579971313477, "learning_rate": 1.4069493851475258e-05, "loss": 0.2044, "step": 174800 }, { "epoch": 2.0109789632072417, "grad_norm": 3.3486168384552, "learning_rate": 1.406610109399465e-05, "loss": 0.1946, "step": 174900 }, { "epoch": 2.011148601081272, "grad_norm": 7.863454818725586, "learning_rate": 1.4062708336514044e-05, "loss": 0.1717, "step": 175000 }, { "epoch": 2.011318238955302, "grad_norm": 10.974206924438477, "learning_rate": 1.405931557903344e-05, "loss": 0.1823, "step": 175100 }, { "epoch": 2.0114878768293325, "grad_norm": 9.711405754089355, "learning_rate": 1.4055922821552831e-05, "loss": 0.206, "step": 175200 }, { "epoch": 2.0116575147033626, "grad_norm": 7.226758003234863, "learning_rate": 1.4052530064072225e-05, "loss": 0.1809, "step": 175300 }, { "epoch": 2.011827152577393, "grad_norm": 3.171283006668091, "learning_rate": 1.404913730659162e-05, "loss": 0.1862, "step": 175400 }, { "epoch": 2.0119967904514233, "grad_norm": 11.826248168945312, "learning_rate": 1.4045744549111013e-05, "loss": 0.1985, "step": 175500 }, { "epoch": 2.0121664283254534, "grad_norm": 2.4670896530151367, "learning_rate": 1.4042351791630408e-05, "loss": 0.1894, "step": 175600 }, { "epoch": 2.012336066199484, "grad_norm": 4.756000995635986, "learning_rate": 1.4038959034149802e-05, "loss": 0.1925, "step": 175700 }, { "epoch": 2.012505704073514, "grad_norm": 4.847017765045166, "learning_rate": 1.4035566276669194e-05, "loss": 0.2064, "step": 175800 }, { "epoch": 2.0126753419475447, "grad_norm": 9.018815994262695, "learning_rate": 1.403217351918859e-05, "loss": 0.2018, "step": 175900 }, { "epoch": 2.012844979821575, "grad_norm": 4.527077674865723, "learning_rate": 1.4028780761707983e-05, "loss": 0.1724, "step": 176000 }, { "epoch": 2.0130146176956054, "grad_norm": 7.276963710784912, "learning_rate": 1.4025388004227376e-05, "loss": 0.1927, "step": 176100 }, { "epoch": 2.0131842555696355, "grad_norm": 6.4675164222717285, "learning_rate": 1.4021995246746771e-05, "loss": 0.1874, "step": 176200 }, { "epoch": 2.0133538934436657, "grad_norm": 8.948766708374023, "learning_rate": 1.4018602489266165e-05, "loss": 0.1839, "step": 176300 }, { "epoch": 2.0135235313176962, "grad_norm": 5.918384552001953, "learning_rate": 1.401520973178556e-05, "loss": 0.194, "step": 176400 }, { "epoch": 2.0136931691917264, "grad_norm": 3.682623863220215, "learning_rate": 1.4011816974304952e-05, "loss": 0.1928, "step": 176500 }, { "epoch": 2.013862807065757, "grad_norm": 4.670161724090576, "learning_rate": 1.4008424216824346e-05, "loss": 0.1846, "step": 176600 }, { "epoch": 2.014032444939787, "grad_norm": 8.821544647216797, "learning_rate": 1.4005031459343742e-05, "loss": 0.2011, "step": 176700 }, { "epoch": 2.014202082813817, "grad_norm": 5.230496883392334, "learning_rate": 1.4001638701863134e-05, "loss": 0.1838, "step": 176800 }, { "epoch": 2.0143717206878478, "grad_norm": 14.750777244567871, "learning_rate": 1.3998245944382528e-05, "loss": 0.1961, "step": 176900 }, { "epoch": 2.014541358561878, "grad_norm": 5.336324691772461, "learning_rate": 1.3994853186901921e-05, "loss": 0.2056, "step": 177000 }, { "epoch": 2.0147109964359085, "grad_norm": 11.197516441345215, "learning_rate": 1.3991460429421315e-05, "loss": 0.1983, "step": 177100 }, { "epoch": 2.0148806343099386, "grad_norm": 3.4094271659851074, "learning_rate": 1.3988067671940709e-05, "loss": 0.1871, "step": 177200 }, { "epoch": 2.0150502721839687, "grad_norm": 9.821170806884766, "learning_rate": 1.3984674914460103e-05, "loss": 0.1963, "step": 177300 }, { "epoch": 2.0152199100579993, "grad_norm": 12.778410911560059, "learning_rate": 1.3981282156979497e-05, "loss": 0.2008, "step": 177400 }, { "epoch": 2.0153895479320294, "grad_norm": 9.539752960205078, "learning_rate": 1.3977889399498892e-05, "loss": 0.2072, "step": 177500 }, { "epoch": 2.01555918580606, "grad_norm": 6.088284015655518, "learning_rate": 1.3974496642018284e-05, "loss": 0.1997, "step": 177600 }, { "epoch": 2.01572882368009, "grad_norm": 5.633551597595215, "learning_rate": 1.3971103884537678e-05, "loss": 0.198, "step": 177700 }, { "epoch": 2.0158984615541202, "grad_norm": 7.894077777862549, "learning_rate": 1.3967711127057073e-05, "loss": 0.195, "step": 177800 }, { "epoch": 2.016068099428151, "grad_norm": 6.14935827255249, "learning_rate": 1.3964318369576466e-05, "loss": 0.2018, "step": 177900 }, { "epoch": 2.016237737302181, "grad_norm": 7.406183242797852, "learning_rate": 1.396092561209586e-05, "loss": 0.1934, "step": 178000 }, { "epoch": 2.0164073751762115, "grad_norm": 10.12109088897705, "learning_rate": 1.3957532854615255e-05, "loss": 0.1854, "step": 178100 }, { "epoch": 2.0165770130502416, "grad_norm": 7.550734043121338, "learning_rate": 1.3954140097134647e-05, "loss": 0.1888, "step": 178200 }, { "epoch": 2.0167466509242717, "grad_norm": 7.152534484863281, "learning_rate": 1.3950747339654042e-05, "loss": 0.1884, "step": 178300 }, { "epoch": 2.0169162887983023, "grad_norm": 13.513175010681152, "learning_rate": 1.3947354582173436e-05, "loss": 0.1844, "step": 178400 }, { "epoch": 2.0170859266723324, "grad_norm": 19.67890167236328, "learning_rate": 1.3943961824692828e-05, "loss": 0.1863, "step": 178500 }, { "epoch": 2.017255564546363, "grad_norm": 5.531754016876221, "learning_rate": 1.3940569067212224e-05, "loss": 0.1997, "step": 178600 }, { "epoch": 2.017425202420393, "grad_norm": 4.287716388702393, "learning_rate": 1.3937176309731618e-05, "loss": 0.1931, "step": 178700 }, { "epoch": 2.0175948402944237, "grad_norm": 9.62065315246582, "learning_rate": 1.393378355225101e-05, "loss": 0.1746, "step": 178800 }, { "epoch": 2.017764478168454, "grad_norm": 6.5100274085998535, "learning_rate": 1.3930390794770405e-05, "loss": 0.2062, "step": 178900 }, { "epoch": 2.017934116042484, "grad_norm": 7.372591972351074, "learning_rate": 1.3926998037289799e-05, "loss": 0.1902, "step": 179000 }, { "epoch": 2.0181037539165145, "grad_norm": 7.127958297729492, "learning_rate": 1.3923605279809191e-05, "loss": 0.1766, "step": 179100 }, { "epoch": 2.0182733917905447, "grad_norm": 6.794496536254883, "learning_rate": 1.3920212522328587e-05, "loss": 0.2025, "step": 179200 }, { "epoch": 2.0184430296645752, "grad_norm": 8.349127769470215, "learning_rate": 1.391681976484798e-05, "loss": 0.192, "step": 179300 }, { "epoch": 2.0186126675386054, "grad_norm": 16.9593563079834, "learning_rate": 1.3913427007367374e-05, "loss": 0.208, "step": 179400 }, { "epoch": 2.0187823054126355, "grad_norm": 6.9921650886535645, "learning_rate": 1.3910034249886768e-05, "loss": 0.1965, "step": 179500 }, { "epoch": 2.018951943286666, "grad_norm": 10.093411445617676, "learning_rate": 1.3906641492406162e-05, "loss": 0.1862, "step": 179600 }, { "epoch": 2.019121581160696, "grad_norm": 7.904622554779053, "learning_rate": 1.3903248734925556e-05, "loss": 0.1907, "step": 179700 }, { "epoch": 2.0192912190347267, "grad_norm": 2.977637529373169, "learning_rate": 1.389985597744495e-05, "loss": 0.1906, "step": 179800 }, { "epoch": 2.019460856908757, "grad_norm": 8.087541580200195, "learning_rate": 1.3896463219964343e-05, "loss": 0.1866, "step": 179900 }, { "epoch": 2.019630494782787, "grad_norm": 6.765992641448975, "learning_rate": 1.3893070462483737e-05, "loss": 0.1958, "step": 180000 }, { "epoch": 2.0198001326568176, "grad_norm": 9.74838924407959, "learning_rate": 1.388967770500313e-05, "loss": 0.1931, "step": 180100 }, { "epoch": 2.0199697705308477, "grad_norm": 4.761765480041504, "learning_rate": 1.3886284947522526e-05, "loss": 0.1889, "step": 180200 }, { "epoch": 2.0201394084048783, "grad_norm": 7.805159091949463, "learning_rate": 1.3882892190041918e-05, "loss": 0.1817, "step": 180300 }, { "epoch": 2.0203090462789084, "grad_norm": 6.775490760803223, "learning_rate": 1.3879499432561312e-05, "loss": 0.1871, "step": 180400 }, { "epoch": 2.0204786841529385, "grad_norm": 8.722590446472168, "learning_rate": 1.3876106675080708e-05, "loss": 0.1954, "step": 180500 }, { "epoch": 2.020648322026969, "grad_norm": 7.982821941375732, "learning_rate": 1.38727139176001e-05, "loss": 0.2057, "step": 180600 }, { "epoch": 2.020817959900999, "grad_norm": 7.348810195922852, "learning_rate": 1.3869321160119494e-05, "loss": 0.1896, "step": 180700 }, { "epoch": 2.02098759777503, "grad_norm": 9.341002464294434, "learning_rate": 1.3865928402638889e-05, "loss": 0.1926, "step": 180800 }, { "epoch": 2.02115723564906, "grad_norm": 3.7878944873809814, "learning_rate": 1.3862535645158281e-05, "loss": 0.1997, "step": 180900 }, { "epoch": 2.02132687352309, "grad_norm": 8.112997055053711, "learning_rate": 1.3859142887677675e-05, "loss": 0.1935, "step": 181000 }, { "epoch": 2.0214965113971206, "grad_norm": 3.699326276779175, "learning_rate": 1.385575013019707e-05, "loss": 0.2063, "step": 181100 }, { "epoch": 2.0216661492711507, "grad_norm": 8.689441680908203, "learning_rate": 1.3852357372716462e-05, "loss": 0.2003, "step": 181200 }, { "epoch": 2.0218357871451813, "grad_norm": 2.563828468322754, "learning_rate": 1.3848964615235858e-05, "loss": 0.1858, "step": 181300 }, { "epoch": 2.0220054250192114, "grad_norm": 7.546522617340088, "learning_rate": 1.3845571857755252e-05, "loss": 0.1968, "step": 181400 }, { "epoch": 2.022175062893242, "grad_norm": 5.755893707275391, "learning_rate": 1.3842179100274644e-05, "loss": 0.1934, "step": 181500 }, { "epoch": 2.022344700767272, "grad_norm": 6.550370216369629, "learning_rate": 1.383878634279404e-05, "loss": 0.1971, "step": 181600 }, { "epoch": 2.0225143386413023, "grad_norm": 5.04830265045166, "learning_rate": 1.3835393585313433e-05, "loss": 0.199, "step": 181700 }, { "epoch": 2.022683976515333, "grad_norm": 2.636538505554199, "learning_rate": 1.3832000827832825e-05, "loss": 0.1895, "step": 181800 }, { "epoch": 2.022853614389363, "grad_norm": 6.463318347930908, "learning_rate": 1.382860807035222e-05, "loss": 0.198, "step": 181900 }, { "epoch": 2.0230232522633935, "grad_norm": 6.915595531463623, "learning_rate": 1.3825215312871615e-05, "loss": 0.1983, "step": 182000 }, { "epoch": 2.0231928901374236, "grad_norm": 8.169347763061523, "learning_rate": 1.3821822555391008e-05, "loss": 0.19, "step": 182100 }, { "epoch": 2.0233625280114538, "grad_norm": 6.569549083709717, "learning_rate": 1.3818429797910402e-05, "loss": 0.1874, "step": 182200 }, { "epoch": 2.0235321658854843, "grad_norm": 6.753419399261475, "learning_rate": 1.3815037040429796e-05, "loss": 0.1721, "step": 182300 }, { "epoch": 2.0237018037595145, "grad_norm": 8.113062858581543, "learning_rate": 1.381164428294919e-05, "loss": 0.1857, "step": 182400 }, { "epoch": 2.023871441633545, "grad_norm": 7.362775802612305, "learning_rate": 1.3808251525468584e-05, "loss": 0.1849, "step": 182500 }, { "epoch": 2.024041079507575, "grad_norm": 7.930744171142578, "learning_rate": 1.3804858767987976e-05, "loss": 0.1931, "step": 182600 }, { "epoch": 2.0242107173816053, "grad_norm": 11.364336013793945, "learning_rate": 1.3801466010507371e-05, "loss": 0.2012, "step": 182700 }, { "epoch": 2.024380355255636, "grad_norm": 8.673807144165039, "learning_rate": 1.3798073253026765e-05, "loss": 0.1813, "step": 182800 }, { "epoch": 2.024549993129666, "grad_norm": 12.512649536132812, "learning_rate": 1.3794680495546157e-05, "loss": 0.1994, "step": 182900 }, { "epoch": 2.0247196310036966, "grad_norm": 7.447030544281006, "learning_rate": 1.3791287738065552e-05, "loss": 0.202, "step": 183000 }, { "epoch": 2.0248892688777267, "grad_norm": 6.819461345672607, "learning_rate": 1.3787894980584946e-05, "loss": 0.1902, "step": 183100 }, { "epoch": 2.025058906751757, "grad_norm": 5.1691975593566895, "learning_rate": 1.3784502223104342e-05, "loss": 0.1889, "step": 183200 }, { "epoch": 2.0252285446257874, "grad_norm": 6.351062297821045, "learning_rate": 1.3781109465623734e-05, "loss": 0.198, "step": 183300 }, { "epoch": 2.0253981824998175, "grad_norm": 11.768117904663086, "learning_rate": 1.3777716708143128e-05, "loss": 0.18, "step": 183400 }, { "epoch": 2.025567820373848, "grad_norm": 11.776639938354492, "learning_rate": 1.3774323950662523e-05, "loss": 0.2106, "step": 183500 }, { "epoch": 2.025737458247878, "grad_norm": 11.376594543457031, "learning_rate": 1.3770931193181915e-05, "loss": 0.1952, "step": 183600 }, { "epoch": 2.025907096121909, "grad_norm": 5.3198161125183105, "learning_rate": 1.3767538435701309e-05, "loss": 0.181, "step": 183700 }, { "epoch": 2.026076733995939, "grad_norm": 29.431217193603516, "learning_rate": 1.3764145678220705e-05, "loss": 0.2092, "step": 183800 }, { "epoch": 2.026246371869969, "grad_norm": 8.39002799987793, "learning_rate": 1.3760752920740097e-05, "loss": 0.183, "step": 183900 }, { "epoch": 2.0264160097439996, "grad_norm": 10.67718505859375, "learning_rate": 1.3757360163259492e-05, "loss": 0.2105, "step": 184000 }, { "epoch": 2.0265856476180297, "grad_norm": 9.744851112365723, "learning_rate": 1.3753967405778886e-05, "loss": 0.1915, "step": 184100 }, { "epoch": 2.0267552854920603, "grad_norm": 5.5628557205200195, "learning_rate": 1.3750574648298278e-05, "loss": 0.1716, "step": 184200 }, { "epoch": 2.0269249233660904, "grad_norm": 2.8659534454345703, "learning_rate": 1.3747181890817674e-05, "loss": 0.199, "step": 184300 }, { "epoch": 2.0270945612401206, "grad_norm": 12.633681297302246, "learning_rate": 1.3743789133337067e-05, "loss": 0.192, "step": 184400 }, { "epoch": 2.027264199114151, "grad_norm": 9.892876625061035, "learning_rate": 1.374039637585646e-05, "loss": 0.1914, "step": 184500 }, { "epoch": 2.0274338369881812, "grad_norm": 10.392839431762695, "learning_rate": 1.3737003618375855e-05, "loss": 0.1806, "step": 184600 }, { "epoch": 2.027603474862212, "grad_norm": 8.992154121398926, "learning_rate": 1.3733610860895247e-05, "loss": 0.2089, "step": 184700 }, { "epoch": 2.027773112736242, "grad_norm": 10.647764205932617, "learning_rate": 1.373021810341464e-05, "loss": 0.1945, "step": 184800 }, { "epoch": 2.027942750610272, "grad_norm": 8.927639961242676, "learning_rate": 1.3726825345934036e-05, "loss": 0.1978, "step": 184900 }, { "epoch": 2.0281123884843026, "grad_norm": 2.4121272563934326, "learning_rate": 1.3723432588453428e-05, "loss": 0.1846, "step": 185000 }, { "epoch": 2.0282820263583328, "grad_norm": 5.591588497161865, "learning_rate": 1.3720039830972824e-05, "loss": 0.1943, "step": 185100 }, { "epoch": 2.0284516642323633, "grad_norm": 7.772965431213379, "learning_rate": 1.3716647073492218e-05, "loss": 0.2008, "step": 185200 }, { "epoch": 2.0286213021063935, "grad_norm": 8.23934268951416, "learning_rate": 1.371325431601161e-05, "loss": 0.1947, "step": 185300 }, { "epoch": 2.0287909399804236, "grad_norm": 8.898578643798828, "learning_rate": 1.3709861558531005e-05, "loss": 0.1926, "step": 185400 }, { "epoch": 2.028960577854454, "grad_norm": 3.4063193798065186, "learning_rate": 1.3706468801050399e-05, "loss": 0.2086, "step": 185500 }, { "epoch": 2.0291302157284843, "grad_norm": 8.958745956420898, "learning_rate": 1.3703076043569791e-05, "loss": 0.1823, "step": 185600 }, { "epoch": 2.029299853602515, "grad_norm": 6.691910743713379, "learning_rate": 1.3699683286089187e-05, "loss": 0.1972, "step": 185700 }, { "epoch": 2.029469491476545, "grad_norm": 15.731667518615723, "learning_rate": 1.369629052860858e-05, "loss": 0.1983, "step": 185800 }, { "epoch": 2.029639129350575, "grad_norm": 4.626884937286377, "learning_rate": 1.3692897771127973e-05, "loss": 0.1814, "step": 185900 }, { "epoch": 2.0298087672246057, "grad_norm": 7.6923627853393555, "learning_rate": 1.3689505013647368e-05, "loss": 0.1729, "step": 186000 }, { "epoch": 2.029978405098636, "grad_norm": 8.782466888427734, "learning_rate": 1.3686112256166762e-05, "loss": 0.1996, "step": 186100 }, { "epoch": 2.0301480429726664, "grad_norm": 11.543107986450195, "learning_rate": 1.3682719498686157e-05, "loss": 0.1874, "step": 186200 }, { "epoch": 2.0303176808466965, "grad_norm": 3.326141357421875, "learning_rate": 1.367932674120555e-05, "loss": 0.1892, "step": 186300 }, { "epoch": 2.030487318720727, "grad_norm": 6.099785327911377, "learning_rate": 1.3675933983724943e-05, "loss": 0.171, "step": 186400 }, { "epoch": 2.030656956594757, "grad_norm": 7.367239475250244, "learning_rate": 1.3672541226244339e-05, "loss": 0.182, "step": 186500 }, { "epoch": 2.0308265944687873, "grad_norm": 4.7671122550964355, "learning_rate": 1.366914846876373e-05, "loss": 0.196, "step": 186600 }, { "epoch": 2.030996232342818, "grad_norm": 9.152883529663086, "learning_rate": 1.3665755711283125e-05, "loss": 0.1942, "step": 186700 }, { "epoch": 2.031165870216848, "grad_norm": 3.245002508163452, "learning_rate": 1.366236295380252e-05, "loss": 0.1976, "step": 186800 }, { "epoch": 2.0313355080908786, "grad_norm": 9.208182334899902, "learning_rate": 1.3658970196321912e-05, "loss": 0.1769, "step": 186900 }, { "epoch": 2.0315051459649087, "grad_norm": 5.912868499755859, "learning_rate": 1.3655577438841308e-05, "loss": 0.2035, "step": 187000 }, { "epoch": 2.031674783838939, "grad_norm": 4.554296016693115, "learning_rate": 1.36521846813607e-05, "loss": 0.1869, "step": 187100 }, { "epoch": 2.0318444217129694, "grad_norm": 8.921333312988281, "learning_rate": 1.3648791923880094e-05, "loss": 0.1925, "step": 187200 }, { "epoch": 2.0320140595869995, "grad_norm": 7.173794269561768, "learning_rate": 1.3645399166399489e-05, "loss": 0.2025, "step": 187300 }, { "epoch": 2.03218369746103, "grad_norm": 3.308924436569214, "learning_rate": 1.3642006408918881e-05, "loss": 0.1972, "step": 187400 }, { "epoch": 2.0323533353350602, "grad_norm": 16.532819747924805, "learning_rate": 1.3638613651438275e-05, "loss": 0.2043, "step": 187500 }, { "epoch": 2.0325229732090904, "grad_norm": 11.72754192352295, "learning_rate": 1.363522089395767e-05, "loss": 0.2037, "step": 187600 }, { "epoch": 2.032692611083121, "grad_norm": 7.590754985809326, "learning_rate": 1.3631828136477063e-05, "loss": 0.1922, "step": 187700 }, { "epoch": 2.032862248957151, "grad_norm": 3.767186403274536, "learning_rate": 1.3628435378996456e-05, "loss": 0.1896, "step": 187800 }, { "epoch": 2.0330318868311816, "grad_norm": 11.246227264404297, "learning_rate": 1.3625042621515852e-05, "loss": 0.1811, "step": 187900 }, { "epoch": 2.0332015247052118, "grad_norm": 11.231407165527344, "learning_rate": 1.3621649864035244e-05, "loss": 0.1833, "step": 188000 }, { "epoch": 2.033371162579242, "grad_norm": 5.980721473693848, "learning_rate": 1.361825710655464e-05, "loss": 0.194, "step": 188100 }, { "epoch": 2.0335408004532725, "grad_norm": 6.406867980957031, "learning_rate": 1.3614864349074033e-05, "loss": 0.1915, "step": 188200 }, { "epoch": 2.0337104383273026, "grad_norm": 8.393624305725098, "learning_rate": 1.3611471591593425e-05, "loss": 0.2024, "step": 188300 }, { "epoch": 2.033880076201333, "grad_norm": 6.611584186553955, "learning_rate": 1.360807883411282e-05, "loss": 0.1943, "step": 188400 }, { "epoch": 2.0340497140753633, "grad_norm": 4.973874092102051, "learning_rate": 1.3604686076632215e-05, "loss": 0.193, "step": 188500 }, { "epoch": 2.0342193519493934, "grad_norm": 10.264262199401855, "learning_rate": 1.3601293319151607e-05, "loss": 0.1971, "step": 188600 }, { "epoch": 2.034388989823424, "grad_norm": 3.6636641025543213, "learning_rate": 1.3597900561671002e-05, "loss": 0.193, "step": 188700 }, { "epoch": 2.034558627697454, "grad_norm": 10.401516914367676, "learning_rate": 1.3594507804190396e-05, "loss": 0.2001, "step": 188800 }, { "epoch": 2.0347282655714847, "grad_norm": 1.9320553541183472, "learning_rate": 1.3591115046709792e-05, "loss": 0.1864, "step": 188900 }, { "epoch": 2.034897903445515, "grad_norm": 8.332460403442383, "learning_rate": 1.3587722289229184e-05, "loss": 0.1944, "step": 189000 }, { "epoch": 2.0350675413195454, "grad_norm": 9.770222663879395, "learning_rate": 1.3584329531748577e-05, "loss": 0.2028, "step": 189100 }, { "epoch": 2.0352371791935755, "grad_norm": 8.866495132446289, "learning_rate": 1.3580936774267973e-05, "loss": 0.2039, "step": 189200 }, { "epoch": 2.0354068170676056, "grad_norm": 10.754511833190918, "learning_rate": 1.3577544016787365e-05, "loss": 0.2073, "step": 189300 }, { "epoch": 2.035576454941636, "grad_norm": 1.3255038261413574, "learning_rate": 1.3574151259306759e-05, "loss": 0.1745, "step": 189400 }, { "epoch": 2.0357460928156663, "grad_norm": 13.717827796936035, "learning_rate": 1.3570758501826153e-05, "loss": 0.1949, "step": 189500 }, { "epoch": 2.035915730689697, "grad_norm": 6.770783424377441, "learning_rate": 1.3567365744345546e-05, "loss": 0.1865, "step": 189600 }, { "epoch": 2.036085368563727, "grad_norm": 12.540611267089844, "learning_rate": 1.356397298686494e-05, "loss": 0.2006, "step": 189700 }, { "epoch": 2.036255006437757, "grad_norm": 7.678775787353516, "learning_rate": 1.3560580229384334e-05, "loss": 0.1845, "step": 189800 }, { "epoch": 2.0364246443117877, "grad_norm": 8.447525978088379, "learning_rate": 1.3557187471903728e-05, "loss": 0.1922, "step": 189900 }, { "epoch": 2.036594282185818, "grad_norm": 6.60821008682251, "learning_rate": 1.3553794714423123e-05, "loss": 0.1935, "step": 190000 }, { "epoch": 2.0367639200598484, "grad_norm": 7.688208103179932, "learning_rate": 1.3550401956942515e-05, "loss": 0.1874, "step": 190100 }, { "epoch": 2.0369335579338785, "grad_norm": 4.5239129066467285, "learning_rate": 1.354700919946191e-05, "loss": 0.1854, "step": 190200 }, { "epoch": 2.0371031958079087, "grad_norm": 6.646376609802246, "learning_rate": 1.3543616441981305e-05, "loss": 0.2022, "step": 190300 }, { "epoch": 2.0372728336819392, "grad_norm": 6.580179691314697, "learning_rate": 1.3540223684500697e-05, "loss": 0.1941, "step": 190400 }, { "epoch": 2.0374424715559694, "grad_norm": 9.297142028808594, "learning_rate": 1.353683092702009e-05, "loss": 0.1796, "step": 190500 }, { "epoch": 2.03761210943, "grad_norm": 4.6530303955078125, "learning_rate": 1.3533438169539486e-05, "loss": 0.1891, "step": 190600 }, { "epoch": 2.03778174730403, "grad_norm": 4.5977582931518555, "learning_rate": 1.3530045412058878e-05, "loss": 0.1967, "step": 190700 }, { "epoch": 2.03795138517806, "grad_norm": 7.947295665740967, "learning_rate": 1.3526652654578274e-05, "loss": 0.201, "step": 190800 }, { "epoch": 2.0381210230520908, "grad_norm": 7.843332767486572, "learning_rate": 1.3523259897097667e-05, "loss": 0.1923, "step": 190900 }, { "epoch": 2.038290660926121, "grad_norm": 8.19943618774414, "learning_rate": 1.351986713961706e-05, "loss": 0.1796, "step": 191000 }, { "epoch": 2.0384602988001514, "grad_norm": 8.603262901306152, "learning_rate": 1.3516474382136455e-05, "loss": 0.2055, "step": 191100 }, { "epoch": 2.0386299366741816, "grad_norm": 5.062843322753906, "learning_rate": 1.3513081624655849e-05, "loss": 0.2048, "step": 191200 }, { "epoch": 2.038799574548212, "grad_norm": 5.872547626495361, "learning_rate": 1.3509688867175241e-05, "loss": 0.187, "step": 191300 }, { "epoch": 2.0389692124222423, "grad_norm": 6.552206516265869, "learning_rate": 1.3506296109694636e-05, "loss": 0.1947, "step": 191400 }, { "epoch": 2.0391388502962724, "grad_norm": 10.187865257263184, "learning_rate": 1.350290335221403e-05, "loss": 0.186, "step": 191500 }, { "epoch": 2.039308488170303, "grad_norm": 12.584325790405273, "learning_rate": 1.3499510594733422e-05, "loss": 0.1953, "step": 191600 }, { "epoch": 2.039478126044333, "grad_norm": 6.762688636779785, "learning_rate": 1.3496117837252818e-05, "loss": 0.189, "step": 191700 }, { "epoch": 2.0396477639183637, "grad_norm": 5.884616374969482, "learning_rate": 1.3492725079772212e-05, "loss": 0.1909, "step": 191800 }, { "epoch": 2.039817401792394, "grad_norm": 10.598699569702148, "learning_rate": 1.3489332322291605e-05, "loss": 0.1824, "step": 191900 }, { "epoch": 2.039987039666424, "grad_norm": 11.955621719360352, "learning_rate": 1.3485939564811e-05, "loss": 0.1911, "step": 192000 }, { "epoch": 2.0401566775404545, "grad_norm": 3.957789182662964, "learning_rate": 1.3482546807330393e-05, "loss": 0.1983, "step": 192100 }, { "epoch": 2.0403263154144846, "grad_norm": 4.768653869628906, "learning_rate": 1.3479154049849787e-05, "loss": 0.212, "step": 192200 }, { "epoch": 2.040495953288515, "grad_norm": 6.0809454917907715, "learning_rate": 1.347576129236918e-05, "loss": 0.1906, "step": 192300 }, { "epoch": 2.0406655911625453, "grad_norm": 3.2514383792877197, "learning_rate": 1.3472368534888574e-05, "loss": 0.1978, "step": 192400 }, { "epoch": 2.0408352290365754, "grad_norm": 6.7238264083862305, "learning_rate": 1.3468975777407968e-05, "loss": 0.1744, "step": 192500 }, { "epoch": 2.041004866910606, "grad_norm": 3.983375310897827, "learning_rate": 1.3465583019927362e-05, "loss": 0.1833, "step": 192600 }, { "epoch": 2.041174504784636, "grad_norm": 4.411906719207764, "learning_rate": 1.3462190262446757e-05, "loss": 0.1918, "step": 192700 }, { "epoch": 2.0413441426586667, "grad_norm": 15.519475936889648, "learning_rate": 1.345879750496615e-05, "loss": 0.1931, "step": 192800 }, { "epoch": 2.041513780532697, "grad_norm": 12.48142147064209, "learning_rate": 1.3455404747485543e-05, "loss": 0.1944, "step": 192900 }, { "epoch": 2.041683418406727, "grad_norm": 2.7643537521362305, "learning_rate": 1.3452011990004939e-05, "loss": 0.1867, "step": 193000 }, { "epoch": 2.0418530562807575, "grad_norm": 7.920635223388672, "learning_rate": 1.3448619232524331e-05, "loss": 0.1944, "step": 193100 }, { "epoch": 2.0420226941547877, "grad_norm": 16.575090408325195, "learning_rate": 1.3445226475043725e-05, "loss": 0.1834, "step": 193200 }, { "epoch": 2.0421923320288182, "grad_norm": 7.885792255401611, "learning_rate": 1.344183371756312e-05, "loss": 0.1978, "step": 193300 }, { "epoch": 2.0423619699028484, "grad_norm": 7.946920394897461, "learning_rate": 1.3438440960082512e-05, "loss": 0.1944, "step": 193400 }, { "epoch": 2.042531607776879, "grad_norm": 9.839588165283203, "learning_rate": 1.3435048202601906e-05, "loss": 0.1978, "step": 193500 }, { "epoch": 2.042701245650909, "grad_norm": 9.879121780395508, "learning_rate": 1.3431655445121302e-05, "loss": 0.1888, "step": 193600 }, { "epoch": 2.042870883524939, "grad_norm": 4.946008682250977, "learning_rate": 1.3428262687640694e-05, "loss": 0.194, "step": 193700 }, { "epoch": 2.0430405213989697, "grad_norm": 5.986511707305908, "learning_rate": 1.342486993016009e-05, "loss": 0.1972, "step": 193800 }, { "epoch": 2.043210159273, "grad_norm": 6.465215682983398, "learning_rate": 1.3421477172679483e-05, "loss": 0.2018, "step": 193900 }, { "epoch": 2.0433797971470304, "grad_norm": 4.568042755126953, "learning_rate": 1.3418084415198875e-05, "loss": 0.1818, "step": 194000 }, { "epoch": 2.0435494350210606, "grad_norm": 12.52780818939209, "learning_rate": 1.341469165771827e-05, "loss": 0.1924, "step": 194100 }, { "epoch": 2.0437190728950907, "grad_norm": 12.59299373626709, "learning_rate": 1.3411298900237664e-05, "loss": 0.1995, "step": 194200 }, { "epoch": 2.0438887107691213, "grad_norm": 6.480070114135742, "learning_rate": 1.3407906142757056e-05, "loss": 0.2014, "step": 194300 }, { "epoch": 2.0440583486431514, "grad_norm": 2.0554418563842773, "learning_rate": 1.3404513385276452e-05, "loss": 0.1929, "step": 194400 }, { "epoch": 2.044227986517182, "grad_norm": 10.666851043701172, "learning_rate": 1.3401120627795846e-05, "loss": 0.189, "step": 194500 }, { "epoch": 2.044397624391212, "grad_norm": 10.362600326538086, "learning_rate": 1.339772787031524e-05, "loss": 0.1922, "step": 194600 }, { "epoch": 2.044567262265242, "grad_norm": 4.196156978607178, "learning_rate": 1.3394335112834633e-05, "loss": 0.1894, "step": 194700 }, { "epoch": 2.044736900139273, "grad_norm": 1.8054924011230469, "learning_rate": 1.3390942355354027e-05, "loss": 0.2148, "step": 194800 }, { "epoch": 2.044906538013303, "grad_norm": 6.08188009262085, "learning_rate": 1.3387549597873421e-05, "loss": 0.1896, "step": 194900 }, { "epoch": 2.0450761758873335, "grad_norm": 6.369657516479492, "learning_rate": 1.3384156840392815e-05, "loss": 0.2068, "step": 195000 }, { "epoch": 2.0452458137613636, "grad_norm": 6.679615020751953, "learning_rate": 1.3380764082912207e-05, "loss": 0.2041, "step": 195100 }, { "epoch": 2.0454154516353937, "grad_norm": 9.488202095031738, "learning_rate": 1.3377371325431602e-05, "loss": 0.1906, "step": 195200 }, { "epoch": 2.0455850895094243, "grad_norm": 8.936306953430176, "learning_rate": 1.3373978567950996e-05, "loss": 0.1994, "step": 195300 }, { "epoch": 2.0457547273834544, "grad_norm": 9.00567626953125, "learning_rate": 1.3370585810470388e-05, "loss": 0.1907, "step": 195400 }, { "epoch": 2.045924365257485, "grad_norm": 16.74872398376465, "learning_rate": 1.3367193052989784e-05, "loss": 0.1774, "step": 195500 }, { "epoch": 2.046094003131515, "grad_norm": 6.625627040863037, "learning_rate": 1.3363800295509178e-05, "loss": 0.1941, "step": 195600 }, { "epoch": 2.0462636410055453, "grad_norm": 7.7932658195495605, "learning_rate": 1.3360407538028573e-05, "loss": 0.1754, "step": 195700 }, { "epoch": 2.046433278879576, "grad_norm": 4.849085807800293, "learning_rate": 1.3357014780547965e-05, "loss": 0.1978, "step": 195800 }, { "epoch": 2.046602916753606, "grad_norm": 5.853183746337891, "learning_rate": 1.3353622023067359e-05, "loss": 0.1968, "step": 195900 }, { "epoch": 2.0467725546276365, "grad_norm": 3.5187366008758545, "learning_rate": 1.3350229265586754e-05, "loss": 0.1813, "step": 196000 }, { "epoch": 2.0469421925016666, "grad_norm": 8.676417350769043, "learning_rate": 1.3346836508106146e-05, "loss": 0.1757, "step": 196100 }, { "epoch": 2.047111830375697, "grad_norm": 8.45637035369873, "learning_rate": 1.334344375062554e-05, "loss": 0.1818, "step": 196200 }, { "epoch": 2.0472814682497273, "grad_norm": 3.94053053855896, "learning_rate": 1.3340050993144936e-05, "loss": 0.2028, "step": 196300 }, { "epoch": 2.0474511061237575, "grad_norm": 4.95117712020874, "learning_rate": 1.3336658235664328e-05, "loss": 0.1966, "step": 196400 }, { "epoch": 2.047620743997788, "grad_norm": 4.228691101074219, "learning_rate": 1.3333265478183723e-05, "loss": 0.1922, "step": 196500 }, { "epoch": 2.047790381871818, "grad_norm": 9.729278564453125, "learning_rate": 1.3329872720703117e-05, "loss": 0.189, "step": 196600 }, { "epoch": 2.0479600197458487, "grad_norm": 3.2934677600860596, "learning_rate": 1.332647996322251e-05, "loss": 0.1809, "step": 196700 }, { "epoch": 2.048129657619879, "grad_norm": 8.613676071166992, "learning_rate": 1.3323087205741905e-05, "loss": 0.1973, "step": 196800 }, { "epoch": 2.048299295493909, "grad_norm": 8.219529151916504, "learning_rate": 1.3319694448261299e-05, "loss": 0.1908, "step": 196900 }, { "epoch": 2.0484689333679396, "grad_norm": 4.776409149169922, "learning_rate": 1.331630169078069e-05, "loss": 0.186, "step": 197000 }, { "epoch": 2.0486385712419697, "grad_norm": 9.676149368286133, "learning_rate": 1.3312908933300086e-05, "loss": 0.1964, "step": 197100 }, { "epoch": 2.0488082091160003, "grad_norm": 8.236662864685059, "learning_rate": 1.3309516175819478e-05, "loss": 0.1925, "step": 197200 }, { "epoch": 2.0489778469900304, "grad_norm": 7.135667324066162, "learning_rate": 1.3306123418338872e-05, "loss": 0.1818, "step": 197300 }, { "epoch": 2.0491474848640605, "grad_norm": 8.626110076904297, "learning_rate": 1.3302730660858268e-05, "loss": 0.1955, "step": 197400 }, { "epoch": 2.049317122738091, "grad_norm": 4.884781837463379, "learning_rate": 1.329933790337766e-05, "loss": 0.1984, "step": 197500 }, { "epoch": 2.049486760612121, "grad_norm": 11.682696342468262, "learning_rate": 1.3295945145897055e-05, "loss": 0.1892, "step": 197600 }, { "epoch": 2.0496563984861518, "grad_norm": 4.399531364440918, "learning_rate": 1.3292552388416449e-05, "loss": 0.1953, "step": 197700 }, { "epoch": 2.049826036360182, "grad_norm": 9.568199157714844, "learning_rate": 1.3289159630935841e-05, "loss": 0.2018, "step": 197800 }, { "epoch": 2.049995674234212, "grad_norm": 12.463345527648926, "learning_rate": 1.3285766873455237e-05, "loss": 0.1936, "step": 197900 }, { "epoch": 2.0501653121082426, "grad_norm": 6.043578624725342, "learning_rate": 1.328237411597463e-05, "loss": 0.189, "step": 198000 }, { "epoch": 2.0503349499822727, "grad_norm": 2.9240152835845947, "learning_rate": 1.3278981358494022e-05, "loss": 0.1953, "step": 198100 }, { "epoch": 2.0505045878563033, "grad_norm": 3.301474094390869, "learning_rate": 1.3275588601013418e-05, "loss": 0.1695, "step": 198200 }, { "epoch": 2.0506742257303334, "grad_norm": 3.667085647583008, "learning_rate": 1.3272195843532812e-05, "loss": 0.2012, "step": 198300 }, { "epoch": 2.0508438636043635, "grad_norm": 2.5832698345184326, "learning_rate": 1.3268803086052204e-05, "loss": 0.183, "step": 198400 }, { "epoch": 2.051013501478394, "grad_norm": 7.2634782791137695, "learning_rate": 1.32654103285716e-05, "loss": 0.1763, "step": 198500 }, { "epoch": 2.0511831393524242, "grad_norm": 11.096768379211426, "learning_rate": 1.3262017571090993e-05, "loss": 0.1903, "step": 198600 }, { "epoch": 2.051352777226455, "grad_norm": 2.75386118888855, "learning_rate": 1.3258624813610389e-05, "loss": 0.1947, "step": 198700 }, { "epoch": 2.051522415100485, "grad_norm": 9.866362571716309, "learning_rate": 1.325523205612978e-05, "loss": 0.1745, "step": 198800 }, { "epoch": 2.0516920529745155, "grad_norm": 11.537663459777832, "learning_rate": 1.3251839298649174e-05, "loss": 0.1962, "step": 198900 }, { "epoch": 2.0518616908485456, "grad_norm": 7.3345160484313965, "learning_rate": 1.324844654116857e-05, "loss": 0.1853, "step": 199000 }, { "epoch": 2.0520313287225758, "grad_norm": 19.695510864257812, "learning_rate": 1.3245053783687962e-05, "loss": 0.183, "step": 199100 }, { "epoch": 2.0522009665966063, "grad_norm": 6.187350273132324, "learning_rate": 1.3241661026207356e-05, "loss": 0.1964, "step": 199200 }, { "epoch": 2.0523706044706365, "grad_norm": 7.322741508483887, "learning_rate": 1.3238268268726751e-05, "loss": 0.1852, "step": 199300 }, { "epoch": 2.052540242344667, "grad_norm": 3.082277297973633, "learning_rate": 1.3234875511246143e-05, "loss": 0.1779, "step": 199400 }, { "epoch": 2.052709880218697, "grad_norm": 7.339090347290039, "learning_rate": 1.3231482753765539e-05, "loss": 0.1926, "step": 199500 }, { "epoch": 2.0528795180927273, "grad_norm": 5.019044876098633, "learning_rate": 1.3228089996284931e-05, "loss": 0.1944, "step": 199600 }, { "epoch": 2.053049155966758, "grad_norm": 9.583891868591309, "learning_rate": 1.3224697238804325e-05, "loss": 0.1938, "step": 199700 }, { "epoch": 2.053218793840788, "grad_norm": 7.017909049987793, "learning_rate": 1.322130448132372e-05, "loss": 0.1864, "step": 199800 }, { "epoch": 2.0533884317148186, "grad_norm": 7.259591102600098, "learning_rate": 1.3217911723843112e-05, "loss": 0.1733, "step": 199900 }, { "epoch": 2.0535580695888487, "grad_norm": 5.969758987426758, "learning_rate": 1.3214518966362506e-05, "loss": 0.1978, "step": 200000 }, { "epoch": 2.053727707462879, "grad_norm": 11.943553924560547, "learning_rate": 1.3211126208881902e-05, "loss": 0.1674, "step": 200100 }, { "epoch": 2.0538973453369094, "grad_norm": 8.667548179626465, "learning_rate": 1.3207733451401294e-05, "loss": 0.2028, "step": 200200 }, { "epoch": 2.0540669832109395, "grad_norm": 3.8372387886047363, "learning_rate": 1.3204340693920688e-05, "loss": 0.1853, "step": 200300 }, { "epoch": 2.05423662108497, "grad_norm": 7.1270952224731445, "learning_rate": 1.3200947936440083e-05, "loss": 0.1903, "step": 200400 }, { "epoch": 2.054406258959, "grad_norm": 11.11916446685791, "learning_rate": 1.3197555178959475e-05, "loss": 0.1944, "step": 200500 }, { "epoch": 2.0545758968330303, "grad_norm": 6.086613655090332, "learning_rate": 1.319416242147887e-05, "loss": 0.1899, "step": 200600 }, { "epoch": 2.054745534707061, "grad_norm": 3.8742802143096924, "learning_rate": 1.3190769663998264e-05, "loss": 0.1946, "step": 200700 }, { "epoch": 2.054915172581091, "grad_norm": 8.757858276367188, "learning_rate": 1.3187376906517657e-05, "loss": 0.2025, "step": 200800 }, { "epoch": 2.0550848104551216, "grad_norm": 10.361157417297363, "learning_rate": 1.3183984149037052e-05, "loss": 0.2052, "step": 200900 }, { "epoch": 2.0552544483291517, "grad_norm": 3.330179452896118, "learning_rate": 1.3180591391556446e-05, "loss": 0.1941, "step": 201000 }, { "epoch": 2.055424086203182, "grad_norm": 22.475528717041016, "learning_rate": 1.3177198634075838e-05, "loss": 0.193, "step": 201100 }, { "epoch": 2.0555937240772124, "grad_norm": 7.462094783782959, "learning_rate": 1.3173805876595233e-05, "loss": 0.186, "step": 201200 }, { "epoch": 2.0557633619512425, "grad_norm": 4.720335006713867, "learning_rate": 1.3170413119114627e-05, "loss": 0.1789, "step": 201300 }, { "epoch": 2.055932999825273, "grad_norm": 8.799880027770996, "learning_rate": 1.3167020361634023e-05, "loss": 0.2042, "step": 201400 }, { "epoch": 2.0561026376993032, "grad_norm": 6.465872764587402, "learning_rate": 1.3163627604153415e-05, "loss": 0.1888, "step": 201500 }, { "epoch": 2.056272275573334, "grad_norm": 11.183770179748535, "learning_rate": 1.3160234846672809e-05, "loss": 0.1827, "step": 201600 }, { "epoch": 2.056441913447364, "grad_norm": 5.316091060638428, "learning_rate": 1.3156842089192204e-05, "loss": 0.1848, "step": 201700 }, { "epoch": 2.056611551321394, "grad_norm": 11.857697486877441, "learning_rate": 1.3153449331711596e-05, "loss": 0.1924, "step": 201800 }, { "epoch": 2.0567811891954246, "grad_norm": 12.643261909484863, "learning_rate": 1.315005657423099e-05, "loss": 0.2142, "step": 201900 }, { "epoch": 2.0569508270694548, "grad_norm": 2.7740955352783203, "learning_rate": 1.3146663816750384e-05, "loss": 0.18, "step": 202000 }, { "epoch": 2.0571204649434853, "grad_norm": 4.607290267944336, "learning_rate": 1.3143271059269778e-05, "loss": 0.1747, "step": 202100 }, { "epoch": 2.0572901028175155, "grad_norm": 11.687403678894043, "learning_rate": 1.3139878301789171e-05, "loss": 0.2017, "step": 202200 }, { "epoch": 2.0574597406915456, "grad_norm": 1.6699429750442505, "learning_rate": 1.3136485544308565e-05, "loss": 0.2084, "step": 202300 }, { "epoch": 2.057629378565576, "grad_norm": 13.128548622131348, "learning_rate": 1.3133092786827959e-05, "loss": 0.2105, "step": 202400 }, { "epoch": 2.0577990164396063, "grad_norm": 3.7054314613342285, "learning_rate": 1.3129700029347354e-05, "loss": 0.1859, "step": 202500 }, { "epoch": 2.057968654313637, "grad_norm": 5.522262096405029, "learning_rate": 1.3126307271866747e-05, "loss": 0.2072, "step": 202600 }, { "epoch": 2.058138292187667, "grad_norm": 10.551847457885742, "learning_rate": 1.312291451438614e-05, "loss": 0.1872, "step": 202700 }, { "epoch": 2.058307930061697, "grad_norm": 4.949223518371582, "learning_rate": 1.3119521756905536e-05, "loss": 0.1987, "step": 202800 }, { "epoch": 2.0584775679357277, "grad_norm": 11.203483581542969, "learning_rate": 1.3116128999424928e-05, "loss": 0.2021, "step": 202900 }, { "epoch": 2.058647205809758, "grad_norm": 10.554142951965332, "learning_rate": 1.3112736241944322e-05, "loss": 0.1932, "step": 203000 }, { "epoch": 2.0588168436837884, "grad_norm": 7.468832969665527, "learning_rate": 1.3109343484463717e-05, "loss": 0.196, "step": 203100 }, { "epoch": 2.0589864815578185, "grad_norm": 5.934240818023682, "learning_rate": 1.310595072698311e-05, "loss": 0.2084, "step": 203200 }, { "epoch": 2.0591561194318486, "grad_norm": 8.120330810546875, "learning_rate": 1.3102557969502505e-05, "loss": 0.1961, "step": 203300 }, { "epoch": 2.059325757305879, "grad_norm": 5.771285533905029, "learning_rate": 1.3099165212021899e-05, "loss": 0.1935, "step": 203400 }, { "epoch": 2.0594953951799093, "grad_norm": 1.8816614151000977, "learning_rate": 1.309577245454129e-05, "loss": 0.2042, "step": 203500 }, { "epoch": 2.05966503305394, "grad_norm": 11.052457809448242, "learning_rate": 1.3092379697060686e-05, "loss": 0.2079, "step": 203600 }, { "epoch": 2.05983467092797, "grad_norm": 13.122213363647461, "learning_rate": 1.308898693958008e-05, "loss": 0.1896, "step": 203700 }, { "epoch": 2.0600043088020006, "grad_norm": 20.611143112182617, "learning_rate": 1.3085594182099472e-05, "loss": 0.1991, "step": 203800 }, { "epoch": 2.0601739466760307, "grad_norm": 4.678999900817871, "learning_rate": 1.3082201424618868e-05, "loss": 0.1879, "step": 203900 }, { "epoch": 2.060343584550061, "grad_norm": 5.663878440856934, "learning_rate": 1.3078808667138261e-05, "loss": 0.1833, "step": 204000 }, { "epoch": 2.0605132224240914, "grad_norm": 4.8230366706848145, "learning_rate": 1.3075415909657654e-05, "loss": 0.2007, "step": 204100 }, { "epoch": 2.0606828602981215, "grad_norm": 2.594977378845215, "learning_rate": 1.3072023152177049e-05, "loss": 0.1826, "step": 204200 }, { "epoch": 2.060852498172152, "grad_norm": 10.990616798400879, "learning_rate": 1.3068630394696443e-05, "loss": 0.1948, "step": 204300 }, { "epoch": 2.0610221360461822, "grad_norm": 11.730507850646973, "learning_rate": 1.3065237637215837e-05, "loss": 0.2009, "step": 204400 }, { "epoch": 2.0611917739202124, "grad_norm": 7.233855247497559, "learning_rate": 1.306184487973523e-05, "loss": 0.1884, "step": 204500 }, { "epoch": 2.061361411794243, "grad_norm": 11.244887351989746, "learning_rate": 1.3058452122254624e-05, "loss": 0.1913, "step": 204600 }, { "epoch": 2.061531049668273, "grad_norm": 11.509984016418457, "learning_rate": 1.3055059364774018e-05, "loss": 0.2033, "step": 204700 }, { "epoch": 2.0617006875423036, "grad_norm": 5.886805534362793, "learning_rate": 1.3051666607293412e-05, "loss": 0.2056, "step": 204800 }, { "epoch": 2.0618703254163337, "grad_norm": 1.6255496740341187, "learning_rate": 1.3048273849812806e-05, "loss": 0.2031, "step": 204900 }, { "epoch": 2.062039963290364, "grad_norm": 7.444450855255127, "learning_rate": 1.30448810923322e-05, "loss": 0.1922, "step": 205000 }, { "epoch": 2.0622096011643944, "grad_norm": 5.758943557739258, "learning_rate": 1.3041488334851593e-05, "loss": 0.1911, "step": 205100 }, { "epoch": 2.0623792390384246, "grad_norm": 3.89597225189209, "learning_rate": 1.3038095577370989e-05, "loss": 0.1946, "step": 205200 }, { "epoch": 2.062548876912455, "grad_norm": 5.777465343475342, "learning_rate": 1.303470281989038e-05, "loss": 0.1902, "step": 205300 }, { "epoch": 2.0627185147864853, "grad_norm": 1.7225890159606934, "learning_rate": 1.3031310062409775e-05, "loss": 0.1989, "step": 205400 }, { "epoch": 2.0628881526605154, "grad_norm": 15.89594554901123, "learning_rate": 1.302791730492917e-05, "loss": 0.1766, "step": 205500 }, { "epoch": 2.063057790534546, "grad_norm": 5.426549434661865, "learning_rate": 1.3024524547448562e-05, "loss": 0.1964, "step": 205600 }, { "epoch": 2.063227428408576, "grad_norm": 8.354893684387207, "learning_rate": 1.3021131789967956e-05, "loss": 0.1915, "step": 205700 }, { "epoch": 2.0633970662826067, "grad_norm": 8.07347297668457, "learning_rate": 1.3017739032487351e-05, "loss": 0.1831, "step": 205800 }, { "epoch": 2.063566704156637, "grad_norm": 4.234877109527588, "learning_rate": 1.3014346275006744e-05, "loss": 0.1866, "step": 205900 }, { "epoch": 2.0637363420306674, "grad_norm": 14.894997596740723, "learning_rate": 1.3010953517526137e-05, "loss": 0.1834, "step": 206000 }, { "epoch": 2.0639059799046975, "grad_norm": 0.9771266579627991, "learning_rate": 1.3007560760045533e-05, "loss": 0.2017, "step": 206100 }, { "epoch": 2.0640756177787276, "grad_norm": 19.548673629760742, "learning_rate": 1.3004168002564925e-05, "loss": 0.2023, "step": 206200 }, { "epoch": 2.064245255652758, "grad_norm": 2.252000570297241, "learning_rate": 1.300077524508432e-05, "loss": 0.1869, "step": 206300 }, { "epoch": 2.0644148935267883, "grad_norm": 7.359702110290527, "learning_rate": 1.2997382487603714e-05, "loss": 0.1902, "step": 206400 }, { "epoch": 2.064584531400819, "grad_norm": 5.802046298980713, "learning_rate": 1.2993989730123106e-05, "loss": 0.1878, "step": 206500 }, { "epoch": 2.064754169274849, "grad_norm": 9.156872749328613, "learning_rate": 1.2990596972642502e-05, "loss": 0.1872, "step": 206600 }, { "epoch": 2.064923807148879, "grad_norm": 7.065124034881592, "learning_rate": 1.2987204215161896e-05, "loss": 0.1902, "step": 206700 }, { "epoch": 2.0650934450229097, "grad_norm": 5.853184223175049, "learning_rate": 1.2983811457681288e-05, "loss": 0.2011, "step": 206800 }, { "epoch": 2.06526308289694, "grad_norm": 6.522987365722656, "learning_rate": 1.2980418700200683e-05, "loss": 0.2012, "step": 206900 }, { "epoch": 2.0654327207709704, "grad_norm": 4.316494464874268, "learning_rate": 1.2977025942720077e-05, "loss": 0.1996, "step": 207000 }, { "epoch": 2.0656023586450005, "grad_norm": 12.100545883178711, "learning_rate": 1.297363318523947e-05, "loss": 0.1959, "step": 207100 }, { "epoch": 2.0657719965190307, "grad_norm": 13.146650314331055, "learning_rate": 1.2970240427758865e-05, "loss": 0.1854, "step": 207200 }, { "epoch": 2.065941634393061, "grad_norm": 3.5107998847961426, "learning_rate": 1.2966847670278257e-05, "loss": 0.1947, "step": 207300 }, { "epoch": 2.0661112722670913, "grad_norm": 8.45229434967041, "learning_rate": 1.2963454912797652e-05, "loss": 0.1926, "step": 207400 }, { "epoch": 2.066280910141122, "grad_norm": 2.331770896911621, "learning_rate": 1.2960062155317046e-05, "loss": 0.2042, "step": 207500 }, { "epoch": 2.066450548015152, "grad_norm": 7.211934566497803, "learning_rate": 1.2956669397836438e-05, "loss": 0.1799, "step": 207600 }, { "epoch": 2.066620185889182, "grad_norm": 8.143392562866211, "learning_rate": 1.2953276640355834e-05, "loss": 0.1776, "step": 207700 }, { "epoch": 2.0667898237632127, "grad_norm": 3.8880624771118164, "learning_rate": 1.2949883882875227e-05, "loss": 0.1946, "step": 207800 }, { "epoch": 2.066959461637243, "grad_norm": 8.199936866760254, "learning_rate": 1.294649112539462e-05, "loss": 0.2202, "step": 207900 }, { "epoch": 2.0671290995112734, "grad_norm": 3.5876893997192383, "learning_rate": 1.2943098367914015e-05, "loss": 0.187, "step": 208000 }, { "epoch": 2.0672987373853036, "grad_norm": 9.107784271240234, "learning_rate": 1.2939705610433409e-05, "loss": 0.199, "step": 208100 }, { "epoch": 2.0674683752593337, "grad_norm": 3.451442003250122, "learning_rate": 1.2936312852952804e-05, "loss": 0.1972, "step": 208200 }, { "epoch": 2.0676380131333643, "grad_norm": 7.029556751251221, "learning_rate": 1.2932920095472196e-05, "loss": 0.1849, "step": 208300 }, { "epoch": 2.0678076510073944, "grad_norm": 7.283563137054443, "learning_rate": 1.292952733799159e-05, "loss": 0.191, "step": 208400 }, { "epoch": 2.067977288881425, "grad_norm": 17.303056716918945, "learning_rate": 1.2926134580510986e-05, "loss": 0.1991, "step": 208500 }, { "epoch": 2.068146926755455, "grad_norm": 6.135007381439209, "learning_rate": 1.2922741823030378e-05, "loss": 0.1954, "step": 208600 }, { "epoch": 2.0683165646294857, "grad_norm": 9.410603523254395, "learning_rate": 1.2919349065549772e-05, "loss": 0.1958, "step": 208700 }, { "epoch": 2.068486202503516, "grad_norm": 9.756718635559082, "learning_rate": 1.2915956308069167e-05, "loss": 0.1946, "step": 208800 }, { "epoch": 2.068655840377546, "grad_norm": 5.086231708526611, "learning_rate": 1.2912563550588559e-05, "loss": 0.2, "step": 208900 }, { "epoch": 2.0688254782515765, "grad_norm": 7.878388404846191, "learning_rate": 1.2909170793107953e-05, "loss": 0.1872, "step": 209000 }, { "epoch": 2.0689951161256066, "grad_norm": 3.9899446964263916, "learning_rate": 1.2905778035627348e-05, "loss": 0.1909, "step": 209100 }, { "epoch": 2.069164753999637, "grad_norm": 15.879865646362305, "learning_rate": 1.290238527814674e-05, "loss": 0.2019, "step": 209200 }, { "epoch": 2.0693343918736673, "grad_norm": 9.863194465637207, "learning_rate": 1.2898992520666136e-05, "loss": 0.2052, "step": 209300 }, { "epoch": 2.0695040297476974, "grad_norm": 6.2490997314453125, "learning_rate": 1.289559976318553e-05, "loss": 0.1884, "step": 209400 }, { "epoch": 2.069673667621728, "grad_norm": 7.1877851486206055, "learning_rate": 1.2892207005704922e-05, "loss": 0.2033, "step": 209500 }, { "epoch": 2.069843305495758, "grad_norm": 4.709298133850098, "learning_rate": 1.2888814248224317e-05, "loss": 0.1911, "step": 209600 }, { "epoch": 2.0700129433697887, "grad_norm": 9.592456817626953, "learning_rate": 1.288542149074371e-05, "loss": 0.1919, "step": 209700 }, { "epoch": 2.070182581243819, "grad_norm": 5.424144744873047, "learning_rate": 1.2882028733263103e-05, "loss": 0.1878, "step": 209800 }, { "epoch": 2.070352219117849, "grad_norm": 3.6838934421539307, "learning_rate": 1.2878635975782499e-05, "loss": 0.1948, "step": 209900 }, { "epoch": 2.0705218569918795, "grad_norm": 7.946353912353516, "learning_rate": 1.2875243218301891e-05, "loss": 0.2056, "step": 210000 }, { "epoch": 2.0706914948659096, "grad_norm": 3.535989284515381, "learning_rate": 1.2871850460821286e-05, "loss": 0.21, "step": 210100 }, { "epoch": 2.07086113273994, "grad_norm": 4.305080890655518, "learning_rate": 1.286845770334068e-05, "loss": 0.182, "step": 210200 }, { "epoch": 2.0710307706139703, "grad_norm": 5.943207263946533, "learning_rate": 1.2865064945860072e-05, "loss": 0.1889, "step": 210300 }, { "epoch": 2.0712004084880005, "grad_norm": 2.706958532333374, "learning_rate": 1.2861672188379468e-05, "loss": 0.1813, "step": 210400 }, { "epoch": 2.071370046362031, "grad_norm": 6.771958827972412, "learning_rate": 1.2858279430898862e-05, "loss": 0.2056, "step": 210500 }, { "epoch": 2.071539684236061, "grad_norm": 4.442162990570068, "learning_rate": 1.2854886673418254e-05, "loss": 0.1957, "step": 210600 }, { "epoch": 2.0717093221100917, "grad_norm": 5.242990493774414, "learning_rate": 1.2851493915937649e-05, "loss": 0.1994, "step": 210700 }, { "epoch": 2.071878959984122, "grad_norm": 6.074418544769287, "learning_rate": 1.2848101158457043e-05, "loss": 0.1956, "step": 210800 }, { "epoch": 2.072048597858152, "grad_norm": 2.311633586883545, "learning_rate": 1.2844708400976435e-05, "loss": 0.1682, "step": 210900 }, { "epoch": 2.0722182357321826, "grad_norm": 6.063413143157959, "learning_rate": 1.284131564349583e-05, "loss": 0.2021, "step": 211000 }, { "epoch": 2.0723878736062127, "grad_norm": 8.913256645202637, "learning_rate": 1.2837922886015224e-05, "loss": 0.1917, "step": 211100 }, { "epoch": 2.0725575114802433, "grad_norm": 7.716264724731445, "learning_rate": 1.283453012853462e-05, "loss": 0.1848, "step": 211200 }, { "epoch": 2.0727271493542734, "grad_norm": 11.818100929260254, "learning_rate": 1.2831137371054012e-05, "loss": 0.1929, "step": 211300 }, { "epoch": 2.072896787228304, "grad_norm": 10.222314834594727, "learning_rate": 1.2827744613573406e-05, "loss": 0.1909, "step": 211400 }, { "epoch": 2.073066425102334, "grad_norm": 10.459920883178711, "learning_rate": 1.2824351856092801e-05, "loss": 0.2054, "step": 211500 }, { "epoch": 2.073236062976364, "grad_norm": 7.7573418617248535, "learning_rate": 1.2820959098612193e-05, "loss": 0.1882, "step": 211600 }, { "epoch": 2.0734057008503948, "grad_norm": 8.41683292388916, "learning_rate": 1.2817566341131587e-05, "loss": 0.1872, "step": 211700 }, { "epoch": 2.073575338724425, "grad_norm": 2.3180766105651855, "learning_rate": 1.2814173583650983e-05, "loss": 0.1908, "step": 211800 }, { "epoch": 2.0737449765984555, "grad_norm": 5.816251277923584, "learning_rate": 1.2810780826170375e-05, "loss": 0.1956, "step": 211900 }, { "epoch": 2.0739146144724856, "grad_norm": 10.846295356750488, "learning_rate": 1.280738806868977e-05, "loss": 0.1895, "step": 212000 }, { "epoch": 2.0740842523465157, "grad_norm": 10.05728816986084, "learning_rate": 1.2803995311209162e-05, "loss": 0.1959, "step": 212100 }, { "epoch": 2.0742538902205463, "grad_norm": 9.393750190734863, "learning_rate": 1.2800602553728556e-05, "loss": 0.1812, "step": 212200 }, { "epoch": 2.0744235280945764, "grad_norm": 8.44619083404541, "learning_rate": 1.2797209796247952e-05, "loss": 0.1904, "step": 212300 }, { "epoch": 2.074593165968607, "grad_norm": 9.321249961853027, "learning_rate": 1.2793817038767344e-05, "loss": 0.2002, "step": 212400 }, { "epoch": 2.074762803842637, "grad_norm": 1.1132696866989136, "learning_rate": 1.2790424281286737e-05, "loss": 0.1971, "step": 212500 }, { "epoch": 2.0749324417166672, "grad_norm": 3.3004181385040283, "learning_rate": 1.2787031523806133e-05, "loss": 0.1965, "step": 212600 }, { "epoch": 2.075102079590698, "grad_norm": 8.170100212097168, "learning_rate": 1.2783638766325525e-05, "loss": 0.1864, "step": 212700 }, { "epoch": 2.075271717464728, "grad_norm": 5.74677038192749, "learning_rate": 1.2780246008844919e-05, "loss": 0.1984, "step": 212800 }, { "epoch": 2.0754413553387585, "grad_norm": 5.013864040374756, "learning_rate": 1.2776853251364314e-05, "loss": 0.1974, "step": 212900 }, { "epoch": 2.0756109932127886, "grad_norm": 7.446988105773926, "learning_rate": 1.2773460493883706e-05, "loss": 0.1751, "step": 213000 }, { "epoch": 2.0757806310868188, "grad_norm": 4.888339042663574, "learning_rate": 1.2770067736403102e-05, "loss": 0.1742, "step": 213100 }, { "epoch": 2.0759502689608493, "grad_norm": 4.34816312789917, "learning_rate": 1.2766674978922496e-05, "loss": 0.2015, "step": 213200 }, { "epoch": 2.0761199068348795, "grad_norm": 14.998065948486328, "learning_rate": 1.2763282221441888e-05, "loss": 0.203, "step": 213300 }, { "epoch": 2.07628954470891, "grad_norm": 9.211222648620605, "learning_rate": 1.2759889463961283e-05, "loss": 0.1811, "step": 213400 }, { "epoch": 2.07645918258294, "grad_norm": 4.669536113739014, "learning_rate": 1.2756496706480677e-05, "loss": 0.1915, "step": 213500 }, { "epoch": 2.0766288204569703, "grad_norm": 11.059216499328613, "learning_rate": 1.275310394900007e-05, "loss": 0.2019, "step": 213600 }, { "epoch": 2.076798458331001, "grad_norm": 16.573156356811523, "learning_rate": 1.2749711191519465e-05, "loss": 0.1866, "step": 213700 }, { "epoch": 2.076968096205031, "grad_norm": 1.1291776895523071, "learning_rate": 1.2746318434038858e-05, "loss": 0.1935, "step": 213800 }, { "epoch": 2.0771377340790615, "grad_norm": 3.9333109855651855, "learning_rate": 1.2742925676558254e-05, "loss": 0.1897, "step": 213900 }, { "epoch": 2.0773073719530917, "grad_norm": 13.124249458312988, "learning_rate": 1.2739532919077646e-05, "loss": 0.2003, "step": 214000 }, { "epoch": 2.0774770098271222, "grad_norm": 10.219476699829102, "learning_rate": 1.273614016159704e-05, "loss": 0.2013, "step": 214100 }, { "epoch": 2.0776466477011524, "grad_norm": 4.0450921058654785, "learning_rate": 1.2732747404116434e-05, "loss": 0.1846, "step": 214200 }, { "epoch": 2.0778162855751825, "grad_norm": 12.395927429199219, "learning_rate": 1.2729354646635827e-05, "loss": 0.1941, "step": 214300 }, { "epoch": 2.077985923449213, "grad_norm": 18.5852108001709, "learning_rate": 1.2725961889155221e-05, "loss": 0.1945, "step": 214400 }, { "epoch": 2.078155561323243, "grad_norm": 4.5804829597473145, "learning_rate": 1.2722569131674615e-05, "loss": 0.1872, "step": 214500 }, { "epoch": 2.0783251991972738, "grad_norm": 5.867319583892822, "learning_rate": 1.2719176374194009e-05, "loss": 0.1787, "step": 214600 }, { "epoch": 2.078494837071304, "grad_norm": 15.991630554199219, "learning_rate": 1.2715783616713403e-05, "loss": 0.1778, "step": 214700 }, { "epoch": 2.078664474945334, "grad_norm": 10.04265308380127, "learning_rate": 1.2712390859232796e-05, "loss": 0.1848, "step": 214800 }, { "epoch": 2.0788341128193646, "grad_norm": 8.702296257019043, "learning_rate": 1.270899810175219e-05, "loss": 0.2018, "step": 214900 }, { "epoch": 2.0790037506933947, "grad_norm": 5.8428635597229, "learning_rate": 1.2705605344271586e-05, "loss": 0.1871, "step": 215000 }, { "epoch": 2.0791733885674253, "grad_norm": 14.92750072479248, "learning_rate": 1.2702212586790978e-05, "loss": 0.1907, "step": 215100 }, { "epoch": 2.0793430264414554, "grad_norm": 7.074461460113525, "learning_rate": 1.2698819829310372e-05, "loss": 0.1909, "step": 215200 }, { "epoch": 2.0795126643154855, "grad_norm": 6.224263668060303, "learning_rate": 1.2695427071829767e-05, "loss": 0.202, "step": 215300 }, { "epoch": 2.079682302189516, "grad_norm": 9.481513023376465, "learning_rate": 1.269203431434916e-05, "loss": 0.1901, "step": 215400 }, { "epoch": 2.0798519400635462, "grad_norm": 7.6238627433776855, "learning_rate": 1.2688641556868553e-05, "loss": 0.1976, "step": 215500 }, { "epoch": 2.080021577937577, "grad_norm": 5.20045280456543, "learning_rate": 1.2685248799387948e-05, "loss": 0.178, "step": 215600 }, { "epoch": 2.080191215811607, "grad_norm": 11.684486389160156, "learning_rate": 1.268185604190734e-05, "loss": 0.1878, "step": 215700 }, { "epoch": 2.080360853685637, "grad_norm": 9.253644943237305, "learning_rate": 1.2678463284426736e-05, "loss": 0.1996, "step": 215800 }, { "epoch": 2.0805304915596676, "grad_norm": 7.7199225425720215, "learning_rate": 1.267507052694613e-05, "loss": 0.1983, "step": 215900 }, { "epoch": 2.0807001294336978, "grad_norm": 8.392951011657715, "learning_rate": 1.2671677769465522e-05, "loss": 0.1801, "step": 216000 }, { "epoch": 2.0808697673077283, "grad_norm": 10.250505447387695, "learning_rate": 1.2668285011984917e-05, "loss": 0.1843, "step": 216100 }, { "epoch": 2.0810394051817585, "grad_norm": 13.790194511413574, "learning_rate": 1.2664892254504311e-05, "loss": 0.1836, "step": 216200 }, { "epoch": 2.081209043055789, "grad_norm": 8.138324737548828, "learning_rate": 1.2661499497023703e-05, "loss": 0.1807, "step": 216300 }, { "epoch": 2.081378680929819, "grad_norm": 8.713359832763672, "learning_rate": 1.2658106739543099e-05, "loss": 0.2149, "step": 216400 }, { "epoch": 2.0815483188038493, "grad_norm": 9.374500274658203, "learning_rate": 1.2654713982062493e-05, "loss": 0.1914, "step": 216500 }, { "epoch": 2.08171795667788, "grad_norm": 3.881978988647461, "learning_rate": 1.2651321224581885e-05, "loss": 0.1826, "step": 216600 }, { "epoch": 2.08188759455191, "grad_norm": 12.80359935760498, "learning_rate": 1.264792846710128e-05, "loss": 0.1769, "step": 216700 }, { "epoch": 2.0820572324259405, "grad_norm": 3.326504945755005, "learning_rate": 1.2644535709620674e-05, "loss": 0.2018, "step": 216800 }, { "epoch": 2.0822268702999707, "grad_norm": 5.914657115936279, "learning_rate": 1.2641142952140068e-05, "loss": 0.1948, "step": 216900 }, { "epoch": 2.082396508174001, "grad_norm": 4.625913619995117, "learning_rate": 1.2637750194659462e-05, "loss": 0.1839, "step": 217000 }, { "epoch": 2.0825661460480314, "grad_norm": 7.694644927978516, "learning_rate": 1.2634357437178855e-05, "loss": 0.197, "step": 217100 }, { "epoch": 2.0827357839220615, "grad_norm": 10.181940078735352, "learning_rate": 1.263096467969825e-05, "loss": 0.2068, "step": 217200 }, { "epoch": 2.082905421796092, "grad_norm": 4.35053014755249, "learning_rate": 1.2627571922217643e-05, "loss": 0.1886, "step": 217300 }, { "epoch": 2.083075059670122, "grad_norm": 2.464212656021118, "learning_rate": 1.2624179164737037e-05, "loss": 0.1876, "step": 217400 }, { "epoch": 2.0832446975441523, "grad_norm": 11.739908218383789, "learning_rate": 1.262078640725643e-05, "loss": 0.1872, "step": 217500 }, { "epoch": 2.083414335418183, "grad_norm": 17.35321044921875, "learning_rate": 1.2617393649775824e-05, "loss": 0.2056, "step": 217600 }, { "epoch": 2.083583973292213, "grad_norm": 8.750527381896973, "learning_rate": 1.261400089229522e-05, "loss": 0.2022, "step": 217700 }, { "epoch": 2.0837536111662436, "grad_norm": 13.314850807189941, "learning_rate": 1.2610608134814612e-05, "loss": 0.1843, "step": 217800 }, { "epoch": 2.0839232490402737, "grad_norm": 6.248717784881592, "learning_rate": 1.2607215377334006e-05, "loss": 0.2062, "step": 217900 }, { "epoch": 2.084092886914304, "grad_norm": 4.818118095397949, "learning_rate": 1.2603822619853401e-05, "loss": 0.186, "step": 218000 }, { "epoch": 2.0842625247883344, "grad_norm": 7.437923431396484, "learning_rate": 1.2600429862372793e-05, "loss": 0.194, "step": 218100 }, { "epoch": 2.0844321626623645, "grad_norm": 6.660145282745361, "learning_rate": 1.2597037104892187e-05, "loss": 0.1726, "step": 218200 }, { "epoch": 2.084601800536395, "grad_norm": 4.922725677490234, "learning_rate": 1.2593644347411583e-05, "loss": 0.1976, "step": 218300 }, { "epoch": 2.0847714384104252, "grad_norm": 10.401130676269531, "learning_rate": 1.2590251589930975e-05, "loss": 0.1968, "step": 218400 }, { "epoch": 2.084941076284456, "grad_norm": 13.157453536987305, "learning_rate": 1.2586858832450369e-05, "loss": 0.1872, "step": 218500 }, { "epoch": 2.085110714158486, "grad_norm": 7.789931774139404, "learning_rate": 1.2583466074969764e-05, "loss": 0.204, "step": 218600 }, { "epoch": 2.085280352032516, "grad_norm": 3.73468279838562, "learning_rate": 1.2580073317489156e-05, "loss": 0.2088, "step": 218700 }, { "epoch": 2.0854499899065466, "grad_norm": 7.5194010734558105, "learning_rate": 1.2576680560008552e-05, "loss": 0.1837, "step": 218800 }, { "epoch": 2.0856196277805767, "grad_norm": 4.679093360900879, "learning_rate": 1.2573287802527945e-05, "loss": 0.1923, "step": 218900 }, { "epoch": 2.0857892656546073, "grad_norm": 6.97086763381958, "learning_rate": 1.2569895045047338e-05, "loss": 0.1895, "step": 219000 }, { "epoch": 2.0859589035286374, "grad_norm": 5.347015380859375, "learning_rate": 1.2566502287566733e-05, "loss": 0.1959, "step": 219100 }, { "epoch": 2.0861285414026676, "grad_norm": 8.408075332641602, "learning_rate": 1.2563109530086127e-05, "loss": 0.1638, "step": 219200 }, { "epoch": 2.086298179276698, "grad_norm": 14.707612991333008, "learning_rate": 1.2559716772605519e-05, "loss": 0.2099, "step": 219300 }, { "epoch": 2.0864678171507283, "grad_norm": 3.77673077583313, "learning_rate": 1.2556324015124914e-05, "loss": 0.1961, "step": 219400 }, { "epoch": 2.086637455024759, "grad_norm": 7.2944464683532715, "learning_rate": 1.2552931257644308e-05, "loss": 0.2008, "step": 219500 }, { "epoch": 2.086807092898789, "grad_norm": 9.081720352172852, "learning_rate": 1.2549538500163702e-05, "loss": 0.1967, "step": 219600 }, { "epoch": 2.086976730772819, "grad_norm": 18.67855453491211, "learning_rate": 1.2546145742683096e-05, "loss": 0.1895, "step": 219700 }, { "epoch": 2.0871463686468497, "grad_norm": 6.165328502655029, "learning_rate": 1.2542752985202488e-05, "loss": 0.1948, "step": 219800 }, { "epoch": 2.08731600652088, "grad_norm": 1.7220886945724487, "learning_rate": 1.2539360227721883e-05, "loss": 0.1677, "step": 219900 }, { "epoch": 2.0874856443949104, "grad_norm": 9.412108421325684, "learning_rate": 1.2535967470241277e-05, "loss": 0.2056, "step": 220000 }, { "epoch": 2.0876552822689405, "grad_norm": 13.694067001342773, "learning_rate": 1.253257471276067e-05, "loss": 0.1892, "step": 220100 }, { "epoch": 2.0878249201429706, "grad_norm": 13.564189910888672, "learning_rate": 1.2529181955280065e-05, "loss": 0.184, "step": 220200 }, { "epoch": 2.087994558017001, "grad_norm": 7.272484302520752, "learning_rate": 1.2525789197799459e-05, "loss": 0.2041, "step": 220300 }, { "epoch": 2.0881641958910313, "grad_norm": 7.406403541564941, "learning_rate": 1.252239644031885e-05, "loss": 0.1807, "step": 220400 }, { "epoch": 2.088333833765062, "grad_norm": 5.5541839599609375, "learning_rate": 1.2519003682838246e-05, "loss": 0.2163, "step": 220500 }, { "epoch": 2.088503471639092, "grad_norm": 3.7778735160827637, "learning_rate": 1.251561092535764e-05, "loss": 0.1868, "step": 220600 }, { "epoch": 2.088673109513122, "grad_norm": 10.905168533325195, "learning_rate": 1.2512218167877035e-05, "loss": 0.1963, "step": 220700 }, { "epoch": 2.0888427473871527, "grad_norm": 4.514163494110107, "learning_rate": 1.2508825410396428e-05, "loss": 0.1729, "step": 220800 }, { "epoch": 2.089012385261183, "grad_norm": 19.455162048339844, "learning_rate": 1.2505432652915821e-05, "loss": 0.1953, "step": 220900 }, { "epoch": 2.0891820231352134, "grad_norm": 2.6853537559509277, "learning_rate": 1.2502039895435217e-05, "loss": 0.1751, "step": 221000 }, { "epoch": 2.0893516610092435, "grad_norm": 6.408797740936279, "learning_rate": 1.2498647137954609e-05, "loss": 0.1866, "step": 221100 }, { "epoch": 2.089521298883274, "grad_norm": 10.144122123718262, "learning_rate": 1.2495254380474003e-05, "loss": 0.1753, "step": 221200 }, { "epoch": 2.089690936757304, "grad_norm": 5.796857833862305, "learning_rate": 1.2491861622993398e-05, "loss": 0.1989, "step": 221300 }, { "epoch": 2.0898605746313343, "grad_norm": 5.695582866668701, "learning_rate": 1.248846886551279e-05, "loss": 0.1812, "step": 221400 }, { "epoch": 2.090030212505365, "grad_norm": 6.271152019500732, "learning_rate": 1.2485076108032184e-05, "loss": 0.1913, "step": 221500 }, { "epoch": 2.090199850379395, "grad_norm": 9.600709915161133, "learning_rate": 1.248168335055158e-05, "loss": 0.1988, "step": 221600 }, { "epoch": 2.0903694882534256, "grad_norm": 9.842452049255371, "learning_rate": 1.2478290593070972e-05, "loss": 0.1883, "step": 221700 }, { "epoch": 2.0905391261274557, "grad_norm": 3.589883327484131, "learning_rate": 1.2474897835590367e-05, "loss": 0.2028, "step": 221800 }, { "epoch": 2.090708764001486, "grad_norm": 10.551797866821289, "learning_rate": 1.2471505078109761e-05, "loss": 0.1871, "step": 221900 }, { "epoch": 2.0908784018755164, "grad_norm": 13.489990234375, "learning_rate": 1.2468112320629153e-05, "loss": 0.1817, "step": 222000 }, { "epoch": 2.0910480397495466, "grad_norm": 5.565113067626953, "learning_rate": 1.2464719563148549e-05, "loss": 0.1744, "step": 222100 }, { "epoch": 2.091217677623577, "grad_norm": 12.278332710266113, "learning_rate": 1.246132680566794e-05, "loss": 0.1857, "step": 222200 }, { "epoch": 2.0913873154976073, "grad_norm": 7.235238552093506, "learning_rate": 1.2457934048187334e-05, "loss": 0.2034, "step": 222300 }, { "epoch": 2.0915569533716374, "grad_norm": 8.37668228149414, "learning_rate": 1.245454129070673e-05, "loss": 0.1888, "step": 222400 }, { "epoch": 2.091726591245668, "grad_norm": 5.659432888031006, "learning_rate": 1.2451148533226122e-05, "loss": 0.163, "step": 222500 }, { "epoch": 2.091896229119698, "grad_norm": 8.085991859436035, "learning_rate": 1.2447755775745518e-05, "loss": 0.2075, "step": 222600 }, { "epoch": 2.0920658669937287, "grad_norm": 13.94326400756836, "learning_rate": 1.2444363018264911e-05, "loss": 0.204, "step": 222700 }, { "epoch": 2.0922355048677588, "grad_norm": 6.385529041290283, "learning_rate": 1.2440970260784303e-05, "loss": 0.1744, "step": 222800 }, { "epoch": 2.092405142741789, "grad_norm": 10.112266540527344, "learning_rate": 1.2437577503303699e-05, "loss": 0.1859, "step": 222900 }, { "epoch": 2.0925747806158195, "grad_norm": 5.445046424865723, "learning_rate": 1.2434184745823093e-05, "loss": 0.1713, "step": 223000 }, { "epoch": 2.0927444184898496, "grad_norm": 7.234668254852295, "learning_rate": 1.2430791988342485e-05, "loss": 0.1974, "step": 223100 }, { "epoch": 2.09291405636388, "grad_norm": 8.221323013305664, "learning_rate": 1.242739923086188e-05, "loss": 0.1925, "step": 223200 }, { "epoch": 2.0930836942379103, "grad_norm": 6.7646284103393555, "learning_rate": 1.2424006473381274e-05, "loss": 0.1917, "step": 223300 }, { "epoch": 2.0932533321119404, "grad_norm": 7.981011390686035, "learning_rate": 1.2420613715900666e-05, "loss": 0.1967, "step": 223400 }, { "epoch": 2.093422969985971, "grad_norm": 9.38730239868164, "learning_rate": 1.2417220958420062e-05, "loss": 0.2001, "step": 223500 }, { "epoch": 2.093592607860001, "grad_norm": 7.78818416595459, "learning_rate": 1.2413828200939456e-05, "loss": 0.1896, "step": 223600 }, { "epoch": 2.0937622457340317, "grad_norm": 3.2018401622772217, "learning_rate": 1.2410435443458851e-05, "loss": 0.2106, "step": 223700 }, { "epoch": 2.093931883608062, "grad_norm": 3.9925522804260254, "learning_rate": 1.2407042685978243e-05, "loss": 0.1931, "step": 223800 }, { "epoch": 2.0941015214820924, "grad_norm": 7.034592151641846, "learning_rate": 1.2403649928497637e-05, "loss": 0.1883, "step": 223900 }, { "epoch": 2.0942711593561225, "grad_norm": 14.24042797088623, "learning_rate": 1.2400257171017032e-05, "loss": 0.2085, "step": 224000 }, { "epoch": 2.0944407972301526, "grad_norm": 7.332042694091797, "learning_rate": 1.2396864413536425e-05, "loss": 0.1903, "step": 224100 }, { "epoch": 2.094610435104183, "grad_norm": 15.157562255859375, "learning_rate": 1.2393471656055818e-05, "loss": 0.1892, "step": 224200 }, { "epoch": 2.0947800729782133, "grad_norm": 8.967671394348145, "learning_rate": 1.2390078898575214e-05, "loss": 0.1835, "step": 224300 }, { "epoch": 2.094949710852244, "grad_norm": 7.775735855102539, "learning_rate": 1.2386686141094606e-05, "loss": 0.1737, "step": 224400 }, { "epoch": 2.095119348726274, "grad_norm": 10.923720359802246, "learning_rate": 1.2383293383614001e-05, "loss": 0.1776, "step": 224500 }, { "epoch": 2.095288986600304, "grad_norm": 5.585629940032959, "learning_rate": 1.2379900626133393e-05, "loss": 0.1921, "step": 224600 }, { "epoch": 2.0954586244743347, "grad_norm": 3.7639684677124023, "learning_rate": 1.2376507868652787e-05, "loss": 0.2048, "step": 224700 }, { "epoch": 2.095628262348365, "grad_norm": 12.24715805053711, "learning_rate": 1.2373115111172183e-05, "loss": 0.1852, "step": 224800 }, { "epoch": 2.0957979002223954, "grad_norm": 4.2190775871276855, "learning_rate": 1.2369722353691575e-05, "loss": 0.1856, "step": 224900 }, { "epoch": 2.0959675380964256, "grad_norm": 0.908769428730011, "learning_rate": 1.2366329596210969e-05, "loss": 0.1846, "step": 225000 }, { "epoch": 2.0961371759704557, "grad_norm": 3.402691602706909, "learning_rate": 1.2362936838730364e-05, "loss": 0.1817, "step": 225100 }, { "epoch": 2.0963068138444862, "grad_norm": 9.161250114440918, "learning_rate": 1.2359544081249756e-05, "loss": 0.1902, "step": 225200 }, { "epoch": 2.0964764517185164, "grad_norm": 14.212464332580566, "learning_rate": 1.235615132376915e-05, "loss": 0.1828, "step": 225300 }, { "epoch": 2.096646089592547, "grad_norm": 5.785683631896973, "learning_rate": 1.2352758566288546e-05, "loss": 0.2066, "step": 225400 }, { "epoch": 2.096815727466577, "grad_norm": 8.131082534790039, "learning_rate": 1.2349365808807938e-05, "loss": 0.181, "step": 225500 }, { "epoch": 2.096985365340607, "grad_norm": 4.6114420890808105, "learning_rate": 1.2345973051327333e-05, "loss": 0.1873, "step": 225600 }, { "epoch": 2.0971550032146378, "grad_norm": 11.117013931274414, "learning_rate": 1.2342580293846727e-05, "loss": 0.2041, "step": 225700 }, { "epoch": 2.097324641088668, "grad_norm": 3.8855056762695312, "learning_rate": 1.2339187536366119e-05, "loss": 0.174, "step": 225800 }, { "epoch": 2.0974942789626985, "grad_norm": 4.059105396270752, "learning_rate": 1.2335794778885515e-05, "loss": 0.1769, "step": 225900 }, { "epoch": 2.0976639168367286, "grad_norm": 27.802343368530273, "learning_rate": 1.2332402021404908e-05, "loss": 0.2032, "step": 226000 }, { "epoch": 2.0978335547107587, "grad_norm": 11.1795654296875, "learning_rate": 1.23290092639243e-05, "loss": 0.189, "step": 226100 }, { "epoch": 2.0980031925847893, "grad_norm": 3.2582054138183594, "learning_rate": 1.2325616506443696e-05, "loss": 0.182, "step": 226200 }, { "epoch": 2.0981728304588194, "grad_norm": 21.560626983642578, "learning_rate": 1.232222374896309e-05, "loss": 0.1835, "step": 226300 }, { "epoch": 2.09834246833285, "grad_norm": 1.5302034616470337, "learning_rate": 1.2318830991482485e-05, "loss": 0.1863, "step": 226400 }, { "epoch": 2.09851210620688, "grad_norm": 3.807628870010376, "learning_rate": 1.2315438234001877e-05, "loss": 0.1795, "step": 226500 }, { "epoch": 2.0986817440809107, "grad_norm": 10.363687515258789, "learning_rate": 1.2312045476521271e-05, "loss": 0.2053, "step": 226600 }, { "epoch": 2.098851381954941, "grad_norm": 8.4721097946167, "learning_rate": 1.2308652719040665e-05, "loss": 0.2005, "step": 226700 }, { "epoch": 2.099021019828971, "grad_norm": 11.235547065734863, "learning_rate": 1.2305259961560059e-05, "loss": 0.1785, "step": 226800 }, { "epoch": 2.0991906577030015, "grad_norm": 5.156883239746094, "learning_rate": 1.2301867204079452e-05, "loss": 0.1746, "step": 226900 }, { "epoch": 2.0993602955770316, "grad_norm": 8.049545288085938, "learning_rate": 1.2298474446598846e-05, "loss": 0.2015, "step": 227000 }, { "epoch": 2.099529933451062, "grad_norm": 5.177642822265625, "learning_rate": 1.229508168911824e-05, "loss": 0.1822, "step": 227100 }, { "epoch": 2.0996995713250923, "grad_norm": 5.7784857749938965, "learning_rate": 1.2291688931637634e-05, "loss": 0.1879, "step": 227200 }, { "epoch": 2.0998692091991225, "grad_norm": 4.522280693054199, "learning_rate": 1.2288296174157028e-05, "loss": 0.1831, "step": 227300 }, { "epoch": 2.100038847073153, "grad_norm": 1.4746720790863037, "learning_rate": 1.2284903416676421e-05, "loss": 0.1911, "step": 227400 }, { "epoch": 2.100208484947183, "grad_norm": 5.492417335510254, "learning_rate": 1.2281510659195817e-05, "loss": 0.1933, "step": 227500 }, { "epoch": 2.1003781228212137, "grad_norm": 11.554398536682129, "learning_rate": 1.2278117901715209e-05, "loss": 0.1969, "step": 227600 }, { "epoch": 2.100547760695244, "grad_norm": 6.773679256439209, "learning_rate": 1.2274725144234603e-05, "loss": 0.2078, "step": 227700 }, { "epoch": 2.100717398569274, "grad_norm": 6.229804992675781, "learning_rate": 1.2271332386753998e-05, "loss": 0.1871, "step": 227800 }, { "epoch": 2.1008870364433045, "grad_norm": 4.01672887802124, "learning_rate": 1.226793962927339e-05, "loss": 0.1862, "step": 227900 }, { "epoch": 2.1010566743173347, "grad_norm": 11.721128463745117, "learning_rate": 1.2264546871792784e-05, "loss": 0.1947, "step": 228000 }, { "epoch": 2.1012263121913652, "grad_norm": 6.516722202301025, "learning_rate": 1.226115411431218e-05, "loss": 0.2054, "step": 228100 }, { "epoch": 2.1013959500653954, "grad_norm": 4.1069817543029785, "learning_rate": 1.2257761356831572e-05, "loss": 0.1769, "step": 228200 }, { "epoch": 2.1015655879394255, "grad_norm": 9.872206687927246, "learning_rate": 1.2254368599350967e-05, "loss": 0.1946, "step": 228300 }, { "epoch": 2.101735225813456, "grad_norm": 7.206471920013428, "learning_rate": 1.2250975841870361e-05, "loss": 0.1801, "step": 228400 }, { "epoch": 2.101904863687486, "grad_norm": 10.839624404907227, "learning_rate": 1.2247583084389753e-05, "loss": 0.1998, "step": 228500 }, { "epoch": 2.1020745015615168, "grad_norm": 11.516242980957031, "learning_rate": 1.2244190326909149e-05, "loss": 0.1988, "step": 228600 }, { "epoch": 2.102244139435547, "grad_norm": 4.798713684082031, "learning_rate": 1.2240797569428542e-05, "loss": 0.2053, "step": 228700 }, { "epoch": 2.1024137773095775, "grad_norm": 5.792641639709473, "learning_rate": 1.2237404811947935e-05, "loss": 0.1946, "step": 228800 }, { "epoch": 2.1025834151836076, "grad_norm": 8.372696876525879, "learning_rate": 1.223401205446733e-05, "loss": 0.1909, "step": 228900 }, { "epoch": 2.1027530530576377, "grad_norm": 8.683647155761719, "learning_rate": 1.2230619296986724e-05, "loss": 0.1802, "step": 229000 }, { "epoch": 2.1029226909316683, "grad_norm": 8.587242126464844, "learning_rate": 1.2227226539506116e-05, "loss": 0.1762, "step": 229100 }, { "epoch": 2.1030923288056984, "grad_norm": 10.026189804077148, "learning_rate": 1.2223833782025511e-05, "loss": 0.2019, "step": 229200 }, { "epoch": 2.103261966679729, "grad_norm": 8.513427734375, "learning_rate": 1.2220441024544905e-05, "loss": 0.1887, "step": 229300 }, { "epoch": 2.103431604553759, "grad_norm": 7.496162414550781, "learning_rate": 1.2217048267064299e-05, "loss": 0.1865, "step": 229400 }, { "epoch": 2.1036012424277892, "grad_norm": 6.00783109664917, "learning_rate": 1.2213655509583693e-05, "loss": 0.1973, "step": 229500 }, { "epoch": 2.10377088030182, "grad_norm": 11.47477912902832, "learning_rate": 1.2210262752103087e-05, "loss": 0.2028, "step": 229600 }, { "epoch": 2.10394051817585, "grad_norm": 1.9984732866287231, "learning_rate": 1.220686999462248e-05, "loss": 0.1881, "step": 229700 }, { "epoch": 2.1041101560498805, "grad_norm": 5.684045314788818, "learning_rate": 1.2203477237141874e-05, "loss": 0.1927, "step": 229800 }, { "epoch": 2.1042797939239106, "grad_norm": 7.106762886047363, "learning_rate": 1.2200084479661266e-05, "loss": 0.1835, "step": 229900 }, { "epoch": 2.1044494317979408, "grad_norm": 8.293580055236816, "learning_rate": 1.2196691722180662e-05, "loss": 0.1855, "step": 230000 }, { "epoch": 2.1046190696719713, "grad_norm": 11.053361892700195, "learning_rate": 1.2193298964700056e-05, "loss": 0.1994, "step": 230100 }, { "epoch": 2.1047887075460014, "grad_norm": 6.150400638580322, "learning_rate": 1.2189906207219451e-05, "loss": 0.1867, "step": 230200 }, { "epoch": 2.104958345420032, "grad_norm": 4.572146892547607, "learning_rate": 1.2186513449738843e-05, "loss": 0.1943, "step": 230300 }, { "epoch": 2.105127983294062, "grad_norm": 8.67697525024414, "learning_rate": 1.2183120692258237e-05, "loss": 0.195, "step": 230400 }, { "epoch": 2.1052976211680923, "grad_norm": 16.86577796936035, "learning_rate": 1.2179727934777633e-05, "loss": 0.195, "step": 230500 }, { "epoch": 2.105467259042123, "grad_norm": 4.134090423583984, "learning_rate": 1.2176335177297025e-05, "loss": 0.1861, "step": 230600 }, { "epoch": 2.105636896916153, "grad_norm": 6.354436874389648, "learning_rate": 1.2172942419816418e-05, "loss": 0.1817, "step": 230700 }, { "epoch": 2.1058065347901835, "grad_norm": 5.9506049156188965, "learning_rate": 1.2169549662335814e-05, "loss": 0.1914, "step": 230800 }, { "epoch": 2.1059761726642137, "grad_norm": 5.517002582550049, "learning_rate": 1.2166156904855206e-05, "loss": 0.1932, "step": 230900 }, { "epoch": 2.1061458105382442, "grad_norm": 7.1101765632629395, "learning_rate": 1.21627641473746e-05, "loss": 0.1843, "step": 231000 }, { "epoch": 2.1063154484122744, "grad_norm": 4.773265838623047, "learning_rate": 1.2159371389893995e-05, "loss": 0.2098, "step": 231100 }, { "epoch": 2.1064850862863045, "grad_norm": 12.346076011657715, "learning_rate": 1.2155978632413387e-05, "loss": 0.2054, "step": 231200 }, { "epoch": 2.106654724160335, "grad_norm": 6.791500091552734, "learning_rate": 1.2152585874932783e-05, "loss": 0.1886, "step": 231300 }, { "epoch": 2.106824362034365, "grad_norm": 2.456672191619873, "learning_rate": 1.2149193117452177e-05, "loss": 0.1876, "step": 231400 }, { "epoch": 2.1069939999083958, "grad_norm": 5.7534871101379395, "learning_rate": 1.2145800359971569e-05, "loss": 0.2069, "step": 231500 }, { "epoch": 2.107163637782426, "grad_norm": 6.715973854064941, "learning_rate": 1.2142407602490964e-05, "loss": 0.1742, "step": 231600 }, { "epoch": 2.107333275656456, "grad_norm": 10.007668495178223, "learning_rate": 1.2139014845010358e-05, "loss": 0.1808, "step": 231700 }, { "epoch": 2.1075029135304866, "grad_norm": 2.784212112426758, "learning_rate": 1.213562208752975e-05, "loss": 0.2071, "step": 231800 }, { "epoch": 2.1076725514045167, "grad_norm": 7.16627311706543, "learning_rate": 1.2132229330049146e-05, "loss": 0.1919, "step": 231900 }, { "epoch": 2.1078421892785473, "grad_norm": 1.8712211847305298, "learning_rate": 1.212883657256854e-05, "loss": 0.1824, "step": 232000 }, { "epoch": 2.1080118271525774, "grad_norm": 7.499845504760742, "learning_rate": 1.2125443815087933e-05, "loss": 0.1834, "step": 232100 }, { "epoch": 2.1081814650266075, "grad_norm": 4.943541049957275, "learning_rate": 1.2122051057607327e-05, "loss": 0.1776, "step": 232200 }, { "epoch": 2.108351102900638, "grad_norm": 11.756087303161621, "learning_rate": 1.2118658300126719e-05, "loss": 0.1816, "step": 232300 }, { "epoch": 2.1085207407746682, "grad_norm": 4.334687232971191, "learning_rate": 1.2115265542646115e-05, "loss": 0.1886, "step": 232400 }, { "epoch": 2.108690378648699, "grad_norm": 9.227747917175293, "learning_rate": 1.2111872785165508e-05, "loss": 0.1932, "step": 232500 }, { "epoch": 2.108860016522729, "grad_norm": 8.567967414855957, "learning_rate": 1.21084800276849e-05, "loss": 0.193, "step": 232600 }, { "epoch": 2.109029654396759, "grad_norm": 4.510258674621582, "learning_rate": 1.2105087270204296e-05, "loss": 0.1662, "step": 232700 }, { "epoch": 2.1091992922707896, "grad_norm": 3.6258339881896973, "learning_rate": 1.210169451272369e-05, "loss": 0.2012, "step": 232800 }, { "epoch": 2.1093689301448197, "grad_norm": 7.065094947814941, "learning_rate": 1.2098301755243082e-05, "loss": 0.192, "step": 232900 }, { "epoch": 2.1095385680188503, "grad_norm": 6.844850540161133, "learning_rate": 1.2094908997762477e-05, "loss": 0.1956, "step": 233000 }, { "epoch": 2.1097082058928804, "grad_norm": 7.2336225509643555, "learning_rate": 1.2091516240281871e-05, "loss": 0.1988, "step": 233100 }, { "epoch": 2.1098778437669106, "grad_norm": 17.718706130981445, "learning_rate": 1.2088123482801267e-05, "loss": 0.1769, "step": 233200 }, { "epoch": 2.110047481640941, "grad_norm": 12.268726348876953, "learning_rate": 1.2084730725320659e-05, "loss": 0.188, "step": 233300 }, { "epoch": 2.1102171195149713, "grad_norm": 6.233530521392822, "learning_rate": 1.2081337967840053e-05, "loss": 0.203, "step": 233400 }, { "epoch": 2.110386757389002, "grad_norm": 8.900837898254395, "learning_rate": 1.2077945210359448e-05, "loss": 0.1778, "step": 233500 }, { "epoch": 2.110556395263032, "grad_norm": 4.448532581329346, "learning_rate": 1.207455245287884e-05, "loss": 0.186, "step": 233600 }, { "epoch": 2.1107260331370625, "grad_norm": 5.80373477935791, "learning_rate": 1.2071159695398234e-05, "loss": 0.1972, "step": 233700 }, { "epoch": 2.1108956710110927, "grad_norm": 7.28659200668335, "learning_rate": 1.206776693791763e-05, "loss": 0.176, "step": 233800 }, { "epoch": 2.111065308885123, "grad_norm": 10.52536678314209, "learning_rate": 1.2064374180437022e-05, "loss": 0.1841, "step": 233900 }, { "epoch": 2.1112349467591534, "grad_norm": 7.720422267913818, "learning_rate": 1.2060981422956415e-05, "loss": 0.1949, "step": 234000 }, { "epoch": 2.1114045846331835, "grad_norm": 4.044959545135498, "learning_rate": 1.2057588665475811e-05, "loss": 0.1787, "step": 234100 }, { "epoch": 2.111574222507214, "grad_norm": 6.96904993057251, "learning_rate": 1.2054195907995203e-05, "loss": 0.1939, "step": 234200 }, { "epoch": 2.111743860381244, "grad_norm": 12.666570663452148, "learning_rate": 1.2050803150514598e-05, "loss": 0.178, "step": 234300 }, { "epoch": 2.1119134982552743, "grad_norm": 5.619147300720215, "learning_rate": 1.2047410393033992e-05, "loss": 0.1672, "step": 234400 }, { "epoch": 2.112083136129305, "grad_norm": 3.324998140335083, "learning_rate": 1.2044017635553384e-05, "loss": 0.1923, "step": 234500 }, { "epoch": 2.112252774003335, "grad_norm": 5.033210754394531, "learning_rate": 1.204062487807278e-05, "loss": 0.1986, "step": 234600 }, { "epoch": 2.1124224118773656, "grad_norm": 11.690618515014648, "learning_rate": 1.2037232120592172e-05, "loss": 0.184, "step": 234700 }, { "epoch": 2.1125920497513957, "grad_norm": 1.9585484266281128, "learning_rate": 1.2033839363111566e-05, "loss": 0.1787, "step": 234800 }, { "epoch": 2.112761687625426, "grad_norm": 13.266183853149414, "learning_rate": 1.2030446605630961e-05, "loss": 0.1938, "step": 234900 }, { "epoch": 2.1129313254994564, "grad_norm": 3.259474039077759, "learning_rate": 1.2027053848150353e-05, "loss": 0.1864, "step": 235000 }, { "epoch": 2.1131009633734865, "grad_norm": 2.7183687686920166, "learning_rate": 1.2023661090669749e-05, "loss": 0.1995, "step": 235100 }, { "epoch": 2.113270601247517, "grad_norm": 8.34109878540039, "learning_rate": 1.2020268333189143e-05, "loss": 0.1854, "step": 235200 }, { "epoch": 2.113440239121547, "grad_norm": 2.2283740043640137, "learning_rate": 1.2016875575708535e-05, "loss": 0.1712, "step": 235300 }, { "epoch": 2.1136098769955773, "grad_norm": 8.350970268249512, "learning_rate": 1.201348281822793e-05, "loss": 0.1849, "step": 235400 }, { "epoch": 2.113779514869608, "grad_norm": 8.520951271057129, "learning_rate": 1.2010090060747324e-05, "loss": 0.1741, "step": 235500 }, { "epoch": 2.113949152743638, "grad_norm": 6.124274253845215, "learning_rate": 1.2006697303266716e-05, "loss": 0.1706, "step": 235600 }, { "epoch": 2.1141187906176686, "grad_norm": 8.81298828125, "learning_rate": 1.2003304545786112e-05, "loss": 0.2022, "step": 235700 }, { "epoch": 2.1142884284916987, "grad_norm": 9.624695777893066, "learning_rate": 1.1999911788305505e-05, "loss": 0.1847, "step": 235800 }, { "epoch": 2.114458066365729, "grad_norm": 11.68756103515625, "learning_rate": 1.1996519030824897e-05, "loss": 0.1828, "step": 235900 }, { "epoch": 2.1146277042397594, "grad_norm": 11.155129432678223, "learning_rate": 1.1993126273344293e-05, "loss": 0.1897, "step": 236000 }, { "epoch": 2.1147973421137896, "grad_norm": 4.27059268951416, "learning_rate": 1.1989733515863687e-05, "loss": 0.1898, "step": 236100 }, { "epoch": 2.11496697998782, "grad_norm": 5.267179012298584, "learning_rate": 1.1986340758383082e-05, "loss": 0.1805, "step": 236200 }, { "epoch": 2.1151366178618503, "grad_norm": 4.11684513092041, "learning_rate": 1.1982948000902474e-05, "loss": 0.1903, "step": 236300 }, { "epoch": 2.115306255735881, "grad_norm": 6.799238681793213, "learning_rate": 1.1979555243421868e-05, "loss": 0.1889, "step": 236400 }, { "epoch": 2.115475893609911, "grad_norm": 3.201465368270874, "learning_rate": 1.1976162485941264e-05, "loss": 0.1851, "step": 236500 }, { "epoch": 2.115645531483941, "grad_norm": 5.185305595397949, "learning_rate": 1.1972769728460656e-05, "loss": 0.1942, "step": 236600 }, { "epoch": 2.1158151693579716, "grad_norm": 10.803401947021484, "learning_rate": 1.196937697098005e-05, "loss": 0.1826, "step": 236700 }, { "epoch": 2.1159848072320018, "grad_norm": 5.601503849029541, "learning_rate": 1.1965984213499443e-05, "loss": 0.1883, "step": 236800 }, { "epoch": 2.1161544451060323, "grad_norm": 6.7415266036987305, "learning_rate": 1.1962591456018837e-05, "loss": 0.19, "step": 236900 }, { "epoch": 2.1163240829800625, "grad_norm": 10.093852043151855, "learning_rate": 1.1959198698538233e-05, "loss": 0.1858, "step": 237000 }, { "epoch": 2.1164937208540926, "grad_norm": 10.029623985290527, "learning_rate": 1.1955805941057625e-05, "loss": 0.2023, "step": 237100 }, { "epoch": 2.116663358728123, "grad_norm": 5.3388776779174805, "learning_rate": 1.1952413183577019e-05, "loss": 0.2054, "step": 237200 }, { "epoch": 2.1168329966021533, "grad_norm": 6.3686747550964355, "learning_rate": 1.1949020426096414e-05, "loss": 0.1952, "step": 237300 }, { "epoch": 2.117002634476184, "grad_norm": 3.421414375305176, "learning_rate": 1.1945627668615806e-05, "loss": 0.1633, "step": 237400 }, { "epoch": 2.117172272350214, "grad_norm": 4.883300304412842, "learning_rate": 1.19422349111352e-05, "loss": 0.1679, "step": 237500 }, { "epoch": 2.117341910224244, "grad_norm": 10.557600975036621, "learning_rate": 1.1938842153654595e-05, "loss": 0.1902, "step": 237600 }, { "epoch": 2.1175115480982747, "grad_norm": 8.51529312133789, "learning_rate": 1.1935449396173987e-05, "loss": 0.1886, "step": 237700 }, { "epoch": 2.117681185972305, "grad_norm": 9.17726993560791, "learning_rate": 1.1932056638693381e-05, "loss": 0.1751, "step": 237800 }, { "epoch": 2.1178508238463354, "grad_norm": 23.27602195739746, "learning_rate": 1.1928663881212777e-05, "loss": 0.1832, "step": 237900 }, { "epoch": 2.1180204617203655, "grad_norm": 2.8281631469726562, "learning_rate": 1.1925271123732169e-05, "loss": 0.1951, "step": 238000 }, { "epoch": 2.1181900995943956, "grad_norm": 2.3195724487304688, "learning_rate": 1.1921878366251564e-05, "loss": 0.1865, "step": 238100 }, { "epoch": 2.118359737468426, "grad_norm": 9.13061237335205, "learning_rate": 1.1918485608770958e-05, "loss": 0.1937, "step": 238200 }, { "epoch": 2.1185293753424563, "grad_norm": 5.133619785308838, "learning_rate": 1.191509285129035e-05, "loss": 0.1859, "step": 238300 }, { "epoch": 2.118699013216487, "grad_norm": 7.453373908996582, "learning_rate": 1.1911700093809746e-05, "loss": 0.1948, "step": 238400 }, { "epoch": 2.118868651090517, "grad_norm": 9.566646575927734, "learning_rate": 1.190830733632914e-05, "loss": 0.1951, "step": 238500 }, { "epoch": 2.119038288964547, "grad_norm": 4.604611396789551, "learning_rate": 1.1904914578848532e-05, "loss": 0.1773, "step": 238600 }, { "epoch": 2.1192079268385777, "grad_norm": 8.186748504638672, "learning_rate": 1.1901521821367927e-05, "loss": 0.2062, "step": 238700 }, { "epoch": 2.119377564712608, "grad_norm": 8.87588882446289, "learning_rate": 1.1898129063887321e-05, "loss": 0.1734, "step": 238800 }, { "epoch": 2.1195472025866384, "grad_norm": 4.553011417388916, "learning_rate": 1.1894736306406716e-05, "loss": 0.185, "step": 238900 }, { "epoch": 2.1197168404606685, "grad_norm": 6.592287540435791, "learning_rate": 1.1891343548926109e-05, "loss": 0.182, "step": 239000 }, { "epoch": 2.119886478334699, "grad_norm": 5.786715030670166, "learning_rate": 1.1887950791445502e-05, "loss": 0.1904, "step": 239100 }, { "epoch": 2.1200561162087292, "grad_norm": 5.531994819641113, "learning_rate": 1.1884558033964896e-05, "loss": 0.1983, "step": 239200 }, { "epoch": 2.1202257540827594, "grad_norm": 7.879891395568848, "learning_rate": 1.188116527648429e-05, "loss": 0.2042, "step": 239300 }, { "epoch": 2.12039539195679, "grad_norm": 4.6721320152282715, "learning_rate": 1.1877772519003684e-05, "loss": 0.1825, "step": 239400 }, { "epoch": 2.12056502983082, "grad_norm": 7.120616912841797, "learning_rate": 1.1874379761523078e-05, "loss": 0.1776, "step": 239500 }, { "epoch": 2.1207346677048506, "grad_norm": 11.006314277648926, "learning_rate": 1.1870987004042471e-05, "loss": 0.1998, "step": 239600 }, { "epoch": 2.1209043055788808, "grad_norm": 7.216954231262207, "learning_rate": 1.1867594246561865e-05, "loss": 0.2118, "step": 239700 }, { "epoch": 2.121073943452911, "grad_norm": 13.835487365722656, "learning_rate": 1.1864201489081259e-05, "loss": 0.1837, "step": 239800 }, { "epoch": 2.1212435813269415, "grad_norm": 3.5803258419036865, "learning_rate": 1.1860808731600653e-05, "loss": 0.1834, "step": 239900 }, { "epoch": 2.1214132192009716, "grad_norm": 7.1089653968811035, "learning_rate": 1.1857415974120048e-05, "loss": 0.1741, "step": 240000 }, { "epoch": 2.121582857075002, "grad_norm": 8.84593391418457, "learning_rate": 1.185402321663944e-05, "loss": 0.1878, "step": 240100 }, { "epoch": 2.1217524949490323, "grad_norm": 3.6169581413269043, "learning_rate": 1.1850630459158834e-05, "loss": 0.1747, "step": 240200 }, { "epoch": 2.1219221328230624, "grad_norm": 8.328496932983398, "learning_rate": 1.184723770167823e-05, "loss": 0.1932, "step": 240300 }, { "epoch": 2.122091770697093, "grad_norm": 6.281271457672119, "learning_rate": 1.1843844944197622e-05, "loss": 0.2079, "step": 240400 }, { "epoch": 2.122261408571123, "grad_norm": 4.165229797363281, "learning_rate": 1.1840452186717015e-05, "loss": 0.1881, "step": 240500 }, { "epoch": 2.1224310464451537, "grad_norm": 14.28010368347168, "learning_rate": 1.1837059429236411e-05, "loss": 0.1898, "step": 240600 }, { "epoch": 2.122600684319184, "grad_norm": 6.624906063079834, "learning_rate": 1.1833666671755803e-05, "loss": 0.1766, "step": 240700 }, { "epoch": 2.122770322193214, "grad_norm": 8.398733139038086, "learning_rate": 1.1830273914275199e-05, "loss": 0.1845, "step": 240800 }, { "epoch": 2.1229399600672445, "grad_norm": 5.61748743057251, "learning_rate": 1.1826881156794592e-05, "loss": 0.1799, "step": 240900 }, { "epoch": 2.1231095979412746, "grad_norm": 13.98038387298584, "learning_rate": 1.1823488399313984e-05, "loss": 0.1948, "step": 241000 }, { "epoch": 2.123279235815305, "grad_norm": 3.3029749393463135, "learning_rate": 1.182009564183338e-05, "loss": 0.1847, "step": 241100 }, { "epoch": 2.1234488736893353, "grad_norm": 6.48779821395874, "learning_rate": 1.1816702884352774e-05, "loss": 0.1797, "step": 241200 }, { "epoch": 2.123618511563366, "grad_norm": 11.097245216369629, "learning_rate": 1.1813310126872166e-05, "loss": 0.1848, "step": 241300 }, { "epoch": 2.123788149437396, "grad_norm": 8.509100914001465, "learning_rate": 1.1809917369391561e-05, "loss": 0.1957, "step": 241400 }, { "epoch": 2.123957787311426, "grad_norm": 19.04422950744629, "learning_rate": 1.1806524611910955e-05, "loss": 0.1872, "step": 241500 }, { "epoch": 2.1241274251854567, "grad_norm": 4.552314758300781, "learning_rate": 1.1803131854430347e-05, "loss": 0.2108, "step": 241600 }, { "epoch": 2.124297063059487, "grad_norm": 10.521430969238281, "learning_rate": 1.1799739096949743e-05, "loss": 0.201, "step": 241700 }, { "epoch": 2.1244667009335174, "grad_norm": 8.90468978881836, "learning_rate": 1.1796346339469136e-05, "loss": 0.1885, "step": 241800 }, { "epoch": 2.1246363388075475, "grad_norm": 5.7342352867126465, "learning_rate": 1.179295358198853e-05, "loss": 0.1982, "step": 241900 }, { "epoch": 2.1248059766815777, "grad_norm": 7.043406009674072, "learning_rate": 1.1789560824507924e-05, "loss": 0.1845, "step": 242000 }, { "epoch": 2.1249756145556082, "grad_norm": 5.993015766143799, "learning_rate": 1.1786168067027318e-05, "loss": 0.1878, "step": 242100 }, { "epoch": 2.1251452524296384, "grad_norm": 3.03867244720459, "learning_rate": 1.1782775309546712e-05, "loss": 0.2009, "step": 242200 }, { "epoch": 2.125314890303669, "grad_norm": 2.781801462173462, "learning_rate": 1.1779382552066105e-05, "loss": 0.1981, "step": 242300 }, { "epoch": 2.125484528177699, "grad_norm": 4.493129730224609, "learning_rate": 1.1775989794585498e-05, "loss": 0.1803, "step": 242400 }, { "epoch": 2.125654166051729, "grad_norm": 5.861032009124756, "learning_rate": 1.1772597037104893e-05, "loss": 0.1937, "step": 242500 }, { "epoch": 2.1258238039257598, "grad_norm": 3.724652051925659, "learning_rate": 1.1769204279624287e-05, "loss": 0.1698, "step": 242600 }, { "epoch": 2.12599344179979, "grad_norm": 1.5517131090164185, "learning_rate": 1.1765811522143682e-05, "loss": 0.1784, "step": 242700 }, { "epoch": 2.1261630796738205, "grad_norm": 7.5770745277404785, "learning_rate": 1.1762418764663074e-05, "loss": 0.1854, "step": 242800 }, { "epoch": 2.1263327175478506, "grad_norm": 4.220036506652832, "learning_rate": 1.1759026007182468e-05, "loss": 0.1694, "step": 242900 }, { "epoch": 2.1265023554218807, "grad_norm": 7.1957106590271, "learning_rate": 1.1755633249701864e-05, "loss": 0.1958, "step": 243000 }, { "epoch": 2.1266719932959113, "grad_norm": 6.904239177703857, "learning_rate": 1.1752240492221256e-05, "loss": 0.1838, "step": 243100 }, { "epoch": 2.1268416311699414, "grad_norm": 7.904985427856445, "learning_rate": 1.174884773474065e-05, "loss": 0.1901, "step": 243200 }, { "epoch": 2.127011269043972, "grad_norm": 4.854261875152588, "learning_rate": 1.1745454977260045e-05, "loss": 0.1925, "step": 243300 }, { "epoch": 2.127180906918002, "grad_norm": 4.531145095825195, "learning_rate": 1.1742062219779437e-05, "loss": 0.1785, "step": 243400 }, { "epoch": 2.1273505447920327, "grad_norm": 6.811899662017822, "learning_rate": 1.1738669462298831e-05, "loss": 0.1944, "step": 243500 }, { "epoch": 2.127520182666063, "grad_norm": 6.484658718109131, "learning_rate": 1.1735276704818227e-05, "loss": 0.1788, "step": 243600 }, { "epoch": 2.127689820540093, "grad_norm": 7.494356155395508, "learning_rate": 1.1731883947337619e-05, "loss": 0.2007, "step": 243700 }, { "epoch": 2.1278594584141235, "grad_norm": 5.929379940032959, "learning_rate": 1.1728491189857014e-05, "loss": 0.1884, "step": 243800 }, { "epoch": 2.1280290962881536, "grad_norm": 8.362099647521973, "learning_rate": 1.1725098432376408e-05, "loss": 0.1869, "step": 243900 }, { "epoch": 2.128198734162184, "grad_norm": 8.643105506896973, "learning_rate": 1.17217056748958e-05, "loss": 0.1819, "step": 244000 }, { "epoch": 2.1283683720362143, "grad_norm": 8.171627044677734, "learning_rate": 1.1718312917415195e-05, "loss": 0.1808, "step": 244100 }, { "epoch": 2.1285380099102444, "grad_norm": 11.349912643432617, "learning_rate": 1.171492015993459e-05, "loss": 0.1933, "step": 244200 }, { "epoch": 2.128707647784275, "grad_norm": 2.869845151901245, "learning_rate": 1.1711527402453981e-05, "loss": 0.1901, "step": 244300 }, { "epoch": 2.128877285658305, "grad_norm": 7.505216598510742, "learning_rate": 1.1708134644973377e-05, "loss": 0.183, "step": 244400 }, { "epoch": 2.1290469235323357, "grad_norm": 4.731356143951416, "learning_rate": 1.170474188749277e-05, "loss": 0.2026, "step": 244500 }, { "epoch": 2.129216561406366, "grad_norm": 2.3445475101470947, "learning_rate": 1.1701349130012163e-05, "loss": 0.1697, "step": 244600 }, { "epoch": 2.129386199280396, "grad_norm": 12.573566436767578, "learning_rate": 1.1697956372531558e-05, "loss": 0.2028, "step": 244700 }, { "epoch": 2.1295558371544265, "grad_norm": 14.310277938842773, "learning_rate": 1.169456361505095e-05, "loss": 0.1781, "step": 244800 }, { "epoch": 2.1297254750284567, "grad_norm": 10.487683296203613, "learning_rate": 1.1691170857570346e-05, "loss": 0.2043, "step": 244900 }, { "epoch": 2.1298951129024872, "grad_norm": 6.824893951416016, "learning_rate": 1.168777810008974e-05, "loss": 0.1721, "step": 245000 }, { "epoch": 2.1300647507765174, "grad_norm": 5.843127727508545, "learning_rate": 1.1684385342609132e-05, "loss": 0.1858, "step": 245100 }, { "epoch": 2.1302343886505475, "grad_norm": 5.878823280334473, "learning_rate": 1.1680992585128527e-05, "loss": 0.1874, "step": 245200 }, { "epoch": 2.130404026524578, "grad_norm": 9.4090576171875, "learning_rate": 1.1677599827647921e-05, "loss": 0.1883, "step": 245300 }, { "epoch": 2.130573664398608, "grad_norm": 3.012777328491211, "learning_rate": 1.1674207070167313e-05, "loss": 0.1779, "step": 245400 }, { "epoch": 2.1307433022726388, "grad_norm": 9.27148151397705, "learning_rate": 1.1670814312686709e-05, "loss": 0.1894, "step": 245500 }, { "epoch": 2.130912940146669, "grad_norm": 10.496716499328613, "learning_rate": 1.1667421555206102e-05, "loss": 0.1814, "step": 245600 }, { "epoch": 2.131082578020699, "grad_norm": 17.873456954956055, "learning_rate": 1.1664028797725498e-05, "loss": 0.1827, "step": 245700 }, { "epoch": 2.1312522158947296, "grad_norm": 4.582610607147217, "learning_rate": 1.166063604024489e-05, "loss": 0.2001, "step": 245800 }, { "epoch": 2.1314218537687597, "grad_norm": 18.17752456665039, "learning_rate": 1.1657243282764284e-05, "loss": 0.1972, "step": 245900 }, { "epoch": 2.1315914916427903, "grad_norm": 6.14479923248291, "learning_rate": 1.165385052528368e-05, "loss": 0.1922, "step": 246000 }, { "epoch": 2.1317611295168204, "grad_norm": 9.692614555358887, "learning_rate": 1.1650457767803071e-05, "loss": 0.1805, "step": 246100 }, { "epoch": 2.131930767390851, "grad_norm": 4.1453776359558105, "learning_rate": 1.1647065010322465e-05, "loss": 0.193, "step": 246200 }, { "epoch": 2.132100405264881, "grad_norm": 5.202688694000244, "learning_rate": 1.164367225284186e-05, "loss": 0.1841, "step": 246300 }, { "epoch": 2.132270043138911, "grad_norm": 4.797952651977539, "learning_rate": 1.1640279495361253e-05, "loss": 0.1849, "step": 246400 }, { "epoch": 2.132439681012942, "grad_norm": 8.852509498596191, "learning_rate": 1.1636886737880647e-05, "loss": 0.1734, "step": 246500 }, { "epoch": 2.132609318886972, "grad_norm": 4.011237144470215, "learning_rate": 1.1633493980400042e-05, "loss": 0.1887, "step": 246600 }, { "epoch": 2.1327789567610025, "grad_norm": 1.6306049823760986, "learning_rate": 1.1630101222919434e-05, "loss": 0.1881, "step": 246700 }, { "epoch": 2.1329485946350326, "grad_norm": 6.635383605957031, "learning_rate": 1.162670846543883e-05, "loss": 0.2023, "step": 246800 }, { "epoch": 2.1331182325090627, "grad_norm": 9.43075942993164, "learning_rate": 1.1623315707958223e-05, "loss": 0.1837, "step": 246900 }, { "epoch": 2.1332878703830933, "grad_norm": 4.119908809661865, "learning_rate": 1.1619922950477616e-05, "loss": 0.1852, "step": 247000 }, { "epoch": 2.1334575082571234, "grad_norm": 1.8754686117172241, "learning_rate": 1.1616530192997011e-05, "loss": 0.1698, "step": 247100 }, { "epoch": 2.133627146131154, "grad_norm": 7.817907810211182, "learning_rate": 1.1613137435516403e-05, "loss": 0.1875, "step": 247200 }, { "epoch": 2.133796784005184, "grad_norm": 6.326663494110107, "learning_rate": 1.1609744678035797e-05, "loss": 0.2084, "step": 247300 }, { "epoch": 2.1339664218792143, "grad_norm": 7.274734973907471, "learning_rate": 1.1606351920555192e-05, "loss": 0.1831, "step": 247400 }, { "epoch": 2.134136059753245, "grad_norm": 12.36933708190918, "learning_rate": 1.1602959163074585e-05, "loss": 0.1756, "step": 247500 }, { "epoch": 2.134305697627275, "grad_norm": 7.392027378082275, "learning_rate": 1.159956640559398e-05, "loss": 0.1972, "step": 247600 }, { "epoch": 2.1344753355013055, "grad_norm": 5.983292579650879, "learning_rate": 1.1596173648113374e-05, "loss": 0.1949, "step": 247700 }, { "epoch": 2.1346449733753357, "grad_norm": 2.011563539505005, "learning_rate": 1.1592780890632766e-05, "loss": 0.1928, "step": 247800 }, { "epoch": 2.134814611249366, "grad_norm": 6.264834880828857, "learning_rate": 1.1589388133152161e-05, "loss": 0.1892, "step": 247900 }, { "epoch": 2.1349842491233963, "grad_norm": 6.2508931159973145, "learning_rate": 1.1585995375671555e-05, "loss": 0.1836, "step": 248000 }, { "epoch": 2.1351538869974265, "grad_norm": 9.63463306427002, "learning_rate": 1.1582602618190947e-05, "loss": 0.1957, "step": 248100 }, { "epoch": 2.135323524871457, "grad_norm": 7.196920394897461, "learning_rate": 1.1579209860710343e-05, "loss": 0.1801, "step": 248200 }, { "epoch": 2.135493162745487, "grad_norm": 4.582245349884033, "learning_rate": 1.1575817103229737e-05, "loss": 0.1858, "step": 248300 }, { "epoch": 2.1356628006195173, "grad_norm": 5.813337802886963, "learning_rate": 1.1572424345749129e-05, "loss": 0.1822, "step": 248400 }, { "epoch": 2.135832438493548, "grad_norm": 6.3441996574401855, "learning_rate": 1.1569031588268524e-05, "loss": 0.1885, "step": 248500 }, { "epoch": 2.136002076367578, "grad_norm": 3.106865644454956, "learning_rate": 1.1565638830787918e-05, "loss": 0.1914, "step": 248600 }, { "epoch": 2.1361717142416086, "grad_norm": 7.907399654388428, "learning_rate": 1.1562246073307313e-05, "loss": 0.1965, "step": 248700 }, { "epoch": 2.1363413521156387, "grad_norm": 10.346025466918945, "learning_rate": 1.1558853315826706e-05, "loss": 0.1935, "step": 248800 }, { "epoch": 2.1365109899896693, "grad_norm": 6.634618282318115, "learning_rate": 1.15554605583461e-05, "loss": 0.1981, "step": 248900 }, { "epoch": 2.1366806278636994, "grad_norm": 13.242652893066406, "learning_rate": 1.1552067800865495e-05, "loss": 0.1723, "step": 249000 }, { "epoch": 2.1368502657377295, "grad_norm": 8.057804107666016, "learning_rate": 1.1548675043384887e-05, "loss": 0.1878, "step": 249100 }, { "epoch": 2.13701990361176, "grad_norm": 5.566277980804443, "learning_rate": 1.154528228590428e-05, "loss": 0.1834, "step": 249200 }, { "epoch": 2.13718954148579, "grad_norm": 3.2228307723999023, "learning_rate": 1.1541889528423675e-05, "loss": 0.1819, "step": 249300 }, { "epoch": 2.137359179359821, "grad_norm": 1.5053019523620605, "learning_rate": 1.1538496770943068e-05, "loss": 0.1958, "step": 249400 }, { "epoch": 2.137528817233851, "grad_norm": 7.785714626312256, "learning_rate": 1.1535104013462464e-05, "loss": 0.1806, "step": 249500 }, { "epoch": 2.137698455107881, "grad_norm": 5.866556644439697, "learning_rate": 1.1531711255981856e-05, "loss": 0.1971, "step": 249600 }, { "epoch": 2.1378680929819116, "grad_norm": 7.383463382720947, "learning_rate": 1.152831849850125e-05, "loss": 0.1916, "step": 249700 }, { "epoch": 2.1380377308559417, "grad_norm": 9.722503662109375, "learning_rate": 1.1524925741020645e-05, "loss": 0.178, "step": 249800 }, { "epoch": 2.1382073687299723, "grad_norm": 3.867753505706787, "learning_rate": 1.1521532983540037e-05, "loss": 0.1918, "step": 249900 }, { "epoch": 2.1383770066040024, "grad_norm": 8.436537742614746, "learning_rate": 1.1518140226059431e-05, "loss": 0.1794, "step": 250000 }, { "epoch": 2.1385466444780326, "grad_norm": 12.660341262817383, "learning_rate": 1.1514747468578827e-05, "loss": 0.178, "step": 250100 }, { "epoch": 2.138716282352063, "grad_norm": 9.986969947814941, "learning_rate": 1.1511354711098219e-05, "loss": 0.1955, "step": 250200 }, { "epoch": 2.1388859202260933, "grad_norm": 13.627443313598633, "learning_rate": 1.1507961953617613e-05, "loss": 0.1879, "step": 250300 }, { "epoch": 2.139055558100124, "grad_norm": 10.75405502319336, "learning_rate": 1.1504569196137008e-05, "loss": 0.1973, "step": 250400 }, { "epoch": 2.139225195974154, "grad_norm": 10.38640308380127, "learning_rate": 1.15011764386564e-05, "loss": 0.1826, "step": 250500 }, { "epoch": 2.139394833848184, "grad_norm": 9.381989479064941, "learning_rate": 1.1497783681175796e-05, "loss": 0.2197, "step": 250600 }, { "epoch": 2.1395644717222146, "grad_norm": 7.092848300933838, "learning_rate": 1.149439092369519e-05, "loss": 0.1802, "step": 250700 }, { "epoch": 2.1397341095962448, "grad_norm": 2.813396692276001, "learning_rate": 1.1490998166214581e-05, "loss": 0.202, "step": 250800 }, { "epoch": 2.1399037474702753, "grad_norm": 4.201328277587891, "learning_rate": 1.1487605408733977e-05, "loss": 0.1909, "step": 250900 }, { "epoch": 2.1400733853443055, "grad_norm": 4.403903961181641, "learning_rate": 1.148421265125337e-05, "loss": 0.1776, "step": 251000 }, { "epoch": 2.1402430232183356, "grad_norm": 7.295247554779053, "learning_rate": 1.1480819893772763e-05, "loss": 0.181, "step": 251100 }, { "epoch": 2.140412661092366, "grad_norm": 8.18549919128418, "learning_rate": 1.1477427136292158e-05, "loss": 0.1797, "step": 251200 }, { "epoch": 2.1405822989663963, "grad_norm": 3.105384349822998, "learning_rate": 1.1474034378811552e-05, "loss": 0.1988, "step": 251300 }, { "epoch": 2.140751936840427, "grad_norm": 10.096595764160156, "learning_rate": 1.1470641621330948e-05, "loss": 0.1794, "step": 251400 }, { "epoch": 2.140921574714457, "grad_norm": 4.723698616027832, "learning_rate": 1.146724886385034e-05, "loss": 0.1787, "step": 251500 }, { "epoch": 2.1410912125884876, "grad_norm": 8.417098999023438, "learning_rate": 1.1463856106369734e-05, "loss": 0.169, "step": 251600 }, { "epoch": 2.1412608504625177, "grad_norm": 5.481856822967529, "learning_rate": 1.1460463348889127e-05, "loss": 0.1942, "step": 251700 }, { "epoch": 2.141430488336548, "grad_norm": 5.337045669555664, "learning_rate": 1.1457070591408521e-05, "loss": 0.1854, "step": 251800 }, { "epoch": 2.1416001262105784, "grad_norm": 8.74197769165039, "learning_rate": 1.1453677833927915e-05, "loss": 0.1822, "step": 251900 }, { "epoch": 2.1417697640846085, "grad_norm": 5.057628631591797, "learning_rate": 1.1450285076447309e-05, "loss": 0.2013, "step": 252000 }, { "epoch": 2.141939401958639, "grad_norm": 8.981176376342773, "learning_rate": 1.1446892318966703e-05, "loss": 0.1765, "step": 252100 }, { "epoch": 2.142109039832669, "grad_norm": 7.047201156616211, "learning_rate": 1.1443499561486096e-05, "loss": 0.1926, "step": 252200 }, { "epoch": 2.1422786777066993, "grad_norm": 10.283426284790039, "learning_rate": 1.144010680400549e-05, "loss": 0.1878, "step": 252300 }, { "epoch": 2.14244831558073, "grad_norm": 8.41259479522705, "learning_rate": 1.1436714046524884e-05, "loss": 0.2018, "step": 252400 }, { "epoch": 2.14261795345476, "grad_norm": 2.6027989387512207, "learning_rate": 1.143332128904428e-05, "loss": 0.1748, "step": 252500 }, { "epoch": 2.1427875913287906, "grad_norm": 5.654937744140625, "learning_rate": 1.1429928531563672e-05, "loss": 0.2006, "step": 252600 }, { "epoch": 2.142858839235883, "eval_accuracy": 0.8332303255047384, "eval_f1": 0.8840214713296911, "eval_loss": 0.4817821979522705, "eval_runtime": 386.6686, "eval_samples_per_second": 866.184, "eval_steps_per_second": 27.07, "step": 252642 }, { "epoch": 3.000098389966938, "grad_norm": 6.859489917755127, "learning_rate": 1.1426535774083065e-05, "loss": 0.1759, "step": 252700 }, { "epoch": 3.000268027840968, "grad_norm": 5.377769470214844, "learning_rate": 1.142314301660246e-05, "loss": 0.138, "step": 252800 }, { "epoch": 3.000437665714998, "grad_norm": 11.909835815429688, "learning_rate": 1.1419750259121853e-05, "loss": 0.1395, "step": 252900 }, { "epoch": 3.0006073035890286, "grad_norm": 7.841085433959961, "learning_rate": 1.1416357501641247e-05, "loss": 0.1353, "step": 253000 }, { "epoch": 3.0007769414630587, "grad_norm": 12.859881401062012, "learning_rate": 1.1412964744160642e-05, "loss": 0.1474, "step": 253100 }, { "epoch": 3.0009465793370893, "grad_norm": 4.698070049285889, "learning_rate": 1.1409571986680034e-05, "loss": 0.1591, "step": 253200 }, { "epoch": 3.0011162172111194, "grad_norm": 8.886073112487793, "learning_rate": 1.140617922919943e-05, "loss": 0.1478, "step": 253300 }, { "epoch": 3.0012858550851496, "grad_norm": 8.994847297668457, "learning_rate": 1.1402786471718824e-05, "loss": 0.1432, "step": 253400 }, { "epoch": 3.00145549295918, "grad_norm": 1.0132832527160645, "learning_rate": 1.1399393714238216e-05, "loss": 0.145, "step": 253500 }, { "epoch": 3.0016251308332103, "grad_norm": 9.308547973632812, "learning_rate": 1.1396000956757611e-05, "loss": 0.1454, "step": 253600 }, { "epoch": 3.001794768707241, "grad_norm": 3.1426219940185547, "learning_rate": 1.1392608199277005e-05, "loss": 0.1524, "step": 253700 }, { "epoch": 3.001964406581271, "grad_norm": 7.950308799743652, "learning_rate": 1.1389215441796397e-05, "loss": 0.1415, "step": 253800 }, { "epoch": 3.002134044455301, "grad_norm": 8.1004638671875, "learning_rate": 1.1385822684315793e-05, "loss": 0.1612, "step": 253900 }, { "epoch": 3.0023036823293316, "grad_norm": 9.498191833496094, "learning_rate": 1.1382429926835186e-05, "loss": 0.1417, "step": 254000 }, { "epoch": 3.0024733202033618, "grad_norm": 6.465892314910889, "learning_rate": 1.1379037169354578e-05, "loss": 0.1554, "step": 254100 }, { "epoch": 3.0026429580773923, "grad_norm": 1.6298259496688843, "learning_rate": 1.1375644411873974e-05, "loss": 0.1585, "step": 254200 }, { "epoch": 3.0028125959514225, "grad_norm": 1.428348422050476, "learning_rate": 1.1372251654393368e-05, "loss": 0.1441, "step": 254300 }, { "epoch": 3.0029822338254526, "grad_norm": 12.497976303100586, "learning_rate": 1.1368858896912762e-05, "loss": 0.1362, "step": 254400 }, { "epoch": 3.003151871699483, "grad_norm": 5.3019561767578125, "learning_rate": 1.1365466139432155e-05, "loss": 0.1362, "step": 254500 }, { "epoch": 3.0033215095735133, "grad_norm": 12.373723983764648, "learning_rate": 1.1362073381951549e-05, "loss": 0.1579, "step": 254600 }, { "epoch": 3.003491147447544, "grad_norm": 5.337221622467041, "learning_rate": 1.1358680624470943e-05, "loss": 0.1528, "step": 254700 }, { "epoch": 3.003660785321574, "grad_norm": 15.917119979858398, "learning_rate": 1.1355287866990337e-05, "loss": 0.1372, "step": 254800 }, { "epoch": 3.0038304231956046, "grad_norm": 9.744399070739746, "learning_rate": 1.1351895109509729e-05, "loss": 0.1754, "step": 254900 }, { "epoch": 3.0040000610696347, "grad_norm": 19.287105560302734, "learning_rate": 1.1348502352029124e-05, "loss": 0.1444, "step": 255000 }, { "epoch": 3.004169698943665, "grad_norm": 0.9813449382781982, "learning_rate": 1.1345109594548518e-05, "loss": 0.1608, "step": 255100 }, { "epoch": 3.0043393368176954, "grad_norm": 12.956783294677734, "learning_rate": 1.1341716837067914e-05, "loss": 0.1471, "step": 255200 }, { "epoch": 3.0045089746917255, "grad_norm": 12.616933822631836, "learning_rate": 1.1338324079587306e-05, "loss": 0.159, "step": 255300 }, { "epoch": 3.004678612565756, "grad_norm": 9.786433219909668, "learning_rate": 1.13349313221067e-05, "loss": 0.1449, "step": 255400 }, { "epoch": 3.004848250439786, "grad_norm": 15.596601486206055, "learning_rate": 1.1331538564626095e-05, "loss": 0.149, "step": 255500 }, { "epoch": 3.0050178883138163, "grad_norm": 10.437399864196777, "learning_rate": 1.1328145807145487e-05, "loss": 0.1564, "step": 255600 }, { "epoch": 3.005187526187847, "grad_norm": 14.533721923828125, "learning_rate": 1.1324753049664881e-05, "loss": 0.1648, "step": 255700 }, { "epoch": 3.005357164061877, "grad_norm": 14.869539260864258, "learning_rate": 1.1321360292184276e-05, "loss": 0.1716, "step": 255800 }, { "epoch": 3.0055268019359076, "grad_norm": 7.618862152099609, "learning_rate": 1.1317967534703668e-05, "loss": 0.1425, "step": 255900 }, { "epoch": 3.0056964398099377, "grad_norm": 2.152909755706787, "learning_rate": 1.1314574777223062e-05, "loss": 0.1376, "step": 256000 }, { "epoch": 3.005866077683968, "grad_norm": 7.5385284423828125, "learning_rate": 1.1311182019742458e-05, "loss": 0.1483, "step": 256100 }, { "epoch": 3.0060357155579984, "grad_norm": 1.9369388818740845, "learning_rate": 1.130778926226185e-05, "loss": 0.1581, "step": 256200 }, { "epoch": 3.0062053534320285, "grad_norm": 8.027322769165039, "learning_rate": 1.1304396504781245e-05, "loss": 0.1442, "step": 256300 }, { "epoch": 3.006374991306059, "grad_norm": 4.693883419036865, "learning_rate": 1.1301003747300639e-05, "loss": 0.1493, "step": 256400 }, { "epoch": 3.0065446291800892, "grad_norm": 2.3747100830078125, "learning_rate": 1.1297610989820031e-05, "loss": 0.1618, "step": 256500 }, { "epoch": 3.0067142670541194, "grad_norm": 2.9483299255371094, "learning_rate": 1.1294218232339427e-05, "loss": 0.1415, "step": 256600 }, { "epoch": 3.00688390492815, "grad_norm": 1.0412172079086304, "learning_rate": 1.129082547485882e-05, "loss": 0.1504, "step": 256700 }, { "epoch": 3.00705354280218, "grad_norm": 8.542884826660156, "learning_rate": 1.1287432717378213e-05, "loss": 0.1577, "step": 256800 }, { "epoch": 3.0072231806762106, "grad_norm": 19.342742919921875, "learning_rate": 1.1284039959897608e-05, "loss": 0.1446, "step": 256900 }, { "epoch": 3.0073928185502408, "grad_norm": 2.848705530166626, "learning_rate": 1.1280647202417002e-05, "loss": 0.1465, "step": 257000 }, { "epoch": 3.007562456424271, "grad_norm": 10.126928329467773, "learning_rate": 1.1277254444936394e-05, "loss": 0.1433, "step": 257100 }, { "epoch": 3.0077320942983015, "grad_norm": 8.105850219726562, "learning_rate": 1.127386168745579e-05, "loss": 0.1665, "step": 257200 }, { "epoch": 3.0079017321723316, "grad_norm": 3.399991035461426, "learning_rate": 1.1270468929975182e-05, "loss": 0.1311, "step": 257300 }, { "epoch": 3.008071370046362, "grad_norm": 3.10902738571167, "learning_rate": 1.1267076172494577e-05, "loss": 0.1863, "step": 257400 }, { "epoch": 3.0082410079203923, "grad_norm": 6.065944671630859, "learning_rate": 1.1263683415013971e-05, "loss": 0.1355, "step": 257500 }, { "epoch": 3.008410645794423, "grad_norm": 13.424445152282715, "learning_rate": 1.1260290657533363e-05, "loss": 0.1446, "step": 257600 }, { "epoch": 3.008580283668453, "grad_norm": 4.612052917480469, "learning_rate": 1.1256897900052758e-05, "loss": 0.1465, "step": 257700 }, { "epoch": 3.008749921542483, "grad_norm": 3.4364919662475586, "learning_rate": 1.1253505142572152e-05, "loss": 0.1518, "step": 257800 }, { "epoch": 3.0089195594165137, "grad_norm": 13.868603706359863, "learning_rate": 1.1250112385091544e-05, "loss": 0.1474, "step": 257900 }, { "epoch": 3.009089197290544, "grad_norm": 13.139039039611816, "learning_rate": 1.124671962761094e-05, "loss": 0.1519, "step": 258000 }, { "epoch": 3.0092588351645744, "grad_norm": 15.931158065795898, "learning_rate": 1.1243326870130334e-05, "loss": 0.148, "step": 258100 }, { "epoch": 3.0094284730386045, "grad_norm": 5.191665172576904, "learning_rate": 1.1239934112649729e-05, "loss": 0.1811, "step": 258200 }, { "epoch": 3.0095981109126346, "grad_norm": 5.50192928314209, "learning_rate": 1.1236541355169121e-05, "loss": 0.1659, "step": 258300 }, { "epoch": 3.009767748786665, "grad_norm": 7.319461345672607, "learning_rate": 1.1233148597688515e-05, "loss": 0.1605, "step": 258400 }, { "epoch": 3.0099373866606953, "grad_norm": 31.423480987548828, "learning_rate": 1.122975584020791e-05, "loss": 0.1627, "step": 258500 }, { "epoch": 3.010107024534726, "grad_norm": 13.53628158569336, "learning_rate": 1.1226363082727303e-05, "loss": 0.1368, "step": 258600 }, { "epoch": 3.010276662408756, "grad_norm": 13.526360511779785, "learning_rate": 1.1222970325246696e-05, "loss": 0.1624, "step": 258700 }, { "epoch": 3.010446300282786, "grad_norm": 10.5674409866333, "learning_rate": 1.1219577567766092e-05, "loss": 0.1337, "step": 258800 }, { "epoch": 3.0106159381568167, "grad_norm": 11.437397003173828, "learning_rate": 1.1216184810285484e-05, "loss": 0.1606, "step": 258900 }, { "epoch": 3.010785576030847, "grad_norm": 7.828662872314453, "learning_rate": 1.1212792052804878e-05, "loss": 0.1549, "step": 259000 }, { "epoch": 3.0109552139048774, "grad_norm": 5.249697685241699, "learning_rate": 1.1209399295324273e-05, "loss": 0.1434, "step": 259100 }, { "epoch": 3.0111248517789075, "grad_norm": 2.189824104309082, "learning_rate": 1.1206006537843665e-05, "loss": 0.1457, "step": 259200 }, { "epoch": 3.0112944896529377, "grad_norm": 16.03636932373047, "learning_rate": 1.1202613780363061e-05, "loss": 0.1403, "step": 259300 }, { "epoch": 3.0114641275269682, "grad_norm": 2.7781448364257812, "learning_rate": 1.1199221022882453e-05, "loss": 0.159, "step": 259400 }, { "epoch": 3.0116337654009984, "grad_norm": 3.875314950942993, "learning_rate": 1.1195828265401847e-05, "loss": 0.1594, "step": 259500 }, { "epoch": 3.011803403275029, "grad_norm": 5.530049800872803, "learning_rate": 1.1192435507921242e-05, "loss": 0.1572, "step": 259600 }, { "epoch": 3.011973041149059, "grad_norm": 1.389613151550293, "learning_rate": 1.1189042750440634e-05, "loss": 0.1347, "step": 259700 }, { "epoch": 3.0121426790230896, "grad_norm": 13.930469512939453, "learning_rate": 1.1185649992960028e-05, "loss": 0.1511, "step": 259800 }, { "epoch": 3.0123123168971198, "grad_norm": 16.59494972229004, "learning_rate": 1.1182257235479424e-05, "loss": 0.1453, "step": 259900 }, { "epoch": 3.01248195477115, "grad_norm": 13.14272689819336, "learning_rate": 1.1178864477998816e-05, "loss": 0.1612, "step": 260000 }, { "epoch": 3.0126515926451805, "grad_norm": 5.357202053070068, "learning_rate": 1.1175471720518211e-05, "loss": 0.1533, "step": 260100 }, { "epoch": 3.0128212305192106, "grad_norm": 5.225732803344727, "learning_rate": 1.1172078963037605e-05, "loss": 0.1628, "step": 260200 }, { "epoch": 3.012990868393241, "grad_norm": 11.406720161437988, "learning_rate": 1.1168686205556997e-05, "loss": 0.1495, "step": 260300 }, { "epoch": 3.0131605062672713, "grad_norm": 9.100525856018066, "learning_rate": 1.1165293448076393e-05, "loss": 0.161, "step": 260400 }, { "epoch": 3.0133301441413014, "grad_norm": 9.270694732666016, "learning_rate": 1.1161900690595786e-05, "loss": 0.1458, "step": 260500 }, { "epoch": 3.013499782015332, "grad_norm": 11.04570198059082, "learning_rate": 1.1158507933115179e-05, "loss": 0.1616, "step": 260600 }, { "epoch": 3.013669419889362, "grad_norm": 8.0200777053833, "learning_rate": 1.1155115175634574e-05, "loss": 0.1516, "step": 260700 }, { "epoch": 3.0138390577633927, "grad_norm": 14.767019271850586, "learning_rate": 1.1151722418153968e-05, "loss": 0.158, "step": 260800 }, { "epoch": 3.014008695637423, "grad_norm": 10.854776382446289, "learning_rate": 1.114832966067336e-05, "loss": 0.1526, "step": 260900 }, { "epoch": 3.014178333511453, "grad_norm": 9.84465217590332, "learning_rate": 1.1144936903192755e-05, "loss": 0.1535, "step": 261000 }, { "epoch": 3.0143479713854835, "grad_norm": 14.235413551330566, "learning_rate": 1.114154414571215e-05, "loss": 0.1515, "step": 261100 }, { "epoch": 3.0145176092595136, "grad_norm": 3.733858346939087, "learning_rate": 1.1138151388231545e-05, "loss": 0.157, "step": 261200 }, { "epoch": 3.014687247133544, "grad_norm": 12.766897201538086, "learning_rate": 1.1134758630750937e-05, "loss": 0.1468, "step": 261300 }, { "epoch": 3.0148568850075743, "grad_norm": 6.835260391235352, "learning_rate": 1.113136587327033e-05, "loss": 0.1489, "step": 261400 }, { "epoch": 3.0150265228816044, "grad_norm": 4.0035014152526855, "learning_rate": 1.1127973115789726e-05, "loss": 0.1467, "step": 261500 }, { "epoch": 3.015196160755635, "grad_norm": 7.1262030601501465, "learning_rate": 1.1124580358309118e-05, "loss": 0.1565, "step": 261600 }, { "epoch": 3.015365798629665, "grad_norm": 14.781464576721191, "learning_rate": 1.1121187600828512e-05, "loss": 0.1457, "step": 261700 }, { "epoch": 3.0155354365036957, "grad_norm": 18.29169464111328, "learning_rate": 1.1117794843347906e-05, "loss": 0.1638, "step": 261800 }, { "epoch": 3.015705074377726, "grad_norm": 6.718327045440674, "learning_rate": 1.11144020858673e-05, "loss": 0.1613, "step": 261900 }, { "epoch": 3.015874712251756, "grad_norm": 2.2949256896972656, "learning_rate": 1.1111009328386695e-05, "loss": 0.1529, "step": 262000 }, { "epoch": 3.0160443501257865, "grad_norm": 5.536544322967529, "learning_rate": 1.1107616570906087e-05, "loss": 0.1567, "step": 262100 }, { "epoch": 3.0162139879998167, "grad_norm": 4.796491622924805, "learning_rate": 1.1104223813425481e-05, "loss": 0.1532, "step": 262200 }, { "epoch": 3.0163836258738472, "grad_norm": 18.046443939208984, "learning_rate": 1.1100831055944876e-05, "loss": 0.1505, "step": 262300 }, { "epoch": 3.0165532637478774, "grad_norm": 5.231873989105225, "learning_rate": 1.1097438298464269e-05, "loss": 0.1435, "step": 262400 }, { "epoch": 3.016722901621908, "grad_norm": 13.044255256652832, "learning_rate": 1.1094045540983662e-05, "loss": 0.1554, "step": 262500 }, { "epoch": 3.016892539495938, "grad_norm": 9.445646286010742, "learning_rate": 1.1090652783503058e-05, "loss": 0.154, "step": 262600 }, { "epoch": 3.017062177369968, "grad_norm": 9.655503273010254, "learning_rate": 1.108726002602245e-05, "loss": 0.1497, "step": 262700 }, { "epoch": 3.0172318152439987, "grad_norm": 6.973533630371094, "learning_rate": 1.1083867268541844e-05, "loss": 0.1612, "step": 262800 }, { "epoch": 3.017401453118029, "grad_norm": 14.824217796325684, "learning_rate": 1.108047451106124e-05, "loss": 0.1565, "step": 262900 }, { "epoch": 3.0175710909920594, "grad_norm": 3.8722593784332275, "learning_rate": 1.1077081753580631e-05, "loss": 0.1435, "step": 263000 }, { "epoch": 3.0177407288660896, "grad_norm": 5.326178550720215, "learning_rate": 1.1073688996100027e-05, "loss": 0.1609, "step": 263100 }, { "epoch": 3.0179103667401197, "grad_norm": 4.236318111419678, "learning_rate": 1.107029623861942e-05, "loss": 0.1653, "step": 263200 }, { "epoch": 3.0180800046141503, "grad_norm": 8.586865425109863, "learning_rate": 1.1066903481138813e-05, "loss": 0.1478, "step": 263300 }, { "epoch": 3.0182496424881804, "grad_norm": 12.35448932647705, "learning_rate": 1.1063510723658208e-05, "loss": 0.1503, "step": 263400 }, { "epoch": 3.018419280362211, "grad_norm": 0.5385250449180603, "learning_rate": 1.1060117966177602e-05, "loss": 0.1464, "step": 263500 }, { "epoch": 3.018588918236241, "grad_norm": 10.220398902893066, "learning_rate": 1.1056725208696994e-05, "loss": 0.153, "step": 263600 }, { "epoch": 3.018758556110271, "grad_norm": 7.881062984466553, "learning_rate": 1.105333245121639e-05, "loss": 0.1657, "step": 263700 }, { "epoch": 3.018928193984302, "grad_norm": 19.34242057800293, "learning_rate": 1.1049939693735783e-05, "loss": 0.141, "step": 263800 }, { "epoch": 3.019097831858332, "grad_norm": 18.52291488647461, "learning_rate": 1.1046546936255179e-05, "loss": 0.1727, "step": 263900 }, { "epoch": 3.0192674697323625, "grad_norm": 4.987180233001709, "learning_rate": 1.1043154178774571e-05, "loss": 0.1652, "step": 264000 }, { "epoch": 3.0194371076063926, "grad_norm": 17.299802780151367, "learning_rate": 1.1039761421293965e-05, "loss": 0.1411, "step": 264100 }, { "epoch": 3.0196067454804227, "grad_norm": 9.488534927368164, "learning_rate": 1.1036368663813359e-05, "loss": 0.1644, "step": 264200 }, { "epoch": 3.0197763833544533, "grad_norm": 6.206912040710449, "learning_rate": 1.1032975906332752e-05, "loss": 0.1568, "step": 264300 }, { "epoch": 3.0199460212284834, "grad_norm": 7.081071376800537, "learning_rate": 1.1029583148852146e-05, "loss": 0.1522, "step": 264400 }, { "epoch": 3.020115659102514, "grad_norm": 2.5040862560272217, "learning_rate": 1.102619039137154e-05, "loss": 0.1554, "step": 264500 }, { "epoch": 3.020285296976544, "grad_norm": 6.770780563354492, "learning_rate": 1.1022797633890934e-05, "loss": 0.1607, "step": 264600 }, { "epoch": 3.0204549348505747, "grad_norm": 13.436602592468262, "learning_rate": 1.1019404876410328e-05, "loss": 0.1534, "step": 264700 }, { "epoch": 3.020624572724605, "grad_norm": 14.114776611328125, "learning_rate": 1.1016012118929721e-05, "loss": 0.1442, "step": 264800 }, { "epoch": 3.020794210598635, "grad_norm": 0.9489614367485046, "learning_rate": 1.1012619361449115e-05, "loss": 0.1469, "step": 264900 }, { "epoch": 3.0209638484726655, "grad_norm": 11.639456748962402, "learning_rate": 1.100922660396851e-05, "loss": 0.1593, "step": 265000 }, { "epoch": 3.0211334863466957, "grad_norm": 4.758131504058838, "learning_rate": 1.1005833846487903e-05, "loss": 0.1628, "step": 265100 }, { "epoch": 3.021303124220726, "grad_norm": 7.354145526885986, "learning_rate": 1.1002441089007297e-05, "loss": 0.1545, "step": 265200 }, { "epoch": 3.0214727620947563, "grad_norm": 11.331395149230957, "learning_rate": 1.0999048331526692e-05, "loss": 0.1454, "step": 265300 }, { "epoch": 3.0216423999687865, "grad_norm": 4.428385257720947, "learning_rate": 1.0995655574046084e-05, "loss": 0.1456, "step": 265400 }, { "epoch": 3.021812037842817, "grad_norm": 12.109479904174805, "learning_rate": 1.0992262816565478e-05, "loss": 0.1678, "step": 265500 }, { "epoch": 3.021981675716847, "grad_norm": 7.164102554321289, "learning_rate": 1.0988870059084873e-05, "loss": 0.1631, "step": 265600 }, { "epoch": 3.0221513135908777, "grad_norm": 10.159419059753418, "learning_rate": 1.0985477301604266e-05, "loss": 0.155, "step": 265700 }, { "epoch": 3.022320951464908, "grad_norm": 8.781635284423828, "learning_rate": 1.0982084544123661e-05, "loss": 0.1479, "step": 265800 }, { "epoch": 3.022490589338938, "grad_norm": 8.16838550567627, "learning_rate": 1.0978691786643055e-05, "loss": 0.1685, "step": 265900 }, { "epoch": 3.0226602272129686, "grad_norm": 2.6142830848693848, "learning_rate": 1.0975299029162447e-05, "loss": 0.1482, "step": 266000 }, { "epoch": 3.0228298650869987, "grad_norm": 0.7166082859039307, "learning_rate": 1.0971906271681842e-05, "loss": 0.1392, "step": 266100 }, { "epoch": 3.0229995029610293, "grad_norm": 10.59117603302002, "learning_rate": 1.0968513514201236e-05, "loss": 0.1598, "step": 266200 }, { "epoch": 3.0231691408350594, "grad_norm": 11.641011238098145, "learning_rate": 1.0965120756720628e-05, "loss": 0.1608, "step": 266300 }, { "epoch": 3.0233387787090895, "grad_norm": 4.453952789306641, "learning_rate": 1.0961727999240024e-05, "loss": 0.1659, "step": 266400 }, { "epoch": 3.02350841658312, "grad_norm": 0.9542881846427917, "learning_rate": 1.0958335241759418e-05, "loss": 0.1457, "step": 266500 }, { "epoch": 3.02367805445715, "grad_norm": 19.581737518310547, "learning_rate": 1.095494248427881e-05, "loss": 0.1471, "step": 266600 }, { "epoch": 3.023847692331181, "grad_norm": 14.669853210449219, "learning_rate": 1.0951549726798205e-05, "loss": 0.158, "step": 266700 }, { "epoch": 3.024017330205211, "grad_norm": 5.632186412811279, "learning_rate": 1.0948156969317599e-05, "loss": 0.1668, "step": 266800 }, { "epoch": 3.024186968079241, "grad_norm": 3.3562917709350586, "learning_rate": 1.0944764211836993e-05, "loss": 0.1375, "step": 266900 }, { "epoch": 3.0243566059532716, "grad_norm": 8.661239624023438, "learning_rate": 1.0941371454356387e-05, "loss": 0.165, "step": 267000 }, { "epoch": 3.0245262438273017, "grad_norm": 7.081549644470215, "learning_rate": 1.093797869687578e-05, "loss": 0.1632, "step": 267100 }, { "epoch": 3.0246958817013323, "grad_norm": 6.823566913604736, "learning_rate": 1.0934585939395174e-05, "loss": 0.1581, "step": 267200 }, { "epoch": 3.0248655195753624, "grad_norm": 8.413429260253906, "learning_rate": 1.0931193181914568e-05, "loss": 0.1425, "step": 267300 }, { "epoch": 3.025035157449393, "grad_norm": 3.5006725788116455, "learning_rate": 1.092780042443396e-05, "loss": 0.1565, "step": 267400 }, { "epoch": 3.025204795323423, "grad_norm": 4.0169291496276855, "learning_rate": 1.0924407666953356e-05, "loss": 0.1433, "step": 267500 }, { "epoch": 3.0253744331974532, "grad_norm": 10.510782241821289, "learning_rate": 1.092101490947275e-05, "loss": 0.1444, "step": 267600 }, { "epoch": 3.025544071071484, "grad_norm": 12.500268936157227, "learning_rate": 1.0917622151992145e-05, "loss": 0.1575, "step": 267700 }, { "epoch": 3.025713708945514, "grad_norm": 9.160432815551758, "learning_rate": 1.0914229394511537e-05, "loss": 0.1533, "step": 267800 }, { "epoch": 3.0258833468195445, "grad_norm": 7.572457313537598, "learning_rate": 1.091083663703093e-05, "loss": 0.1652, "step": 267900 }, { "epoch": 3.0260529846935746, "grad_norm": 8.465401649475098, "learning_rate": 1.0907443879550326e-05, "loss": 0.152, "step": 268000 }, { "epoch": 3.0262226225676048, "grad_norm": 12.756807327270508, "learning_rate": 1.0904051122069718e-05, "loss": 0.1622, "step": 268100 }, { "epoch": 3.0263922604416353, "grad_norm": 5.499080657958984, "learning_rate": 1.0900658364589112e-05, "loss": 0.1424, "step": 268200 }, { "epoch": 3.0265618983156655, "grad_norm": 7.71085786819458, "learning_rate": 1.0897265607108508e-05, "loss": 0.1527, "step": 268300 }, { "epoch": 3.026731536189696, "grad_norm": 1.8893438577651978, "learning_rate": 1.08938728496279e-05, "loss": 0.1605, "step": 268400 }, { "epoch": 3.026901174063726, "grad_norm": 4.8983635902404785, "learning_rate": 1.0890480092147293e-05, "loss": 0.1622, "step": 268500 }, { "epoch": 3.0270708119377563, "grad_norm": 12.47304630279541, "learning_rate": 1.0887087334666689e-05, "loss": 0.1566, "step": 268600 }, { "epoch": 3.027240449811787, "grad_norm": 2.2171988487243652, "learning_rate": 1.0883694577186081e-05, "loss": 0.1508, "step": 268700 }, { "epoch": 3.027410087685817, "grad_norm": 3.102511167526245, "learning_rate": 1.0880301819705477e-05, "loss": 0.1647, "step": 268800 }, { "epoch": 3.0275797255598476, "grad_norm": 7.014285087585449, "learning_rate": 1.087690906222487e-05, "loss": 0.1479, "step": 268900 }, { "epoch": 3.0277493634338777, "grad_norm": 13.506196975708008, "learning_rate": 1.0873516304744262e-05, "loss": 0.1522, "step": 269000 }, { "epoch": 3.027919001307908, "grad_norm": 3.4324350357055664, "learning_rate": 1.0870123547263658e-05, "loss": 0.1405, "step": 269100 }, { "epoch": 3.0280886391819384, "grad_norm": 15.12349796295166, "learning_rate": 1.0866730789783052e-05, "loss": 0.1519, "step": 269200 }, { "epoch": 3.0282582770559685, "grad_norm": 13.906213760375977, "learning_rate": 1.0863338032302444e-05, "loss": 0.165, "step": 269300 }, { "epoch": 3.028427914929999, "grad_norm": 3.7625648975372314, "learning_rate": 1.085994527482184e-05, "loss": 0.1478, "step": 269400 }, { "epoch": 3.028597552804029, "grad_norm": 10.120375633239746, "learning_rate": 1.0856552517341231e-05, "loss": 0.1522, "step": 269500 }, { "epoch": 3.0287671906780593, "grad_norm": 8.107616424560547, "learning_rate": 1.0853159759860625e-05, "loss": 0.1579, "step": 269600 }, { "epoch": 3.02893682855209, "grad_norm": 8.820052146911621, "learning_rate": 1.084976700238002e-05, "loss": 0.15, "step": 269700 }, { "epoch": 3.02910646642612, "grad_norm": 12.950883865356445, "learning_rate": 1.0846374244899413e-05, "loss": 0.1604, "step": 269800 }, { "epoch": 3.0292761043001506, "grad_norm": 7.364608287811279, "learning_rate": 1.0842981487418808e-05, "loss": 0.1604, "step": 269900 }, { "epoch": 3.0294457421741807, "grad_norm": 11.994559288024902, "learning_rate": 1.0839588729938202e-05, "loss": 0.151, "step": 270000 }, { "epoch": 3.0296153800482113, "grad_norm": 1.343295931816101, "learning_rate": 1.0836195972457594e-05, "loss": 0.166, "step": 270100 }, { "epoch": 3.0297850179222414, "grad_norm": 16.38650131225586, "learning_rate": 1.083280321497699e-05, "loss": 0.1576, "step": 270200 }, { "epoch": 3.0299546557962715, "grad_norm": 3.1557910442352295, "learning_rate": 1.0829410457496383e-05, "loss": 0.1568, "step": 270300 }, { "epoch": 3.030124293670302, "grad_norm": 5.765024185180664, "learning_rate": 1.0826017700015776e-05, "loss": 0.1592, "step": 270400 }, { "epoch": 3.0302939315443322, "grad_norm": 10.466668128967285, "learning_rate": 1.0822624942535171e-05, "loss": 0.1493, "step": 270500 }, { "epoch": 3.030463569418363, "grad_norm": 20.431995391845703, "learning_rate": 1.0819232185054565e-05, "loss": 0.1734, "step": 270600 }, { "epoch": 3.030633207292393, "grad_norm": 22.098695755004883, "learning_rate": 1.081583942757396e-05, "loss": 0.1601, "step": 270700 }, { "epoch": 3.030802845166423, "grad_norm": 3.298430919647217, "learning_rate": 1.0812446670093352e-05, "loss": 0.151, "step": 270800 }, { "epoch": 3.0309724830404536, "grad_norm": 17.58994483947754, "learning_rate": 1.0809053912612746e-05, "loss": 0.1417, "step": 270900 }, { "epoch": 3.0311421209144838, "grad_norm": 7.217146873474121, "learning_rate": 1.0805661155132142e-05, "loss": 0.1539, "step": 271000 }, { "epoch": 3.0313117587885143, "grad_norm": 9.172521591186523, "learning_rate": 1.0802268397651534e-05, "loss": 0.155, "step": 271100 }, { "epoch": 3.0314813966625445, "grad_norm": 7.927305221557617, "learning_rate": 1.0798875640170928e-05, "loss": 0.1341, "step": 271200 }, { "epoch": 3.0316510345365746, "grad_norm": 23.2873477935791, "learning_rate": 1.0795482882690323e-05, "loss": 0.1606, "step": 271300 }, { "epoch": 3.031820672410605, "grad_norm": 14.955362319946289, "learning_rate": 1.0792090125209715e-05, "loss": 0.1589, "step": 271400 }, { "epoch": 3.0319903102846353, "grad_norm": 6.056772708892822, "learning_rate": 1.0788697367729109e-05, "loss": 0.1612, "step": 271500 }, { "epoch": 3.032159948158666, "grad_norm": 9.204275131225586, "learning_rate": 1.0785304610248505e-05, "loss": 0.142, "step": 271600 }, { "epoch": 3.032329586032696, "grad_norm": 25.9403076171875, "learning_rate": 1.0781911852767897e-05, "loss": 0.1549, "step": 271700 }, { "epoch": 3.032499223906726, "grad_norm": 5.354026794433594, "learning_rate": 1.0778519095287292e-05, "loss": 0.1436, "step": 271800 }, { "epoch": 3.0326688617807567, "grad_norm": 0.5512380599975586, "learning_rate": 1.0775126337806684e-05, "loss": 0.1482, "step": 271900 }, { "epoch": 3.032838499654787, "grad_norm": 18.330427169799805, "learning_rate": 1.0771733580326078e-05, "loss": 0.1704, "step": 272000 }, { "epoch": 3.0330081375288174, "grad_norm": 6.092288970947266, "learning_rate": 1.0768340822845474e-05, "loss": 0.1609, "step": 272100 }, { "epoch": 3.0331777754028475, "grad_norm": 20.31742286682129, "learning_rate": 1.0764948065364866e-05, "loss": 0.1463, "step": 272200 }, { "epoch": 3.0333474132768776, "grad_norm": 8.367773056030273, "learning_rate": 1.076155530788426e-05, "loss": 0.1451, "step": 272300 }, { "epoch": 3.033517051150908, "grad_norm": 5.444825649261475, "learning_rate": 1.0758162550403655e-05, "loss": 0.1514, "step": 272400 }, { "epoch": 3.0336866890249383, "grad_norm": 9.22183609008789, "learning_rate": 1.0754769792923047e-05, "loss": 0.1554, "step": 272500 }, { "epoch": 3.033856326898969, "grad_norm": 10.532670021057129, "learning_rate": 1.0751377035442442e-05, "loss": 0.1514, "step": 272600 }, { "epoch": 3.034025964772999, "grad_norm": 10.975471496582031, "learning_rate": 1.0747984277961836e-05, "loss": 0.1534, "step": 272700 }, { "epoch": 3.0341956026470296, "grad_norm": 11.464454650878906, "learning_rate": 1.0744591520481228e-05, "loss": 0.1455, "step": 272800 }, { "epoch": 3.0343652405210597, "grad_norm": 7.674105644226074, "learning_rate": 1.0741198763000624e-05, "loss": 0.1639, "step": 272900 }, { "epoch": 3.03453487839509, "grad_norm": 14.340052604675293, "learning_rate": 1.0737806005520018e-05, "loss": 0.1577, "step": 273000 }, { "epoch": 3.0347045162691204, "grad_norm": 7.500619411468506, "learning_rate": 1.073441324803941e-05, "loss": 0.1543, "step": 273100 }, { "epoch": 3.0348741541431505, "grad_norm": 2.9548473358154297, "learning_rate": 1.0731020490558805e-05, "loss": 0.157, "step": 273200 }, { "epoch": 3.035043792017181, "grad_norm": 10.908401489257812, "learning_rate": 1.0727627733078199e-05, "loss": 0.1649, "step": 273300 }, { "epoch": 3.0352134298912112, "grad_norm": 11.687898635864258, "learning_rate": 1.0724234975597591e-05, "loss": 0.1603, "step": 273400 }, { "epoch": 3.0353830677652414, "grad_norm": 5.1739397048950195, "learning_rate": 1.0720842218116987e-05, "loss": 0.1628, "step": 273500 }, { "epoch": 3.035552705639272, "grad_norm": 13.077430725097656, "learning_rate": 1.071744946063638e-05, "loss": 0.1626, "step": 273600 }, { "epoch": 3.035722343513302, "grad_norm": 9.085679054260254, "learning_rate": 1.0714056703155776e-05, "loss": 0.1454, "step": 273700 }, { "epoch": 3.0358919813873326, "grad_norm": 11.874102592468262, "learning_rate": 1.0710663945675168e-05, "loss": 0.1594, "step": 273800 }, { "epoch": 3.0360616192613628, "grad_norm": 6.318803787231445, "learning_rate": 1.0707271188194562e-05, "loss": 0.1464, "step": 273900 }, { "epoch": 3.036231257135393, "grad_norm": 3.6595921516418457, "learning_rate": 1.0703878430713957e-05, "loss": 0.1514, "step": 274000 }, { "epoch": 3.0364008950094235, "grad_norm": 6.508599281311035, "learning_rate": 1.070048567323335e-05, "loss": 0.1579, "step": 274100 }, { "epoch": 3.0365705328834536, "grad_norm": 7.110990524291992, "learning_rate": 1.0697092915752743e-05, "loss": 0.1536, "step": 274200 }, { "epoch": 3.036740170757484, "grad_norm": 9.764284133911133, "learning_rate": 1.0693700158272137e-05, "loss": 0.1625, "step": 274300 }, { "epoch": 3.0369098086315143, "grad_norm": 2.1606273651123047, "learning_rate": 1.069030740079153e-05, "loss": 0.1537, "step": 274400 }, { "epoch": 3.0370794465055444, "grad_norm": 10.484155654907227, "learning_rate": 1.0686914643310926e-05, "loss": 0.1541, "step": 274500 }, { "epoch": 3.037249084379575, "grad_norm": 12.764707565307617, "learning_rate": 1.0683521885830318e-05, "loss": 0.144, "step": 274600 }, { "epoch": 3.037418722253605, "grad_norm": 11.691524505615234, "learning_rate": 1.0680129128349712e-05, "loss": 0.1433, "step": 274700 }, { "epoch": 3.0375883601276357, "grad_norm": 11.436172485351562, "learning_rate": 1.0676736370869108e-05, "loss": 0.1449, "step": 274800 }, { "epoch": 3.037757998001666, "grad_norm": 12.15080451965332, "learning_rate": 1.06733436133885e-05, "loss": 0.1678, "step": 274900 }, { "epoch": 3.0379276358756964, "grad_norm": 9.971243858337402, "learning_rate": 1.0669950855907894e-05, "loss": 0.1374, "step": 275000 }, { "epoch": 3.0380972737497265, "grad_norm": 1.8355307579040527, "learning_rate": 1.0666558098427289e-05, "loss": 0.1534, "step": 275100 }, { "epoch": 3.0382669116237566, "grad_norm": 10.537922859191895, "learning_rate": 1.0663165340946681e-05, "loss": 0.141, "step": 275200 }, { "epoch": 3.038436549497787, "grad_norm": 10.619355201721191, "learning_rate": 1.0659772583466075e-05, "loss": 0.1562, "step": 275300 }, { "epoch": 3.0386061873718173, "grad_norm": 2.8350930213928223, "learning_rate": 1.065637982598547e-05, "loss": 0.1472, "step": 275400 }, { "epoch": 3.038775825245848, "grad_norm": 6.209840297698975, "learning_rate": 1.0652987068504863e-05, "loss": 0.1494, "step": 275500 }, { "epoch": 3.038945463119878, "grad_norm": 11.679935455322266, "learning_rate": 1.0649594311024258e-05, "loss": 0.158, "step": 275600 }, { "epoch": 3.039115100993908, "grad_norm": 11.010242462158203, "learning_rate": 1.0646201553543652e-05, "loss": 0.1441, "step": 275700 }, { "epoch": 3.0392847388679387, "grad_norm": 5.518909454345703, "learning_rate": 1.0642808796063044e-05, "loss": 0.1512, "step": 275800 }, { "epoch": 3.039454376741969, "grad_norm": 6.219600677490234, "learning_rate": 1.063941603858244e-05, "loss": 0.1499, "step": 275900 }, { "epoch": 3.0396240146159994, "grad_norm": 10.654524803161621, "learning_rate": 1.0636023281101833e-05, "loss": 0.1618, "step": 276000 }, { "epoch": 3.0397936524900295, "grad_norm": 6.117408275604248, "learning_rate": 1.0632630523621225e-05, "loss": 0.1484, "step": 276100 }, { "epoch": 3.0399632903640597, "grad_norm": 3.0971076488494873, "learning_rate": 1.062923776614062e-05, "loss": 0.1523, "step": 276200 }, { "epoch": 3.0401329282380902, "grad_norm": 1.460566759109497, "learning_rate": 1.0625845008660015e-05, "loss": 0.1416, "step": 276300 }, { "epoch": 3.0403025661121204, "grad_norm": 9.416996002197266, "learning_rate": 1.062245225117941e-05, "loss": 0.1528, "step": 276400 }, { "epoch": 3.040472203986151, "grad_norm": 8.550689697265625, "learning_rate": 1.0619059493698802e-05, "loss": 0.1484, "step": 276500 }, { "epoch": 3.040641841860181, "grad_norm": 8.4969482421875, "learning_rate": 1.0615666736218196e-05, "loss": 0.175, "step": 276600 }, { "epoch": 3.040811479734211, "grad_norm": 9.158646583557129, "learning_rate": 1.061227397873759e-05, "loss": 0.1587, "step": 276700 }, { "epoch": 3.0409811176082417, "grad_norm": 12.332466125488281, "learning_rate": 1.0608881221256984e-05, "loss": 0.1601, "step": 276800 }, { "epoch": 3.041150755482272, "grad_norm": 12.582658767700195, "learning_rate": 1.0605488463776377e-05, "loss": 0.1528, "step": 276900 }, { "epoch": 3.0413203933563024, "grad_norm": 8.893961906433105, "learning_rate": 1.0602095706295771e-05, "loss": 0.1658, "step": 277000 }, { "epoch": 3.0414900312303326, "grad_norm": 7.1197919845581055, "learning_rate": 1.0598702948815165e-05, "loss": 0.1624, "step": 277100 }, { "epoch": 3.041659669104363, "grad_norm": 8.699812889099121, "learning_rate": 1.0595310191334559e-05, "loss": 0.1543, "step": 277200 }, { "epoch": 3.0418293069783933, "grad_norm": 12.104179382324219, "learning_rate": 1.0591917433853953e-05, "loss": 0.1453, "step": 277300 }, { "epoch": 3.0419989448524234, "grad_norm": 1.827694296836853, "learning_rate": 1.0588524676373346e-05, "loss": 0.1578, "step": 277400 }, { "epoch": 3.042168582726454, "grad_norm": 7.066198825836182, "learning_rate": 1.0585131918892742e-05, "loss": 0.1541, "step": 277500 }, { "epoch": 3.042338220600484, "grad_norm": 6.5160980224609375, "learning_rate": 1.0581739161412134e-05, "loss": 0.1591, "step": 277600 }, { "epoch": 3.0425078584745147, "grad_norm": 7.305414199829102, "learning_rate": 1.0578346403931528e-05, "loss": 0.1426, "step": 277700 }, { "epoch": 3.042677496348545, "grad_norm": 33.0880012512207, "learning_rate": 1.0574953646450923e-05, "loss": 0.1402, "step": 277800 }, { "epoch": 3.042847134222575, "grad_norm": 21.34502601623535, "learning_rate": 1.0571560888970315e-05, "loss": 0.1427, "step": 277900 }, { "epoch": 3.0430167720966055, "grad_norm": 8.229662895202637, "learning_rate": 1.0568168131489709e-05, "loss": 0.145, "step": 278000 }, { "epoch": 3.0431864099706356, "grad_norm": 3.8914146423339844, "learning_rate": 1.0564775374009105e-05, "loss": 0.1598, "step": 278100 }, { "epoch": 3.043356047844666, "grad_norm": 1.3231157064437866, "learning_rate": 1.0561382616528497e-05, "loss": 0.1451, "step": 278200 }, { "epoch": 3.0435256857186963, "grad_norm": 11.843729019165039, "learning_rate": 1.0557989859047892e-05, "loss": 0.1625, "step": 278300 }, { "epoch": 3.0436953235927264, "grad_norm": 12.33315372467041, "learning_rate": 1.0554597101567286e-05, "loss": 0.16, "step": 278400 }, { "epoch": 3.043864961466757, "grad_norm": 17.10000991821289, "learning_rate": 1.0551204344086678e-05, "loss": 0.1479, "step": 278500 }, { "epoch": 3.044034599340787, "grad_norm": 10.123991012573242, "learning_rate": 1.0547811586606074e-05, "loss": 0.1563, "step": 278600 }, { "epoch": 3.0442042372148177, "grad_norm": 5.522684097290039, "learning_rate": 1.0544418829125467e-05, "loss": 0.1516, "step": 278700 }, { "epoch": 3.044373875088848, "grad_norm": 8.821853637695312, "learning_rate": 1.054102607164486e-05, "loss": 0.155, "step": 278800 }, { "epoch": 3.044543512962878, "grad_norm": 22.614259719848633, "learning_rate": 1.0537633314164255e-05, "loss": 0.1434, "step": 278900 }, { "epoch": 3.0447131508369085, "grad_norm": 10.30262279510498, "learning_rate": 1.0534240556683649e-05, "loss": 0.1687, "step": 279000 }, { "epoch": 3.0448827887109386, "grad_norm": 10.847270965576172, "learning_rate": 1.0530847799203041e-05, "loss": 0.1515, "step": 279100 }, { "epoch": 3.045052426584969, "grad_norm": 0.8757480978965759, "learning_rate": 1.0527455041722436e-05, "loss": 0.1542, "step": 279200 }, { "epoch": 3.0452220644589993, "grad_norm": 22.479379653930664, "learning_rate": 1.052406228424183e-05, "loss": 0.1564, "step": 279300 }, { "epoch": 3.0453917023330295, "grad_norm": 7.732309818267822, "learning_rate": 1.0520669526761224e-05, "loss": 0.1466, "step": 279400 }, { "epoch": 3.04556134020706, "grad_norm": 5.350580215454102, "learning_rate": 1.0517276769280618e-05, "loss": 0.1665, "step": 279500 }, { "epoch": 3.04573097808109, "grad_norm": 15.878984451293945, "learning_rate": 1.0513884011800012e-05, "loss": 0.1567, "step": 279600 }, { "epoch": 3.0459006159551207, "grad_norm": 23.68988609313965, "learning_rate": 1.0510491254319405e-05, "loss": 0.1515, "step": 279700 }, { "epoch": 3.046070253829151, "grad_norm": 20.24547004699707, "learning_rate": 1.05070984968388e-05, "loss": 0.1567, "step": 279800 }, { "epoch": 3.0462398917031814, "grad_norm": 12.247384071350098, "learning_rate": 1.0503705739358191e-05, "loss": 0.1548, "step": 279900 }, { "epoch": 3.0464095295772116, "grad_norm": 9.208386421203613, "learning_rate": 1.0500312981877587e-05, "loss": 0.1603, "step": 280000 }, { "epoch": 3.0465791674512417, "grad_norm": 6.749161720275879, "learning_rate": 1.049692022439698e-05, "loss": 0.1492, "step": 280100 }, { "epoch": 3.0467488053252723, "grad_norm": 11.327890396118164, "learning_rate": 1.0493527466916376e-05, "loss": 0.1581, "step": 280200 }, { "epoch": 3.0469184431993024, "grad_norm": 12.144312858581543, "learning_rate": 1.0490134709435768e-05, "loss": 0.1625, "step": 280300 }, { "epoch": 3.047088081073333, "grad_norm": 8.07168197631836, "learning_rate": 1.0486741951955162e-05, "loss": 0.1515, "step": 280400 }, { "epoch": 3.047257718947363, "grad_norm": 7.432772636413574, "learning_rate": 1.0483349194474557e-05, "loss": 0.1397, "step": 280500 }, { "epoch": 3.047427356821393, "grad_norm": 1.4622809886932373, "learning_rate": 1.047995643699395e-05, "loss": 0.1552, "step": 280600 }, { "epoch": 3.0475969946954238, "grad_norm": 17.0101261138916, "learning_rate": 1.0476563679513343e-05, "loss": 0.1627, "step": 280700 }, { "epoch": 3.047766632569454, "grad_norm": 6.399352550506592, "learning_rate": 1.0473170922032739e-05, "loss": 0.1525, "step": 280800 }, { "epoch": 3.0479362704434845, "grad_norm": 16.494226455688477, "learning_rate": 1.0469778164552131e-05, "loss": 0.1444, "step": 280900 }, { "epoch": 3.0481059083175146, "grad_norm": 9.555428504943848, "learning_rate": 1.0466385407071525e-05, "loss": 0.1429, "step": 281000 }, { "epoch": 3.0482755461915447, "grad_norm": 15.844372749328613, "learning_rate": 1.046299264959092e-05, "loss": 0.1582, "step": 281100 }, { "epoch": 3.0484451840655753, "grad_norm": 20.859193801879883, "learning_rate": 1.0459599892110312e-05, "loss": 0.1341, "step": 281200 }, { "epoch": 3.0486148219396054, "grad_norm": 10.637293815612793, "learning_rate": 1.0456207134629708e-05, "loss": 0.1537, "step": 281300 }, { "epoch": 3.048784459813636, "grad_norm": 0.6746477484703064, "learning_rate": 1.0452814377149102e-05, "loss": 0.1446, "step": 281400 }, { "epoch": 3.048954097687666, "grad_norm": 8.761613845825195, "learning_rate": 1.0449421619668494e-05, "loss": 0.1575, "step": 281500 }, { "epoch": 3.0491237355616962, "grad_norm": 14.853021621704102, "learning_rate": 1.044602886218789e-05, "loss": 0.1596, "step": 281600 }, { "epoch": 3.049293373435727, "grad_norm": 4.094226837158203, "learning_rate": 1.0442636104707283e-05, "loss": 0.1491, "step": 281700 }, { "epoch": 3.049463011309757, "grad_norm": 8.79259204864502, "learning_rate": 1.0439243347226675e-05, "loss": 0.1596, "step": 281800 }, { "epoch": 3.0496326491837875, "grad_norm": 5.393545150756836, "learning_rate": 1.043585058974607e-05, "loss": 0.1553, "step": 281900 }, { "epoch": 3.0498022870578176, "grad_norm": 10.10053539276123, "learning_rate": 1.0432457832265463e-05, "loss": 0.1486, "step": 282000 }, { "epoch": 3.0499719249318478, "grad_norm": 10.174798965454102, "learning_rate": 1.0429065074784856e-05, "loss": 0.1605, "step": 282100 }, { "epoch": 3.0501415628058783, "grad_norm": 4.397825717926025, "learning_rate": 1.0425672317304252e-05, "loss": 0.1452, "step": 282200 }, { "epoch": 3.0503112006799085, "grad_norm": 14.1172513961792, "learning_rate": 1.0422279559823644e-05, "loss": 0.14, "step": 282300 }, { "epoch": 3.050480838553939, "grad_norm": 13.556111335754395, "learning_rate": 1.041888680234304e-05, "loss": 0.1703, "step": 282400 }, { "epoch": 3.050650476427969, "grad_norm": 17.3558349609375, "learning_rate": 1.0415494044862433e-05, "loss": 0.1479, "step": 282500 }, { "epoch": 3.0508201143019997, "grad_norm": 4.454570770263672, "learning_rate": 1.0412101287381825e-05, "loss": 0.17, "step": 282600 }, { "epoch": 3.05098975217603, "grad_norm": 10.001524925231934, "learning_rate": 1.0408708529901221e-05, "loss": 0.1551, "step": 282700 }, { "epoch": 3.05115939005006, "grad_norm": 18.696088790893555, "learning_rate": 1.0405315772420615e-05, "loss": 0.1716, "step": 282800 }, { "epoch": 3.0513290279240906, "grad_norm": 4.925704002380371, "learning_rate": 1.0401923014940007e-05, "loss": 0.1629, "step": 282900 }, { "epoch": 3.0514986657981207, "grad_norm": 6.537905693054199, "learning_rate": 1.0398530257459402e-05, "loss": 0.154, "step": 283000 }, { "epoch": 3.0516683036721513, "grad_norm": 14.683539390563965, "learning_rate": 1.0395137499978796e-05, "loss": 0.1562, "step": 283100 }, { "epoch": 3.0518379415461814, "grad_norm": 15.45533275604248, "learning_rate": 1.0391744742498192e-05, "loss": 0.1557, "step": 283200 }, { "epoch": 3.0520075794202115, "grad_norm": 8.611940383911133, "learning_rate": 1.0388351985017584e-05, "loss": 0.1726, "step": 283300 }, { "epoch": 3.052177217294242, "grad_norm": 9.38478946685791, "learning_rate": 1.0384959227536977e-05, "loss": 0.1406, "step": 283400 }, { "epoch": 3.052346855168272, "grad_norm": 10.511954307556152, "learning_rate": 1.0381566470056373e-05, "loss": 0.1615, "step": 283500 }, { "epoch": 3.0525164930423028, "grad_norm": 8.072473526000977, "learning_rate": 1.0378173712575765e-05, "loss": 0.1526, "step": 283600 }, { "epoch": 3.052686130916333, "grad_norm": 3.5554392337799072, "learning_rate": 1.0374780955095159e-05, "loss": 0.1785, "step": 283700 }, { "epoch": 3.052855768790363, "grad_norm": 14.283459663391113, "learning_rate": 1.0371388197614554e-05, "loss": 0.1524, "step": 283800 }, { "epoch": 3.0530254066643936, "grad_norm": 2.960000991821289, "learning_rate": 1.0367995440133946e-05, "loss": 0.1651, "step": 283900 }, { "epoch": 3.0531950445384237, "grad_norm": 5.733016490936279, "learning_rate": 1.036460268265334e-05, "loss": 0.1572, "step": 284000 }, { "epoch": 3.0533646824124543, "grad_norm": 5.506923198699951, "learning_rate": 1.0361209925172736e-05, "loss": 0.1728, "step": 284100 }, { "epoch": 3.0535343202864844, "grad_norm": 4.472716331481934, "learning_rate": 1.0357817167692128e-05, "loss": 0.185, "step": 284200 }, { "epoch": 3.0537039581605145, "grad_norm": 15.217207908630371, "learning_rate": 1.0354424410211523e-05, "loss": 0.1396, "step": 284300 }, { "epoch": 3.053873596034545, "grad_norm": 1.806425929069519, "learning_rate": 1.0351031652730915e-05, "loss": 0.1452, "step": 284400 }, { "epoch": 3.0540432339085752, "grad_norm": 1.7473758459091187, "learning_rate": 1.034763889525031e-05, "loss": 0.1348, "step": 284500 }, { "epoch": 3.054212871782606, "grad_norm": 19.017946243286133, "learning_rate": 1.0344246137769705e-05, "loss": 0.1503, "step": 284600 }, { "epoch": 3.054382509656636, "grad_norm": 2.08984112739563, "learning_rate": 1.0340853380289097e-05, "loss": 0.1405, "step": 284700 }, { "epoch": 3.054552147530666, "grad_norm": 12.714713096618652, "learning_rate": 1.033746062280849e-05, "loss": 0.1639, "step": 284800 }, { "epoch": 3.0547217854046966, "grad_norm": 8.354045867919922, "learning_rate": 1.0334067865327886e-05, "loss": 0.1674, "step": 284900 }, { "epoch": 3.0548914232787268, "grad_norm": 5.359758377075195, "learning_rate": 1.0330675107847278e-05, "loss": 0.1633, "step": 285000 }, { "epoch": 3.0550610611527573, "grad_norm": 1.86589777469635, "learning_rate": 1.0327282350366674e-05, "loss": 0.1383, "step": 285100 }, { "epoch": 3.0552306990267875, "grad_norm": 12.48970890045166, "learning_rate": 1.0323889592886068e-05, "loss": 0.1519, "step": 285200 }, { "epoch": 3.055400336900818, "grad_norm": 13.4497652053833, "learning_rate": 1.032049683540546e-05, "loss": 0.1582, "step": 285300 }, { "epoch": 3.055569974774848, "grad_norm": 6.420639514923096, "learning_rate": 1.0317104077924855e-05, "loss": 0.1413, "step": 285400 }, { "epoch": 3.0557396126488783, "grad_norm": 12.233366012573242, "learning_rate": 1.0313711320444249e-05, "loss": 0.1525, "step": 285500 }, { "epoch": 3.055909250522909, "grad_norm": 4.512674331665039, "learning_rate": 1.0310318562963641e-05, "loss": 0.1571, "step": 285600 }, { "epoch": 3.056078888396939, "grad_norm": 4.601346969604492, "learning_rate": 1.0306925805483036e-05, "loss": 0.1523, "step": 285700 }, { "epoch": 3.0562485262709695, "grad_norm": 10.609725952148438, "learning_rate": 1.030353304800243e-05, "loss": 0.1535, "step": 285800 }, { "epoch": 3.0564181641449997, "grad_norm": 8.47838306427002, "learning_rate": 1.0300140290521822e-05, "loss": 0.1567, "step": 285900 }, { "epoch": 3.05658780201903, "grad_norm": 1.6939266920089722, "learning_rate": 1.0296747533041218e-05, "loss": 0.1457, "step": 286000 }, { "epoch": 3.0567574398930604, "grad_norm": 3.3652281761169434, "learning_rate": 1.0293354775560612e-05, "loss": 0.167, "step": 286100 }, { "epoch": 3.0569270777670905, "grad_norm": 4.734527587890625, "learning_rate": 1.0289962018080007e-05, "loss": 0.1652, "step": 286200 }, { "epoch": 3.057096715641121, "grad_norm": 14.071426391601562, "learning_rate": 1.02865692605994e-05, "loss": 0.144, "step": 286300 }, { "epoch": 3.057266353515151, "grad_norm": 10.030948638916016, "learning_rate": 1.0283176503118793e-05, "loss": 0.1587, "step": 286400 }, { "epoch": 3.0574359913891813, "grad_norm": 8.056357383728027, "learning_rate": 1.0279783745638189e-05, "loss": 0.1576, "step": 286500 }, { "epoch": 3.057605629263212, "grad_norm": 12.558915138244629, "learning_rate": 1.027639098815758e-05, "loss": 0.1599, "step": 286600 }, { "epoch": 3.057775267137242, "grad_norm": 6.904665470123291, "learning_rate": 1.0272998230676974e-05, "loss": 0.1498, "step": 286700 }, { "epoch": 3.0579449050112726, "grad_norm": 10.15954875946045, "learning_rate": 1.0269605473196368e-05, "loss": 0.1506, "step": 286800 }, { "epoch": 3.0581145428853027, "grad_norm": 7.451841831207275, "learning_rate": 1.0266212715715762e-05, "loss": 0.1482, "step": 286900 }, { "epoch": 3.058284180759333, "grad_norm": 13.540942192077637, "learning_rate": 1.0262819958235158e-05, "loss": 0.1436, "step": 287000 }, { "epoch": 3.0584538186333634, "grad_norm": 9.730463027954102, "learning_rate": 1.025942720075455e-05, "loss": 0.1653, "step": 287100 }, { "epoch": 3.0586234565073935, "grad_norm": 6.998121738433838, "learning_rate": 1.0256034443273943e-05, "loss": 0.1566, "step": 287200 }, { "epoch": 3.058793094381424, "grad_norm": 14.938324928283691, "learning_rate": 1.0252641685793339e-05, "loss": 0.15, "step": 287300 }, { "epoch": 3.0589627322554542, "grad_norm": 14.451517105102539, "learning_rate": 1.0249248928312731e-05, "loss": 0.1407, "step": 287400 }, { "epoch": 3.059132370129485, "grad_norm": 13.281218528747559, "learning_rate": 1.0245856170832125e-05, "loss": 0.1583, "step": 287500 }, { "epoch": 3.059302008003515, "grad_norm": 8.591048240661621, "learning_rate": 1.024246341335152e-05, "loss": 0.1485, "step": 287600 }, { "epoch": 3.059471645877545, "grad_norm": 14.771340370178223, "learning_rate": 1.0239070655870912e-05, "loss": 0.1542, "step": 287700 }, { "epoch": 3.0596412837515756, "grad_norm": 5.535162925720215, "learning_rate": 1.0235677898390306e-05, "loss": 0.1595, "step": 287800 }, { "epoch": 3.0598109216256058, "grad_norm": 9.059183120727539, "learning_rate": 1.0232285140909702e-05, "loss": 0.1495, "step": 287900 }, { "epoch": 3.0599805594996363, "grad_norm": 9.929990768432617, "learning_rate": 1.0228892383429094e-05, "loss": 0.1593, "step": 288000 }, { "epoch": 3.0601501973736664, "grad_norm": 7.771573543548584, "learning_rate": 1.022549962594849e-05, "loss": 0.1574, "step": 288100 }, { "epoch": 3.0603198352476966, "grad_norm": 16.755142211914062, "learning_rate": 1.0222106868467883e-05, "loss": 0.172, "step": 288200 }, { "epoch": 3.060489473121727, "grad_norm": 6.495147228240967, "learning_rate": 1.0218714110987275e-05, "loss": 0.1537, "step": 288300 }, { "epoch": 3.0606591109957573, "grad_norm": 5.46190881729126, "learning_rate": 1.021532135350667e-05, "loss": 0.1558, "step": 288400 }, { "epoch": 3.060828748869788, "grad_norm": 20.5092716217041, "learning_rate": 1.0211928596026064e-05, "loss": 0.1673, "step": 288500 }, { "epoch": 3.060998386743818, "grad_norm": 12.621013641357422, "learning_rate": 1.0208535838545457e-05, "loss": 0.1552, "step": 288600 }, { "epoch": 3.061168024617848, "grad_norm": 4.6638922691345215, "learning_rate": 1.0205143081064852e-05, "loss": 0.1512, "step": 288700 }, { "epoch": 3.0613376624918787, "grad_norm": 16.296987533569336, "learning_rate": 1.0201750323584246e-05, "loss": 0.1514, "step": 288800 }, { "epoch": 3.061507300365909, "grad_norm": 11.383593559265137, "learning_rate": 1.019835756610364e-05, "loss": 0.1544, "step": 288900 }, { "epoch": 3.0616769382399394, "grad_norm": 12.533166885375977, "learning_rate": 1.0194964808623033e-05, "loss": 0.157, "step": 289000 }, { "epoch": 3.0618465761139695, "grad_norm": 5.4589457511901855, "learning_rate": 1.0191572051142427e-05, "loss": 0.1548, "step": 289100 }, { "epoch": 3.0620162139879996, "grad_norm": 11.434454917907715, "learning_rate": 1.0188179293661821e-05, "loss": 0.158, "step": 289200 }, { "epoch": 3.06218585186203, "grad_norm": 7.625936985015869, "learning_rate": 1.0184786536181215e-05, "loss": 0.1667, "step": 289300 }, { "epoch": 3.0623554897360603, "grad_norm": 10.938652992248535, "learning_rate": 1.0181393778700609e-05, "loss": 0.1487, "step": 289400 }, { "epoch": 3.062525127610091, "grad_norm": 8.181602478027344, "learning_rate": 1.0178001021220002e-05, "loss": 0.1532, "step": 289500 }, { "epoch": 3.062694765484121, "grad_norm": 8.93181324005127, "learning_rate": 1.0174608263739396e-05, "loss": 0.1621, "step": 289600 }, { "epoch": 3.0628644033581516, "grad_norm": 1.7509123086929321, "learning_rate": 1.017121550625879e-05, "loss": 0.1561, "step": 289700 }, { "epoch": 3.0630340412321817, "grad_norm": 10.677427291870117, "learning_rate": 1.0167822748778184e-05, "loss": 0.1629, "step": 289800 }, { "epoch": 3.063203679106212, "grad_norm": 2.465341329574585, "learning_rate": 1.0164429991297578e-05, "loss": 0.1472, "step": 289900 }, { "epoch": 3.0633733169802424, "grad_norm": 9.191140174865723, "learning_rate": 1.0161037233816973e-05, "loss": 0.1607, "step": 290000 }, { "epoch": 3.0635429548542725, "grad_norm": 17.820270538330078, "learning_rate": 1.0157644476336365e-05, "loss": 0.1557, "step": 290100 }, { "epoch": 3.063712592728303, "grad_norm": 6.872378349304199, "learning_rate": 1.0154251718855759e-05, "loss": 0.1521, "step": 290200 }, { "epoch": 3.0638822306023332, "grad_norm": 3.9641504287719727, "learning_rate": 1.0150858961375154e-05, "loss": 0.1521, "step": 290300 }, { "epoch": 3.0640518684763633, "grad_norm": 10.190421104431152, "learning_rate": 1.0147466203894547e-05, "loss": 0.1679, "step": 290400 }, { "epoch": 3.064221506350394, "grad_norm": 10.26928424835205, "learning_rate": 1.014407344641394e-05, "loss": 0.1533, "step": 290500 }, { "epoch": 3.064391144224424, "grad_norm": 4.450615406036377, "learning_rate": 1.0140680688933336e-05, "loss": 0.1614, "step": 290600 }, { "epoch": 3.0645607820984546, "grad_norm": 7.007386207580566, "learning_rate": 1.0137287931452728e-05, "loss": 0.1563, "step": 290700 }, { "epoch": 3.0647304199724847, "grad_norm": 14.78759765625, "learning_rate": 1.0133895173972123e-05, "loss": 0.1557, "step": 290800 }, { "epoch": 3.064900057846515, "grad_norm": 4.8821587562561035, "learning_rate": 1.0130502416491517e-05, "loss": 0.1487, "step": 290900 }, { "epoch": 3.0650696957205454, "grad_norm": 10.25904655456543, "learning_rate": 1.012710965901091e-05, "loss": 0.1555, "step": 291000 }, { "epoch": 3.0652393335945756, "grad_norm": 9.174434661865234, "learning_rate": 1.0123716901530305e-05, "loss": 0.1581, "step": 291100 }, { "epoch": 3.065408971468606, "grad_norm": 6.67979621887207, "learning_rate": 1.0120324144049699e-05, "loss": 0.1526, "step": 291200 }, { "epoch": 3.0655786093426363, "grad_norm": 16.158374786376953, "learning_rate": 1.011693138656909e-05, "loss": 0.1509, "step": 291300 }, { "epoch": 3.0657482472166664, "grad_norm": 5.206679344177246, "learning_rate": 1.0113538629088486e-05, "loss": 0.1653, "step": 291400 }, { "epoch": 3.065917885090697, "grad_norm": 13.50239372253418, "learning_rate": 1.011014587160788e-05, "loss": 0.1611, "step": 291500 }, { "epoch": 3.066087522964727, "grad_norm": 2.074781656265259, "learning_rate": 1.0106753114127272e-05, "loss": 0.1533, "step": 291600 }, { "epoch": 3.0662571608387577, "grad_norm": 6.413392543792725, "learning_rate": 1.0103360356646668e-05, "loss": 0.1623, "step": 291700 }, { "epoch": 3.066426798712788, "grad_norm": 3.957357406616211, "learning_rate": 1.0099967599166061e-05, "loss": 0.157, "step": 291800 }, { "epoch": 3.066596436586818, "grad_norm": 2.924762725830078, "learning_rate": 1.0096574841685455e-05, "loss": 0.1584, "step": 291900 }, { "epoch": 3.0667660744608485, "grad_norm": 10.836687088012695, "learning_rate": 1.0093182084204849e-05, "loss": 0.1643, "step": 292000 }, { "epoch": 3.0669357123348786, "grad_norm": 2.1579530239105225, "learning_rate": 1.0089789326724241e-05, "loss": 0.1539, "step": 292100 }, { "epoch": 3.067105350208909, "grad_norm": 10.761563301086426, "learning_rate": 1.0086396569243637e-05, "loss": 0.1499, "step": 292200 }, { "epoch": 3.0672749880829393, "grad_norm": 6.928286075592041, "learning_rate": 1.008300381176303e-05, "loss": 0.1331, "step": 292300 }, { "epoch": 3.06744462595697, "grad_norm": 5.1343913078308105, "learning_rate": 1.0079611054282422e-05, "loss": 0.1677, "step": 292400 }, { "epoch": 3.067614263831, "grad_norm": 12.268917083740234, "learning_rate": 1.0076218296801818e-05, "loss": 0.1358, "step": 292500 }, { "epoch": 3.06778390170503, "grad_norm": 10.127782821655273, "learning_rate": 1.0072825539321212e-05, "loss": 0.1414, "step": 292600 }, { "epoch": 3.0679535395790607, "grad_norm": 3.008007049560547, "learning_rate": 1.0069432781840604e-05, "loss": 0.1332, "step": 292700 }, { "epoch": 3.068123177453091, "grad_norm": 3.3134982585906982, "learning_rate": 1.006604002436e-05, "loss": 0.1492, "step": 292800 }, { "epoch": 3.0682928153271214, "grad_norm": 20.15691375732422, "learning_rate": 1.0062647266879393e-05, "loss": 0.163, "step": 292900 }, { "epoch": 3.0684624532011515, "grad_norm": 14.074825286865234, "learning_rate": 1.0059254509398789e-05, "loss": 0.1476, "step": 293000 }, { "epoch": 3.0686320910751816, "grad_norm": 15.28664493560791, "learning_rate": 1.005586175191818e-05, "loss": 0.1423, "step": 293100 }, { "epoch": 3.068801728949212, "grad_norm": 7.060014724731445, "learning_rate": 1.0052468994437575e-05, "loss": 0.1565, "step": 293200 }, { "epoch": 3.0689713668232423, "grad_norm": 7.609553813934326, "learning_rate": 1.004907623695697e-05, "loss": 0.1574, "step": 293300 }, { "epoch": 3.069141004697273, "grad_norm": 0.6592801213264465, "learning_rate": 1.0045683479476362e-05, "loss": 0.1498, "step": 293400 }, { "epoch": 3.069310642571303, "grad_norm": 13.036972999572754, "learning_rate": 1.0042290721995756e-05, "loss": 0.1674, "step": 293500 }, { "epoch": 3.069480280445333, "grad_norm": 5.463217735290527, "learning_rate": 1.0038897964515151e-05, "loss": 0.1512, "step": 293600 }, { "epoch": 3.0696499183193637, "grad_norm": 3.785287857055664, "learning_rate": 1.0035505207034544e-05, "loss": 0.1527, "step": 293700 }, { "epoch": 3.069819556193394, "grad_norm": 5.100829124450684, "learning_rate": 1.0032112449553939e-05, "loss": 0.156, "step": 293800 }, { "epoch": 3.0699891940674244, "grad_norm": 7.399212837219238, "learning_rate": 1.0028719692073333e-05, "loss": 0.1669, "step": 293900 }, { "epoch": 3.0701588319414546, "grad_norm": 10.142976760864258, "learning_rate": 1.0025326934592725e-05, "loss": 0.1549, "step": 294000 }, { "epoch": 3.0703284698154847, "grad_norm": 11.383374214172363, "learning_rate": 1.002193417711212e-05, "loss": 0.1552, "step": 294100 }, { "epoch": 3.0704981076895153, "grad_norm": 9.65023422241211, "learning_rate": 1.0018541419631514e-05, "loss": 0.1522, "step": 294200 }, { "epoch": 3.0706677455635454, "grad_norm": 3.3531785011291504, "learning_rate": 1.0015148662150906e-05, "loss": 0.1461, "step": 294300 }, { "epoch": 3.070837383437576, "grad_norm": 7.738694667816162, "learning_rate": 1.0011755904670302e-05, "loss": 0.1575, "step": 294400 }, { "epoch": 3.071007021311606, "grad_norm": 8.140894889831543, "learning_rate": 1.0008363147189694e-05, "loss": 0.1679, "step": 294500 }, { "epoch": 3.071176659185636, "grad_norm": 18.283039093017578, "learning_rate": 1.0004970389709088e-05, "loss": 0.158, "step": 294600 }, { "epoch": 3.0713462970596668, "grad_norm": 1.8624451160430908, "learning_rate": 1.0001577632228483e-05, "loss": 0.1595, "step": 294700 }, { "epoch": 3.071515934933697, "grad_norm": 12.813203811645508, "learning_rate": 9.998184874747875e-06, "loss": 0.1515, "step": 294800 }, { "epoch": 3.0716855728077275, "grad_norm": 11.285210609436035, "learning_rate": 9.99479211726727e-06, "loss": 0.1541, "step": 294900 }, { "epoch": 3.0718552106817576, "grad_norm": 5.851372241973877, "learning_rate": 9.991399359786665e-06, "loss": 0.1597, "step": 295000 }, { "epoch": 3.072024848555788, "grad_norm": 8.74927806854248, "learning_rate": 9.988006602306058e-06, "loss": 0.17, "step": 295100 }, { "epoch": 3.0721944864298183, "grad_norm": 9.946389198303223, "learning_rate": 9.984613844825452e-06, "loss": 0.151, "step": 295200 }, { "epoch": 3.0723641243038484, "grad_norm": 2.90785551071167, "learning_rate": 9.981221087344846e-06, "loss": 0.1612, "step": 295300 }, { "epoch": 3.072533762177879, "grad_norm": 9.060962677001953, "learning_rate": 9.97782832986424e-06, "loss": 0.1495, "step": 295400 }, { "epoch": 3.072703400051909, "grad_norm": 4.12734317779541, "learning_rate": 9.974435572383634e-06, "loss": 0.1496, "step": 295500 }, { "epoch": 3.0728730379259397, "grad_norm": 9.046548843383789, "learning_rate": 9.971042814903027e-06, "loss": 0.1462, "step": 295600 }, { "epoch": 3.07304267579997, "grad_norm": 10.788717269897461, "learning_rate": 9.967650057422421e-06, "loss": 0.1423, "step": 295700 }, { "epoch": 3.073212313674, "grad_norm": 10.79653549194336, "learning_rate": 9.964257299941815e-06, "loss": 0.1583, "step": 295800 }, { "epoch": 3.0733819515480305, "grad_norm": 12.135979652404785, "learning_rate": 9.960864542461209e-06, "loss": 0.1596, "step": 295900 }, { "epoch": 3.0735515894220606, "grad_norm": 21.967288970947266, "learning_rate": 9.957471784980603e-06, "loss": 0.1396, "step": 296000 }, { "epoch": 3.073721227296091, "grad_norm": 10.614531517028809, "learning_rate": 9.954079027499996e-06, "loss": 0.1552, "step": 296100 }, { "epoch": 3.0738908651701213, "grad_norm": 19.905366897583008, "learning_rate": 9.95068627001939e-06, "loss": 0.145, "step": 296200 }, { "epoch": 3.0740605030441515, "grad_norm": 12.674455642700195, "learning_rate": 9.947293512538784e-06, "loss": 0.1462, "step": 296300 }, { "epoch": 3.074230140918182, "grad_norm": 15.07858657836914, "learning_rate": 9.943900755058178e-06, "loss": 0.1559, "step": 296400 }, { "epoch": 3.074399778792212, "grad_norm": 4.476396560668945, "learning_rate": 9.940507997577571e-06, "loss": 0.1619, "step": 296500 }, { "epoch": 3.0745694166662427, "grad_norm": 8.040300369262695, "learning_rate": 9.937115240096967e-06, "loss": 0.1697, "step": 296600 }, { "epoch": 3.074739054540273, "grad_norm": 7.1312432289123535, "learning_rate": 9.933722482616359e-06, "loss": 0.1555, "step": 296700 }, { "epoch": 3.074908692414303, "grad_norm": 9.426125526428223, "learning_rate": 9.930329725135753e-06, "loss": 0.1475, "step": 296800 }, { "epoch": 3.0750783302883336, "grad_norm": 18.610984802246094, "learning_rate": 9.926936967655147e-06, "loss": 0.1571, "step": 296900 }, { "epoch": 3.0752479681623637, "grad_norm": 4.225525856018066, "learning_rate": 9.923544210174542e-06, "loss": 0.1426, "step": 297000 }, { "epoch": 3.0754176060363942, "grad_norm": 18.358247756958008, "learning_rate": 9.920151452693934e-06, "loss": 0.1546, "step": 297100 }, { "epoch": 3.0755872439104244, "grad_norm": 16.723167419433594, "learning_rate": 9.916758695213328e-06, "loss": 0.149, "step": 297200 }, { "epoch": 3.0757568817844545, "grad_norm": 3.5464885234832764, "learning_rate": 9.913365937732724e-06, "loss": 0.1613, "step": 297300 }, { "epoch": 3.075926519658485, "grad_norm": 4.197837829589844, "learning_rate": 9.909973180252117e-06, "loss": 0.1439, "step": 297400 }, { "epoch": 3.076096157532515, "grad_norm": 13.752692222595215, "learning_rate": 9.90658042277151e-06, "loss": 0.1543, "step": 297500 }, { "epoch": 3.0762657954065458, "grad_norm": 12.094964027404785, "learning_rate": 9.903187665290905e-06, "loss": 0.1538, "step": 297600 }, { "epoch": 3.076435433280576, "grad_norm": 9.297470092773438, "learning_rate": 9.899794907810299e-06, "loss": 0.1475, "step": 297700 }, { "epoch": 3.0766050711546065, "grad_norm": 4.826433181762695, "learning_rate": 9.896402150329693e-06, "loss": 0.1467, "step": 297800 }, { "epoch": 3.0767747090286366, "grad_norm": 6.379718780517578, "learning_rate": 9.893009392849086e-06, "loss": 0.1522, "step": 297900 }, { "epoch": 3.0769443469026667, "grad_norm": 13.807291030883789, "learning_rate": 9.88961663536848e-06, "loss": 0.1512, "step": 298000 }, { "epoch": 3.0771139847766973, "grad_norm": 14.184438705444336, "learning_rate": 9.886223877887874e-06, "loss": 0.1559, "step": 298100 }, { "epoch": 3.0772836226507274, "grad_norm": 9.328975677490234, "learning_rate": 9.882831120407268e-06, "loss": 0.1491, "step": 298200 }, { "epoch": 3.077453260524758, "grad_norm": 6.197120666503906, "learning_rate": 9.879438362926662e-06, "loss": 0.1544, "step": 298300 }, { "epoch": 3.077622898398788, "grad_norm": 14.996658325195312, "learning_rate": 9.876045605446055e-06, "loss": 0.1552, "step": 298400 }, { "epoch": 3.0777925362728182, "grad_norm": 9.238858222961426, "learning_rate": 9.872652847965449e-06, "loss": 0.1499, "step": 298500 }, { "epoch": 3.077962174146849, "grad_norm": 21.469158172607422, "learning_rate": 9.869260090484843e-06, "loss": 0.1558, "step": 298600 }, { "epoch": 3.078131812020879, "grad_norm": 13.737997055053711, "learning_rate": 9.865867333004237e-06, "loss": 0.1398, "step": 298700 }, { "epoch": 3.0783014498949095, "grad_norm": 14.401994705200195, "learning_rate": 9.86247457552363e-06, "loss": 0.1744, "step": 298800 }, { "epoch": 3.0784710877689396, "grad_norm": 3.6717450618743896, "learning_rate": 9.859081818043024e-06, "loss": 0.1564, "step": 298900 }, { "epoch": 3.0786407256429698, "grad_norm": 7.037825107574463, "learning_rate": 9.855689060562418e-06, "loss": 0.1455, "step": 299000 }, { "epoch": 3.0788103635170003, "grad_norm": 18.489667892456055, "learning_rate": 9.852296303081812e-06, "loss": 0.167, "step": 299100 }, { "epoch": 3.0789800013910305, "grad_norm": 8.13353157043457, "learning_rate": 9.848903545601206e-06, "loss": 0.1498, "step": 299200 }, { "epoch": 3.079149639265061, "grad_norm": 2.6240739822387695, "learning_rate": 9.8455107881206e-06, "loss": 0.1236, "step": 299300 }, { "epoch": 3.079319277139091, "grad_norm": 4.97042179107666, "learning_rate": 9.842118030639993e-06, "loss": 0.163, "step": 299400 }, { "epoch": 3.0794889150131213, "grad_norm": 6.753870487213135, "learning_rate": 9.838725273159387e-06, "loss": 0.1625, "step": 299500 }, { "epoch": 3.079658552887152, "grad_norm": 9.701141357421875, "learning_rate": 9.835332515678781e-06, "loss": 0.1497, "step": 299600 }, { "epoch": 3.079828190761182, "grad_norm": 14.670252799987793, "learning_rate": 9.831939758198176e-06, "loss": 0.1685, "step": 299700 }, { "epoch": 3.0799978286352125, "grad_norm": 6.538000583648682, "learning_rate": 9.828547000717568e-06, "loss": 0.1534, "step": 299800 }, { "epoch": 3.0801674665092427, "grad_norm": 11.067461967468262, "learning_rate": 9.825154243236962e-06, "loss": 0.1767, "step": 299900 }, { "epoch": 3.0803371043832732, "grad_norm": 11.65058422088623, "learning_rate": 9.821761485756358e-06, "loss": 0.1554, "step": 300000 }, { "epoch": 3.0805067422573034, "grad_norm": 4.224606513977051, "learning_rate": 9.81836872827575e-06, "loss": 0.1307, "step": 300100 }, { "epoch": 3.0806763801313335, "grad_norm": 5.927542686462402, "learning_rate": 9.814975970795144e-06, "loss": 0.1681, "step": 300200 }, { "epoch": 3.080846018005364, "grad_norm": 10.269852638244629, "learning_rate": 9.811583213314539e-06, "loss": 0.1512, "step": 300300 }, { "epoch": 3.081015655879394, "grad_norm": 20.434057235717773, "learning_rate": 9.808190455833933e-06, "loss": 0.1604, "step": 300400 }, { "epoch": 3.0811852937534248, "grad_norm": 2.9734911918640137, "learning_rate": 9.804797698353325e-06, "loss": 0.1477, "step": 300500 }, { "epoch": 3.081354931627455, "grad_norm": 6.240009307861328, "learning_rate": 9.80140494087272e-06, "loss": 0.146, "step": 300600 }, { "epoch": 3.081524569501485, "grad_norm": 4.845802307128906, "learning_rate": 9.798012183392114e-06, "loss": 0.1658, "step": 300700 }, { "epoch": 3.0816942073755156, "grad_norm": 3.7777259349823, "learning_rate": 9.794619425911508e-06, "loss": 0.1543, "step": 300800 }, { "epoch": 3.0818638452495457, "grad_norm": 1.8492971658706665, "learning_rate": 9.7912266684309e-06, "loss": 0.1467, "step": 300900 }, { "epoch": 3.0820334831235763, "grad_norm": 13.3462553024292, "learning_rate": 9.787833910950296e-06, "loss": 0.1487, "step": 301000 }, { "epoch": 3.0822031209976064, "grad_norm": 17.85575294494629, "learning_rate": 9.78444115346969e-06, "loss": 0.152, "step": 301100 }, { "epoch": 3.0823727588716365, "grad_norm": 7.585082530975342, "learning_rate": 9.781048395989083e-06, "loss": 0.1567, "step": 301200 }, { "epoch": 3.082542396745667, "grad_norm": 13.604305267333984, "learning_rate": 9.777655638508477e-06, "loss": 0.1537, "step": 301300 }, { "epoch": 3.0827120346196972, "grad_norm": 7.5811896324157715, "learning_rate": 9.774262881027871e-06, "loss": 0.1577, "step": 301400 }, { "epoch": 3.082881672493728, "grad_norm": 13.424019813537598, "learning_rate": 9.770870123547265e-06, "loss": 0.1515, "step": 301500 }, { "epoch": 3.083051310367758, "grad_norm": 1.8108642101287842, "learning_rate": 9.767477366066658e-06, "loss": 0.1482, "step": 301600 }, { "epoch": 3.083220948241788, "grad_norm": 11.185663223266602, "learning_rate": 9.764084608586052e-06, "loss": 0.1597, "step": 301700 }, { "epoch": 3.0833905861158186, "grad_norm": 7.8758931159973145, "learning_rate": 9.760691851105446e-06, "loss": 0.1485, "step": 301800 }, { "epoch": 3.0835602239898487, "grad_norm": 10.892339706420898, "learning_rate": 9.75729909362484e-06, "loss": 0.1562, "step": 301900 }, { "epoch": 3.0837298618638793, "grad_norm": 8.200766563415527, "learning_rate": 9.753906336144234e-06, "loss": 0.1483, "step": 302000 }, { "epoch": 3.0838994997379094, "grad_norm": 4.081294536590576, "learning_rate": 9.750513578663627e-06, "loss": 0.1589, "step": 302100 }, { "epoch": 3.08406913761194, "grad_norm": 8.926464080810547, "learning_rate": 9.747120821183021e-06, "loss": 0.1491, "step": 302200 }, { "epoch": 3.08423877548597, "grad_norm": 13.127912521362305, "learning_rate": 9.743728063702415e-06, "loss": 0.1473, "step": 302300 }, { "epoch": 3.0844084133600003, "grad_norm": 6.387683391571045, "learning_rate": 9.740335306221809e-06, "loss": 0.14, "step": 302400 }, { "epoch": 3.084578051234031, "grad_norm": 1.0145988464355469, "learning_rate": 9.736942548741203e-06, "loss": 0.1485, "step": 302500 }, { "epoch": 3.084747689108061, "grad_norm": 9.924799919128418, "learning_rate": 9.733549791260596e-06, "loss": 0.1426, "step": 302600 }, { "epoch": 3.0849173269820915, "grad_norm": 7.480307102203369, "learning_rate": 9.730157033779992e-06, "loss": 0.1557, "step": 302700 }, { "epoch": 3.0850869648561217, "grad_norm": 14.193624496459961, "learning_rate": 9.726764276299384e-06, "loss": 0.163, "step": 302800 }, { "epoch": 3.085256602730152, "grad_norm": 7.340779781341553, "learning_rate": 9.723371518818778e-06, "loss": 0.1543, "step": 302900 }, { "epoch": 3.0854262406041824, "grad_norm": 12.474454879760742, "learning_rate": 9.719978761338172e-06, "loss": 0.1634, "step": 303000 }, { "epoch": 3.0855958784782125, "grad_norm": 22.68638038635254, "learning_rate": 9.716586003857567e-06, "loss": 0.1291, "step": 303100 }, { "epoch": 3.085765516352243, "grad_norm": 5.26704216003418, "learning_rate": 9.71319324637696e-06, "loss": 0.1548, "step": 303200 }, { "epoch": 3.085935154226273, "grad_norm": 5.4972453117370605, "learning_rate": 9.709800488896353e-06, "loss": 0.1532, "step": 303300 }, { "epoch": 3.0861047921003033, "grad_norm": 12.965251922607422, "learning_rate": 9.706407731415748e-06, "loss": 0.1367, "step": 303400 }, { "epoch": 3.086274429974334, "grad_norm": 4.3410186767578125, "learning_rate": 9.70301497393514e-06, "loss": 0.1586, "step": 303500 }, { "epoch": 3.086444067848364, "grad_norm": 6.927376747131348, "learning_rate": 9.699622216454534e-06, "loss": 0.1313, "step": 303600 }, { "epoch": 3.0866137057223946, "grad_norm": 4.838318347930908, "learning_rate": 9.69622945897393e-06, "loss": 0.1455, "step": 303700 }, { "epoch": 3.0867833435964247, "grad_norm": 14.784576416015625, "learning_rate": 9.692836701493324e-06, "loss": 0.1511, "step": 303800 }, { "epoch": 3.086952981470455, "grad_norm": 7.182317733764648, "learning_rate": 9.689443944012716e-06, "loss": 0.1644, "step": 303900 }, { "epoch": 3.0871226193444854, "grad_norm": 0.7586160898208618, "learning_rate": 9.686051186532111e-06, "loss": 0.1514, "step": 304000 }, { "epoch": 3.0872922572185155, "grad_norm": 4.344551086425781, "learning_rate": 9.682658429051505e-06, "loss": 0.1469, "step": 304100 }, { "epoch": 3.087461895092546, "grad_norm": 6.081146240234375, "learning_rate": 9.679265671570899e-06, "loss": 0.1406, "step": 304200 }, { "epoch": 3.087631532966576, "grad_norm": 9.925230026245117, "learning_rate": 9.675872914090293e-06, "loss": 0.1436, "step": 304300 }, { "epoch": 3.0878011708406063, "grad_norm": 12.138921737670898, "learning_rate": 9.672480156609686e-06, "loss": 0.1589, "step": 304400 }, { "epoch": 3.087970808714637, "grad_norm": 10.88057804107666, "learning_rate": 9.66908739912908e-06, "loss": 0.1406, "step": 304500 }, { "epoch": 3.088140446588667, "grad_norm": 9.040034294128418, "learning_rate": 9.665694641648474e-06, "loss": 0.1388, "step": 304600 }, { "epoch": 3.0883100844626976, "grad_norm": 3.554966688156128, "learning_rate": 9.662301884167868e-06, "loss": 0.1562, "step": 304700 }, { "epoch": 3.0884797223367277, "grad_norm": 1.6443816423416138, "learning_rate": 9.658909126687262e-06, "loss": 0.1407, "step": 304800 }, { "epoch": 3.0886493602107583, "grad_norm": 9.091776847839355, "learning_rate": 9.655516369206655e-06, "loss": 0.1539, "step": 304900 }, { "epoch": 3.0888189980847884, "grad_norm": 11.889481544494629, "learning_rate": 9.65212361172605e-06, "loss": 0.148, "step": 305000 }, { "epoch": 3.0889886359588186, "grad_norm": 7.746503829956055, "learning_rate": 9.648730854245443e-06, "loss": 0.1651, "step": 305100 }, { "epoch": 3.089158273832849, "grad_norm": 14.05681324005127, "learning_rate": 9.645338096764837e-06, "loss": 0.158, "step": 305200 }, { "epoch": 3.0893279117068793, "grad_norm": 3.1424617767333984, "learning_rate": 9.64194533928423e-06, "loss": 0.1618, "step": 305300 }, { "epoch": 3.08949754958091, "grad_norm": 8.952536582946777, "learning_rate": 9.638552581803624e-06, "loss": 0.152, "step": 305400 }, { "epoch": 3.08966718745494, "grad_norm": 9.546321868896484, "learning_rate": 9.635159824323018e-06, "loss": 0.1409, "step": 305500 }, { "epoch": 3.08983682532897, "grad_norm": 7.973764419555664, "learning_rate": 9.631767066842412e-06, "loss": 0.1518, "step": 305600 }, { "epoch": 3.0900064632030007, "grad_norm": 12.232881546020508, "learning_rate": 9.628374309361806e-06, "loss": 0.1453, "step": 305700 }, { "epoch": 3.090176101077031, "grad_norm": 12.702545166015625, "learning_rate": 9.6249815518812e-06, "loss": 0.1479, "step": 305800 }, { "epoch": 3.0903457389510613, "grad_norm": 8.685492515563965, "learning_rate": 9.621588794400593e-06, "loss": 0.1494, "step": 305900 }, { "epoch": 3.0905153768250915, "grad_norm": 8.630180358886719, "learning_rate": 9.618196036919987e-06, "loss": 0.1621, "step": 306000 }, { "epoch": 3.0906850146991216, "grad_norm": 8.385542869567871, "learning_rate": 9.614803279439383e-06, "loss": 0.163, "step": 306100 }, { "epoch": 3.090854652573152, "grad_norm": 9.643824577331543, "learning_rate": 9.611410521958775e-06, "loss": 0.1408, "step": 306200 }, { "epoch": 3.0910242904471823, "grad_norm": 9.798473358154297, "learning_rate": 9.608017764478169e-06, "loss": 0.1469, "step": 306300 }, { "epoch": 3.091193928321213, "grad_norm": 2.619964122772217, "learning_rate": 9.604625006997564e-06, "loss": 0.142, "step": 306400 }, { "epoch": 3.091363566195243, "grad_norm": 2.6148910522460938, "learning_rate": 9.601232249516958e-06, "loss": 0.1539, "step": 306500 }, { "epoch": 3.091533204069273, "grad_norm": 4.897170543670654, "learning_rate": 9.59783949203635e-06, "loss": 0.1504, "step": 306600 }, { "epoch": 3.0917028419433037, "grad_norm": 12.519481658935547, "learning_rate": 9.594446734555745e-06, "loss": 0.1578, "step": 306700 }, { "epoch": 3.091872479817334, "grad_norm": 13.268074989318848, "learning_rate": 9.59105397707514e-06, "loss": 0.1659, "step": 306800 }, { "epoch": 3.0920421176913644, "grad_norm": 5.449469566345215, "learning_rate": 9.587661219594533e-06, "loss": 0.1593, "step": 306900 }, { "epoch": 3.0922117555653945, "grad_norm": 7.228404521942139, "learning_rate": 9.584268462113925e-06, "loss": 0.1617, "step": 307000 }, { "epoch": 3.0923813934394246, "grad_norm": 15.553772926330566, "learning_rate": 9.58087570463332e-06, "loss": 0.1492, "step": 307100 }, { "epoch": 3.092551031313455, "grad_norm": 14.400397300720215, "learning_rate": 9.577482947152714e-06, "loss": 0.1556, "step": 307200 }, { "epoch": 3.0927206691874853, "grad_norm": 8.053176879882812, "learning_rate": 9.574090189672106e-06, "loss": 0.1621, "step": 307300 }, { "epoch": 3.092890307061516, "grad_norm": 6.700496196746826, "learning_rate": 9.570697432191502e-06, "loss": 0.1642, "step": 307400 }, { "epoch": 3.093059944935546, "grad_norm": 3.353828191757202, "learning_rate": 9.567304674710896e-06, "loss": 0.1449, "step": 307500 }, { "epoch": 3.0932295828095766, "grad_norm": 10.085803985595703, "learning_rate": 9.56391191723029e-06, "loss": 0.16, "step": 307600 }, { "epoch": 3.0933992206836067, "grad_norm": 8.42874813079834, "learning_rate": 9.560519159749683e-06, "loss": 0.1646, "step": 307700 }, { "epoch": 3.093568858557637, "grad_norm": 6.35466194152832, "learning_rate": 9.557126402269077e-06, "loss": 0.1432, "step": 307800 }, { "epoch": 3.0937384964316674, "grad_norm": 5.090446949005127, "learning_rate": 9.553733644788471e-06, "loss": 0.1406, "step": 307900 }, { "epoch": 3.0939081343056976, "grad_norm": 7.445723056793213, "learning_rate": 9.550340887307865e-06, "loss": 0.165, "step": 308000 }, { "epoch": 3.094077772179728, "grad_norm": 6.035569190979004, "learning_rate": 9.546948129827259e-06, "loss": 0.1508, "step": 308100 }, { "epoch": 3.0942474100537583, "grad_norm": 1.853783369064331, "learning_rate": 9.543555372346652e-06, "loss": 0.1592, "step": 308200 }, { "epoch": 3.0944170479277884, "grad_norm": 19.062946319580078, "learning_rate": 9.540162614866046e-06, "loss": 0.1437, "step": 308300 }, { "epoch": 3.094586685801819, "grad_norm": 4.863981246948242, "learning_rate": 9.53676985738544e-06, "loss": 0.1503, "step": 308400 }, { "epoch": 3.094756323675849, "grad_norm": 7.131556510925293, "learning_rate": 9.533377099904834e-06, "loss": 0.1441, "step": 308500 }, { "epoch": 3.0949259615498796, "grad_norm": 10.521081924438477, "learning_rate": 9.529984342424228e-06, "loss": 0.1625, "step": 308600 }, { "epoch": 3.0950955994239098, "grad_norm": 7.098443031311035, "learning_rate": 9.526591584943621e-06, "loss": 0.1603, "step": 308700 }, { "epoch": 3.09526523729794, "grad_norm": 6.363058567047119, "learning_rate": 9.523198827463015e-06, "loss": 0.16, "step": 308800 }, { "epoch": 3.0954348751719705, "grad_norm": 8.139494895935059, "learning_rate": 9.519806069982409e-06, "loss": 0.1666, "step": 308900 }, { "epoch": 3.0956045130460006, "grad_norm": 4.595174789428711, "learning_rate": 9.516413312501803e-06, "loss": 0.1524, "step": 309000 }, { "epoch": 3.095774150920031, "grad_norm": 20.2647705078125, "learning_rate": 9.513020555021198e-06, "loss": 0.1545, "step": 309100 }, { "epoch": 3.0959437887940613, "grad_norm": 12.974306106567383, "learning_rate": 9.50962779754059e-06, "loss": 0.1629, "step": 309200 }, { "epoch": 3.0961134266680914, "grad_norm": 8.19435977935791, "learning_rate": 9.506235040059984e-06, "loss": 0.1537, "step": 309300 }, { "epoch": 3.096283064542122, "grad_norm": 3.2247631549835205, "learning_rate": 9.502842282579378e-06, "loss": 0.1607, "step": 309400 }, { "epoch": 3.096452702416152, "grad_norm": 3.9568965435028076, "learning_rate": 9.499449525098773e-06, "loss": 0.1509, "step": 309500 }, { "epoch": 3.0966223402901827, "grad_norm": 6.136845111846924, "learning_rate": 9.496056767618165e-06, "loss": 0.1631, "step": 309600 }, { "epoch": 3.096791978164213, "grad_norm": 17.5639591217041, "learning_rate": 9.49266401013756e-06, "loss": 0.1496, "step": 309700 }, { "epoch": 3.096961616038243, "grad_norm": 9.503756523132324, "learning_rate": 9.489271252656955e-06, "loss": 0.1488, "step": 309800 }, { "epoch": 3.0971312539122735, "grad_norm": 12.717977523803711, "learning_rate": 9.485878495176349e-06, "loss": 0.1559, "step": 309900 }, { "epoch": 3.0973008917863036, "grad_norm": 9.592851638793945, "learning_rate": 9.48248573769574e-06, "loss": 0.157, "step": 310000 }, { "epoch": 3.097470529660334, "grad_norm": 17.88542366027832, "learning_rate": 9.479092980215136e-06, "loss": 0.1673, "step": 310100 }, { "epoch": 3.0976401675343643, "grad_norm": 8.722711563110352, "learning_rate": 9.47570022273453e-06, "loss": 0.16, "step": 310200 }, { "epoch": 3.097809805408395, "grad_norm": 8.623176574707031, "learning_rate": 9.472307465253924e-06, "loss": 0.1591, "step": 310300 }, { "epoch": 3.097979443282425, "grad_norm": 11.372925758361816, "learning_rate": 9.468914707773318e-06, "loss": 0.1589, "step": 310400 }, { "epoch": 3.098149081156455, "grad_norm": 6.604671478271484, "learning_rate": 9.465521950292711e-06, "loss": 0.1488, "step": 310500 }, { "epoch": 3.0983187190304857, "grad_norm": 6.798544883728027, "learning_rate": 9.462129192812105e-06, "loss": 0.1707, "step": 310600 }, { "epoch": 3.098488356904516, "grad_norm": 9.805066108703613, "learning_rate": 9.458736435331499e-06, "loss": 0.1617, "step": 310700 }, { "epoch": 3.0986579947785464, "grad_norm": 15.037117958068848, "learning_rate": 9.455343677850893e-06, "loss": 0.1595, "step": 310800 }, { "epoch": 3.0988276326525765, "grad_norm": 7.078458786010742, "learning_rate": 9.451950920370287e-06, "loss": 0.1418, "step": 310900 }, { "epoch": 3.0989972705266067, "grad_norm": 11.404520034790039, "learning_rate": 9.44855816288968e-06, "loss": 0.164, "step": 311000 }, { "epoch": 3.0991669084006372, "grad_norm": 21.338726043701172, "learning_rate": 9.445165405409074e-06, "loss": 0.1283, "step": 311100 }, { "epoch": 3.0993365462746674, "grad_norm": 7.9873247146606445, "learning_rate": 9.441772647928468e-06, "loss": 0.1425, "step": 311200 }, { "epoch": 3.099506184148698, "grad_norm": 11.472331047058105, "learning_rate": 9.438379890447862e-06, "loss": 0.1435, "step": 311300 }, { "epoch": 3.099675822022728, "grad_norm": 9.910642623901367, "learning_rate": 9.434987132967256e-06, "loss": 0.1446, "step": 311400 }, { "epoch": 3.099845459896758, "grad_norm": 10.790426254272461, "learning_rate": 9.43159437548665e-06, "loss": 0.1412, "step": 311500 }, { "epoch": 3.1000150977707888, "grad_norm": 0.667790949344635, "learning_rate": 9.428201618006043e-06, "loss": 0.1499, "step": 311600 }, { "epoch": 3.100184735644819, "grad_norm": 3.0795040130615234, "learning_rate": 9.424808860525437e-06, "loss": 0.157, "step": 311700 }, { "epoch": 3.1003543735188495, "grad_norm": 6.189947605133057, "learning_rate": 9.42141610304483e-06, "loss": 0.1413, "step": 311800 }, { "epoch": 3.1005240113928796, "grad_norm": 10.775115013122559, "learning_rate": 9.418023345564224e-06, "loss": 0.1664, "step": 311900 }, { "epoch": 3.1006936492669097, "grad_norm": 6.759642124176025, "learning_rate": 9.414630588083618e-06, "loss": 0.156, "step": 312000 }, { "epoch": 3.1008632871409403, "grad_norm": 9.006803512573242, "learning_rate": 9.411237830603012e-06, "loss": 0.1764, "step": 312100 }, { "epoch": 3.1010329250149704, "grad_norm": 7.014209270477295, "learning_rate": 9.407845073122408e-06, "loss": 0.1607, "step": 312200 }, { "epoch": 3.101202562889001, "grad_norm": 11.219196319580078, "learning_rate": 9.4044523156418e-06, "loss": 0.1503, "step": 312300 }, { "epoch": 3.101372200763031, "grad_norm": 10.704278945922852, "learning_rate": 9.401059558161193e-06, "loss": 0.1566, "step": 312400 }, { "epoch": 3.1015418386370617, "grad_norm": 6.441004276275635, "learning_rate": 9.397666800680589e-06, "loss": 0.163, "step": 312500 }, { "epoch": 3.101711476511092, "grad_norm": 14.618765830993652, "learning_rate": 9.394274043199981e-06, "loss": 0.1662, "step": 312600 }, { "epoch": 3.101881114385122, "grad_norm": 10.64292049407959, "learning_rate": 9.390881285719375e-06, "loss": 0.1526, "step": 312700 }, { "epoch": 3.1020507522591525, "grad_norm": 10.018407821655273, "learning_rate": 9.38748852823877e-06, "loss": 0.1532, "step": 312800 }, { "epoch": 3.1022203901331826, "grad_norm": 11.123716354370117, "learning_rate": 9.384095770758164e-06, "loss": 0.1562, "step": 312900 }, { "epoch": 3.102390028007213, "grad_norm": 9.392641067504883, "learning_rate": 9.380703013277556e-06, "loss": 0.1621, "step": 313000 }, { "epoch": 3.1025596658812433, "grad_norm": 16.611286163330078, "learning_rate": 9.37731025579695e-06, "loss": 0.1645, "step": 313100 }, { "epoch": 3.1027293037552734, "grad_norm": 0.7613620162010193, "learning_rate": 9.373917498316346e-06, "loss": 0.1341, "step": 313200 }, { "epoch": 3.102898941629304, "grad_norm": 10.596071243286133, "learning_rate": 9.37052474083574e-06, "loss": 0.1611, "step": 313300 }, { "epoch": 3.103068579503334, "grad_norm": 8.791805267333984, "learning_rate": 9.367131983355131e-06, "loss": 0.1433, "step": 313400 }, { "epoch": 3.1032382173773647, "grad_norm": 7.9409708976745605, "learning_rate": 9.363739225874527e-06, "loss": 0.1551, "step": 313500 }, { "epoch": 3.103407855251395, "grad_norm": 7.538231372833252, "learning_rate": 9.36034646839392e-06, "loss": 0.1527, "step": 313600 }, { "epoch": 3.103577493125425, "grad_norm": 11.592803001403809, "learning_rate": 9.356953710913314e-06, "loss": 0.1461, "step": 313700 }, { "epoch": 3.1037471309994555, "grad_norm": 2.859217882156372, "learning_rate": 9.353560953432708e-06, "loss": 0.1612, "step": 313800 }, { "epoch": 3.1039167688734857, "grad_norm": 1.572331428527832, "learning_rate": 9.350168195952102e-06, "loss": 0.1577, "step": 313900 }, { "epoch": 3.1040864067475162, "grad_norm": 3.357349395751953, "learning_rate": 9.346775438471496e-06, "loss": 0.1709, "step": 314000 }, { "epoch": 3.1042560446215464, "grad_norm": 10.83899211883545, "learning_rate": 9.34338268099089e-06, "loss": 0.1483, "step": 314100 }, { "epoch": 3.1044256824955765, "grad_norm": 8.259320259094238, "learning_rate": 9.339989923510283e-06, "loss": 0.1396, "step": 314200 }, { "epoch": 3.104595320369607, "grad_norm": 10.29360294342041, "learning_rate": 9.336597166029677e-06, "loss": 0.1559, "step": 314300 }, { "epoch": 3.104764958243637, "grad_norm": 11.465302467346191, "learning_rate": 9.333204408549071e-06, "loss": 0.1545, "step": 314400 }, { "epoch": 3.1049345961176678, "grad_norm": 5.906549453735352, "learning_rate": 9.329811651068465e-06, "loss": 0.1612, "step": 314500 }, { "epoch": 3.105104233991698, "grad_norm": 7.025627613067627, "learning_rate": 9.326418893587859e-06, "loss": 0.1501, "step": 314600 }, { "epoch": 3.1052738718657285, "grad_norm": 15.370887756347656, "learning_rate": 9.323026136107252e-06, "loss": 0.1591, "step": 314700 }, { "epoch": 3.1054435097397586, "grad_norm": 12.311824798583984, "learning_rate": 9.319633378626646e-06, "loss": 0.1483, "step": 314800 }, { "epoch": 3.1056131476137887, "grad_norm": 3.3376224040985107, "learning_rate": 9.31624062114604e-06, "loss": 0.1514, "step": 314900 }, { "epoch": 3.1057827854878193, "grad_norm": 2.8769049644470215, "learning_rate": 9.312847863665434e-06, "loss": 0.1663, "step": 315000 }, { "epoch": 3.1059524233618494, "grad_norm": 2.419759511947632, "learning_rate": 9.309455106184828e-06, "loss": 0.1621, "step": 315100 }, { "epoch": 3.10612206123588, "grad_norm": 13.329123497009277, "learning_rate": 9.306062348704223e-06, "loss": 0.1504, "step": 315200 }, { "epoch": 3.10629169910991, "grad_norm": 13.45897102355957, "learning_rate": 9.302669591223615e-06, "loss": 0.1586, "step": 315300 }, { "epoch": 3.1064613369839402, "grad_norm": 12.850385665893555, "learning_rate": 9.299276833743009e-06, "loss": 0.1418, "step": 315400 }, { "epoch": 3.106630974857971, "grad_norm": 9.050607681274414, "learning_rate": 9.295884076262403e-06, "loss": 0.1643, "step": 315500 }, { "epoch": 3.106800612732001, "grad_norm": 7.439171314239502, "learning_rate": 9.292491318781798e-06, "loss": 0.1413, "step": 315600 }, { "epoch": 3.1069702506060315, "grad_norm": 12.610795021057129, "learning_rate": 9.28909856130119e-06, "loss": 0.1304, "step": 315700 }, { "epoch": 3.1071398884800616, "grad_norm": 6.889320373535156, "learning_rate": 9.285705803820584e-06, "loss": 0.1423, "step": 315800 }, { "epoch": 3.1073095263540917, "grad_norm": 1.5673604011535645, "learning_rate": 9.28231304633998e-06, "loss": 0.1466, "step": 315900 }, { "epoch": 3.1074791642281223, "grad_norm": 7.443792819976807, "learning_rate": 9.278920288859372e-06, "loss": 0.1328, "step": 316000 }, { "epoch": 3.1076488021021524, "grad_norm": 17.166004180908203, "learning_rate": 9.275527531378766e-06, "loss": 0.1646, "step": 316100 }, { "epoch": 3.107818439976183, "grad_norm": 13.895317077636719, "learning_rate": 9.272134773898161e-06, "loss": 0.1467, "step": 316200 }, { "epoch": 3.107988077850213, "grad_norm": 17.768218994140625, "learning_rate": 9.268742016417555e-06, "loss": 0.1593, "step": 316300 }, { "epoch": 3.1081577157242433, "grad_norm": 9.236688613891602, "learning_rate": 9.265349258936947e-06, "loss": 0.155, "step": 316400 }, { "epoch": 3.108327353598274, "grad_norm": 7.151867866516113, "learning_rate": 9.261956501456342e-06, "loss": 0.1614, "step": 316500 }, { "epoch": 3.108496991472304, "grad_norm": 2.861132860183716, "learning_rate": 9.258563743975736e-06, "loss": 0.153, "step": 316600 }, { "epoch": 3.1086666293463345, "grad_norm": 4.528660774230957, "learning_rate": 9.25517098649513e-06, "loss": 0.147, "step": 316700 }, { "epoch": 3.1088362672203647, "grad_norm": 8.766101837158203, "learning_rate": 9.251778229014524e-06, "loss": 0.1604, "step": 316800 }, { "epoch": 3.109005905094395, "grad_norm": 3.061906099319458, "learning_rate": 9.248385471533918e-06, "loss": 0.1476, "step": 316900 }, { "epoch": 3.1091755429684254, "grad_norm": 9.351452827453613, "learning_rate": 9.244992714053311e-06, "loss": 0.1576, "step": 317000 }, { "epoch": 3.1093451808424555, "grad_norm": 13.356674194335938, "learning_rate": 9.241599956572705e-06, "loss": 0.144, "step": 317100 }, { "epoch": 3.109514818716486, "grad_norm": 8.799071311950684, "learning_rate": 9.238207199092099e-06, "loss": 0.1367, "step": 317200 }, { "epoch": 3.109684456590516, "grad_norm": 11.545010566711426, "learning_rate": 9.234814441611493e-06, "loss": 0.1776, "step": 317300 }, { "epoch": 3.1098540944645467, "grad_norm": 4.938475608825684, "learning_rate": 9.231421684130887e-06, "loss": 0.1455, "step": 317400 }, { "epoch": 3.110023732338577, "grad_norm": 3.726855516433716, "learning_rate": 9.22802892665028e-06, "loss": 0.1555, "step": 317500 }, { "epoch": 3.110193370212607, "grad_norm": 36.41159439086914, "learning_rate": 9.224636169169674e-06, "loss": 0.1485, "step": 317600 }, { "epoch": 3.1103630080866376, "grad_norm": 18.417932510375977, "learning_rate": 9.221243411689068e-06, "loss": 0.1371, "step": 317700 }, { "epoch": 3.1105326459606677, "grad_norm": 16.297130584716797, "learning_rate": 9.217850654208462e-06, "loss": 0.1667, "step": 317800 }, { "epoch": 3.1107022838346983, "grad_norm": 13.126415252685547, "learning_rate": 9.214457896727856e-06, "loss": 0.1416, "step": 317900 }, { "epoch": 3.1108719217087284, "grad_norm": 4.375078201293945, "learning_rate": 9.21106513924725e-06, "loss": 0.1548, "step": 318000 }, { "epoch": 3.1110415595827585, "grad_norm": 7.279323101043701, "learning_rate": 9.207672381766643e-06, "loss": 0.1613, "step": 318100 }, { "epoch": 3.111211197456789, "grad_norm": 8.33651065826416, "learning_rate": 9.204279624286037e-06, "loss": 0.1569, "step": 318200 }, { "epoch": 3.111380835330819, "grad_norm": 18.665924072265625, "learning_rate": 9.20088686680543e-06, "loss": 0.1636, "step": 318300 }, { "epoch": 3.11155047320485, "grad_norm": 14.927677154541016, "learning_rate": 9.197494109324825e-06, "loss": 0.1402, "step": 318400 }, { "epoch": 3.11172011107888, "grad_norm": 7.423648357391357, "learning_rate": 9.194101351844218e-06, "loss": 0.1647, "step": 318500 }, { "epoch": 3.11188974895291, "grad_norm": 9.62630844116211, "learning_rate": 9.190708594363614e-06, "loss": 0.1537, "step": 318600 }, { "epoch": 3.1120593868269406, "grad_norm": 11.103574752807617, "learning_rate": 9.187315836883006e-06, "loss": 0.143, "step": 318700 }, { "epoch": 3.1122290247009707, "grad_norm": 6.989495277404785, "learning_rate": 9.1839230794024e-06, "loss": 0.1569, "step": 318800 }, { "epoch": 3.1123986625750013, "grad_norm": 10.582026481628418, "learning_rate": 9.180530321921795e-06, "loss": 0.1477, "step": 318900 }, { "epoch": 3.1125683004490314, "grad_norm": 4.209559440612793, "learning_rate": 9.177137564441189e-06, "loss": 0.1572, "step": 319000 }, { "epoch": 3.1127379383230616, "grad_norm": 2.1376657485961914, "learning_rate": 9.173744806960581e-06, "loss": 0.1544, "step": 319100 }, { "epoch": 3.112907576197092, "grad_norm": 11.289379119873047, "learning_rate": 9.170352049479977e-06, "loss": 0.1466, "step": 319200 }, { "epoch": 3.1130772140711223, "grad_norm": 17.316179275512695, "learning_rate": 9.16695929199937e-06, "loss": 0.1489, "step": 319300 }, { "epoch": 3.113246851945153, "grad_norm": 8.989668846130371, "learning_rate": 9.163566534518764e-06, "loss": 0.16, "step": 319400 }, { "epoch": 3.113416489819183, "grad_norm": 4.3799614906311035, "learning_rate": 9.160173777038156e-06, "loss": 0.1535, "step": 319500 }, { "epoch": 3.113586127693213, "grad_norm": 6.918078899383545, "learning_rate": 9.156781019557552e-06, "loss": 0.1493, "step": 319600 }, { "epoch": 3.1137557655672436, "grad_norm": 15.436637878417969, "learning_rate": 9.153388262076946e-06, "loss": 0.1441, "step": 319700 }, { "epoch": 3.1139254034412738, "grad_norm": 14.970596313476562, "learning_rate": 9.149995504596338e-06, "loss": 0.1615, "step": 319800 }, { "epoch": 3.1140950413153043, "grad_norm": 12.728464126586914, "learning_rate": 9.146602747115733e-06, "loss": 0.1755, "step": 319900 }, { "epoch": 3.1142646791893345, "grad_norm": 8.670181274414062, "learning_rate": 9.143209989635127e-06, "loss": 0.1498, "step": 320000 }, { "epoch": 3.114434317063365, "grad_norm": 7.3130011558532715, "learning_rate": 9.13981723215452e-06, "loss": 0.1503, "step": 320100 }, { "epoch": 3.114603954937395, "grad_norm": 7.768583297729492, "learning_rate": 9.136424474673915e-06, "loss": 0.1497, "step": 320200 }, { "epoch": 3.1147735928114253, "grad_norm": 8.45137882232666, "learning_rate": 9.133031717193308e-06, "loss": 0.1533, "step": 320300 }, { "epoch": 3.114943230685456, "grad_norm": 11.041439056396484, "learning_rate": 9.129638959712702e-06, "loss": 0.1638, "step": 320400 }, { "epoch": 3.115112868559486, "grad_norm": 12.105263710021973, "learning_rate": 9.126246202232096e-06, "loss": 0.1425, "step": 320500 }, { "epoch": 3.1152825064335166, "grad_norm": 7.935965061187744, "learning_rate": 9.12285344475149e-06, "loss": 0.154, "step": 320600 }, { "epoch": 3.1154521443075467, "grad_norm": 6.290897846221924, "learning_rate": 9.119460687270884e-06, "loss": 0.1429, "step": 320700 }, { "epoch": 3.115621782181577, "grad_norm": 5.613923072814941, "learning_rate": 9.116067929790277e-06, "loss": 0.1572, "step": 320800 }, { "epoch": 3.1157914200556074, "grad_norm": 4.609667778015137, "learning_rate": 9.112675172309671e-06, "loss": 0.1489, "step": 320900 }, { "epoch": 3.1159610579296375, "grad_norm": 13.74768352508545, "learning_rate": 9.109282414829065e-06, "loss": 0.1747, "step": 321000 }, { "epoch": 3.116130695803668, "grad_norm": 11.756501197814941, "learning_rate": 9.105889657348459e-06, "loss": 0.1435, "step": 321100 }, { "epoch": 3.116300333677698, "grad_norm": 3.6802937984466553, "learning_rate": 9.102496899867853e-06, "loss": 0.1553, "step": 321200 }, { "epoch": 3.1164699715517283, "grad_norm": 4.375495433807373, "learning_rate": 9.099104142387246e-06, "loss": 0.1493, "step": 321300 }, { "epoch": 3.116639609425759, "grad_norm": 11.734832763671875, "learning_rate": 9.09571138490664e-06, "loss": 0.1509, "step": 321400 }, { "epoch": 3.116809247299789, "grad_norm": 4.484577655792236, "learning_rate": 9.092318627426034e-06, "loss": 0.1491, "step": 321500 }, { "epoch": 3.1169788851738196, "grad_norm": 12.60404109954834, "learning_rate": 9.088925869945428e-06, "loss": 0.1684, "step": 321600 }, { "epoch": 3.1171485230478497, "grad_norm": 5.369375228881836, "learning_rate": 9.085533112464822e-06, "loss": 0.1385, "step": 321700 }, { "epoch": 3.11731816092188, "grad_norm": 5.562798500061035, "learning_rate": 9.082140354984215e-06, "loss": 0.1639, "step": 321800 }, { "epoch": 3.1174877987959104, "grad_norm": 4.1967973709106445, "learning_rate": 9.078747597503609e-06, "loss": 0.1529, "step": 321900 }, { "epoch": 3.1176574366699406, "grad_norm": 5.292855262756348, "learning_rate": 9.075354840023005e-06, "loss": 0.1476, "step": 322000 }, { "epoch": 3.117827074543971, "grad_norm": 10.671407699584961, "learning_rate": 9.071962082542397e-06, "loss": 0.1585, "step": 322100 }, { "epoch": 3.1179967124180012, "grad_norm": 5.746520519256592, "learning_rate": 9.06856932506179e-06, "loss": 0.1374, "step": 322200 }, { "epoch": 3.1181663502920314, "grad_norm": 12.52679443359375, "learning_rate": 9.065176567581186e-06, "loss": 0.1694, "step": 322300 }, { "epoch": 3.118335988166062, "grad_norm": 4.903810501098633, "learning_rate": 9.06178381010058e-06, "loss": 0.1441, "step": 322400 }, { "epoch": 3.118505626040092, "grad_norm": 12.564791679382324, "learning_rate": 9.058391052619972e-06, "loss": 0.1384, "step": 322500 }, { "epoch": 3.1186752639141226, "grad_norm": 10.100198745727539, "learning_rate": 9.054998295139367e-06, "loss": 0.1484, "step": 322600 }, { "epoch": 3.1188449017881528, "grad_norm": 1.6982064247131348, "learning_rate": 9.051605537658761e-06, "loss": 0.1422, "step": 322700 }, { "epoch": 3.1190145396621833, "grad_norm": 14.451294898986816, "learning_rate": 9.048212780178155e-06, "loss": 0.1544, "step": 322800 }, { "epoch": 3.1191841775362135, "grad_norm": 5.437126636505127, "learning_rate": 9.044820022697549e-06, "loss": 0.1459, "step": 322900 }, { "epoch": 3.1193538154102436, "grad_norm": 25.893709182739258, "learning_rate": 9.041427265216943e-06, "loss": 0.1513, "step": 323000 }, { "epoch": 3.119523453284274, "grad_norm": 6.983341217041016, "learning_rate": 9.038034507736336e-06, "loss": 0.1608, "step": 323100 }, { "epoch": 3.1196930911583043, "grad_norm": 11.667720794677734, "learning_rate": 9.03464175025573e-06, "loss": 0.1507, "step": 323200 }, { "epoch": 3.119862729032335, "grad_norm": 2.4907960891723633, "learning_rate": 9.031248992775124e-06, "loss": 0.1674, "step": 323300 }, { "epoch": 3.120032366906365, "grad_norm": 2.911104202270508, "learning_rate": 9.027856235294518e-06, "loss": 0.1461, "step": 323400 }, { "epoch": 3.120202004780395, "grad_norm": 22.81617546081543, "learning_rate": 9.024463477813912e-06, "loss": 0.1471, "step": 323500 }, { "epoch": 3.1203716426544257, "grad_norm": 1.308252215385437, "learning_rate": 9.021070720333305e-06, "loss": 0.1457, "step": 323600 }, { "epoch": 3.120541280528456, "grad_norm": 4.271811008453369, "learning_rate": 9.017677962852699e-06, "loss": 0.1612, "step": 323700 }, { "epoch": 3.1207109184024864, "grad_norm": 10.921123504638672, "learning_rate": 9.014285205372093e-06, "loss": 0.1403, "step": 323800 }, { "epoch": 3.1208805562765165, "grad_norm": 15.82497501373291, "learning_rate": 9.010892447891487e-06, "loss": 0.158, "step": 323900 }, { "epoch": 3.1210501941505466, "grad_norm": 17.001861572265625, "learning_rate": 9.00749969041088e-06, "loss": 0.1511, "step": 324000 }, { "epoch": 3.121219832024577, "grad_norm": 18.677310943603516, "learning_rate": 9.004106932930274e-06, "loss": 0.1535, "step": 324100 }, { "epoch": 3.1213894698986073, "grad_norm": 28.991657257080078, "learning_rate": 9.000714175449668e-06, "loss": 0.1414, "step": 324200 }, { "epoch": 3.121559107772638, "grad_norm": 6.101358413696289, "learning_rate": 8.997321417969062e-06, "loss": 0.155, "step": 324300 }, { "epoch": 3.121728745646668, "grad_norm": 11.982998847961426, "learning_rate": 8.993928660488456e-06, "loss": 0.144, "step": 324400 }, { "epoch": 3.121898383520698, "grad_norm": 1.4227628707885742, "learning_rate": 8.99053590300785e-06, "loss": 0.1504, "step": 324500 }, { "epoch": 3.1220680213947287, "grad_norm": 8.330729484558105, "learning_rate": 8.987143145527243e-06, "loss": 0.155, "step": 324600 }, { "epoch": 3.122237659268759, "grad_norm": 6.300501346588135, "learning_rate": 8.983750388046639e-06, "loss": 0.1356, "step": 324700 }, { "epoch": 3.1224072971427894, "grad_norm": 20.837337493896484, "learning_rate": 8.980357630566031e-06, "loss": 0.1321, "step": 324800 }, { "epoch": 3.1225769350168195, "grad_norm": 4.385847568511963, "learning_rate": 8.976964873085425e-06, "loss": 0.139, "step": 324900 }, { "epoch": 3.12274657289085, "grad_norm": 4.935657024383545, "learning_rate": 8.97357211560482e-06, "loss": 0.1675, "step": 325000 }, { "epoch": 3.1229162107648802, "grad_norm": 11.19404411315918, "learning_rate": 8.970179358124212e-06, "loss": 0.1394, "step": 325100 }, { "epoch": 3.1230858486389104, "grad_norm": 2.215076446533203, "learning_rate": 8.966786600643606e-06, "loss": 0.1492, "step": 325200 }, { "epoch": 3.123255486512941, "grad_norm": 1.8280470371246338, "learning_rate": 8.963393843163002e-06, "loss": 0.1488, "step": 325300 }, { "epoch": 3.123425124386971, "grad_norm": 12.952447891235352, "learning_rate": 8.960001085682395e-06, "loss": 0.1534, "step": 325400 }, { "epoch": 3.1235947622610016, "grad_norm": 4.7561845779418945, "learning_rate": 8.956608328201787e-06, "loss": 0.1442, "step": 325500 }, { "epoch": 3.1237644001350318, "grad_norm": 8.698183059692383, "learning_rate": 8.953215570721181e-06, "loss": 0.1446, "step": 325600 }, { "epoch": 3.123934038009062, "grad_norm": 7.304073810577393, "learning_rate": 8.949822813240577e-06, "loss": 0.1485, "step": 325700 }, { "epoch": 3.1241036758830925, "grad_norm": 19.2971248626709, "learning_rate": 8.94643005575997e-06, "loss": 0.1532, "step": 325800 }, { "epoch": 3.1242733137571226, "grad_norm": 9.496908187866211, "learning_rate": 8.943037298279363e-06, "loss": 0.15, "step": 325900 }, { "epoch": 3.124442951631153, "grad_norm": 21.06388282775879, "learning_rate": 8.939644540798758e-06, "loss": 0.1552, "step": 326000 }, { "epoch": 3.1246125895051833, "grad_norm": 7.371946334838867, "learning_rate": 8.936251783318152e-06, "loss": 0.1562, "step": 326100 }, { "epoch": 3.1247822273792134, "grad_norm": 11.227481842041016, "learning_rate": 8.932859025837546e-06, "loss": 0.153, "step": 326200 }, { "epoch": 3.124951865253244, "grad_norm": 9.809083938598633, "learning_rate": 8.92946626835694e-06, "loss": 0.1419, "step": 326300 }, { "epoch": 3.125121503127274, "grad_norm": 14.668333053588867, "learning_rate": 8.926073510876333e-06, "loss": 0.1645, "step": 326400 }, { "epoch": 3.1252911410013047, "grad_norm": 15.842262268066406, "learning_rate": 8.922680753395727e-06, "loss": 0.1576, "step": 326500 }, { "epoch": 3.125460778875335, "grad_norm": 5.139595985412598, "learning_rate": 8.919287995915121e-06, "loss": 0.1508, "step": 326600 }, { "epoch": 3.125630416749365, "grad_norm": 4.800197124481201, "learning_rate": 8.915895238434515e-06, "loss": 0.1567, "step": 326700 }, { "epoch": 3.1258000546233955, "grad_norm": 0.7250486612319946, "learning_rate": 8.912502480953908e-06, "loss": 0.1542, "step": 326800 }, { "epoch": 3.1259696924974256, "grad_norm": 8.637436866760254, "learning_rate": 8.909109723473302e-06, "loss": 0.1686, "step": 326900 }, { "epoch": 3.126139330371456, "grad_norm": 32.78266525268555, "learning_rate": 8.905716965992696e-06, "loss": 0.1416, "step": 327000 }, { "epoch": 3.1263089682454863, "grad_norm": 2.3818910121917725, "learning_rate": 8.90232420851209e-06, "loss": 0.1399, "step": 327100 }, { "epoch": 3.126478606119517, "grad_norm": 20.868663787841797, "learning_rate": 8.898931451031484e-06, "loss": 0.1533, "step": 327200 }, { "epoch": 3.126648243993547, "grad_norm": 5.316283702850342, "learning_rate": 8.895538693550877e-06, "loss": 0.1477, "step": 327300 }, { "epoch": 3.126817881867577, "grad_norm": 4.440040111541748, "learning_rate": 8.892145936070271e-06, "loss": 0.1495, "step": 327400 }, { "epoch": 3.1269875197416077, "grad_norm": 6.1746015548706055, "learning_rate": 8.888753178589665e-06, "loss": 0.1374, "step": 327500 }, { "epoch": 3.127157157615638, "grad_norm": 9.59041690826416, "learning_rate": 8.885360421109059e-06, "loss": 0.1549, "step": 327600 }, { "epoch": 3.1273267954896684, "grad_norm": 4.135430812835693, "learning_rate": 8.881967663628454e-06, "loss": 0.1545, "step": 327700 }, { "epoch": 3.1274964333636985, "grad_norm": 16.396940231323242, "learning_rate": 8.878574906147846e-06, "loss": 0.1319, "step": 327800 }, { "epoch": 3.1276660712377287, "grad_norm": 10.399099349975586, "learning_rate": 8.87518214866724e-06, "loss": 0.1508, "step": 327900 }, { "epoch": 3.1278357091117592, "grad_norm": 8.851381301879883, "learning_rate": 8.871789391186634e-06, "loss": 0.1558, "step": 328000 }, { "epoch": 3.1280053469857894, "grad_norm": 9.997504234313965, "learning_rate": 8.86839663370603e-06, "loss": 0.1574, "step": 328100 }, { "epoch": 3.12817498485982, "grad_norm": 5.497226238250732, "learning_rate": 8.865003876225422e-06, "loss": 0.1551, "step": 328200 }, { "epoch": 3.12834462273385, "grad_norm": 19.404802322387695, "learning_rate": 8.861611118744815e-06, "loss": 0.1387, "step": 328300 }, { "epoch": 3.12851426060788, "grad_norm": 3.1415412425994873, "learning_rate": 8.858218361264211e-06, "loss": 0.1591, "step": 328400 }, { "epoch": 3.1286838984819108, "grad_norm": 2.0446486473083496, "learning_rate": 8.854825603783603e-06, "loss": 0.1471, "step": 328500 }, { "epoch": 3.128853536355941, "grad_norm": 18.815673828125, "learning_rate": 8.851432846302997e-06, "loss": 0.1405, "step": 328600 }, { "epoch": 3.1290231742299714, "grad_norm": 10.953410148620605, "learning_rate": 8.848040088822392e-06, "loss": 0.1529, "step": 328700 }, { "epoch": 3.1291928121040016, "grad_norm": 8.908870697021484, "learning_rate": 8.844647331341786e-06, "loss": 0.1296, "step": 328800 }, { "epoch": 3.1293624499780317, "grad_norm": 10.434511184692383, "learning_rate": 8.841254573861178e-06, "loss": 0.1658, "step": 328900 }, { "epoch": 3.1295320878520623, "grad_norm": 9.398686408996582, "learning_rate": 8.837861816380574e-06, "loss": 0.1766, "step": 329000 }, { "epoch": 3.1297017257260924, "grad_norm": 7.90044641494751, "learning_rate": 8.834469058899967e-06, "loss": 0.1456, "step": 329100 }, { "epoch": 3.129871363600123, "grad_norm": 16.060773849487305, "learning_rate": 8.831076301419361e-06, "loss": 0.1674, "step": 329200 }, { "epoch": 3.130041001474153, "grad_norm": 5.409017086029053, "learning_rate": 8.827683543938755e-06, "loss": 0.1637, "step": 329300 }, { "epoch": 3.130210639348183, "grad_norm": 9.413041114807129, "learning_rate": 8.824290786458149e-06, "loss": 0.1567, "step": 329400 }, { "epoch": 3.130380277222214, "grad_norm": 10.294463157653809, "learning_rate": 8.820898028977543e-06, "loss": 0.1642, "step": 329500 }, { "epoch": 3.130549915096244, "grad_norm": 10.808913230895996, "learning_rate": 8.817505271496936e-06, "loss": 0.1411, "step": 329600 }, { "epoch": 3.1307195529702745, "grad_norm": 6.416714191436768, "learning_rate": 8.81411251401633e-06, "loss": 0.1517, "step": 329700 }, { "epoch": 3.1308891908443046, "grad_norm": 18.73712730407715, "learning_rate": 8.810719756535724e-06, "loss": 0.1585, "step": 329800 }, { "epoch": 3.131058828718335, "grad_norm": 19.18730354309082, "learning_rate": 8.807326999055118e-06, "loss": 0.1598, "step": 329900 }, { "epoch": 3.1312284665923653, "grad_norm": 14.474937438964844, "learning_rate": 8.803934241574512e-06, "loss": 0.1467, "step": 330000 }, { "epoch": 3.1313981044663954, "grad_norm": 2.1220431327819824, "learning_rate": 8.800541484093905e-06, "loss": 0.1529, "step": 330100 }, { "epoch": 3.131567742340426, "grad_norm": 11.744819641113281, "learning_rate": 8.7971487266133e-06, "loss": 0.1492, "step": 330200 }, { "epoch": 3.131737380214456, "grad_norm": 1.2355562448501587, "learning_rate": 8.793755969132693e-06, "loss": 0.1514, "step": 330300 }, { "epoch": 3.1319070180884867, "grad_norm": 10.149300575256348, "learning_rate": 8.790363211652087e-06, "loss": 0.1578, "step": 330400 }, { "epoch": 3.132076655962517, "grad_norm": 3.975703001022339, "learning_rate": 8.78697045417148e-06, "loss": 0.1387, "step": 330500 }, { "epoch": 3.132246293836547, "grad_norm": 5.372679233551025, "learning_rate": 8.783577696690874e-06, "loss": 0.151, "step": 330600 }, { "epoch": 3.1324159317105775, "grad_norm": 4.228768825531006, "learning_rate": 8.780184939210268e-06, "loss": 0.1395, "step": 330700 }, { "epoch": 3.1325855695846077, "grad_norm": 14.677135467529297, "learning_rate": 8.776792181729662e-06, "loss": 0.1418, "step": 330800 }, { "epoch": 3.1327552074586382, "grad_norm": 6.7939324378967285, "learning_rate": 8.773399424249056e-06, "loss": 0.1472, "step": 330900 }, { "epoch": 3.1329248453326684, "grad_norm": 7.787801742553711, "learning_rate": 8.77000666676845e-06, "loss": 0.1622, "step": 331000 }, { "epoch": 3.1330944832066985, "grad_norm": 10.322552680969238, "learning_rate": 8.766613909287845e-06, "loss": 0.1448, "step": 331100 }, { "epoch": 3.133264121080729, "grad_norm": 14.07001781463623, "learning_rate": 8.763221151807237e-06, "loss": 0.1448, "step": 331200 }, { "epoch": 3.133433758954759, "grad_norm": 10.692039489746094, "learning_rate": 8.759828394326631e-06, "loss": 0.1608, "step": 331300 }, { "epoch": 3.1336033968287897, "grad_norm": 5.534763813018799, "learning_rate": 8.756435636846026e-06, "loss": 0.1541, "step": 331400 }, { "epoch": 3.13377303470282, "grad_norm": 8.916943550109863, "learning_rate": 8.75304287936542e-06, "loss": 0.1644, "step": 331500 }, { "epoch": 3.13394267257685, "grad_norm": 4.890268325805664, "learning_rate": 8.749650121884812e-06, "loss": 0.1663, "step": 331600 }, { "epoch": 3.1341123104508806, "grad_norm": 6.1551032066345215, "learning_rate": 8.746257364404208e-06, "loss": 0.1459, "step": 331700 }, { "epoch": 3.1342819483249107, "grad_norm": 3.5279381275177, "learning_rate": 8.742864606923602e-06, "loss": 0.1397, "step": 331800 }, { "epoch": 3.1344515861989413, "grad_norm": 5.114406108856201, "learning_rate": 8.739471849442994e-06, "loss": 0.1414, "step": 331900 }, { "epoch": 3.1346212240729714, "grad_norm": 5.314475059509277, "learning_rate": 8.736079091962388e-06, "loss": 0.1598, "step": 332000 }, { "epoch": 3.1347908619470015, "grad_norm": 2.272326707839966, "learning_rate": 8.732686334481783e-06, "loss": 0.1707, "step": 332100 }, { "epoch": 3.134960499821032, "grad_norm": 10.833784103393555, "learning_rate": 8.729293577001177e-06, "loss": 0.1438, "step": 332200 }, { "epoch": 3.135130137695062, "grad_norm": 7.149417877197266, "learning_rate": 8.725900819520569e-06, "loss": 0.1592, "step": 332300 }, { "epoch": 3.135299775569093, "grad_norm": 10.843116760253906, "learning_rate": 8.722508062039964e-06, "loss": 0.1534, "step": 332400 }, { "epoch": 3.135469413443123, "grad_norm": 13.283733367919922, "learning_rate": 8.719115304559358e-06, "loss": 0.144, "step": 332500 }, { "epoch": 3.1356390513171535, "grad_norm": 22.778839111328125, "learning_rate": 8.715722547078752e-06, "loss": 0.1573, "step": 332600 }, { "epoch": 3.1358086891911836, "grad_norm": 9.50833797454834, "learning_rate": 8.712329789598146e-06, "loss": 0.1375, "step": 332700 }, { "epoch": 3.1359783270652137, "grad_norm": 12.108532905578613, "learning_rate": 8.70893703211754e-06, "loss": 0.141, "step": 332800 }, { "epoch": 3.1361479649392443, "grad_norm": 10.512304306030273, "learning_rate": 8.705544274636933e-06, "loss": 0.155, "step": 332900 }, { "epoch": 3.1363176028132744, "grad_norm": 12.338032722473145, "learning_rate": 8.702151517156327e-06, "loss": 0.1382, "step": 333000 }, { "epoch": 3.136487240687305, "grad_norm": 8.894229888916016, "learning_rate": 8.698758759675721e-06, "loss": 0.1571, "step": 333100 }, { "epoch": 3.136656878561335, "grad_norm": 23.01858139038086, "learning_rate": 8.695366002195115e-06, "loss": 0.1385, "step": 333200 }, { "epoch": 3.1368265164353653, "grad_norm": 5.573966979980469, "learning_rate": 8.691973244714509e-06, "loss": 0.1265, "step": 333300 }, { "epoch": 3.136996154309396, "grad_norm": 16.47337532043457, "learning_rate": 8.688580487233902e-06, "loss": 0.1517, "step": 333400 }, { "epoch": 3.137165792183426, "grad_norm": 4.0820088386535645, "learning_rate": 8.685187729753296e-06, "loss": 0.1515, "step": 333500 }, { "epoch": 3.1373354300574565, "grad_norm": 1.55823814868927, "learning_rate": 8.68179497227269e-06, "loss": 0.1478, "step": 333600 }, { "epoch": 3.1375050679314866, "grad_norm": 6.9192795753479, "learning_rate": 8.678402214792084e-06, "loss": 0.1502, "step": 333700 }, { "epoch": 3.1376747058055168, "grad_norm": 1.5402929782867432, "learning_rate": 8.675009457311478e-06, "loss": 0.1649, "step": 333800 }, { "epoch": 3.1378443436795473, "grad_norm": 15.63942813873291, "learning_rate": 8.671616699830871e-06, "loss": 0.1467, "step": 333900 }, { "epoch": 3.1380139815535775, "grad_norm": 10.198447227478027, "learning_rate": 8.668223942350265e-06, "loss": 0.1503, "step": 334000 }, { "epoch": 3.138183619427608, "grad_norm": 4.773581504821777, "learning_rate": 8.664831184869659e-06, "loss": 0.1338, "step": 334100 }, { "epoch": 3.138353257301638, "grad_norm": 13.391327857971191, "learning_rate": 8.661438427389053e-06, "loss": 0.1582, "step": 334200 }, { "epoch": 3.1385228951756683, "grad_norm": 13.790149688720703, "learning_rate": 8.658045669908447e-06, "loss": 0.1323, "step": 334300 }, { "epoch": 3.138692533049699, "grad_norm": 10.463839530944824, "learning_rate": 8.65465291242784e-06, "loss": 0.1514, "step": 334400 }, { "epoch": 3.138862170923729, "grad_norm": 1.6945717334747314, "learning_rate": 8.651260154947236e-06, "loss": 0.1388, "step": 334500 }, { "epoch": 3.1390318087977596, "grad_norm": 12.751996040344238, "learning_rate": 8.647867397466628e-06, "loss": 0.1535, "step": 334600 }, { "epoch": 3.1392014466717897, "grad_norm": 2.94106125831604, "learning_rate": 8.644474639986022e-06, "loss": 0.1556, "step": 334700 }, { "epoch": 3.13937108454582, "grad_norm": 6.561633110046387, "learning_rate": 8.641081882505417e-06, "loss": 0.1544, "step": 334800 }, { "epoch": 3.1395407224198504, "grad_norm": 3.846176862716675, "learning_rate": 8.637689125024811e-06, "loss": 0.1539, "step": 334900 }, { "epoch": 3.1397103602938805, "grad_norm": 10.778132438659668, "learning_rate": 8.634296367544203e-06, "loss": 0.1627, "step": 335000 }, { "epoch": 3.139879998167911, "grad_norm": 8.251314163208008, "learning_rate": 8.630903610063599e-06, "loss": 0.1423, "step": 335100 }, { "epoch": 3.140049636041941, "grad_norm": 3.559891939163208, "learning_rate": 8.627510852582992e-06, "loss": 0.1487, "step": 335200 }, { "epoch": 3.1402192739159718, "grad_norm": 5.410735607147217, "learning_rate": 8.624118095102386e-06, "loss": 0.1494, "step": 335300 }, { "epoch": 3.140388911790002, "grad_norm": 5.394093036651611, "learning_rate": 8.62072533762178e-06, "loss": 0.1497, "step": 335400 }, { "epoch": 3.140558549664032, "grad_norm": 5.447688102722168, "learning_rate": 8.617332580141174e-06, "loss": 0.1586, "step": 335500 }, { "epoch": 3.1407281875380626, "grad_norm": 5.747609615325928, "learning_rate": 8.613939822660568e-06, "loss": 0.1484, "step": 335600 }, { "epoch": 3.1408978254120927, "grad_norm": 9.34107494354248, "learning_rate": 8.61054706517996e-06, "loss": 0.155, "step": 335700 }, { "epoch": 3.1410674632861233, "grad_norm": 11.651909828186035, "learning_rate": 8.607154307699355e-06, "loss": 0.156, "step": 335800 }, { "epoch": 3.1412371011601534, "grad_norm": 7.69114351272583, "learning_rate": 8.603761550218749e-06, "loss": 0.1588, "step": 335900 }, { "epoch": 3.1414067390341835, "grad_norm": 5.9808197021484375, "learning_rate": 8.600368792738143e-06, "loss": 0.1457, "step": 336000 }, { "epoch": 3.141576376908214, "grad_norm": 14.738529205322266, "learning_rate": 8.596976035257537e-06, "loss": 0.1444, "step": 336100 }, { "epoch": 3.1417460147822442, "grad_norm": 6.872000217437744, "learning_rate": 8.59358327777693e-06, "loss": 0.1539, "step": 336200 }, { "epoch": 3.141915652656275, "grad_norm": 16.223583221435547, "learning_rate": 8.590190520296324e-06, "loss": 0.147, "step": 336300 }, { "epoch": 3.142085290530305, "grad_norm": 1.6831421852111816, "learning_rate": 8.586797762815718e-06, "loss": 0.1456, "step": 336400 }, { "epoch": 3.142254928404335, "grad_norm": 3.8319926261901855, "learning_rate": 8.583405005335112e-06, "loss": 0.1556, "step": 336500 }, { "epoch": 3.1424245662783656, "grad_norm": 4.042234420776367, "learning_rate": 8.580012247854506e-06, "loss": 0.1493, "step": 336600 }, { "epoch": 3.1425942041523958, "grad_norm": 10.577070236206055, "learning_rate": 8.5766194903739e-06, "loss": 0.1535, "step": 336700 }, { "epoch": 3.1427638420264263, "grad_norm": 2.1118826866149902, "learning_rate": 8.573226732893293e-06, "loss": 0.1502, "step": 336800 }, { "epoch": 3.142858839235883, "eval_accuracy": 0.8158623098833773, "eval_f1": 0.8736179377189995, "eval_loss": 0.6319847106933594, "eval_runtime": 385.1385, "eval_samples_per_second": 869.625, "eval_steps_per_second": 27.177, "step": 336856 }, { "epoch": 4.0000746406645735, "grad_norm": 1.4753037691116333, "learning_rate": 8.569833975412687e-06, "loss": 0.1169, "step": 336900 }, { "epoch": 4.000244278538603, "grad_norm": 25.817707061767578, "learning_rate": 8.56644121793208e-06, "loss": 0.1312, "step": 337000 }, { "epoch": 4.000413916412634, "grad_norm": 6.65255880355835, "learning_rate": 8.563048460451475e-06, "loss": 0.1144, "step": 337100 }, { "epoch": 4.000583554286664, "grad_norm": 7.5764055252075195, "learning_rate": 8.55965570297087e-06, "loss": 0.1126, "step": 337200 }, { "epoch": 4.000753192160695, "grad_norm": 2.29899263381958, "learning_rate": 8.556262945490262e-06, "loss": 0.1273, "step": 337300 }, { "epoch": 4.000922830034725, "grad_norm": 7.7006330490112305, "learning_rate": 8.552870188009656e-06, "loss": 0.1256, "step": 337400 }, { "epoch": 4.001092467908755, "grad_norm": 8.635316848754883, "learning_rate": 8.549477430529051e-06, "loss": 0.1302, "step": 337500 }, { "epoch": 4.001262105782786, "grad_norm": 3.5421018600463867, "learning_rate": 8.546084673048444e-06, "loss": 0.1019, "step": 337600 }, { "epoch": 4.001431743656815, "grad_norm": 2.1323189735412598, "learning_rate": 8.542691915567837e-06, "loss": 0.1311, "step": 337700 }, { "epoch": 4.001601381530846, "grad_norm": 12.08141803741455, "learning_rate": 8.539299158087233e-06, "loss": 0.1271, "step": 337800 }, { "epoch": 4.001771019404877, "grad_norm": 4.371209621429443, "learning_rate": 8.535906400606627e-06, "loss": 0.1209, "step": 337900 }, { "epoch": 4.001940657278907, "grad_norm": 20.55004119873047, "learning_rate": 8.532513643126019e-06, "loss": 0.1123, "step": 338000 }, { "epoch": 4.002110295152937, "grad_norm": 6.263615131378174, "learning_rate": 8.529120885645412e-06, "loss": 0.124, "step": 338100 }, { "epoch": 4.002279933026967, "grad_norm": 5.724860191345215, "learning_rate": 8.525728128164808e-06, "loss": 0.1359, "step": 338200 }, { "epoch": 4.002449570900998, "grad_norm": 14.596488952636719, "learning_rate": 8.522335370684202e-06, "loss": 0.1397, "step": 338300 }, { "epoch": 4.002619208775028, "grad_norm": 5.167860984802246, "learning_rate": 8.518942613203594e-06, "loss": 0.1299, "step": 338400 }, { "epoch": 4.002788846649058, "grad_norm": 18.229463577270508, "learning_rate": 8.51554985572299e-06, "loss": 0.1112, "step": 338500 }, { "epoch": 4.002958484523089, "grad_norm": 1.9730621576309204, "learning_rate": 8.512157098242383e-06, "loss": 0.1289, "step": 338600 }, { "epoch": 4.0031281223971185, "grad_norm": 6.861379623413086, "learning_rate": 8.508764340761777e-06, "loss": 0.1177, "step": 338700 }, { "epoch": 4.003297760271149, "grad_norm": 8.073102951049805, "learning_rate": 8.50537158328117e-06, "loss": 0.1207, "step": 338800 }, { "epoch": 4.00346739814518, "grad_norm": 14.724177360534668, "learning_rate": 8.501978825800565e-06, "loss": 0.1178, "step": 338900 }, { "epoch": 4.00363703601921, "grad_norm": 25.075544357299805, "learning_rate": 8.498586068319958e-06, "loss": 0.1241, "step": 339000 }, { "epoch": 4.00380667389324, "grad_norm": 15.72407054901123, "learning_rate": 8.495193310839352e-06, "loss": 0.1318, "step": 339100 }, { "epoch": 4.00397631176727, "grad_norm": 2.8275187015533447, "learning_rate": 8.491800553358746e-06, "loss": 0.1065, "step": 339200 }, { "epoch": 4.004145949641301, "grad_norm": 16.065927505493164, "learning_rate": 8.48840779587814e-06, "loss": 0.1162, "step": 339300 }, { "epoch": 4.004315587515331, "grad_norm": 13.263042449951172, "learning_rate": 8.485015038397534e-06, "loss": 0.1246, "step": 339400 }, { "epoch": 4.004485225389361, "grad_norm": 4.651915073394775, "learning_rate": 8.481622280916927e-06, "loss": 0.1249, "step": 339500 }, { "epoch": 4.004654863263392, "grad_norm": 4.318575859069824, "learning_rate": 8.478229523436321e-06, "loss": 0.1147, "step": 339600 }, { "epoch": 4.004824501137422, "grad_norm": 11.587611198425293, "learning_rate": 8.474836765955715e-06, "loss": 0.1279, "step": 339700 }, { "epoch": 4.004994139011452, "grad_norm": 5.598023414611816, "learning_rate": 8.471444008475109e-06, "loss": 0.1209, "step": 339800 }, { "epoch": 4.005163776885483, "grad_norm": 11.661540985107422, "learning_rate": 8.468051250994502e-06, "loss": 0.1268, "step": 339900 }, { "epoch": 4.005333414759513, "grad_norm": 31.349014282226562, "learning_rate": 8.464658493513896e-06, "loss": 0.1228, "step": 340000 }, { "epoch": 4.005503052633543, "grad_norm": 59.5808219909668, "learning_rate": 8.46126573603329e-06, "loss": 0.1296, "step": 340100 }, { "epoch": 4.0056726905075735, "grad_norm": 6.532795429229736, "learning_rate": 8.457872978552686e-06, "loss": 0.1222, "step": 340200 }, { "epoch": 4.005842328381604, "grad_norm": 9.745274543762207, "learning_rate": 8.454480221072078e-06, "loss": 0.1188, "step": 340300 }, { "epoch": 4.006011966255634, "grad_norm": 23.338224411010742, "learning_rate": 8.451087463591471e-06, "loss": 0.1475, "step": 340400 }, { "epoch": 4.006181604129664, "grad_norm": 4.777886867523193, "learning_rate": 8.447694706110865e-06, "loss": 0.105, "step": 340500 }, { "epoch": 4.006351242003695, "grad_norm": 4.203643321990967, "learning_rate": 8.44430194863026e-06, "loss": 0.118, "step": 340600 }, { "epoch": 4.006520879877725, "grad_norm": 8.315308570861816, "learning_rate": 8.440909191149653e-06, "loss": 0.1328, "step": 340700 }, { "epoch": 4.006690517751755, "grad_norm": 4.954531669616699, "learning_rate": 8.437516433669047e-06, "loss": 0.1292, "step": 340800 }, { "epoch": 4.006860155625786, "grad_norm": 11.588252067565918, "learning_rate": 8.434123676188442e-06, "loss": 0.1126, "step": 340900 }, { "epoch": 4.007029793499816, "grad_norm": 7.825423240661621, "learning_rate": 8.430730918707834e-06, "loss": 0.1089, "step": 341000 }, { "epoch": 4.007199431373846, "grad_norm": 23.660751342773438, "learning_rate": 8.427338161227228e-06, "loss": 0.1128, "step": 341100 }, { "epoch": 4.0073690692478765, "grad_norm": 30.48399543762207, "learning_rate": 8.423945403746624e-06, "loss": 0.1037, "step": 341200 }, { "epoch": 4.007538707121907, "grad_norm": 24.171367645263672, "learning_rate": 8.420552646266017e-06, "loss": 0.1353, "step": 341300 }, { "epoch": 4.007708344995937, "grad_norm": 42.28245544433594, "learning_rate": 8.41715988878541e-06, "loss": 0.1225, "step": 341400 }, { "epoch": 4.007877982869967, "grad_norm": 13.029651641845703, "learning_rate": 8.413767131304805e-06, "loss": 0.137, "step": 341500 }, { "epoch": 4.008047620743998, "grad_norm": 7.880374431610107, "learning_rate": 8.410374373824199e-06, "loss": 0.118, "step": 341600 }, { "epoch": 4.0082172586180285, "grad_norm": 8.456289291381836, "learning_rate": 8.406981616343593e-06, "loss": 0.1332, "step": 341700 }, { "epoch": 4.008386896492058, "grad_norm": 14.888591766357422, "learning_rate": 8.403588858862986e-06, "loss": 0.1408, "step": 341800 }, { "epoch": 4.008556534366089, "grad_norm": 20.587812423706055, "learning_rate": 8.40019610138238e-06, "loss": 0.1222, "step": 341900 }, { "epoch": 4.008726172240119, "grad_norm": 2.22739315032959, "learning_rate": 8.396803343901774e-06, "loss": 0.1258, "step": 342000 }, { "epoch": 4.008895810114149, "grad_norm": 6.209226608276367, "learning_rate": 8.393410586421168e-06, "loss": 0.1188, "step": 342100 }, { "epoch": 4.0090654479881795, "grad_norm": 2.9117813110351562, "learning_rate": 8.390017828940561e-06, "loss": 0.1128, "step": 342200 }, { "epoch": 4.00923508586221, "grad_norm": 6.999342441558838, "learning_rate": 8.386625071459955e-06, "loss": 0.1282, "step": 342300 }, { "epoch": 4.009404723736241, "grad_norm": 17.391786575317383, "learning_rate": 8.383232313979349e-06, "loss": 0.1037, "step": 342400 }, { "epoch": 4.00957436161027, "grad_norm": 12.371906280517578, "learning_rate": 8.379839556498743e-06, "loss": 0.1354, "step": 342500 }, { "epoch": 4.009743999484301, "grad_norm": 26.41329574584961, "learning_rate": 8.376446799018137e-06, "loss": 0.1278, "step": 342600 }, { "epoch": 4.0099136373583315, "grad_norm": 19.788253784179688, "learning_rate": 8.37305404153753e-06, "loss": 0.1185, "step": 342700 }, { "epoch": 4.010083275232361, "grad_norm": 10.428492546081543, "learning_rate": 8.369661284056924e-06, "loss": 0.1107, "step": 342800 }, { "epoch": 4.010252913106392, "grad_norm": 9.066603660583496, "learning_rate": 8.366268526576318e-06, "loss": 0.1305, "step": 342900 }, { "epoch": 4.010422550980422, "grad_norm": 14.27758502960205, "learning_rate": 8.362875769095712e-06, "loss": 0.123, "step": 343000 }, { "epoch": 4.010592188854452, "grad_norm": 9.269153594970703, "learning_rate": 8.359483011615106e-06, "loss": 0.141, "step": 343100 }, { "epoch": 4.010761826728483, "grad_norm": 18.361230850219727, "learning_rate": 8.3560902541345e-06, "loss": 0.1355, "step": 343200 }, { "epoch": 4.010931464602513, "grad_norm": 19.991859436035156, "learning_rate": 8.352697496653893e-06, "loss": 0.1054, "step": 343300 }, { "epoch": 4.011101102476544, "grad_norm": 20.192245483398438, "learning_rate": 8.349304739173287e-06, "loss": 0.1324, "step": 343400 }, { "epoch": 4.011270740350573, "grad_norm": 8.221390724182129, "learning_rate": 8.34591198169268e-06, "loss": 0.1211, "step": 343500 }, { "epoch": 4.011440378224604, "grad_norm": 13.013278007507324, "learning_rate": 8.342519224212076e-06, "loss": 0.1102, "step": 343600 }, { "epoch": 4.0116100160986345, "grad_norm": 11.90462589263916, "learning_rate": 8.339126466731468e-06, "loss": 0.1181, "step": 343700 }, { "epoch": 4.011779653972664, "grad_norm": 3.2053580284118652, "learning_rate": 8.335733709250862e-06, "loss": 0.1232, "step": 343800 }, { "epoch": 4.011949291846695, "grad_norm": 7.971137523651123, "learning_rate": 8.332340951770258e-06, "loss": 0.1212, "step": 343900 }, { "epoch": 4.012118929720725, "grad_norm": 16.3214054107666, "learning_rate": 8.328948194289652e-06, "loss": 0.1178, "step": 344000 }, { "epoch": 4.012288567594755, "grad_norm": 5.22127103805542, "learning_rate": 8.325555436809044e-06, "loss": 0.1255, "step": 344100 }, { "epoch": 4.012458205468786, "grad_norm": 15.117521286010742, "learning_rate": 8.322162679328437e-06, "loss": 0.136, "step": 344200 }, { "epoch": 4.012627843342816, "grad_norm": 3.054384231567383, "learning_rate": 8.318769921847833e-06, "loss": 0.1275, "step": 344300 }, { "epoch": 4.012797481216847, "grad_norm": 8.193516731262207, "learning_rate": 8.315377164367225e-06, "loss": 0.1345, "step": 344400 }, { "epoch": 4.0129671190908764, "grad_norm": 14.622005462646484, "learning_rate": 8.311984406886619e-06, "loss": 0.1321, "step": 344500 }, { "epoch": 4.013136756964907, "grad_norm": 1.3674663305282593, "learning_rate": 8.308591649406014e-06, "loss": 0.1214, "step": 344600 }, { "epoch": 4.013306394838938, "grad_norm": 7.983667373657227, "learning_rate": 8.305198891925408e-06, "loss": 0.113, "step": 344700 }, { "epoch": 4.013476032712967, "grad_norm": 10.49072551727295, "learning_rate": 8.3018061344448e-06, "loss": 0.1159, "step": 344800 }, { "epoch": 4.013645670586998, "grad_norm": 12.936163902282715, "learning_rate": 8.298413376964196e-06, "loss": 0.1377, "step": 344900 }, { "epoch": 4.013815308461028, "grad_norm": 11.379878997802734, "learning_rate": 8.29502061948359e-06, "loss": 0.1151, "step": 345000 }, { "epoch": 4.013984946335059, "grad_norm": 1.9266237020492554, "learning_rate": 8.291627862002983e-06, "loss": 0.1266, "step": 345100 }, { "epoch": 4.014154584209089, "grad_norm": 9.137473106384277, "learning_rate": 8.288235104522377e-06, "loss": 0.1155, "step": 345200 }, { "epoch": 4.014324222083119, "grad_norm": 1.0795109272003174, "learning_rate": 8.284842347041771e-06, "loss": 0.1158, "step": 345300 }, { "epoch": 4.01449385995715, "grad_norm": 9.868939399719238, "learning_rate": 8.281449589561165e-06, "loss": 0.1322, "step": 345400 }, { "epoch": 4.0146634978311795, "grad_norm": 8.113648414611816, "learning_rate": 8.278056832080558e-06, "loss": 0.1262, "step": 345500 }, { "epoch": 4.01483313570521, "grad_norm": 44.553653717041016, "learning_rate": 8.274664074599952e-06, "loss": 0.1082, "step": 345600 }, { "epoch": 4.015002773579241, "grad_norm": 24.069320678710938, "learning_rate": 8.271271317119346e-06, "loss": 0.1367, "step": 345700 }, { "epoch": 4.01517241145327, "grad_norm": 6.197901248931885, "learning_rate": 8.26787855963874e-06, "loss": 0.1179, "step": 345800 }, { "epoch": 4.015342049327301, "grad_norm": 6.067958831787109, "learning_rate": 8.264485802158134e-06, "loss": 0.1133, "step": 345900 }, { "epoch": 4.0155116872013314, "grad_norm": 15.382827758789062, "learning_rate": 8.261093044677527e-06, "loss": 0.1234, "step": 346000 }, { "epoch": 4.015681325075362, "grad_norm": 11.560909271240234, "learning_rate": 8.257700287196921e-06, "loss": 0.1174, "step": 346100 }, { "epoch": 4.015850962949392, "grad_norm": 18.342487335205078, "learning_rate": 8.254307529716315e-06, "loss": 0.1219, "step": 346200 }, { "epoch": 4.016020600823422, "grad_norm": 13.920736312866211, "learning_rate": 8.250914772235709e-06, "loss": 0.1242, "step": 346300 }, { "epoch": 4.016190238697453, "grad_norm": 2.9301042556762695, "learning_rate": 8.247522014755103e-06, "loss": 0.1304, "step": 346400 }, { "epoch": 4.0163598765714825, "grad_norm": 14.16393756866455, "learning_rate": 8.244129257274496e-06, "loss": 0.1161, "step": 346500 }, { "epoch": 4.016529514445513, "grad_norm": 9.511248588562012, "learning_rate": 8.24073649979389e-06, "loss": 0.1392, "step": 346600 }, { "epoch": 4.016699152319544, "grad_norm": 11.840932846069336, "learning_rate": 8.237343742313284e-06, "loss": 0.1173, "step": 346700 }, { "epoch": 4.016868790193573, "grad_norm": 10.93812370300293, "learning_rate": 8.233950984832678e-06, "loss": 0.1225, "step": 346800 }, { "epoch": 4.017038428067604, "grad_norm": 10.380459785461426, "learning_rate": 8.230558227352072e-06, "loss": 0.1177, "step": 346900 }, { "epoch": 4.0172080659416345, "grad_norm": 2.376262664794922, "learning_rate": 8.227165469871467e-06, "loss": 0.1312, "step": 347000 }, { "epoch": 4.017377703815665, "grad_norm": 6.312952041625977, "learning_rate": 8.22377271239086e-06, "loss": 0.1294, "step": 347100 }, { "epoch": 4.017547341689695, "grad_norm": 8.470458984375, "learning_rate": 8.220379954910253e-06, "loss": 0.1156, "step": 347200 }, { "epoch": 4.017716979563725, "grad_norm": 8.567611694335938, "learning_rate": 8.216987197429648e-06, "loss": 0.1254, "step": 347300 }, { "epoch": 4.017886617437756, "grad_norm": 12.712966918945312, "learning_rate": 8.213594439949042e-06, "loss": 0.129, "step": 347400 }, { "epoch": 4.018056255311786, "grad_norm": 2.8734512329101562, "learning_rate": 8.210201682468434e-06, "loss": 0.1101, "step": 347500 }, { "epoch": 4.018225893185816, "grad_norm": 19.900754928588867, "learning_rate": 8.20680892498783e-06, "loss": 0.1323, "step": 347600 }, { "epoch": 4.018395531059847, "grad_norm": 14.73266315460205, "learning_rate": 8.203416167507224e-06, "loss": 0.1313, "step": 347700 }, { "epoch": 4.018565168933877, "grad_norm": 7.524212837219238, "learning_rate": 8.200023410026617e-06, "loss": 0.1089, "step": 347800 }, { "epoch": 4.018734806807907, "grad_norm": 16.132570266723633, "learning_rate": 8.196630652546011e-06, "loss": 0.1106, "step": 347900 }, { "epoch": 4.0189044446819375, "grad_norm": 7.607890605926514, "learning_rate": 8.193237895065405e-06, "loss": 0.1274, "step": 348000 }, { "epoch": 4.019074082555968, "grad_norm": 3.2361605167388916, "learning_rate": 8.189845137584799e-06, "loss": 0.1262, "step": 348100 }, { "epoch": 4.019243720429998, "grad_norm": 12.859869003295898, "learning_rate": 8.186452380104191e-06, "loss": 0.1131, "step": 348200 }, { "epoch": 4.019413358304028, "grad_norm": 12.418172836303711, "learning_rate": 8.183059622623586e-06, "loss": 0.113, "step": 348300 }, { "epoch": 4.019582996178059, "grad_norm": 4.697198390960693, "learning_rate": 8.17966686514298e-06, "loss": 0.1359, "step": 348400 }, { "epoch": 4.019752634052089, "grad_norm": 33.50651931762695, "learning_rate": 8.176274107662374e-06, "loss": 0.1121, "step": 348500 }, { "epoch": 4.019922271926119, "grad_norm": 17.201929092407227, "learning_rate": 8.172881350181768e-06, "loss": 0.1214, "step": 348600 }, { "epoch": 4.02009190980015, "grad_norm": 1.6107405424118042, "learning_rate": 8.169488592701162e-06, "loss": 0.123, "step": 348700 }, { "epoch": 4.02026154767418, "grad_norm": 7.8145551681518555, "learning_rate": 8.166095835220555e-06, "loss": 0.1253, "step": 348800 }, { "epoch": 4.02043118554821, "grad_norm": 2.854302167892456, "learning_rate": 8.16270307773995e-06, "loss": 0.1357, "step": 348900 }, { "epoch": 4.020600823422241, "grad_norm": 16.306047439575195, "learning_rate": 8.159310320259343e-06, "loss": 0.1362, "step": 349000 }, { "epoch": 4.020770461296271, "grad_norm": 1.1139909029006958, "learning_rate": 8.155917562778737e-06, "loss": 0.1265, "step": 349100 }, { "epoch": 4.020940099170301, "grad_norm": 12.16659164428711, "learning_rate": 8.15252480529813e-06, "loss": 0.1258, "step": 349200 }, { "epoch": 4.021109737044331, "grad_norm": 19.98220443725586, "learning_rate": 8.149132047817524e-06, "loss": 0.1191, "step": 349300 }, { "epoch": 4.021279374918362, "grad_norm": 27.745275497436523, "learning_rate": 8.145739290336918e-06, "loss": 0.1158, "step": 349400 }, { "epoch": 4.021449012792392, "grad_norm": 7.404868125915527, "learning_rate": 8.142346532856312e-06, "loss": 0.1292, "step": 349500 }, { "epoch": 4.021618650666422, "grad_norm": 1.6541012525558472, "learning_rate": 8.138953775375706e-06, "loss": 0.1241, "step": 349600 }, { "epoch": 4.021788288540453, "grad_norm": 4.041659832000732, "learning_rate": 8.1355610178951e-06, "loss": 0.1126, "step": 349700 }, { "epoch": 4.021957926414483, "grad_norm": 6.789789199829102, "learning_rate": 8.132168260414493e-06, "loss": 0.12, "step": 349800 }, { "epoch": 4.022127564288513, "grad_norm": 13.931510925292969, "learning_rate": 8.128775502933887e-06, "loss": 0.1289, "step": 349900 }, { "epoch": 4.022297202162544, "grad_norm": 23.473278045654297, "learning_rate": 8.125382745453283e-06, "loss": 0.1136, "step": 350000 }, { "epoch": 4.022466840036574, "grad_norm": 9.654302597045898, "learning_rate": 8.121989987972675e-06, "loss": 0.1273, "step": 350100 }, { "epoch": 4.022636477910604, "grad_norm": 9.047483444213867, "learning_rate": 8.118597230492069e-06, "loss": 0.1302, "step": 350200 }, { "epoch": 4.022806115784634, "grad_norm": 20.0140323638916, "learning_rate": 8.115204473011464e-06, "loss": 0.123, "step": 350300 }, { "epoch": 4.022975753658665, "grad_norm": 6.775856971740723, "learning_rate": 8.111811715530858e-06, "loss": 0.118, "step": 350400 }, { "epoch": 4.023145391532696, "grad_norm": 10.502120018005371, "learning_rate": 8.10841895805025e-06, "loss": 0.1337, "step": 350500 }, { "epoch": 4.023315029406725, "grad_norm": 6.88502311706543, "learning_rate": 8.105026200569644e-06, "loss": 0.1177, "step": 350600 }, { "epoch": 4.023484667280756, "grad_norm": 1.2850745916366577, "learning_rate": 8.10163344308904e-06, "loss": 0.1343, "step": 350700 }, { "epoch": 4.023654305154786, "grad_norm": 22.36053466796875, "learning_rate": 8.098240685608433e-06, "loss": 0.1112, "step": 350800 }, { "epoch": 4.023823943028816, "grad_norm": 5.1058454513549805, "learning_rate": 8.094847928127825e-06, "loss": 0.1332, "step": 350900 }, { "epoch": 4.023993580902847, "grad_norm": 18.764307022094727, "learning_rate": 8.09145517064722e-06, "loss": 0.1347, "step": 351000 }, { "epoch": 4.024163218776877, "grad_norm": 13.133614540100098, "learning_rate": 8.088062413166614e-06, "loss": 0.1343, "step": 351100 }, { "epoch": 4.024332856650907, "grad_norm": 18.693710327148438, "learning_rate": 8.084669655686008e-06, "loss": 0.1208, "step": 351200 }, { "epoch": 4.0245024945249375, "grad_norm": 11.377923965454102, "learning_rate": 8.081276898205402e-06, "loss": 0.1342, "step": 351300 }, { "epoch": 4.024672132398968, "grad_norm": 23.671886444091797, "learning_rate": 8.077884140724796e-06, "loss": 0.1273, "step": 351400 }, { "epoch": 4.024841770272999, "grad_norm": 5.256482124328613, "learning_rate": 8.07449138324419e-06, "loss": 0.1301, "step": 351500 }, { "epoch": 4.025011408147028, "grad_norm": 3.166217088699341, "learning_rate": 8.071098625763583e-06, "loss": 0.1319, "step": 351600 }, { "epoch": 4.025181046021059, "grad_norm": 4.691116809844971, "learning_rate": 8.067705868282977e-06, "loss": 0.1088, "step": 351700 }, { "epoch": 4.025350683895089, "grad_norm": 5.852519989013672, "learning_rate": 8.064313110802371e-06, "loss": 0.1406, "step": 351800 }, { "epoch": 4.025520321769119, "grad_norm": 12.814229965209961, "learning_rate": 8.060920353321765e-06, "loss": 0.1329, "step": 351900 }, { "epoch": 4.02568995964315, "grad_norm": 22.606258392333984, "learning_rate": 8.057527595841159e-06, "loss": 0.1033, "step": 352000 }, { "epoch": 4.02585959751718, "grad_norm": 10.76799488067627, "learning_rate": 8.054134838360552e-06, "loss": 0.1241, "step": 352100 }, { "epoch": 4.026029235391211, "grad_norm": 7.967154502868652, "learning_rate": 8.050742080879946e-06, "loss": 0.1232, "step": 352200 }, { "epoch": 4.0261988732652405, "grad_norm": 5.316176891326904, "learning_rate": 8.04734932339934e-06, "loss": 0.1259, "step": 352300 }, { "epoch": 4.026368511139271, "grad_norm": 3.9711930751800537, "learning_rate": 8.043956565918734e-06, "loss": 0.1482, "step": 352400 }, { "epoch": 4.026538149013302, "grad_norm": 21.198040008544922, "learning_rate": 8.040563808438128e-06, "loss": 0.1212, "step": 352500 }, { "epoch": 4.026707786887331, "grad_norm": 7.742687702178955, "learning_rate": 8.037171050957521e-06, "loss": 0.1086, "step": 352600 }, { "epoch": 4.026877424761362, "grad_norm": 5.0289459228515625, "learning_rate": 8.033778293476915e-06, "loss": 0.1214, "step": 352700 }, { "epoch": 4.0270470626353925, "grad_norm": 10.210003852844238, "learning_rate": 8.030385535996309e-06, "loss": 0.1315, "step": 352800 }, { "epoch": 4.027216700509422, "grad_norm": 1.7948375940322876, "learning_rate": 8.026992778515703e-06, "loss": 0.1212, "step": 352900 }, { "epoch": 4.027386338383453, "grad_norm": 5.884891510009766, "learning_rate": 8.023600021035096e-06, "loss": 0.1256, "step": 353000 }, { "epoch": 4.027555976257483, "grad_norm": 5.018660545349121, "learning_rate": 8.020207263554492e-06, "loss": 0.1399, "step": 353100 }, { "epoch": 4.027725614131514, "grad_norm": 9.203808784484863, "learning_rate": 8.016814506073884e-06, "loss": 0.1365, "step": 353200 }, { "epoch": 4.0278952520055435, "grad_norm": 12.197157859802246, "learning_rate": 8.013421748593278e-06, "loss": 0.1265, "step": 353300 }, { "epoch": 4.028064889879574, "grad_norm": 8.138416290283203, "learning_rate": 8.010028991112673e-06, "loss": 0.1121, "step": 353400 }, { "epoch": 4.028234527753605, "grad_norm": 8.020289421081543, "learning_rate": 8.006636233632065e-06, "loss": 0.1077, "step": 353500 }, { "epoch": 4.028404165627634, "grad_norm": 0.26797807216644287, "learning_rate": 8.00324347615146e-06, "loss": 0.1292, "step": 353600 }, { "epoch": 4.028573803501665, "grad_norm": 7.19188117980957, "learning_rate": 7.999850718670855e-06, "loss": 0.1264, "step": 353700 }, { "epoch": 4.0287434413756955, "grad_norm": 2.2471537590026855, "learning_rate": 7.996457961190249e-06, "loss": 0.1065, "step": 353800 }, { "epoch": 4.028913079249725, "grad_norm": 8.665640830993652, "learning_rate": 7.99306520370964e-06, "loss": 0.1475, "step": 353900 }, { "epoch": 4.029082717123756, "grad_norm": 9.641529083251953, "learning_rate": 7.989672446229036e-06, "loss": 0.1138, "step": 354000 }, { "epoch": 4.029252354997786, "grad_norm": 4.446402072906494, "learning_rate": 7.98627968874843e-06, "loss": 0.1378, "step": 354100 }, { "epoch": 4.029421992871817, "grad_norm": 22.353796005249023, "learning_rate": 7.982886931267824e-06, "loss": 0.1147, "step": 354200 }, { "epoch": 4.029591630745847, "grad_norm": 16.44369888305664, "learning_rate": 7.979494173787218e-06, "loss": 0.1275, "step": 354300 }, { "epoch": 4.029761268619877, "grad_norm": 2.831658363342285, "learning_rate": 7.976101416306611e-06, "loss": 0.1508, "step": 354400 }, { "epoch": 4.029930906493908, "grad_norm": 4.023041725158691, "learning_rate": 7.972708658826005e-06, "loss": 0.1363, "step": 354500 }, { "epoch": 4.030100544367937, "grad_norm": 15.089393615722656, "learning_rate": 7.969315901345399e-06, "loss": 0.1276, "step": 354600 }, { "epoch": 4.030270182241968, "grad_norm": 12.617644309997559, "learning_rate": 7.965923143864793e-06, "loss": 0.1188, "step": 354700 }, { "epoch": 4.0304398201159986, "grad_norm": 25.986032485961914, "learning_rate": 7.962530386384187e-06, "loss": 0.1317, "step": 354800 }, { "epoch": 4.030609457990029, "grad_norm": 4.795702934265137, "learning_rate": 7.95913762890358e-06, "loss": 0.1186, "step": 354900 }, { "epoch": 4.030779095864059, "grad_norm": 11.333296775817871, "learning_rate": 7.955744871422974e-06, "loss": 0.1347, "step": 355000 }, { "epoch": 4.030948733738089, "grad_norm": 3.6900744438171387, "learning_rate": 7.952352113942368e-06, "loss": 0.1139, "step": 355100 }, { "epoch": 4.03111837161212, "grad_norm": 1.382995367050171, "learning_rate": 7.948959356461762e-06, "loss": 0.1154, "step": 355200 }, { "epoch": 4.03128800948615, "grad_norm": 0.8488509654998779, "learning_rate": 7.945566598981155e-06, "loss": 0.1308, "step": 355300 }, { "epoch": 4.03145764736018, "grad_norm": 7.4142165184021, "learning_rate": 7.94217384150055e-06, "loss": 0.1303, "step": 355400 }, { "epoch": 4.031627285234211, "grad_norm": 17.168750762939453, "learning_rate": 7.938781084019943e-06, "loss": 0.1329, "step": 355500 }, { "epoch": 4.0317969231082404, "grad_norm": 18.117557525634766, "learning_rate": 7.935388326539337e-06, "loss": 0.1239, "step": 355600 }, { "epoch": 4.031966560982271, "grad_norm": 2.656669855117798, "learning_rate": 7.93199556905873e-06, "loss": 0.1235, "step": 355700 }, { "epoch": 4.032136198856302, "grad_norm": 10.05706787109375, "learning_rate": 7.928602811578124e-06, "loss": 0.1236, "step": 355800 }, { "epoch": 4.032305836730332, "grad_norm": 2.718982696533203, "learning_rate": 7.925210054097518e-06, "loss": 0.1315, "step": 355900 }, { "epoch": 4.032475474604362, "grad_norm": 22.3010196685791, "learning_rate": 7.921817296616912e-06, "loss": 0.1426, "step": 356000 }, { "epoch": 4.032645112478392, "grad_norm": 6.683438777923584, "learning_rate": 7.918424539136308e-06, "loss": 0.1237, "step": 356100 }, { "epoch": 4.032814750352423, "grad_norm": 19.58755874633789, "learning_rate": 7.9150317816557e-06, "loss": 0.1272, "step": 356200 }, { "epoch": 4.032984388226453, "grad_norm": 12.796217918395996, "learning_rate": 7.911639024175093e-06, "loss": 0.1281, "step": 356300 }, { "epoch": 4.033154026100483, "grad_norm": 9.74236011505127, "learning_rate": 7.908246266694489e-06, "loss": 0.1297, "step": 356400 }, { "epoch": 4.033323663974514, "grad_norm": 3.078132152557373, "learning_rate": 7.904853509213883e-06, "loss": 0.1226, "step": 356500 }, { "epoch": 4.0334933018485435, "grad_norm": 3.1809394359588623, "learning_rate": 7.901460751733275e-06, "loss": 0.1292, "step": 356600 }, { "epoch": 4.033662939722574, "grad_norm": 16.197250366210938, "learning_rate": 7.898067994252669e-06, "loss": 0.1248, "step": 356700 }, { "epoch": 4.033832577596605, "grad_norm": 33.16566848754883, "learning_rate": 7.894675236772064e-06, "loss": 0.132, "step": 356800 }, { "epoch": 4.034002215470635, "grad_norm": 4.770212173461914, "learning_rate": 7.891282479291456e-06, "loss": 0.1276, "step": 356900 }, { "epoch": 4.034171853344665, "grad_norm": 0.21923674643039703, "learning_rate": 7.88788972181085e-06, "loss": 0.1128, "step": 357000 }, { "epoch": 4.0343414912186955, "grad_norm": 3.716287612915039, "learning_rate": 7.884496964330246e-06, "loss": 0.1214, "step": 357100 }, { "epoch": 4.034511129092726, "grad_norm": 0.9230369925498962, "learning_rate": 7.88110420684964e-06, "loss": 0.1433, "step": 357200 }, { "epoch": 4.034680766966756, "grad_norm": 18.84702491760254, "learning_rate": 7.877711449369031e-06, "loss": 0.1211, "step": 357300 }, { "epoch": 4.034850404840786, "grad_norm": 22.713438034057617, "learning_rate": 7.874318691888427e-06, "loss": 0.1077, "step": 357400 }, { "epoch": 4.035020042714817, "grad_norm": 13.04042911529541, "learning_rate": 7.87092593440782e-06, "loss": 0.1353, "step": 357500 }, { "epoch": 4.035189680588847, "grad_norm": 9.89605712890625, "learning_rate": 7.867533176927214e-06, "loss": 0.124, "step": 357600 }, { "epoch": 4.035359318462877, "grad_norm": 9.460679054260254, "learning_rate": 7.864140419446608e-06, "loss": 0.1125, "step": 357700 }, { "epoch": 4.035528956336908, "grad_norm": 9.523773193359375, "learning_rate": 7.860747661966002e-06, "loss": 0.1296, "step": 357800 }, { "epoch": 4.035698594210938, "grad_norm": 6.1430840492248535, "learning_rate": 7.857354904485396e-06, "loss": 0.1253, "step": 357900 }, { "epoch": 4.035868232084968, "grad_norm": 18.14493751525879, "learning_rate": 7.85396214700479e-06, "loss": 0.1188, "step": 358000 }, { "epoch": 4.0360378699589985, "grad_norm": 2.310302972793579, "learning_rate": 7.850569389524183e-06, "loss": 0.1426, "step": 358100 }, { "epoch": 4.036207507833029, "grad_norm": 22.470827102661133, "learning_rate": 7.847176632043577e-06, "loss": 0.1321, "step": 358200 }, { "epoch": 4.036377145707059, "grad_norm": 6.043875217437744, "learning_rate": 7.843783874562971e-06, "loss": 0.134, "step": 358300 }, { "epoch": 4.036546783581089, "grad_norm": 6.1675004959106445, "learning_rate": 7.840391117082365e-06, "loss": 0.1251, "step": 358400 }, { "epoch": 4.03671642145512, "grad_norm": 11.85997486114502, "learning_rate": 7.836998359601759e-06, "loss": 0.1275, "step": 358500 }, { "epoch": 4.0368860593291505, "grad_norm": 26.950542449951172, "learning_rate": 7.833605602121152e-06, "loss": 0.1178, "step": 358600 }, { "epoch": 4.03705569720318, "grad_norm": 13.268157958984375, "learning_rate": 7.830212844640546e-06, "loss": 0.1172, "step": 358700 }, { "epoch": 4.037225335077211, "grad_norm": 1.196342945098877, "learning_rate": 7.82682008715994e-06, "loss": 0.1253, "step": 358800 }, { "epoch": 4.037394972951241, "grad_norm": 19.877538681030273, "learning_rate": 7.823427329679334e-06, "loss": 0.1115, "step": 358900 }, { "epoch": 4.037564610825271, "grad_norm": 11.74163818359375, "learning_rate": 7.820034572198728e-06, "loss": 0.1345, "step": 359000 }, { "epoch": 4.0377342486993015, "grad_norm": 16.080305099487305, "learning_rate": 7.816641814718121e-06, "loss": 0.1248, "step": 359100 }, { "epoch": 4.037903886573332, "grad_norm": 4.524606704711914, "learning_rate": 7.813249057237515e-06, "loss": 0.1329, "step": 359200 }, { "epoch": 4.038073524447362, "grad_norm": 13.553335189819336, "learning_rate": 7.809856299756909e-06, "loss": 0.1055, "step": 359300 }, { "epoch": 4.038243162321392, "grad_norm": 19.220924377441406, "learning_rate": 7.806463542276303e-06, "loss": 0.1331, "step": 359400 }, { "epoch": 4.038412800195423, "grad_norm": 32.390777587890625, "learning_rate": 7.803070784795698e-06, "loss": 0.1314, "step": 359500 }, { "epoch": 4.0385824380694535, "grad_norm": 7.59092378616333, "learning_rate": 7.79967802731509e-06, "loss": 0.1219, "step": 359600 }, { "epoch": 4.038752075943483, "grad_norm": 6.204862594604492, "learning_rate": 7.796285269834484e-06, "loss": 0.1202, "step": 359700 }, { "epoch": 4.038921713817514, "grad_norm": 7.0418314933776855, "learning_rate": 7.79289251235388e-06, "loss": 0.1149, "step": 359800 }, { "epoch": 4.039091351691544, "grad_norm": 2.6479501724243164, "learning_rate": 7.789499754873273e-06, "loss": 0.1175, "step": 359900 }, { "epoch": 4.039260989565574, "grad_norm": 14.36772346496582, "learning_rate": 7.786106997392666e-06, "loss": 0.1329, "step": 360000 }, { "epoch": 4.039430627439605, "grad_norm": 18.620929718017578, "learning_rate": 7.782714239912061e-06, "loss": 0.1242, "step": 360100 }, { "epoch": 4.039600265313635, "grad_norm": 16.722660064697266, "learning_rate": 7.779321482431455e-06, "loss": 0.1274, "step": 360200 }, { "epoch": 4.039769903187666, "grad_norm": 3.4092962741851807, "learning_rate": 7.775928724950849e-06, "loss": 0.1185, "step": 360300 }, { "epoch": 4.039939541061695, "grad_norm": 0.43524035811424255, "learning_rate": 7.772535967470242e-06, "loss": 0.1303, "step": 360400 }, { "epoch": 4.040109178935726, "grad_norm": 4.447220325469971, "learning_rate": 7.769143209989636e-06, "loss": 0.122, "step": 360500 }, { "epoch": 4.0402788168097565, "grad_norm": 2.7123196125030518, "learning_rate": 7.76575045250903e-06, "loss": 0.1426, "step": 360600 }, { "epoch": 4.040448454683786, "grad_norm": 5.453602313995361, "learning_rate": 7.762357695028422e-06, "loss": 0.1117, "step": 360700 }, { "epoch": 4.040618092557817, "grad_norm": 8.66477108001709, "learning_rate": 7.758964937547818e-06, "loss": 0.1335, "step": 360800 }, { "epoch": 4.040787730431847, "grad_norm": 12.53063678741455, "learning_rate": 7.755572180067211e-06, "loss": 0.1273, "step": 360900 }, { "epoch": 4.040957368305877, "grad_norm": 11.013915061950684, "learning_rate": 7.752179422586605e-06, "loss": 0.131, "step": 361000 }, { "epoch": 4.041127006179908, "grad_norm": 12.035384178161621, "learning_rate": 7.748786665105999e-06, "loss": 0.1168, "step": 361100 }, { "epoch": 4.041296644053938, "grad_norm": 3.426048755645752, "learning_rate": 7.745393907625393e-06, "loss": 0.114, "step": 361200 }, { "epoch": 4.041466281927969, "grad_norm": 0.9180991649627686, "learning_rate": 7.742001150144787e-06, "loss": 0.1213, "step": 361300 }, { "epoch": 4.041635919801998, "grad_norm": 14.788309097290039, "learning_rate": 7.73860839266418e-06, "loss": 0.1286, "step": 361400 }, { "epoch": 4.041805557676029, "grad_norm": 7.532767295837402, "learning_rate": 7.735215635183574e-06, "loss": 0.1465, "step": 361500 }, { "epoch": 4.04197519555006, "grad_norm": 2.191800594329834, "learning_rate": 7.731822877702968e-06, "loss": 0.1429, "step": 361600 }, { "epoch": 4.042144833424089, "grad_norm": 7.706140041351318, "learning_rate": 7.728430120222362e-06, "loss": 0.1177, "step": 361700 }, { "epoch": 4.04231447129812, "grad_norm": 22.981639862060547, "learning_rate": 7.725037362741756e-06, "loss": 0.1162, "step": 361800 }, { "epoch": 4.04248410917215, "grad_norm": 1.8569549322128296, "learning_rate": 7.72164460526115e-06, "loss": 0.1311, "step": 361900 }, { "epoch": 4.04265374704618, "grad_norm": 11.40339469909668, "learning_rate": 7.718251847780543e-06, "loss": 0.1293, "step": 362000 }, { "epoch": 4.042823384920211, "grad_norm": 12.146950721740723, "learning_rate": 7.714859090299937e-06, "loss": 0.1201, "step": 362100 }, { "epoch": 4.042993022794241, "grad_norm": 5.323840141296387, "learning_rate": 7.71146633281933e-06, "loss": 0.1153, "step": 362200 }, { "epoch": 4.043162660668272, "grad_norm": 18.146507263183594, "learning_rate": 7.708073575338725e-06, "loss": 0.1498, "step": 362300 }, { "epoch": 4.0433322985423015, "grad_norm": 4.672055244445801, "learning_rate": 7.704680817858118e-06, "loss": 0.11, "step": 362400 }, { "epoch": 4.043501936416332, "grad_norm": 12.007573127746582, "learning_rate": 7.701288060377514e-06, "loss": 0.1157, "step": 362500 }, { "epoch": 4.043671574290363, "grad_norm": 7.508735179901123, "learning_rate": 7.697895302896906e-06, "loss": 0.1188, "step": 362600 }, { "epoch": 4.043841212164392, "grad_norm": 15.135965347290039, "learning_rate": 7.6945025454163e-06, "loss": 0.1289, "step": 362700 }, { "epoch": 4.044010850038423, "grad_norm": 9.50838565826416, "learning_rate": 7.691109787935695e-06, "loss": 0.129, "step": 362800 }, { "epoch": 4.044180487912453, "grad_norm": 8.23912525177002, "learning_rate": 7.687717030455089e-06, "loss": 0.12, "step": 362900 }, { "epoch": 4.044350125786484, "grad_norm": 30.895610809326172, "learning_rate": 7.684324272974481e-06, "loss": 0.1285, "step": 363000 }, { "epoch": 4.044519763660514, "grad_norm": 12.113358497619629, "learning_rate": 7.680931515493875e-06, "loss": 0.1245, "step": 363100 }, { "epoch": 4.044689401534544, "grad_norm": 7.9205193519592285, "learning_rate": 7.67753875801327e-06, "loss": 0.1173, "step": 363200 }, { "epoch": 4.044859039408575, "grad_norm": 2.7667593955993652, "learning_rate": 7.674146000532664e-06, "loss": 0.1102, "step": 363300 }, { "epoch": 4.0450286772826045, "grad_norm": 24.25559425354004, "learning_rate": 7.670753243052056e-06, "loss": 0.1193, "step": 363400 }, { "epoch": 4.045198315156635, "grad_norm": 5.335149765014648, "learning_rate": 7.667360485571452e-06, "loss": 0.1191, "step": 363500 }, { "epoch": 4.045367953030666, "grad_norm": 4.730679035186768, "learning_rate": 7.663967728090846e-06, "loss": 0.1182, "step": 363600 }, { "epoch": 4.045537590904695, "grad_norm": 9.881540298461914, "learning_rate": 7.66057497061024e-06, "loss": 0.1337, "step": 363700 }, { "epoch": 4.045707228778726, "grad_norm": 6.502552032470703, "learning_rate": 7.657182213129633e-06, "loss": 0.1225, "step": 363800 }, { "epoch": 4.0458768666527565, "grad_norm": 1.0621236562728882, "learning_rate": 7.653789455649027e-06, "loss": 0.1293, "step": 363900 }, { "epoch": 4.046046504526787, "grad_norm": 4.250152111053467, "learning_rate": 7.65039669816842e-06, "loss": 0.1195, "step": 364000 }, { "epoch": 4.046216142400817, "grad_norm": 3.230217695236206, "learning_rate": 7.647003940687815e-06, "loss": 0.1237, "step": 364100 }, { "epoch": 4.046385780274847, "grad_norm": 13.442312240600586, "learning_rate": 7.643611183207208e-06, "loss": 0.1156, "step": 364200 }, { "epoch": 4.046555418148878, "grad_norm": 10.13038158416748, "learning_rate": 7.640218425726602e-06, "loss": 0.1338, "step": 364300 }, { "epoch": 4.0467250560229076, "grad_norm": 5.36207389831543, "learning_rate": 7.636825668245996e-06, "loss": 0.1315, "step": 364400 }, { "epoch": 4.046894693896938, "grad_norm": 15.336673736572266, "learning_rate": 7.63343291076539e-06, "loss": 0.1213, "step": 364500 }, { "epoch": 4.047064331770969, "grad_norm": 14.062495231628418, "learning_rate": 7.630040153284784e-06, "loss": 0.127, "step": 364600 }, { "epoch": 4.047233969644999, "grad_norm": 0.5270054340362549, "learning_rate": 7.626647395804177e-06, "loss": 0.119, "step": 364700 }, { "epoch": 4.047403607519029, "grad_norm": 11.093452453613281, "learning_rate": 7.623254638323572e-06, "loss": 0.118, "step": 364800 }, { "epoch": 4.0475732453930595, "grad_norm": 7.196959018707275, "learning_rate": 7.619861880842965e-06, "loss": 0.1375, "step": 364900 }, { "epoch": 4.04774288326709, "grad_norm": 10.369841575622559, "learning_rate": 7.616469123362359e-06, "loss": 0.1281, "step": 365000 }, { "epoch": 4.04791252114112, "grad_norm": 25.31781578063965, "learning_rate": 7.613076365881753e-06, "loss": 0.1081, "step": 365100 }, { "epoch": 4.04808215901515, "grad_norm": 6.905468940734863, "learning_rate": 7.609683608401147e-06, "loss": 0.1209, "step": 365200 }, { "epoch": 4.048251796889181, "grad_norm": 18.7513484954834, "learning_rate": 7.60629085092054e-06, "loss": 0.1327, "step": 365300 }, { "epoch": 4.048421434763211, "grad_norm": 9.800435066223145, "learning_rate": 7.602898093439934e-06, "loss": 0.1265, "step": 365400 }, { "epoch": 4.048591072637241, "grad_norm": 8.709671020507812, "learning_rate": 7.5995053359593286e-06, "loss": 0.1239, "step": 365500 }, { "epoch": 4.048760710511272, "grad_norm": 20.430940628051758, "learning_rate": 7.596112578478722e-06, "loss": 0.1179, "step": 365600 }, { "epoch": 4.048930348385302, "grad_norm": 7.849871635437012, "learning_rate": 7.592719820998115e-06, "loss": 0.1288, "step": 365700 }, { "epoch": 4.049099986259332, "grad_norm": 9.84711742401123, "learning_rate": 7.58932706351751e-06, "loss": 0.1337, "step": 365800 }, { "epoch": 4.0492696241333626, "grad_norm": 6.880946636199951, "learning_rate": 7.585934306036904e-06, "loss": 0.1259, "step": 365900 }, { "epoch": 4.049439262007393, "grad_norm": 5.211828708648682, "learning_rate": 7.582541548556297e-06, "loss": 0.1368, "step": 366000 }, { "epoch": 4.049608899881423, "grad_norm": 18.675809860229492, "learning_rate": 7.579148791075691e-06, "loss": 0.1186, "step": 366100 }, { "epoch": 4.049778537755453, "grad_norm": 5.621703624725342, "learning_rate": 7.575756033595085e-06, "loss": 0.1245, "step": 366200 }, { "epoch": 4.049948175629484, "grad_norm": 8.703566551208496, "learning_rate": 7.57236327611448e-06, "loss": 0.1274, "step": 366300 }, { "epoch": 4.050117813503514, "grad_norm": 5.8477349281311035, "learning_rate": 7.568970518633873e-06, "loss": 0.1316, "step": 366400 }, { "epoch": 4.050287451377544, "grad_norm": 11.259109497070312, "learning_rate": 7.5655777611532665e-06, "loss": 0.1115, "step": 366500 }, { "epoch": 4.050457089251575, "grad_norm": 7.147488117218018, "learning_rate": 7.56218500367266e-06, "loss": 0.1178, "step": 366600 }, { "epoch": 4.050626727125605, "grad_norm": 0.8318095803260803, "learning_rate": 7.558792246192055e-06, "loss": 0.1314, "step": 366700 }, { "epoch": 4.050796364999635, "grad_norm": 0.7817901968955994, "learning_rate": 7.555399488711448e-06, "loss": 0.1365, "step": 366800 }, { "epoch": 4.050966002873666, "grad_norm": 6.125122547149658, "learning_rate": 7.552006731230842e-06, "loss": 0.1132, "step": 366900 }, { "epoch": 4.051135640747696, "grad_norm": 30.93744468688965, "learning_rate": 7.548613973750236e-06, "loss": 0.1284, "step": 367000 }, { "epoch": 4.051305278621726, "grad_norm": 4.898610591888428, "learning_rate": 7.54522121626963e-06, "loss": 0.1209, "step": 367100 }, { "epoch": 4.051474916495756, "grad_norm": 14.491631507873535, "learning_rate": 7.541828458789023e-06, "loss": 0.1224, "step": 367200 }, { "epoch": 4.051644554369787, "grad_norm": 18.10207748413086, "learning_rate": 7.538435701308418e-06, "loss": 0.1375, "step": 367300 }, { "epoch": 4.051814192243818, "grad_norm": 21.75994873046875, "learning_rate": 7.5350429438278115e-06, "loss": 0.1331, "step": 367400 }, { "epoch": 4.051983830117847, "grad_norm": 2.1756489276885986, "learning_rate": 7.5316501863472045e-06, "loss": 0.122, "step": 367500 }, { "epoch": 4.052153467991878, "grad_norm": 2.862952709197998, "learning_rate": 7.528257428866599e-06, "loss": 0.1269, "step": 367600 }, { "epoch": 4.052323105865908, "grad_norm": 5.82924222946167, "learning_rate": 7.524864671385993e-06, "loss": 0.1403, "step": 367700 }, { "epoch": 4.052492743739938, "grad_norm": 12.353471755981445, "learning_rate": 7.521471913905387e-06, "loss": 0.1296, "step": 367800 }, { "epoch": 4.052662381613969, "grad_norm": 7.276542663574219, "learning_rate": 7.5180791564247805e-06, "loss": 0.1192, "step": 367900 }, { "epoch": 4.052832019487999, "grad_norm": 5.039012908935547, "learning_rate": 7.514686398944174e-06, "loss": 0.1176, "step": 368000 }, { "epoch": 4.053001657362029, "grad_norm": 13.093533515930176, "learning_rate": 7.511293641463568e-06, "loss": 0.1288, "step": 368100 }, { "epoch": 4.0531712952360595, "grad_norm": 9.817602157592773, "learning_rate": 7.507900883982963e-06, "loss": 0.1171, "step": 368200 }, { "epoch": 4.05334093311009, "grad_norm": 21.449562072753906, "learning_rate": 7.504508126502356e-06, "loss": 0.118, "step": 368300 }, { "epoch": 4.053510570984121, "grad_norm": 16.87347984313965, "learning_rate": 7.5011153690217495e-06, "loss": 0.1352, "step": 368400 }, { "epoch": 4.05368020885815, "grad_norm": 0.8061085939407349, "learning_rate": 7.497722611541144e-06, "loss": 0.1281, "step": 368500 }, { "epoch": 4.053849846732181, "grad_norm": 1.1158016920089722, "learning_rate": 7.494329854060538e-06, "loss": 0.1195, "step": 368600 }, { "epoch": 4.054019484606211, "grad_norm": 8.122390747070312, "learning_rate": 7.490937096579931e-06, "loss": 0.1334, "step": 368700 }, { "epoch": 4.054189122480241, "grad_norm": 26.828027725219727, "learning_rate": 7.4875443390993255e-06, "loss": 0.1277, "step": 368800 }, { "epoch": 4.054358760354272, "grad_norm": 20.987964630126953, "learning_rate": 7.484151581618719e-06, "loss": 0.1205, "step": 368900 }, { "epoch": 4.054528398228302, "grad_norm": 13.523033142089844, "learning_rate": 7.480758824138113e-06, "loss": 0.1469, "step": 369000 }, { "epoch": 4.054698036102332, "grad_norm": 6.376496315002441, "learning_rate": 7.477366066657507e-06, "loss": 0.1213, "step": 369100 }, { "epoch": 4.0548676739763625, "grad_norm": 7.582065582275391, "learning_rate": 7.473973309176901e-06, "loss": 0.1297, "step": 369200 }, { "epoch": 4.055037311850393, "grad_norm": 1.807626724243164, "learning_rate": 7.4705805516962945e-06, "loss": 0.117, "step": 369300 }, { "epoch": 4.055206949724424, "grad_norm": 32.22711944580078, "learning_rate": 7.4671877942156874e-06, "loss": 0.1226, "step": 369400 }, { "epoch": 4.055376587598453, "grad_norm": 0.9489873051643372, "learning_rate": 7.463795036735082e-06, "loss": 0.1476, "step": 369500 }, { "epoch": 4.055546225472484, "grad_norm": 9.578985214233398, "learning_rate": 7.460402279254476e-06, "loss": 0.1285, "step": 369600 }, { "epoch": 4.0557158633465145, "grad_norm": 12.119617462158203, "learning_rate": 7.4570095217738705e-06, "loss": 0.1295, "step": 369700 }, { "epoch": 4.055885501220544, "grad_norm": 11.76108455657959, "learning_rate": 7.4536167642932635e-06, "loss": 0.1396, "step": 369800 }, { "epoch": 4.056055139094575, "grad_norm": 4.761253833770752, "learning_rate": 7.450224006812657e-06, "loss": 0.1407, "step": 369900 }, { "epoch": 4.056224776968605, "grad_norm": 10.482038497924805, "learning_rate": 7.446831249332052e-06, "loss": 0.1147, "step": 370000 }, { "epoch": 4.056394414842636, "grad_norm": 15.542064666748047, "learning_rate": 7.443438491851446e-06, "loss": 0.1186, "step": 370100 }, { "epoch": 4.0565640527166655, "grad_norm": 0.8093328475952148, "learning_rate": 7.440045734370839e-06, "loss": 0.1177, "step": 370200 }, { "epoch": 4.056733690590696, "grad_norm": 2.1466662883758545, "learning_rate": 7.436652976890233e-06, "loss": 0.1512, "step": 370300 }, { "epoch": 4.056903328464727, "grad_norm": 18.87828826904297, "learning_rate": 7.433260219409627e-06, "loss": 0.125, "step": 370400 }, { "epoch": 4.057072966338756, "grad_norm": 5.879200458526611, "learning_rate": 7.429867461929021e-06, "loss": 0.1209, "step": 370500 }, { "epoch": 4.057242604212787, "grad_norm": 7.920973300933838, "learning_rate": 7.426474704448414e-06, "loss": 0.132, "step": 370600 }, { "epoch": 4.0574122420868175, "grad_norm": 17.495237350463867, "learning_rate": 7.4230819469678085e-06, "loss": 0.1172, "step": 370700 }, { "epoch": 4.057581879960847, "grad_norm": 20.217382431030273, "learning_rate": 7.419689189487202e-06, "loss": 0.1137, "step": 370800 }, { "epoch": 4.057751517834878, "grad_norm": 4.175044059753418, "learning_rate": 7.416296432006597e-06, "loss": 0.1309, "step": 370900 }, { "epoch": 4.057921155708908, "grad_norm": 3.648073434829712, "learning_rate": 7.41290367452599e-06, "loss": 0.119, "step": 371000 }, { "epoch": 4.058090793582939, "grad_norm": 7.77937650680542, "learning_rate": 7.409510917045384e-06, "loss": 0.1227, "step": 371100 }, { "epoch": 4.058260431456969, "grad_norm": 0.24404045939445496, "learning_rate": 7.406118159564778e-06, "loss": 0.1133, "step": 371200 }, { "epoch": 4.058430069330999, "grad_norm": 6.26303768157959, "learning_rate": 7.402725402084171e-06, "loss": 0.1238, "step": 371300 }, { "epoch": 4.05859970720503, "grad_norm": 6.627842426300049, "learning_rate": 7.399332644603565e-06, "loss": 0.1298, "step": 371400 }, { "epoch": 4.058769345079059, "grad_norm": 4.656120777130127, "learning_rate": 7.39593988712296e-06, "loss": 0.1289, "step": 371500 }, { "epoch": 4.05893898295309, "grad_norm": 11.548340797424316, "learning_rate": 7.3925471296423535e-06, "loss": 0.1311, "step": 371600 }, { "epoch": 4.0591086208271205, "grad_norm": 22.985538482666016, "learning_rate": 7.3891543721617464e-06, "loss": 0.1381, "step": 371700 }, { "epoch": 4.05927825870115, "grad_norm": 6.3693437576293945, "learning_rate": 7.38576161468114e-06, "loss": 0.1282, "step": 371800 }, { "epoch": 4.059447896575181, "grad_norm": 4.062643051147461, "learning_rate": 7.382368857200535e-06, "loss": 0.1119, "step": 371900 }, { "epoch": 4.059617534449211, "grad_norm": 18.065673828125, "learning_rate": 7.378976099719929e-06, "loss": 0.1166, "step": 372000 }, { "epoch": 4.059787172323242, "grad_norm": 19.877395629882812, "learning_rate": 7.375583342239322e-06, "loss": 0.1356, "step": 372100 }, { "epoch": 4.059956810197272, "grad_norm": 4.275839328765869, "learning_rate": 7.372190584758716e-06, "loss": 0.1263, "step": 372200 }, { "epoch": 4.060126448071302, "grad_norm": 12.25257682800293, "learning_rate": 7.36879782727811e-06, "loss": 0.1374, "step": 372300 }, { "epoch": 4.060296085945333, "grad_norm": 29.053752899169922, "learning_rate": 7.365405069797505e-06, "loss": 0.1074, "step": 372400 }, { "epoch": 4.060465723819362, "grad_norm": 7.9035749435424805, "learning_rate": 7.362012312316898e-06, "loss": 0.1241, "step": 372500 }, { "epoch": 4.060635361693393, "grad_norm": 7.221467971801758, "learning_rate": 7.3586195548362914e-06, "loss": 0.1234, "step": 372600 }, { "epoch": 4.060804999567424, "grad_norm": 2.1667354106903076, "learning_rate": 7.355226797355686e-06, "loss": 0.1192, "step": 372700 }, { "epoch": 4.060974637441454, "grad_norm": 11.128185272216797, "learning_rate": 7.35183403987508e-06, "loss": 0.1191, "step": 372800 }, { "epoch": 4.061144275315484, "grad_norm": 5.348910331726074, "learning_rate": 7.348441282394473e-06, "loss": 0.1235, "step": 372900 }, { "epoch": 4.061313913189514, "grad_norm": 26.764183044433594, "learning_rate": 7.345048524913867e-06, "loss": 0.1306, "step": 373000 }, { "epoch": 4.061483551063545, "grad_norm": 11.539952278137207, "learning_rate": 7.341655767433261e-06, "loss": 0.14, "step": 373100 }, { "epoch": 4.061653188937575, "grad_norm": 6.6829986572265625, "learning_rate": 7.338263009952654e-06, "loss": 0.121, "step": 373200 }, { "epoch": 4.061822826811605, "grad_norm": 6.53891658782959, "learning_rate": 7.334870252472048e-06, "loss": 0.1353, "step": 373300 }, { "epoch": 4.061992464685636, "grad_norm": 0.3320728838443756, "learning_rate": 7.331477494991443e-06, "loss": 0.1147, "step": 373400 }, { "epoch": 4.0621621025596655, "grad_norm": 23.63065528869629, "learning_rate": 7.3280847375108365e-06, "loss": 0.129, "step": 373500 }, { "epoch": 4.062331740433696, "grad_norm": 11.268319129943848, "learning_rate": 7.324691980030229e-06, "loss": 0.1104, "step": 373600 }, { "epoch": 4.062501378307727, "grad_norm": 11.77419662475586, "learning_rate": 7.321299222549624e-06, "loss": 0.1295, "step": 373700 }, { "epoch": 4.062671016181757, "grad_norm": 28.353261947631836, "learning_rate": 7.317906465069018e-06, "loss": 0.132, "step": 373800 }, { "epoch": 4.062840654055787, "grad_norm": 31.32401466369629, "learning_rate": 7.3145137075884125e-06, "loss": 0.1189, "step": 373900 }, { "epoch": 4.063010291929817, "grad_norm": 28.497865676879883, "learning_rate": 7.3111209501078054e-06, "loss": 0.1145, "step": 374000 }, { "epoch": 4.063179929803848, "grad_norm": 14.035061836242676, "learning_rate": 7.307728192627199e-06, "loss": 0.1298, "step": 374100 }, { "epoch": 4.063349567677878, "grad_norm": 0.6585734486579895, "learning_rate": 7.304335435146593e-06, "loss": 0.1369, "step": 374200 }, { "epoch": 4.063519205551908, "grad_norm": 15.417983055114746, "learning_rate": 7.300942677665988e-06, "loss": 0.1077, "step": 374300 }, { "epoch": 4.063688843425939, "grad_norm": 1.052061676979065, "learning_rate": 7.297549920185381e-06, "loss": 0.121, "step": 374400 }, { "epoch": 4.0638584812999685, "grad_norm": 1.2836958169937134, "learning_rate": 7.294157162704774e-06, "loss": 0.1193, "step": 374500 }, { "epoch": 4.064028119173999, "grad_norm": 13.582634925842285, "learning_rate": 7.290764405224169e-06, "loss": 0.1237, "step": 374600 }, { "epoch": 4.06419775704803, "grad_norm": 40.146202087402344, "learning_rate": 7.287371647743562e-06, "loss": 0.1057, "step": 374700 }, { "epoch": 4.06436739492206, "grad_norm": 6.798552513122559, "learning_rate": 7.283978890262956e-06, "loss": 0.1283, "step": 374800 }, { "epoch": 4.06453703279609, "grad_norm": 6.005340576171875, "learning_rate": 7.2805861327823504e-06, "loss": 0.119, "step": 374900 }, { "epoch": 4.0647066706701205, "grad_norm": 3.195955991744995, "learning_rate": 7.277193375301744e-06, "loss": 0.1388, "step": 375000 }, { "epoch": 4.064876308544151, "grad_norm": 9.446000099182129, "learning_rate": 7.273800617821137e-06, "loss": 0.1087, "step": 375100 }, { "epoch": 4.065045946418181, "grad_norm": 3.6266846656799316, "learning_rate": 7.270407860340532e-06, "loss": 0.1297, "step": 375200 }, { "epoch": 4.065215584292211, "grad_norm": 29.350561141967773, "learning_rate": 7.267015102859926e-06, "loss": 0.1226, "step": 375300 }, { "epoch": 4.065385222166242, "grad_norm": 2.543923854827881, "learning_rate": 7.263622345379319e-06, "loss": 0.1313, "step": 375400 }, { "epoch": 4.0655548600402724, "grad_norm": 13.054374694824219, "learning_rate": 7.260229587898713e-06, "loss": 0.1179, "step": 375500 }, { "epoch": 4.065724497914302, "grad_norm": 32.902191162109375, "learning_rate": 7.256836830418107e-06, "loss": 0.1126, "step": 375600 }, { "epoch": 4.065894135788333, "grad_norm": 8.577073097229004, "learning_rate": 7.253444072937501e-06, "loss": 0.1229, "step": 375700 }, { "epoch": 4.066063773662363, "grad_norm": 19.73938751220703, "learning_rate": 7.2500513154568954e-06, "loss": 0.127, "step": 375800 }, { "epoch": 4.066233411536393, "grad_norm": 7.374815940856934, "learning_rate": 7.246658557976288e-06, "loss": 0.1371, "step": 375900 }, { "epoch": 4.0664030494104235, "grad_norm": 59.37343215942383, "learning_rate": 7.243265800495682e-06, "loss": 0.141, "step": 376000 }, { "epoch": 4.066572687284454, "grad_norm": 13.792487144470215, "learning_rate": 7.239873043015077e-06, "loss": 0.1127, "step": 376100 }, { "epoch": 4.066742325158484, "grad_norm": 5.568021297454834, "learning_rate": 7.236480285534471e-06, "loss": 0.1233, "step": 376200 }, { "epoch": 4.066911963032514, "grad_norm": 17.204553604125977, "learning_rate": 7.233087528053864e-06, "loss": 0.1312, "step": 376300 }, { "epoch": 4.067081600906545, "grad_norm": 21.083192825317383, "learning_rate": 7.229694770573258e-06, "loss": 0.1261, "step": 376400 }, { "epoch": 4.0672512387805755, "grad_norm": 8.29477310180664, "learning_rate": 7.226302013092652e-06, "loss": 0.1258, "step": 376500 }, { "epoch": 4.067420876654605, "grad_norm": 2.490724802017212, "learning_rate": 7.222909255612045e-06, "loss": 0.1356, "step": 376600 }, { "epoch": 4.067590514528636, "grad_norm": 2.228787422180176, "learning_rate": 7.219516498131439e-06, "loss": 0.1201, "step": 376700 }, { "epoch": 4.067760152402666, "grad_norm": 13.581604957580566, "learning_rate": 7.216123740650833e-06, "loss": 0.1477, "step": 376800 }, { "epoch": 4.067929790276696, "grad_norm": 4.180962562561035, "learning_rate": 7.212730983170227e-06, "loss": 0.1116, "step": 376900 }, { "epoch": 4.068099428150727, "grad_norm": 5.852220058441162, "learning_rate": 7.20933822568962e-06, "loss": 0.1356, "step": 377000 }, { "epoch": 4.068269066024757, "grad_norm": 12.019867897033691, "learning_rate": 7.205945468209015e-06, "loss": 0.1217, "step": 377100 }, { "epoch": 4.068438703898787, "grad_norm": 6.804008483886719, "learning_rate": 7.202552710728409e-06, "loss": 0.1363, "step": 377200 }, { "epoch": 4.068608341772817, "grad_norm": 15.696606636047363, "learning_rate": 7.199159953247803e-06, "loss": 0.1311, "step": 377300 }, { "epoch": 4.068777979646848, "grad_norm": 8.556803703308105, "learning_rate": 7.195767195767196e-06, "loss": 0.1286, "step": 377400 }, { "epoch": 4.0689476175208785, "grad_norm": 9.361321449279785, "learning_rate": 7.19237443828659e-06, "loss": 0.1392, "step": 377500 }, { "epoch": 4.069117255394908, "grad_norm": 5.05167818069458, "learning_rate": 7.188981680805985e-06, "loss": 0.117, "step": 377600 }, { "epoch": 4.069286893268939, "grad_norm": 22.954055786132812, "learning_rate": 7.185588923325378e-06, "loss": 0.1316, "step": 377700 }, { "epoch": 4.069456531142969, "grad_norm": 9.200949668884277, "learning_rate": 7.182196165844771e-06, "loss": 0.1287, "step": 377800 }, { "epoch": 4.069626169016999, "grad_norm": 24.8885440826416, "learning_rate": 7.178803408364165e-06, "loss": 0.1191, "step": 377900 }, { "epoch": 4.06979580689103, "grad_norm": 2.325914144515991, "learning_rate": 7.17541065088356e-06, "loss": 0.1285, "step": 378000 }, { "epoch": 4.06996544476506, "grad_norm": 10.984456062316895, "learning_rate": 7.172017893402954e-06, "loss": 0.1319, "step": 378100 }, { "epoch": 4.070135082639091, "grad_norm": 5.936614513397217, "learning_rate": 7.1686251359223465e-06, "loss": 0.1178, "step": 378200 }, { "epoch": 4.07030472051312, "grad_norm": 14.728145599365234, "learning_rate": 7.165232378441741e-06, "loss": 0.1313, "step": 378300 }, { "epoch": 4.070474358387151, "grad_norm": 19.414968490600586, "learning_rate": 7.161839620961135e-06, "loss": 0.1195, "step": 378400 }, { "epoch": 4.070643996261182, "grad_norm": 0.5971322059631348, "learning_rate": 7.158446863480528e-06, "loss": 0.1087, "step": 378500 }, { "epoch": 4.070813634135211, "grad_norm": 10.073734283447266, "learning_rate": 7.1550541059999226e-06, "loss": 0.1174, "step": 378600 }, { "epoch": 4.070983272009242, "grad_norm": 16.03798484802246, "learning_rate": 7.151661348519316e-06, "loss": 0.1181, "step": 378700 }, { "epoch": 4.071152909883272, "grad_norm": 8.78952693939209, "learning_rate": 7.148268591038711e-06, "loss": 0.1373, "step": 378800 }, { "epoch": 4.071322547757302, "grad_norm": 16.47785758972168, "learning_rate": 7.144875833558104e-06, "loss": 0.1242, "step": 378900 }, { "epoch": 4.071492185631333, "grad_norm": 1.914650321006775, "learning_rate": 7.141483076077498e-06, "loss": 0.1172, "step": 379000 }, { "epoch": 4.071661823505363, "grad_norm": 4.692215442657471, "learning_rate": 7.1380903185968915e-06, "loss": 0.1253, "step": 379100 }, { "epoch": 4.071831461379394, "grad_norm": 2.8297646045684814, "learning_rate": 7.134697561116286e-06, "loss": 0.1251, "step": 379200 }, { "epoch": 4.0720010992534235, "grad_norm": 19.418249130249023, "learning_rate": 7.131304803635679e-06, "loss": 0.1272, "step": 379300 }, { "epoch": 4.072170737127454, "grad_norm": 15.396575927734375, "learning_rate": 7.127912046155073e-06, "loss": 0.1284, "step": 379400 }, { "epoch": 4.072340375001485, "grad_norm": 6.253955841064453, "learning_rate": 7.124519288674468e-06, "loss": 0.1182, "step": 379500 }, { "epoch": 4.072510012875514, "grad_norm": 0.719024658203125, "learning_rate": 7.121126531193861e-06, "loss": 0.1289, "step": 379600 }, { "epoch": 4.072679650749545, "grad_norm": 27.748533248901367, "learning_rate": 7.117733773713254e-06, "loss": 0.1209, "step": 379700 }, { "epoch": 4.072849288623575, "grad_norm": 1.720976710319519, "learning_rate": 7.114341016232649e-06, "loss": 0.1021, "step": 379800 }, { "epoch": 4.073018926497606, "grad_norm": 10.143156051635742, "learning_rate": 7.110948258752043e-06, "loss": 0.1209, "step": 379900 }, { "epoch": 4.073188564371636, "grad_norm": 22.51791000366211, "learning_rate": 7.107555501271436e-06, "loss": 0.1239, "step": 380000 }, { "epoch": 4.073358202245666, "grad_norm": 22.537511825561523, "learning_rate": 7.10416274379083e-06, "loss": 0.1268, "step": 380100 }, { "epoch": 4.073527840119697, "grad_norm": 15.88259506225586, "learning_rate": 7.100769986310224e-06, "loss": 0.1423, "step": 380200 }, { "epoch": 4.0736974779937265, "grad_norm": 5.585384368896484, "learning_rate": 7.097377228829618e-06, "loss": 0.1231, "step": 380300 }, { "epoch": 4.073867115867757, "grad_norm": 24.152284622192383, "learning_rate": 7.093984471349012e-06, "loss": 0.1291, "step": 380400 }, { "epoch": 4.074036753741788, "grad_norm": 20.948566436767578, "learning_rate": 7.0905917138684055e-06, "loss": 0.1265, "step": 380500 }, { "epoch": 4.074206391615817, "grad_norm": 2.920719623565674, "learning_rate": 7.087198956387799e-06, "loss": 0.1274, "step": 380600 }, { "epoch": 4.074376029489848, "grad_norm": 12.002219200134277, "learning_rate": 7.083806198907194e-06, "loss": 0.1112, "step": 380700 }, { "epoch": 4.0745456673638785, "grad_norm": 14.208619117736816, "learning_rate": 7.080413441426587e-06, "loss": 0.1344, "step": 380800 }, { "epoch": 4.074715305237909, "grad_norm": 2.0338268280029297, "learning_rate": 7.077020683945981e-06, "loss": 0.1227, "step": 380900 }, { "epoch": 4.074884943111939, "grad_norm": 23.2507266998291, "learning_rate": 7.073627926465375e-06, "loss": 0.1246, "step": 381000 }, { "epoch": 4.075054580985969, "grad_norm": 19.14887237548828, "learning_rate": 7.070235168984769e-06, "loss": 0.1283, "step": 381100 }, { "epoch": 4.07522421886, "grad_norm": 1.838423728942871, "learning_rate": 7.066842411504162e-06, "loss": 0.1183, "step": 381200 }, { "epoch": 4.0753938567340295, "grad_norm": 27.258195877075195, "learning_rate": 7.063449654023557e-06, "loss": 0.1279, "step": 381300 }, { "epoch": 4.07556349460806, "grad_norm": 15.230782508850098, "learning_rate": 7.0600568965429505e-06, "loss": 0.1255, "step": 381400 }, { "epoch": 4.075733132482091, "grad_norm": 31.025846481323242, "learning_rate": 7.056664139062344e-06, "loss": 0.1146, "step": 381500 }, { "epoch": 4.07590277035612, "grad_norm": 25.809707641601562, "learning_rate": 7.053271381581738e-06, "loss": 0.1214, "step": 381600 }, { "epoch": 4.076072408230151, "grad_norm": 8.543561935424805, "learning_rate": 7.049878624101132e-06, "loss": 0.127, "step": 381700 }, { "epoch": 4.0762420461041815, "grad_norm": 24.935762405395508, "learning_rate": 7.046485866620526e-06, "loss": 0.1102, "step": 381800 }, { "epoch": 4.076411683978212, "grad_norm": 3.1377768516540527, "learning_rate": 7.043093109139919e-06, "loss": 0.1297, "step": 381900 }, { "epoch": 4.076581321852242, "grad_norm": 7.1335625648498535, "learning_rate": 7.039700351659313e-06, "loss": 0.1243, "step": 382000 }, { "epoch": 4.076750959726272, "grad_norm": 6.5739426612854, "learning_rate": 7.036307594178707e-06, "loss": 0.1289, "step": 382100 }, { "epoch": 4.076920597600303, "grad_norm": 16.078632354736328, "learning_rate": 7.032914836698102e-06, "loss": 0.1182, "step": 382200 }, { "epoch": 4.077090235474333, "grad_norm": 14.078588485717773, "learning_rate": 7.029522079217495e-06, "loss": 0.1462, "step": 382300 }, { "epoch": 4.077259873348363, "grad_norm": 18.637439727783203, "learning_rate": 7.0261293217368885e-06, "loss": 0.1207, "step": 382400 }, { "epoch": 4.077429511222394, "grad_norm": 16.114866256713867, "learning_rate": 7.022736564256283e-06, "loss": 0.1386, "step": 382500 }, { "epoch": 4.077599149096424, "grad_norm": 0.5848320722579956, "learning_rate": 7.019343806775677e-06, "loss": 0.1185, "step": 382600 }, { "epoch": 4.077768786970454, "grad_norm": 6.044220924377441, "learning_rate": 7.01595104929507e-06, "loss": 0.1191, "step": 382700 }, { "epoch": 4.0779384248444845, "grad_norm": 0.6891208291053772, "learning_rate": 7.0125582918144645e-06, "loss": 0.1234, "step": 382800 }, { "epoch": 4.078108062718515, "grad_norm": 40.168846130371094, "learning_rate": 7.009165534333858e-06, "loss": 0.1247, "step": 382900 }, { "epoch": 4.078277700592545, "grad_norm": 8.6153564453125, "learning_rate": 7.005772776853252e-06, "loss": 0.1025, "step": 383000 }, { "epoch": 4.078447338466575, "grad_norm": 12.969165802001953, "learning_rate": 7.002380019372645e-06, "loss": 0.1273, "step": 383100 }, { "epoch": 4.078616976340606, "grad_norm": 37.95918273925781, "learning_rate": 6.99898726189204e-06, "loss": 0.1268, "step": 383200 }, { "epoch": 4.078786614214636, "grad_norm": 9.27798843383789, "learning_rate": 6.9955945044114335e-06, "loss": 0.1257, "step": 383300 }, { "epoch": 4.078956252088666, "grad_norm": 55.505218505859375, "learning_rate": 6.992201746930828e-06, "loss": 0.1227, "step": 383400 }, { "epoch": 4.079125889962697, "grad_norm": 2.05892014503479, "learning_rate": 6.988808989450221e-06, "loss": 0.127, "step": 383500 }, { "epoch": 4.079295527836727, "grad_norm": 1.3264784812927246, "learning_rate": 6.985416231969615e-06, "loss": 0.1193, "step": 383600 }, { "epoch": 4.079465165710757, "grad_norm": 7.870678901672363, "learning_rate": 6.9820234744890095e-06, "loss": 0.1082, "step": 383700 }, { "epoch": 4.079634803584788, "grad_norm": 5.4343132972717285, "learning_rate": 6.9786307170084025e-06, "loss": 0.1363, "step": 383800 }, { "epoch": 4.079804441458818, "grad_norm": 2.571004629135132, "learning_rate": 6.975237959527796e-06, "loss": 0.1267, "step": 383900 }, { "epoch": 4.079974079332848, "grad_norm": 11.00088882446289, "learning_rate": 6.971845202047191e-06, "loss": 0.1068, "step": 384000 }, { "epoch": 4.080143717206878, "grad_norm": 7.789837837219238, "learning_rate": 6.968452444566585e-06, "loss": 0.1305, "step": 384100 }, { "epoch": 4.080313355080909, "grad_norm": 28.846837997436523, "learning_rate": 6.965059687085978e-06, "loss": 0.1246, "step": 384200 }, { "epoch": 4.080482992954939, "grad_norm": 4.640471458435059, "learning_rate": 6.9616669296053715e-06, "loss": 0.118, "step": 384300 }, { "epoch": 4.080652630828969, "grad_norm": 2.3916268348693848, "learning_rate": 6.958274172124766e-06, "loss": 0.1138, "step": 384400 }, { "epoch": 4.080822268703, "grad_norm": 3.5489296913146973, "learning_rate": 6.95488141464416e-06, "loss": 0.1311, "step": 384500 }, { "epoch": 4.08099190657703, "grad_norm": 8.473621368408203, "learning_rate": 6.951488657163553e-06, "loss": 0.1087, "step": 384600 }, { "epoch": 4.08116154445106, "grad_norm": 6.622286319732666, "learning_rate": 6.9480958996829475e-06, "loss": 0.1291, "step": 384700 }, { "epoch": 4.081331182325091, "grad_norm": 12.822169303894043, "learning_rate": 6.944703142202341e-06, "loss": 0.1239, "step": 384800 }, { "epoch": 4.081500820199121, "grad_norm": 1.7283954620361328, "learning_rate": 6.941310384721736e-06, "loss": 0.109, "step": 384900 }, { "epoch": 4.081670458073151, "grad_norm": 25.066255569458008, "learning_rate": 6.937917627241129e-06, "loss": 0.1219, "step": 385000 }, { "epoch": 4.0818400959471814, "grad_norm": 1.7801122665405273, "learning_rate": 6.934524869760523e-06, "loss": 0.1074, "step": 385100 }, { "epoch": 4.082009733821212, "grad_norm": 24.907737731933594, "learning_rate": 6.9311321122799165e-06, "loss": 0.1237, "step": 385200 }, { "epoch": 4.082179371695243, "grad_norm": 13.930689811706543, "learning_rate": 6.927739354799311e-06, "loss": 0.1223, "step": 385300 }, { "epoch": 4.082349009569272, "grad_norm": 17.831302642822266, "learning_rate": 6.924346597318704e-06, "loss": 0.1053, "step": 385400 }, { "epoch": 4.082518647443303, "grad_norm": 36.64804458618164, "learning_rate": 6.920953839838098e-06, "loss": 0.1421, "step": 385500 }, { "epoch": 4.082688285317333, "grad_norm": 13.213944435119629, "learning_rate": 6.9175610823574925e-06, "loss": 0.1263, "step": 385600 }, { "epoch": 4.082857923191363, "grad_norm": 0.8227165937423706, "learning_rate": 6.9141683248768854e-06, "loss": 0.1068, "step": 385700 }, { "epoch": 4.083027561065394, "grad_norm": 15.745683670043945, "learning_rate": 6.910775567396279e-06, "loss": 0.1336, "step": 385800 }, { "epoch": 4.083197198939424, "grad_norm": 10.015626907348633, "learning_rate": 6.907382809915674e-06, "loss": 0.1152, "step": 385900 }, { "epoch": 4.083366836813454, "grad_norm": 2.138612985610962, "learning_rate": 6.903990052435068e-06, "loss": 0.1314, "step": 386000 }, { "epoch": 4.0835364746874845, "grad_norm": 12.372211456298828, "learning_rate": 6.900597294954461e-06, "loss": 0.1143, "step": 386100 }, { "epoch": 4.083706112561515, "grad_norm": 4.047888278961182, "learning_rate": 6.897204537473855e-06, "loss": 0.1151, "step": 386200 }, { "epoch": 4.083875750435546, "grad_norm": 14.514519691467285, "learning_rate": 6.893811779993249e-06, "loss": 0.1329, "step": 386300 }, { "epoch": 4.084045388309575, "grad_norm": 23.906997680664062, "learning_rate": 6.890419022512643e-06, "loss": 0.1246, "step": 386400 }, { "epoch": 4.084215026183606, "grad_norm": 4.800948143005371, "learning_rate": 6.887026265032037e-06, "loss": 0.1277, "step": 386500 }, { "epoch": 4.0843846640576364, "grad_norm": 5.343560695648193, "learning_rate": 6.8836335075514305e-06, "loss": 0.1184, "step": 386600 }, { "epoch": 4.084554301931666, "grad_norm": 7.419631481170654, "learning_rate": 6.880240750070824e-06, "loss": 0.1181, "step": 386700 }, { "epoch": 4.084723939805697, "grad_norm": 5.970947265625, "learning_rate": 6.876847992590219e-06, "loss": 0.1007, "step": 386800 }, { "epoch": 4.084893577679727, "grad_norm": 6.9458160400390625, "learning_rate": 6.873455235109612e-06, "loss": 0.1147, "step": 386900 }, { "epoch": 4.085063215553758, "grad_norm": 8.499213218688965, "learning_rate": 6.870062477629006e-06, "loss": 0.1316, "step": 387000 }, { "epoch": 4.0852328534277875, "grad_norm": 14.02662181854248, "learning_rate": 6.8666697201484e-06, "loss": 0.1188, "step": 387100 }, { "epoch": 4.085402491301818, "grad_norm": 11.077070236206055, "learning_rate": 6.863276962667793e-06, "loss": 0.1246, "step": 387200 }, { "epoch": 4.085572129175849, "grad_norm": 7.514576435089111, "learning_rate": 6.859884205187187e-06, "loss": 0.124, "step": 387300 }, { "epoch": 4.085741767049878, "grad_norm": 4.242919921875, "learning_rate": 6.856491447706582e-06, "loss": 0.1411, "step": 387400 }, { "epoch": 4.085911404923909, "grad_norm": 3.11575984954834, "learning_rate": 6.8530986902259755e-06, "loss": 0.1215, "step": 387500 }, { "epoch": 4.0860810427979395, "grad_norm": 11.378223419189453, "learning_rate": 6.849705932745368e-06, "loss": 0.1275, "step": 387600 }, { "epoch": 4.086250680671969, "grad_norm": 15.134997367858887, "learning_rate": 6.846313175264763e-06, "loss": 0.124, "step": 387700 }, { "epoch": 4.086420318546, "grad_norm": 14.206661224365234, "learning_rate": 6.842920417784157e-06, "loss": 0.1134, "step": 387800 }, { "epoch": 4.08658995642003, "grad_norm": 17.224123001098633, "learning_rate": 6.839527660303551e-06, "loss": 0.1087, "step": 387900 }, { "epoch": 4.086759594294061, "grad_norm": 11.5643310546875, "learning_rate": 6.836134902822944e-06, "loss": 0.1161, "step": 388000 }, { "epoch": 4.086929232168091, "grad_norm": 13.77760124206543, "learning_rate": 6.832742145342338e-06, "loss": 0.1322, "step": 388100 }, { "epoch": 4.087098870042121, "grad_norm": 14.917258262634277, "learning_rate": 6.829349387861732e-06, "loss": 0.1061, "step": 388200 }, { "epoch": 4.087268507916152, "grad_norm": 15.576033592224121, "learning_rate": 6.825956630381127e-06, "loss": 0.1279, "step": 388300 }, { "epoch": 4.087438145790181, "grad_norm": 14.606051445007324, "learning_rate": 6.82256387290052e-06, "loss": 0.1207, "step": 388400 }, { "epoch": 4.087607783664212, "grad_norm": 60.014122009277344, "learning_rate": 6.819171115419913e-06, "loss": 0.1278, "step": 388500 }, { "epoch": 4.0877774215382425, "grad_norm": 15.271240234375, "learning_rate": 6.815778357939308e-06, "loss": 0.1282, "step": 388600 }, { "epoch": 4.087947059412272, "grad_norm": 0.46856725215911865, "learning_rate": 6.812385600458702e-06, "loss": 0.1174, "step": 388700 }, { "epoch": 4.088116697286303, "grad_norm": 20.379669189453125, "learning_rate": 6.808992842978095e-06, "loss": 0.1347, "step": 388800 }, { "epoch": 4.088286335160333, "grad_norm": 3.873016357421875, "learning_rate": 6.8056000854974894e-06, "loss": 0.1152, "step": 388900 }, { "epoch": 4.088455973034364, "grad_norm": 3.953172206878662, "learning_rate": 6.802207328016883e-06, "loss": 0.1468, "step": 389000 }, { "epoch": 4.088625610908394, "grad_norm": 13.85262393951416, "learning_rate": 6.798814570536276e-06, "loss": 0.1195, "step": 389100 }, { "epoch": 4.088795248782424, "grad_norm": 1.1751266717910767, "learning_rate": 6.79542181305567e-06, "loss": 0.1196, "step": 389200 }, { "epoch": 4.088964886656455, "grad_norm": 14.297279357910156, "learning_rate": 6.792029055575065e-06, "loss": 0.1226, "step": 389300 }, { "epoch": 4.089134524530484, "grad_norm": 17.842914581298828, "learning_rate": 6.7886362980944584e-06, "loss": 0.1179, "step": 389400 }, { "epoch": 4.089304162404515, "grad_norm": 6.045706272125244, "learning_rate": 6.785243540613851e-06, "loss": 0.1403, "step": 389500 }, { "epoch": 4.089473800278546, "grad_norm": 2.445030450820923, "learning_rate": 6.781850783133246e-06, "loss": 0.117, "step": 389600 }, { "epoch": 4.089643438152576, "grad_norm": 0.8255632519721985, "learning_rate": 6.77845802565264e-06, "loss": 0.1224, "step": 389700 }, { "epoch": 4.089813076026606, "grad_norm": 16.856687545776367, "learning_rate": 6.7750652681720345e-06, "loss": 0.1198, "step": 389800 }, { "epoch": 4.089982713900636, "grad_norm": 2.7511260509490967, "learning_rate": 6.771672510691427e-06, "loss": 0.1081, "step": 389900 }, { "epoch": 4.090152351774667, "grad_norm": 6.402744770050049, "learning_rate": 6.768279753210821e-06, "loss": 0.1276, "step": 390000 }, { "epoch": 4.090321989648697, "grad_norm": 9.632039070129395, "learning_rate": 6.764886995730216e-06, "loss": 0.1305, "step": 390100 }, { "epoch": 4.090491627522727, "grad_norm": 28.992027282714844, "learning_rate": 6.76149423824961e-06, "loss": 0.1296, "step": 390200 }, { "epoch": 4.090661265396758, "grad_norm": 8.775055885314941, "learning_rate": 6.758101480769003e-06, "loss": 0.1219, "step": 390300 }, { "epoch": 4.0908309032707875, "grad_norm": 8.507307052612305, "learning_rate": 6.754708723288396e-06, "loss": 0.134, "step": 390400 }, { "epoch": 4.091000541144818, "grad_norm": 3.8074724674224854, "learning_rate": 6.751315965807791e-06, "loss": 0.1306, "step": 390500 }, { "epoch": 4.091170179018849, "grad_norm": 14.448866844177246, "learning_rate": 6.747923208327185e-06, "loss": 0.1078, "step": 390600 }, { "epoch": 4.091339816892879, "grad_norm": 12.395476341247559, "learning_rate": 6.744530450846578e-06, "loss": 0.1187, "step": 390700 }, { "epoch": 4.091509454766909, "grad_norm": 16.313297271728516, "learning_rate": 6.741137693365972e-06, "loss": 0.1178, "step": 390800 }, { "epoch": 4.091679092640939, "grad_norm": 14.028096199035645, "learning_rate": 6.737744935885366e-06, "loss": 0.1363, "step": 390900 }, { "epoch": 4.09184873051497, "grad_norm": 13.458547592163086, "learning_rate": 6.734352178404759e-06, "loss": 0.1331, "step": 391000 }, { "epoch": 4.092018368389, "grad_norm": 8.461715698242188, "learning_rate": 6.730959420924154e-06, "loss": 0.1483, "step": 391100 }, { "epoch": 4.09218800626303, "grad_norm": 7.109163284301758, "learning_rate": 6.727566663443548e-06, "loss": 0.129, "step": 391200 }, { "epoch": 4.092357644137061, "grad_norm": 2.1214890480041504, "learning_rate": 6.724173905962942e-06, "loss": 0.1431, "step": 391300 }, { "epoch": 4.0925272820110905, "grad_norm": 7.462599754333496, "learning_rate": 6.720781148482335e-06, "loss": 0.1211, "step": 391400 }, { "epoch": 4.092696919885121, "grad_norm": 6.319705009460449, "learning_rate": 6.717388391001729e-06, "loss": 0.1291, "step": 391500 }, { "epoch": 4.092866557759152, "grad_norm": 7.526228904724121, "learning_rate": 6.713995633521123e-06, "loss": 0.1098, "step": 391600 }, { "epoch": 4.093036195633182, "grad_norm": 11.044734001159668, "learning_rate": 6.710602876040517e-06, "loss": 0.1156, "step": 391700 }, { "epoch": 4.093205833507212, "grad_norm": 0.8589796423912048, "learning_rate": 6.70721011855991e-06, "loss": 0.1146, "step": 391800 }, { "epoch": 4.0933754713812425, "grad_norm": 8.938477516174316, "learning_rate": 6.703817361079304e-06, "loss": 0.118, "step": 391900 }, { "epoch": 4.093545109255273, "grad_norm": 12.657191276550293, "learning_rate": 6.700424603598699e-06, "loss": 0.1236, "step": 392000 }, { "epoch": 4.093714747129303, "grad_norm": 12.623083114624023, "learning_rate": 6.697031846118093e-06, "loss": 0.096, "step": 392100 }, { "epoch": 4.093884385003333, "grad_norm": 4.069339275360107, "learning_rate": 6.6936390886374855e-06, "loss": 0.135, "step": 392200 }, { "epoch": 4.094054022877364, "grad_norm": 11.837642669677734, "learning_rate": 6.69024633115688e-06, "loss": 0.1353, "step": 392300 }, { "epoch": 4.094223660751394, "grad_norm": 6.240734577178955, "learning_rate": 6.686853573676274e-06, "loss": 0.1238, "step": 392400 }, { "epoch": 4.094393298625424, "grad_norm": 10.832537651062012, "learning_rate": 6.683460816195667e-06, "loss": 0.1197, "step": 392500 }, { "epoch": 4.094562936499455, "grad_norm": 21.929208755493164, "learning_rate": 6.680068058715062e-06, "loss": 0.1264, "step": 392600 }, { "epoch": 4.094732574373485, "grad_norm": 1.7310981750488281, "learning_rate": 6.676675301234455e-06, "loss": 0.1201, "step": 392700 }, { "epoch": 4.094902212247515, "grad_norm": 2.2251532077789307, "learning_rate": 6.673282543753849e-06, "loss": 0.1226, "step": 392800 }, { "epoch": 4.0950718501215455, "grad_norm": 7.891847610473633, "learning_rate": 6.669889786273243e-06, "loss": 0.1214, "step": 392900 }, { "epoch": 4.095241487995576, "grad_norm": 13.689579963684082, "learning_rate": 6.666497028792637e-06, "loss": 0.1274, "step": 393000 }, { "epoch": 4.095411125869606, "grad_norm": 13.683553695678711, "learning_rate": 6.6631042713120306e-06, "loss": 0.1233, "step": 393100 }, { "epoch": 4.095580763743636, "grad_norm": 23.275352478027344, "learning_rate": 6.659711513831425e-06, "loss": 0.1285, "step": 393200 }, { "epoch": 4.095750401617667, "grad_norm": 21.27773666381836, "learning_rate": 6.656318756350818e-06, "loss": 0.1175, "step": 393300 }, { "epoch": 4.0959200394916975, "grad_norm": 9.449995040893555, "learning_rate": 6.652925998870212e-06, "loss": 0.127, "step": 393400 }, { "epoch": 4.096089677365727, "grad_norm": 4.002708911895752, "learning_rate": 6.649533241389607e-06, "loss": 0.1251, "step": 393500 }, { "epoch": 4.096259315239758, "grad_norm": 12.354340553283691, "learning_rate": 6.646140483909e-06, "loss": 0.1266, "step": 393600 }, { "epoch": 4.096428953113788, "grad_norm": 3.9678099155426025, "learning_rate": 6.642747726428393e-06, "loss": 0.1272, "step": 393700 }, { "epoch": 4.096598590987818, "grad_norm": 9.143670082092285, "learning_rate": 6.639354968947788e-06, "loss": 0.1202, "step": 393800 }, { "epoch": 4.0967682288618485, "grad_norm": 10.903497695922852, "learning_rate": 6.635962211467182e-06, "loss": 0.1341, "step": 393900 }, { "epoch": 4.096937866735879, "grad_norm": 11.499448776245117, "learning_rate": 6.6325694539865756e-06, "loss": 0.1288, "step": 394000 }, { "epoch": 4.097107504609909, "grad_norm": 2.5792582035064697, "learning_rate": 6.629176696505969e-06, "loss": 0.1257, "step": 394100 }, { "epoch": 4.097277142483939, "grad_norm": 2.4846622943878174, "learning_rate": 6.625783939025363e-06, "loss": 0.1117, "step": 394200 }, { "epoch": 4.09744678035797, "grad_norm": 19.608179092407227, "learning_rate": 6.622391181544757e-06, "loss": 0.1185, "step": 394300 }, { "epoch": 4.0976164182320005, "grad_norm": 16.667945861816406, "learning_rate": 6.61899842406415e-06, "loss": 0.1236, "step": 394400 }, { "epoch": 4.09778605610603, "grad_norm": 8.160429000854492, "learning_rate": 6.6156056665835445e-06, "loss": 0.1275, "step": 394500 }, { "epoch": 4.097955693980061, "grad_norm": 3.2135274410247803, "learning_rate": 6.612212909102938e-06, "loss": 0.1214, "step": 394600 }, { "epoch": 4.098125331854091, "grad_norm": 11.069377899169922, "learning_rate": 6.608820151622333e-06, "loss": 0.1278, "step": 394700 }, { "epoch": 4.098294969728121, "grad_norm": 6.828614711761475, "learning_rate": 6.605427394141726e-06, "loss": 0.1221, "step": 394800 }, { "epoch": 4.098464607602152, "grad_norm": 9.585511207580566, "learning_rate": 6.60203463666112e-06, "loss": 0.1217, "step": 394900 }, { "epoch": 4.098634245476182, "grad_norm": 7.183375358581543, "learning_rate": 6.598641879180514e-06, "loss": 0.121, "step": 395000 }, { "epoch": 4.098803883350213, "grad_norm": 17.678075790405273, "learning_rate": 6.595249121699908e-06, "loss": 0.1332, "step": 395100 }, { "epoch": 4.098973521224242, "grad_norm": 20.850067138671875, "learning_rate": 6.591856364219301e-06, "loss": 0.121, "step": 395200 }, { "epoch": 4.099143159098273, "grad_norm": 13.473274230957031, "learning_rate": 6.588463606738696e-06, "loss": 0.1315, "step": 395300 }, { "epoch": 4.0993127969723036, "grad_norm": 10.54035758972168, "learning_rate": 6.5850708492580895e-06, "loss": 0.1295, "step": 395400 }, { "epoch": 4.099482434846333, "grad_norm": 12.843173027038574, "learning_rate": 6.581678091777483e-06, "loss": 0.1168, "step": 395500 }, { "epoch": 4.099652072720364, "grad_norm": 4.90543270111084, "learning_rate": 6.578285334296876e-06, "loss": 0.1243, "step": 395600 }, { "epoch": 4.099821710594394, "grad_norm": 6.47235631942749, "learning_rate": 6.574892576816271e-06, "loss": 0.1305, "step": 395700 }, { "epoch": 4.099991348468424, "grad_norm": 1.5433365106582642, "learning_rate": 6.571499819335665e-06, "loss": 0.1263, "step": 395800 }, { "epoch": 4.100160986342455, "grad_norm": 18.644304275512695, "learning_rate": 6.568107061855059e-06, "loss": 0.1146, "step": 395900 }, { "epoch": 4.100330624216485, "grad_norm": 24.000244140625, "learning_rate": 6.564714304374452e-06, "loss": 0.1244, "step": 396000 }, { "epoch": 4.100500262090516, "grad_norm": 9.524909019470215, "learning_rate": 6.561321546893846e-06, "loss": 0.1304, "step": 396100 }, { "epoch": 4.1006698999645455, "grad_norm": 24.35831642150879, "learning_rate": 6.557928789413241e-06, "loss": 0.1334, "step": 396200 }, { "epoch": 4.100839537838576, "grad_norm": 9.8112154006958, "learning_rate": 6.554536031932634e-06, "loss": 0.1183, "step": 396300 }, { "epoch": 4.101009175712607, "grad_norm": 8.223645210266113, "learning_rate": 6.5511432744520275e-06, "loss": 0.1172, "step": 396400 }, { "epoch": 4.101178813586636, "grad_norm": 41.328407287597656, "learning_rate": 6.547750516971421e-06, "loss": 0.1209, "step": 396500 }, { "epoch": 4.101348451460667, "grad_norm": 2.6834325790405273, "learning_rate": 6.544357759490816e-06, "loss": 0.1243, "step": 396600 }, { "epoch": 4.101518089334697, "grad_norm": 6.87346887588501, "learning_rate": 6.540965002010209e-06, "loss": 0.1238, "step": 396700 }, { "epoch": 4.101687727208727, "grad_norm": 9.561772346496582, "learning_rate": 6.537572244529603e-06, "loss": 0.116, "step": 396800 }, { "epoch": 4.101857365082758, "grad_norm": 4.926433563232422, "learning_rate": 6.534179487048997e-06, "loss": 0.1208, "step": 396900 }, { "epoch": 4.102027002956788, "grad_norm": 10.411685943603516, "learning_rate": 6.530786729568391e-06, "loss": 0.1374, "step": 397000 }, { "epoch": 4.102196640830819, "grad_norm": 5.555187225341797, "learning_rate": 6.527393972087784e-06, "loss": 0.129, "step": 397100 }, { "epoch": 4.1023662787048485, "grad_norm": 6.412428855895996, "learning_rate": 6.524001214607179e-06, "loss": 0.1203, "step": 397200 }, { "epoch": 4.102535916578879, "grad_norm": 9.939200401306152, "learning_rate": 6.5206084571265725e-06, "loss": 0.1314, "step": 397300 }, { "epoch": 4.10270555445291, "grad_norm": 8.08715534210205, "learning_rate": 6.517215699645967e-06, "loss": 0.1109, "step": 397400 }, { "epoch": 4.102875192326939, "grad_norm": 9.143040657043457, "learning_rate": 6.51382294216536e-06, "loss": 0.1056, "step": 397500 }, { "epoch": 4.10304483020097, "grad_norm": 8.41372299194336, "learning_rate": 6.510430184684754e-06, "loss": 0.128, "step": 397600 }, { "epoch": 4.1032144680750005, "grad_norm": 17.72060775756836, "learning_rate": 6.507037427204148e-06, "loss": 0.126, "step": 397700 }, { "epoch": 4.103384105949031, "grad_norm": 14.042823791503906, "learning_rate": 6.5036446697235415e-06, "loss": 0.1268, "step": 397800 }, { "epoch": 4.103553743823061, "grad_norm": 4.259908199310303, "learning_rate": 6.500251912242935e-06, "loss": 0.1228, "step": 397900 }, { "epoch": 4.103723381697091, "grad_norm": 33.240596771240234, "learning_rate": 6.496859154762329e-06, "loss": 0.1046, "step": 398000 }, { "epoch": 4.103893019571122, "grad_norm": 9.029052734375, "learning_rate": 6.493466397281724e-06, "loss": 0.1253, "step": 398100 }, { "epoch": 4.1040626574451515, "grad_norm": 6.633867263793945, "learning_rate": 6.490073639801117e-06, "loss": 0.1301, "step": 398200 }, { "epoch": 4.104232295319182, "grad_norm": 12.243508338928223, "learning_rate": 6.4866808823205105e-06, "loss": 0.1059, "step": 398300 }, { "epoch": 4.104401933193213, "grad_norm": 17.416114807128906, "learning_rate": 6.483288124839905e-06, "loss": 0.1428, "step": 398400 }, { "epoch": 4.104571571067242, "grad_norm": 9.388175010681152, "learning_rate": 6.479895367359299e-06, "loss": 0.1257, "step": 398500 }, { "epoch": 4.104741208941273, "grad_norm": 11.63109302520752, "learning_rate": 6.476502609878692e-06, "loss": 0.11, "step": 398600 }, { "epoch": 4.1049108468153035, "grad_norm": 12.846633911132812, "learning_rate": 6.4731098523980865e-06, "loss": 0.1196, "step": 398700 }, { "epoch": 4.105080484689334, "grad_norm": 9.889016151428223, "learning_rate": 6.46971709491748e-06, "loss": 0.122, "step": 398800 }, { "epoch": 4.105250122563364, "grad_norm": 12.024748802185059, "learning_rate": 6.466324337436874e-06, "loss": 0.121, "step": 398900 }, { "epoch": 4.105419760437394, "grad_norm": 8.326468467712402, "learning_rate": 6.462931579956268e-06, "loss": 0.1193, "step": 399000 }, { "epoch": 4.105589398311425, "grad_norm": 3.3768062591552734, "learning_rate": 6.459538822475662e-06, "loss": 0.1367, "step": 399100 }, { "epoch": 4.105759036185455, "grad_norm": 21.34464454650879, "learning_rate": 6.4561460649950555e-06, "loss": 0.1206, "step": 399200 }, { "epoch": 4.105928674059485, "grad_norm": 21.801605224609375, "learning_rate": 6.45275330751445e-06, "loss": 0.1062, "step": 399300 }, { "epoch": 4.106098311933516, "grad_norm": 9.61803150177002, "learning_rate": 6.449360550033843e-06, "loss": 0.1347, "step": 399400 }, { "epoch": 4.106267949807545, "grad_norm": 4.604773998260498, "learning_rate": 6.445967792553237e-06, "loss": 0.1328, "step": 399500 }, { "epoch": 4.106437587681576, "grad_norm": 8.072755813598633, "learning_rate": 6.4425750350726315e-06, "loss": 0.1192, "step": 399600 }, { "epoch": 4.1066072255556065, "grad_norm": 8.15114974975586, "learning_rate": 6.4391822775920245e-06, "loss": 0.1202, "step": 399700 }, { "epoch": 4.106776863429637, "grad_norm": 14.356123924255371, "learning_rate": 6.435789520111418e-06, "loss": 0.1322, "step": 399800 }, { "epoch": 4.106946501303667, "grad_norm": 1.0744404792785645, "learning_rate": 6.432396762630813e-06, "loss": 0.1162, "step": 399900 }, { "epoch": 4.107116139177697, "grad_norm": 14.923460006713867, "learning_rate": 6.429004005150207e-06, "loss": 0.1156, "step": 400000 }, { "epoch": 4.107285777051728, "grad_norm": 8.944058418273926, "learning_rate": 6.4256112476696e-06, "loss": 0.1314, "step": 400100 }, { "epoch": 4.107455414925758, "grad_norm": 12.335264205932617, "learning_rate": 6.422218490188994e-06, "loss": 0.1176, "step": 400200 }, { "epoch": 4.107625052799788, "grad_norm": 9.501726150512695, "learning_rate": 6.418825732708388e-06, "loss": 0.1305, "step": 400300 }, { "epoch": 4.107794690673819, "grad_norm": 7.4366607666015625, "learning_rate": 6.415432975227782e-06, "loss": 0.1128, "step": 400400 }, { "epoch": 4.107964328547849, "grad_norm": 14.440910339355469, "learning_rate": 6.412040217747175e-06, "loss": 0.1201, "step": 400500 }, { "epoch": 4.108133966421879, "grad_norm": 7.193301677703857, "learning_rate": 6.4086474602665695e-06, "loss": 0.1233, "step": 400600 }, { "epoch": 4.10830360429591, "grad_norm": 21.39470100402832, "learning_rate": 6.405254702785963e-06, "loss": 0.1144, "step": 400700 }, { "epoch": 4.10847324216994, "grad_norm": 18.405773162841797, "learning_rate": 6.401861945305358e-06, "loss": 0.1255, "step": 400800 }, { "epoch": 4.10864288004397, "grad_norm": 21.07414436340332, "learning_rate": 6.398469187824751e-06, "loss": 0.121, "step": 400900 }, { "epoch": 4.108812517918, "grad_norm": 5.929955959320068, "learning_rate": 6.395076430344145e-06, "loss": 0.1152, "step": 401000 }, { "epoch": 4.108982155792031, "grad_norm": 1.4416608810424805, "learning_rate": 6.391683672863539e-06, "loss": 0.1184, "step": 401100 }, { "epoch": 4.109151793666061, "grad_norm": 5.297900199890137, "learning_rate": 6.388290915382933e-06, "loss": 0.1158, "step": 401200 }, { "epoch": 4.109321431540091, "grad_norm": 11.595690727233887, "learning_rate": 6.384898157902326e-06, "loss": 0.1175, "step": 401300 }, { "epoch": 4.109491069414122, "grad_norm": 20.272695541381836, "learning_rate": 6.381505400421721e-06, "loss": 0.1048, "step": 401400 }, { "epoch": 4.109660707288152, "grad_norm": 10.954911231994629, "learning_rate": 6.3781126429411145e-06, "loss": 0.136, "step": 401500 }, { "epoch": 4.109830345162182, "grad_norm": 14.983708381652832, "learning_rate": 6.374719885460507e-06, "loss": 0.1138, "step": 401600 }, { "epoch": 4.109999983036213, "grad_norm": 8.176724433898926, "learning_rate": 6.371327127979901e-06, "loss": 0.1302, "step": 401700 }, { "epoch": 4.110169620910243, "grad_norm": 3.1861302852630615, "learning_rate": 6.367934370499296e-06, "loss": 0.113, "step": 401800 }, { "epoch": 4.110339258784273, "grad_norm": 13.21297836303711, "learning_rate": 6.36454161301869e-06, "loss": 0.112, "step": 401900 }, { "epoch": 4.110508896658303, "grad_norm": 0.4000560939311981, "learning_rate": 6.361148855538083e-06, "loss": 0.1272, "step": 402000 }, { "epoch": 4.110678534532334, "grad_norm": 34.58991622924805, "learning_rate": 6.357756098057477e-06, "loss": 0.1298, "step": 402100 }, { "epoch": 4.110848172406364, "grad_norm": 3.4877521991729736, "learning_rate": 6.354363340576871e-06, "loss": 0.1242, "step": 402200 }, { "epoch": 4.111017810280394, "grad_norm": 14.389931678771973, "learning_rate": 6.350970583096266e-06, "loss": 0.1215, "step": 402300 }, { "epoch": 4.111187448154425, "grad_norm": 37.03853225708008, "learning_rate": 6.347577825615659e-06, "loss": 0.1292, "step": 402400 }, { "epoch": 4.111357086028455, "grad_norm": 27.75765609741211, "learning_rate": 6.3441850681350524e-06, "loss": 0.1193, "step": 402500 }, { "epoch": 4.111526723902485, "grad_norm": 21.146751403808594, "learning_rate": 6.340792310654447e-06, "loss": 0.1135, "step": 402600 }, { "epoch": 4.111696361776516, "grad_norm": 14.435354232788086, "learning_rate": 6.337399553173841e-06, "loss": 0.1164, "step": 402700 }, { "epoch": 4.111865999650546, "grad_norm": 2.9360485076904297, "learning_rate": 6.334006795693234e-06, "loss": 0.1161, "step": 402800 }, { "epoch": 4.112035637524576, "grad_norm": 4.2193603515625, "learning_rate": 6.330614038212628e-06, "loss": 0.1188, "step": 402900 }, { "epoch": 4.1122052753986065, "grad_norm": 8.44333267211914, "learning_rate": 6.327221280732022e-06, "loss": 0.1211, "step": 403000 }, { "epoch": 4.112374913272637, "grad_norm": 6.193358421325684, "learning_rate": 6.323828523251416e-06, "loss": 0.1152, "step": 403100 }, { "epoch": 4.112544551146668, "grad_norm": 22.641128540039062, "learning_rate": 6.320435765770809e-06, "loss": 0.1169, "step": 403200 }, { "epoch": 4.112714189020697, "grad_norm": 19.256744384765625, "learning_rate": 6.317043008290204e-06, "loss": 0.1357, "step": 403300 }, { "epoch": 4.112883826894728, "grad_norm": 7.843400478363037, "learning_rate": 6.3136502508095974e-06, "loss": 0.1132, "step": 403400 }, { "epoch": 4.113053464768758, "grad_norm": 10.0936861038208, "learning_rate": 6.31025749332899e-06, "loss": 0.1178, "step": 403500 }, { "epoch": 4.113223102642788, "grad_norm": 0.9975932836532593, "learning_rate": 6.306864735848385e-06, "loss": 0.1317, "step": 403600 }, { "epoch": 4.113392740516819, "grad_norm": 18.7751522064209, "learning_rate": 6.303471978367779e-06, "loss": 0.1238, "step": 403700 }, { "epoch": 4.113562378390849, "grad_norm": 3.970158338546753, "learning_rate": 6.3000792208871735e-06, "loss": 0.1278, "step": 403800 }, { "epoch": 4.113732016264879, "grad_norm": 0.21265581250190735, "learning_rate": 6.296686463406566e-06, "loss": 0.109, "step": 403900 }, { "epoch": 4.1139016541389095, "grad_norm": 13.239800453186035, "learning_rate": 6.29329370592596e-06, "loss": 0.1346, "step": 404000 }, { "epoch": 4.11407129201294, "grad_norm": 6.586028575897217, "learning_rate": 6.289900948445354e-06, "loss": 0.106, "step": 404100 }, { "epoch": 4.114240929886971, "grad_norm": 11.02481746673584, "learning_rate": 6.286508190964749e-06, "loss": 0.1178, "step": 404200 }, { "epoch": 4.114410567761, "grad_norm": 19.65526580810547, "learning_rate": 6.283115433484142e-06, "loss": 0.111, "step": 404300 }, { "epoch": 4.114580205635031, "grad_norm": 26.382144927978516, "learning_rate": 6.279722676003535e-06, "loss": 0.1316, "step": 404400 }, { "epoch": 4.1147498435090615, "grad_norm": 4.712153434753418, "learning_rate": 6.27632991852293e-06, "loss": 0.1242, "step": 404500 }, { "epoch": 4.114919481383091, "grad_norm": 1.0792763233184814, "learning_rate": 6.272937161042324e-06, "loss": 0.1211, "step": 404600 }, { "epoch": 4.115089119257122, "grad_norm": 11.060771942138672, "learning_rate": 6.269544403561717e-06, "loss": 0.1285, "step": 404700 }, { "epoch": 4.115258757131152, "grad_norm": 1.5611143112182617, "learning_rate": 6.266151646081111e-06, "loss": 0.1245, "step": 404800 }, { "epoch": 4.115428395005183, "grad_norm": 0.39772316813468933, "learning_rate": 6.262758888600505e-06, "loss": 0.1239, "step": 404900 }, { "epoch": 4.1155980328792126, "grad_norm": 1.7654950618743896, "learning_rate": 6.259366131119898e-06, "loss": 0.1255, "step": 405000 }, { "epoch": 4.115767670753243, "grad_norm": 17.399856567382812, "learning_rate": 6.255973373639293e-06, "loss": 0.1208, "step": 405100 }, { "epoch": 4.115937308627274, "grad_norm": 5.5798726081848145, "learning_rate": 6.252580616158687e-06, "loss": 0.117, "step": 405200 }, { "epoch": 4.116106946501303, "grad_norm": 17.268577575683594, "learning_rate": 6.24918785867808e-06, "loss": 0.1266, "step": 405300 }, { "epoch": 4.116276584375334, "grad_norm": 0.7737802267074585, "learning_rate": 6.245795101197474e-06, "loss": 0.1288, "step": 405400 }, { "epoch": 4.1164462222493645, "grad_norm": 12.694964408874512, "learning_rate": 6.242402343716868e-06, "loss": 0.1166, "step": 405500 }, { "epoch": 4.116615860123394, "grad_norm": 2.385915994644165, "learning_rate": 6.239009586236262e-06, "loss": 0.1214, "step": 405600 }, { "epoch": 4.116785497997425, "grad_norm": 7.625898361206055, "learning_rate": 6.2356168287556564e-06, "loss": 0.1261, "step": 405700 }, { "epoch": 4.116955135871455, "grad_norm": 8.831022262573242, "learning_rate": 6.232224071275049e-06, "loss": 0.1303, "step": 405800 }, { "epoch": 4.117124773745486, "grad_norm": 3.9241127967834473, "learning_rate": 6.228831313794443e-06, "loss": 0.1155, "step": 405900 }, { "epoch": 4.117294411619516, "grad_norm": 1.7932560443878174, "learning_rate": 6.225438556313838e-06, "loss": 0.1184, "step": 406000 }, { "epoch": 4.117464049493546, "grad_norm": 7.483323574066162, "learning_rate": 6.222045798833232e-06, "loss": 0.1248, "step": 406100 }, { "epoch": 4.117633687367577, "grad_norm": 18.458415985107422, "learning_rate": 6.2186530413526246e-06, "loss": 0.1283, "step": 406200 }, { "epoch": 4.117803325241606, "grad_norm": 0.5708405375480652, "learning_rate": 6.215260283872019e-06, "loss": 0.1152, "step": 406300 }, { "epoch": 4.117972963115637, "grad_norm": 36.32638931274414, "learning_rate": 6.211867526391413e-06, "loss": 0.1151, "step": 406400 }, { "epoch": 4.118142600989668, "grad_norm": 6.188488960266113, "learning_rate": 6.208474768910807e-06, "loss": 0.1356, "step": 406500 }, { "epoch": 4.118312238863697, "grad_norm": 11.374059677124023, "learning_rate": 6.205082011430201e-06, "loss": 0.1225, "step": 406600 }, { "epoch": 4.118481876737728, "grad_norm": 1.614737629890442, "learning_rate": 6.201689253949594e-06, "loss": 0.1166, "step": 406700 }, { "epoch": 4.118651514611758, "grad_norm": 10.638137817382812, "learning_rate": 6.198296496468988e-06, "loss": 0.1139, "step": 406800 }, { "epoch": 4.118821152485789, "grad_norm": 3.247307062149048, "learning_rate": 6.194903738988381e-06, "loss": 0.116, "step": 406900 }, { "epoch": 4.118990790359819, "grad_norm": 4.9236907958984375, "learning_rate": 6.191510981507776e-06, "loss": 0.1152, "step": 407000 }, { "epoch": 4.119160428233849, "grad_norm": 11.412353515625, "learning_rate": 6.1881182240271696e-06, "loss": 0.1412, "step": 407100 }, { "epoch": 4.11933006610788, "grad_norm": 6.1894612312316895, "learning_rate": 6.184725466546564e-06, "loss": 0.1228, "step": 407200 }, { "epoch": 4.1194997039819095, "grad_norm": 17.216278076171875, "learning_rate": 6.181332709065957e-06, "loss": 0.1288, "step": 407300 }, { "epoch": 4.11966934185594, "grad_norm": 1.101974368095398, "learning_rate": 6.177939951585351e-06, "loss": 0.1146, "step": 407400 }, { "epoch": 4.119838979729971, "grad_norm": 9.500003814697266, "learning_rate": 6.174547194104746e-06, "loss": 0.1204, "step": 407500 }, { "epoch": 4.120008617604001, "grad_norm": 9.752473831176758, "learning_rate": 6.171154436624139e-06, "loss": 0.1329, "step": 407600 }, { "epoch": 4.120178255478031, "grad_norm": 10.531750679016113, "learning_rate": 6.167761679143532e-06, "loss": 0.1306, "step": 407700 }, { "epoch": 4.120347893352061, "grad_norm": 20.847135543823242, "learning_rate": 6.164368921662926e-06, "loss": 0.1465, "step": 407800 }, { "epoch": 4.120517531226092, "grad_norm": 23.819353103637695, "learning_rate": 6.160976164182321e-06, "loss": 0.118, "step": 407900 }, { "epoch": 4.120687169100122, "grad_norm": 10.447949409484863, "learning_rate": 6.1575834067017146e-06, "loss": 0.1096, "step": 408000 }, { "epoch": 4.120856806974152, "grad_norm": 10.087700843811035, "learning_rate": 6.1541906492211075e-06, "loss": 0.1179, "step": 408100 }, { "epoch": 4.121026444848183, "grad_norm": 4.59835147857666, "learning_rate": 6.150797891740502e-06, "loss": 0.1169, "step": 408200 }, { "epoch": 4.1211960827222125, "grad_norm": 5.23657751083374, "learning_rate": 6.147405134259896e-06, "loss": 0.1377, "step": 408300 }, { "epoch": 4.121365720596243, "grad_norm": 9.091748237609863, "learning_rate": 6.144012376779291e-06, "loss": 0.1134, "step": 408400 }, { "epoch": 4.121535358470274, "grad_norm": 20.243488311767578, "learning_rate": 6.1406196192986835e-06, "loss": 0.1124, "step": 408500 }, { "epoch": 4.121704996344304, "grad_norm": 4.846342086791992, "learning_rate": 6.137226861818077e-06, "loss": 0.1286, "step": 408600 }, { "epoch": 4.121874634218334, "grad_norm": 20.212146759033203, "learning_rate": 6.133834104337472e-06, "loss": 0.126, "step": 408700 }, { "epoch": 4.1220442720923645, "grad_norm": 9.41696834564209, "learning_rate": 6.130441346856865e-06, "loss": 0.1202, "step": 408800 }, { "epoch": 4.122213909966395, "grad_norm": 17.517623901367188, "learning_rate": 6.127048589376259e-06, "loss": 0.1161, "step": 408900 }, { "epoch": 4.122383547840425, "grad_norm": 14.106881141662598, "learning_rate": 6.1236558318956525e-06, "loss": 0.1065, "step": 409000 }, { "epoch": 4.122553185714455, "grad_norm": 6.102715969085693, "learning_rate": 6.120263074415047e-06, "loss": 0.1177, "step": 409100 }, { "epoch": 4.122722823588486, "grad_norm": 27.790987014770508, "learning_rate": 6.11687031693444e-06, "loss": 0.1293, "step": 409200 }, { "epoch": 4.1228924614625155, "grad_norm": 26.540987014770508, "learning_rate": 6.113477559453834e-06, "loss": 0.1318, "step": 409300 }, { "epoch": 4.123062099336546, "grad_norm": 1.436174988746643, "learning_rate": 6.1100848019732286e-06, "loss": 0.1323, "step": 409400 }, { "epoch": 4.123231737210577, "grad_norm": 23.07426643371582, "learning_rate": 6.106692044492622e-06, "loss": 0.1366, "step": 409500 }, { "epoch": 4.123401375084607, "grad_norm": 6.997931480407715, "learning_rate": 6.103299287012015e-06, "loss": 0.1243, "step": 409600 }, { "epoch": 4.123571012958637, "grad_norm": 0.6569910049438477, "learning_rate": 6.09990652953141e-06, "loss": 0.1235, "step": 409700 }, { "epoch": 4.1237406508326675, "grad_norm": 24.332027435302734, "learning_rate": 6.096513772050804e-06, "loss": 0.1243, "step": 409800 }, { "epoch": 4.123910288706698, "grad_norm": 21.124120712280273, "learning_rate": 6.093121014570198e-06, "loss": 0.125, "step": 409900 }, { "epoch": 4.124079926580728, "grad_norm": 22.69645118713379, "learning_rate": 6.089728257089591e-06, "loss": 0.1237, "step": 410000 }, { "epoch": 4.124249564454758, "grad_norm": 5.552475452423096, "learning_rate": 6.086335499608985e-06, "loss": 0.1245, "step": 410100 }, { "epoch": 4.124419202328789, "grad_norm": 3.730905771255493, "learning_rate": 6.082942742128379e-06, "loss": 0.1249, "step": 410200 }, { "epoch": 4.1245888402028195, "grad_norm": 8.940705299377441, "learning_rate": 6.079549984647773e-06, "loss": 0.1047, "step": 410300 }, { "epoch": 4.124758478076849, "grad_norm": 9.326803207397461, "learning_rate": 6.0761572271671665e-06, "loss": 0.1524, "step": 410400 }, { "epoch": 4.12492811595088, "grad_norm": 20.728240966796875, "learning_rate": 6.07276446968656e-06, "loss": 0.1165, "step": 410500 }, { "epoch": 4.12509775382491, "grad_norm": 8.523305892944336, "learning_rate": 6.069371712205955e-06, "loss": 0.1157, "step": 410600 }, { "epoch": 4.12526739169894, "grad_norm": 7.816564559936523, "learning_rate": 6.065978954725348e-06, "loss": 0.1224, "step": 410700 }, { "epoch": 4.1254370295729705, "grad_norm": 11.925230026245117, "learning_rate": 6.062586197244742e-06, "loss": 0.1231, "step": 410800 }, { "epoch": 4.125606667447001, "grad_norm": 1.3695433139801025, "learning_rate": 6.059193439764136e-06, "loss": 0.1128, "step": 410900 }, { "epoch": 4.125776305321031, "grad_norm": 20.02553367614746, "learning_rate": 6.05580068228353e-06, "loss": 0.1236, "step": 411000 }, { "epoch": 4.125945943195061, "grad_norm": 11.211836814880371, "learning_rate": 6.052407924802923e-06, "loss": 0.1295, "step": 411100 }, { "epoch": 4.126115581069092, "grad_norm": 16.013132095336914, "learning_rate": 6.049015167322318e-06, "loss": 0.1149, "step": 411200 }, { "epoch": 4.1262852189431225, "grad_norm": 19.966793060302734, "learning_rate": 6.0456224098417115e-06, "loss": 0.1247, "step": 411300 }, { "epoch": 4.126454856817152, "grad_norm": 13.985477447509766, "learning_rate": 6.042229652361105e-06, "loss": 0.1111, "step": 411400 }, { "epoch": 4.126624494691183, "grad_norm": 0.277375727891922, "learning_rate": 6.038836894880499e-06, "loss": 0.1111, "step": 411500 }, { "epoch": 4.126794132565213, "grad_norm": 3.8561112880706787, "learning_rate": 6.035444137399893e-06, "loss": 0.1397, "step": 411600 }, { "epoch": 4.126963770439243, "grad_norm": 1.843510389328003, "learning_rate": 6.032051379919287e-06, "loss": 0.1322, "step": 411700 }, { "epoch": 4.127133408313274, "grad_norm": 4.903130531311035, "learning_rate": 6.028658622438681e-06, "loss": 0.1374, "step": 411800 }, { "epoch": 4.127303046187304, "grad_norm": 9.16454029083252, "learning_rate": 6.025265864958074e-06, "loss": 0.1244, "step": 411900 }, { "epoch": 4.127472684061335, "grad_norm": 27.47962760925293, "learning_rate": 6.021873107477468e-06, "loss": 0.1362, "step": 412000 }, { "epoch": 4.127642321935364, "grad_norm": 10.730592727661133, "learning_rate": 6.018480349996863e-06, "loss": 0.1265, "step": 412100 }, { "epoch": 4.127811959809395, "grad_norm": 20.302291870117188, "learning_rate": 6.015087592516256e-06, "loss": 0.1145, "step": 412200 }, { "epoch": 4.1279815976834255, "grad_norm": 12.211941719055176, "learning_rate": 6.0116948350356495e-06, "loss": 0.1368, "step": 412300 }, { "epoch": 4.128151235557455, "grad_norm": 7.034080982208252, "learning_rate": 6.008302077555044e-06, "loss": 0.1132, "step": 412400 }, { "epoch": 4.128320873431486, "grad_norm": 18.96527862548828, "learning_rate": 6.004909320074438e-06, "loss": 0.1132, "step": 412500 }, { "epoch": 4.128490511305516, "grad_norm": 19.94338607788086, "learning_rate": 6.001516562593831e-06, "loss": 0.1202, "step": 412600 }, { "epoch": 4.128660149179546, "grad_norm": 15.721869468688965, "learning_rate": 5.9981238051132255e-06, "loss": 0.1292, "step": 412700 }, { "epoch": 4.128829787053577, "grad_norm": 15.175695419311523, "learning_rate": 5.994731047632619e-06, "loss": 0.1181, "step": 412800 }, { "epoch": 4.128999424927607, "grad_norm": 6.918076038360596, "learning_rate": 5.991338290152013e-06, "loss": 0.141, "step": 412900 }, { "epoch": 4.129169062801638, "grad_norm": 3.71687388420105, "learning_rate": 5.987945532671406e-06, "loss": 0.1133, "step": 413000 }, { "epoch": 4.129338700675667, "grad_norm": 13.88613224029541, "learning_rate": 5.984552775190801e-06, "loss": 0.113, "step": 413100 }, { "epoch": 4.129508338549698, "grad_norm": 6.198386192321777, "learning_rate": 5.9811600177101945e-06, "loss": 0.1095, "step": 413200 }, { "epoch": 4.129677976423729, "grad_norm": 15.295766830444336, "learning_rate": 5.977767260229589e-06, "loss": 0.1283, "step": 413300 }, { "epoch": 4.129847614297758, "grad_norm": 1.3633476495742798, "learning_rate": 5.974374502748982e-06, "loss": 0.1157, "step": 413400 }, { "epoch": 4.130017252171789, "grad_norm": 3.4762251377105713, "learning_rate": 5.970981745268376e-06, "loss": 0.1087, "step": 413500 }, { "epoch": 4.130186890045819, "grad_norm": 14.099475860595703, "learning_rate": 5.9675889877877705e-06, "loss": 0.1238, "step": 413600 }, { "epoch": 4.130356527919849, "grad_norm": 14.464191436767578, "learning_rate": 5.964196230307164e-06, "loss": 0.1148, "step": 413700 }, { "epoch": 4.13052616579388, "grad_norm": 0.23428601026535034, "learning_rate": 5.960803472826557e-06, "loss": 0.1187, "step": 413800 }, { "epoch": 4.13069580366791, "grad_norm": 18.431804656982422, "learning_rate": 5.957410715345952e-06, "loss": 0.1081, "step": 413900 }, { "epoch": 4.130865441541941, "grad_norm": 6.473383903503418, "learning_rate": 5.954017957865346e-06, "loss": 0.1127, "step": 414000 }, { "epoch": 4.1310350794159705, "grad_norm": 13.232316017150879, "learning_rate": 5.950625200384739e-06, "loss": 0.1333, "step": 414100 }, { "epoch": 4.131204717290001, "grad_norm": 13.854168891906738, "learning_rate": 5.9472324429041324e-06, "loss": 0.1304, "step": 414200 }, { "epoch": 4.131374355164032, "grad_norm": 6.856204986572266, "learning_rate": 5.943839685423527e-06, "loss": 0.1339, "step": 414300 }, { "epoch": 4.131543993038061, "grad_norm": 19.8378963470459, "learning_rate": 5.940446927942921e-06, "loss": 0.117, "step": 414400 }, { "epoch": 4.131713630912092, "grad_norm": 19.874685287475586, "learning_rate": 5.937054170462314e-06, "loss": 0.137, "step": 414500 }, { "epoch": 4.131883268786122, "grad_norm": 1.1106826066970825, "learning_rate": 5.9336614129817085e-06, "loss": 0.1178, "step": 414600 }, { "epoch": 4.132052906660153, "grad_norm": 13.482819557189941, "learning_rate": 5.930268655501102e-06, "loss": 0.1264, "step": 414700 }, { "epoch": 4.132222544534183, "grad_norm": 5.59124755859375, "learning_rate": 5.926875898020497e-06, "loss": 0.1282, "step": 414800 }, { "epoch": 4.132392182408213, "grad_norm": 2.0004255771636963, "learning_rate": 5.92348314053989e-06, "loss": 0.129, "step": 414900 }, { "epoch": 4.132561820282244, "grad_norm": 12.902405738830566, "learning_rate": 5.920090383059284e-06, "loss": 0.1236, "step": 415000 }, { "epoch": 4.1327314581562735, "grad_norm": 6.045074462890625, "learning_rate": 5.916697625578678e-06, "loss": 0.1222, "step": 415100 }, { "epoch": 4.132901096030304, "grad_norm": 8.823320388793945, "learning_rate": 5.913304868098072e-06, "loss": 0.1201, "step": 415200 }, { "epoch": 4.133070733904335, "grad_norm": 21.15587043762207, "learning_rate": 5.909912110617465e-06, "loss": 0.1163, "step": 415300 }, { "epoch": 4.133240371778364, "grad_norm": 15.698034286499023, "learning_rate": 5.906519353136859e-06, "loss": 0.1156, "step": 415400 }, { "epoch": 4.133410009652395, "grad_norm": 7.130799770355225, "learning_rate": 5.9031265956562535e-06, "loss": 0.1358, "step": 415500 }, { "epoch": 4.1335796475264255, "grad_norm": 4.329277992248535, "learning_rate": 5.8997338381756464e-06, "loss": 0.1316, "step": 415600 }, { "epoch": 4.133749285400456, "grad_norm": 9.439695358276367, "learning_rate": 5.89634108069504e-06, "loss": 0.1363, "step": 415700 }, { "epoch": 4.133918923274486, "grad_norm": 12.175934791564941, "learning_rate": 5.892948323214435e-06, "loss": 0.1183, "step": 415800 }, { "epoch": 4.134088561148516, "grad_norm": 20.81033706665039, "learning_rate": 5.889555565733829e-06, "loss": 0.1122, "step": 415900 }, { "epoch": 4.134258199022547, "grad_norm": 22.079803466796875, "learning_rate": 5.886162808253222e-06, "loss": 0.1319, "step": 416000 }, { "epoch": 4.134427836896577, "grad_norm": 32.99759292602539, "learning_rate": 5.882770050772616e-06, "loss": 0.1104, "step": 416100 }, { "epoch": 4.134597474770607, "grad_norm": 13.879011154174805, "learning_rate": 5.87937729329201e-06, "loss": 0.1343, "step": 416200 }, { "epoch": 4.134767112644638, "grad_norm": 19.230188369750977, "learning_rate": 5.875984535811405e-06, "loss": 0.1083, "step": 416300 }, { "epoch": 4.134936750518667, "grad_norm": 1.8381648063659668, "learning_rate": 5.872591778330798e-06, "loss": 0.1336, "step": 416400 }, { "epoch": 4.135106388392698, "grad_norm": 9.054830551147461, "learning_rate": 5.8691990208501914e-06, "loss": 0.1132, "step": 416500 }, { "epoch": 4.1352760262667285, "grad_norm": 9.631292343139648, "learning_rate": 5.865806263369585e-06, "loss": 0.1032, "step": 416600 }, { "epoch": 4.135445664140759, "grad_norm": 3.9058432579040527, "learning_rate": 5.86241350588898e-06, "loss": 0.1185, "step": 416700 }, { "epoch": 4.135615302014789, "grad_norm": 19.27168083190918, "learning_rate": 5.859020748408373e-06, "loss": 0.118, "step": 416800 }, { "epoch": 4.135784939888819, "grad_norm": 11.036360740661621, "learning_rate": 5.855627990927767e-06, "loss": 0.1266, "step": 416900 }, { "epoch": 4.13595457776285, "grad_norm": 12.796171188354492, "learning_rate": 5.852235233447161e-06, "loss": 0.1087, "step": 417000 }, { "epoch": 4.13612421563688, "grad_norm": 5.90040922164917, "learning_rate": 5.848842475966555e-06, "loss": 0.1278, "step": 417100 }, { "epoch": 4.13629385351091, "grad_norm": 0.6688133478164673, "learning_rate": 5.845449718485948e-06, "loss": 0.1047, "step": 417200 }, { "epoch": 4.136463491384941, "grad_norm": 34.76011276245117, "learning_rate": 5.842056961005343e-06, "loss": 0.1347, "step": 417300 }, { "epoch": 4.136633129258971, "grad_norm": 1.3853967189788818, "learning_rate": 5.8386642035247364e-06, "loss": 0.1256, "step": 417400 }, { "epoch": 4.136802767133001, "grad_norm": 3.3147244453430176, "learning_rate": 5.835271446044129e-06, "loss": 0.1229, "step": 417500 }, { "epoch": 4.136972405007032, "grad_norm": 11.806756973266602, "learning_rate": 5.831878688563524e-06, "loss": 0.1345, "step": 417600 }, { "epoch": 4.137142042881062, "grad_norm": 25.9591007232666, "learning_rate": 5.828485931082918e-06, "loss": 0.1225, "step": 417700 }, { "epoch": 4.137311680755092, "grad_norm": 1.7926803827285767, "learning_rate": 5.825093173602312e-06, "loss": 0.1207, "step": 417800 }, { "epoch": 4.137481318629122, "grad_norm": 27.478742599487305, "learning_rate": 5.821700416121705e-06, "loss": 0.1286, "step": 417900 }, { "epoch": 4.137650956503153, "grad_norm": 11.38985824584961, "learning_rate": 5.818307658641099e-06, "loss": 0.1329, "step": 418000 }, { "epoch": 4.137820594377183, "grad_norm": 13.047979354858398, "learning_rate": 5.814914901160493e-06, "loss": 0.1285, "step": 418100 }, { "epoch": 4.137990232251213, "grad_norm": 9.576428413391113, "learning_rate": 5.811522143679888e-06, "loss": 0.1233, "step": 418200 }, { "epoch": 4.138159870125244, "grad_norm": 0.9438449144363403, "learning_rate": 5.808129386199281e-06, "loss": 0.1198, "step": 418300 }, { "epoch": 4.138329507999274, "grad_norm": 14.571768760681152, "learning_rate": 5.804736628718674e-06, "loss": 0.143, "step": 418400 }, { "epoch": 4.138499145873304, "grad_norm": 0.5408310890197754, "learning_rate": 5.801343871238069e-06, "loss": 0.1316, "step": 418500 }, { "epoch": 4.138668783747335, "grad_norm": 2.290682077407837, "learning_rate": 5.797951113757463e-06, "loss": 0.1009, "step": 418600 }, { "epoch": 4.138838421621365, "grad_norm": 10.857550621032715, "learning_rate": 5.794558356276856e-06, "loss": 0.1243, "step": 418700 }, { "epoch": 4.139008059495395, "grad_norm": 11.968220710754395, "learning_rate": 5.7911655987962504e-06, "loss": 0.122, "step": 418800 }, { "epoch": 4.139177697369425, "grad_norm": 1.7733838558197021, "learning_rate": 5.787772841315644e-06, "loss": 0.1272, "step": 418900 }, { "epoch": 4.139347335243456, "grad_norm": 3.040386438369751, "learning_rate": 5.784380083835038e-06, "loss": 0.1386, "step": 419000 }, { "epoch": 4.139516973117486, "grad_norm": 16.805509567260742, "learning_rate": 5.780987326354431e-06, "loss": 0.1254, "step": 419100 }, { "epoch": 4.139686610991516, "grad_norm": 20.544286727905273, "learning_rate": 5.777594568873826e-06, "loss": 0.1093, "step": 419200 }, { "epoch": 4.139856248865547, "grad_norm": 4.153665542602539, "learning_rate": 5.774201811393219e-06, "loss": 0.1233, "step": 419300 }, { "epoch": 4.140025886739577, "grad_norm": 3.4744088649749756, "learning_rate": 5.770809053912612e-06, "loss": 0.1303, "step": 419400 }, { "epoch": 4.140195524613607, "grad_norm": 12.885334968566895, "learning_rate": 5.767416296432007e-06, "loss": 0.1225, "step": 419500 }, { "epoch": 4.140365162487638, "grad_norm": 24.783815383911133, "learning_rate": 5.764023538951401e-06, "loss": 0.111, "step": 419600 }, { "epoch": 4.140534800361668, "grad_norm": 19.816476821899414, "learning_rate": 5.7606307814707954e-06, "loss": 0.1346, "step": 419700 }, { "epoch": 4.140704438235698, "grad_norm": 16.108043670654297, "learning_rate": 5.757238023990188e-06, "loss": 0.116, "step": 419800 }, { "epoch": 4.1408740761097285, "grad_norm": 2.153003692626953, "learning_rate": 5.753845266509582e-06, "loss": 0.1249, "step": 419900 }, { "epoch": 4.141043713983759, "grad_norm": 10.011972427368164, "learning_rate": 5.750452509028977e-06, "loss": 0.1355, "step": 420000 }, { "epoch": 4.14121335185779, "grad_norm": 11.62508487701416, "learning_rate": 5.747059751548371e-06, "loss": 0.1217, "step": 420100 }, { "epoch": 4.141382989731819, "grad_norm": 12.086817741394043, "learning_rate": 5.7436669940677636e-06, "loss": 0.1292, "step": 420200 }, { "epoch": 4.14155262760585, "grad_norm": 10.95189380645752, "learning_rate": 5.740274236587157e-06, "loss": 0.1088, "step": 420300 }, { "epoch": 4.14172226547988, "grad_norm": 19.07208251953125, "learning_rate": 5.736881479106552e-06, "loss": 0.1242, "step": 420400 }, { "epoch": 4.14189190335391, "grad_norm": 11.889034271240234, "learning_rate": 5.733488721625946e-06, "loss": 0.1119, "step": 420500 }, { "epoch": 4.142061541227941, "grad_norm": 10.076273918151855, "learning_rate": 5.730095964145339e-06, "loss": 0.1227, "step": 420600 }, { "epoch": 4.142231179101971, "grad_norm": 16.522918701171875, "learning_rate": 5.726703206664733e-06, "loss": 0.1267, "step": 420700 }, { "epoch": 4.142400816976001, "grad_norm": 8.970725059509277, "learning_rate": 5.723310449184127e-06, "loss": 0.1364, "step": 420800 }, { "epoch": 4.1425704548500315, "grad_norm": 13.779717445373535, "learning_rate": 5.719917691703522e-06, "loss": 0.1315, "step": 420900 }, { "epoch": 4.142740092724062, "grad_norm": 60.513118743896484, "learning_rate": 5.716524934222915e-06, "loss": 0.1181, "step": 421000 }, { "epoch": 4.142858839235883, "eval_accuracy": 0.8196407564656073, "eval_f1": 0.87582716291724, "eval_loss": 0.6810571551322937, "eval_runtime": 385.4176, "eval_samples_per_second": 868.995, "eval_steps_per_second": 27.158, "step": 421070 } ], "logging_steps": 100, "max_steps": 589491, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8733532766050714e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }